diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..70e40d6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,160 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.dist_test/
+*.avi
+ckpts/
+data/bench2drive/
+data/infos
+data_carla
+viz/
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# output & ckpts
+output/
+test/
+ckpts/
+ckpts
+
+# work_dirs
+**/work_dirs
+
+batchscript*
+phoenix*
+
+debug/
+*projs/
+
+INFO
+pyrightconfig.json
+.vscode/
+*.pth
+*.log
+tmp_ckpts/
+val/
+*.ipynb
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..59c5fd8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,52 @@
+
+<h2 align="center">
+  <img src="asserts/bench2drive.jpg" style="width: 100%; height: auto;">
+</h2>
+<h2 align="center">
+Bench2DriveZoo
+</h2>
+<h2 align="center">
+  <img src="asserts/bench2drivezoo.png" style="width: 100%; height: auto;">
+</h2>
+
+
+# Introduction
+
+- We implement training and open-loop evaluation for [BEVFormer](https://github.com/fundamentalvision/BEVFormer), [UniAD](https://github.com/OpenDriveLab/UniAD) , [VAD](https://github.com/hustvl/VAD) on [Bench2Drive](https://github.com/Thinklab-SJTU/Bench2Drive) dataset.
+- We completed the closed-loop evaluation process in Carla for Uniad and VAD on Bench2Drive.
+- We simplified the code framework by merging multiple dependencies like mmcv, mmseg, mmdet, and mmdet3d into a single library, and support the latest version of pytorch(2.3.1), which greatly facilitating installation and development.
+
+
+
+# Getting Started
+
+- [Installation](docs/INSTALL.md)
+- [Prepare Dataset](docs/INSTALL.md)
+- [Train and Open-Loop Eval](docs/TRAIN_EVAL.md)
+- [Closed-Loop Eval in Carla](docs/EVAL_IN_CARLA.md)
+- [Convert Codes from Nuscenes to Bench2Drive](docs/CONVERT_GUIDE.md)
+
+# Results and Pre-trained Models
+
+## UniAD and VAD
+
+| Method | L2 (m) 2s | Driving Score | Success Rate(%) | Config | Download |
+| :---: | :---: | :---: | :---: |  :---: |:---: |
+| UniAD-Tiny |0.80 | 32.00  |  9.54 | [config](adzoo/uniad/configs/stage2_e2e/base_e2e_b2d.py) | [Hugging Face](https://huggingface.co/rethinklab/Bench2DriveZoo/blob/main/bevformer_tiny_b2d.pth)/[Baidu Cloud](https://pan.baidu.com/s/1psr7AKYHD7CitZ30Bz-9sA?pwd=1234 )|
+| UniAD-Base |0.73 | 37.72  |  9.54 | [config](adzoo/uniad/configs/stage2_e2e/tiny_e2e_b2d.py) | [Hugging Face](https://huggingface.co/rethinklab/Bench2DriveZoo/blob/main/uniad_base_b2d.pth)/[Baidu Cloud](https://pan.baidu.com/s/11p9IUGqTax1f4W_qsdLCRw?pwd=1234) |
+| VAD        |0.91 | 39.4  | 10.0 | [config](adzoo/vad/configs/VAD/VAD_base_e2e_b2d.py) | [Hugging Face](https://huggingface.co/rethinklab/Bench2DriveZoo/blob/main/vad_b2d_base.pth)/[Baidu Cloud]( https://pan.baidu.com/s/11p9IUGqTax1f4W_qsdLCRw?pwd=1234) |
+
+## BEVFormer
+
+| Method | mAP | NDS | Config | Download |
+| :---: | :---: | :---: | :---: |  :---: |
+| BEVFormer-Tiny | 0.37 | 0.43  | [config](adzoo/bevformer/configs/bevformer/bevformer_tiny_b2d.py) | [Hugging Face](https://huggingface.co/rethinklab/Bench2DriveZoo/blob/main/bevformer_tiny_b2d.pth)/[Baidu Cloud](https://pan.baidu.com/s/1TWMs9YgKYm2DF5YfXF8i3g?pwd=1234) |
+| BEVFormer-Base | 0.63 | 0.67  | [config](adzoo/bevformer/configs/bevformer/bevformer_base_b2d.py) | [Hugging Face](https://huggingface.co/rethinklab/Bench2DriveZoo/blob/main/bevformer_base_b2d.pth)/[Baidu Cloud](https://pan.baidu.com/s/1Y4VkE1gc8BU0zJ4z2fmIkQ?pwd=1234) |
+
+
+# Related Resources
+
+- [Bench2Drive](https://github.com/Thinklab-SJTU/Bench2Drive)
+- [BEVFormer](https://github.com/fundamentalvision/BEVFormer)
+- [UniAD](https://github.com/OpenDriveLab/UniAD) 
+- [VAD](https://github.com/hustvl/VAD)
diff --git a/adzoo/__init__.py b/adzoo/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/adzoo/bevformer/analysis_tools/__init__.py b/adzoo/bevformer/analysis_tools/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/adzoo/bevformer/analysis_tools/analyze_logs.py b/adzoo/bevformer/analysis_tools/analyze_logs.py
new file mode 100755
index 0000000..806175f
--- /dev/null
+++ b/adzoo/bevformer/analysis_tools/analyze_logs.py
@@ -0,0 +1,201 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+import numpy as np
+import seaborn as sns
+from collections import defaultdict
+from matplotlib import pyplot as plt
+
+
+def cal_train_time(log_dicts, args):
+    for i, log_dict in enumerate(log_dicts):
+        print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}')
+        all_times = []
+        for epoch in log_dict.keys():
+            if args.include_outliers:
+                all_times.append(log_dict[epoch]['time'])
+            else:
+                all_times.append(log_dict[epoch]['time'][1:])
+        all_times = np.array(all_times)
+        epoch_ave_time = all_times.mean(-1)
+        slowest_epoch = epoch_ave_time.argmax()
+        fastest_epoch = epoch_ave_time.argmin()
+        std_over_epoch = epoch_ave_time.std()
+        print(f'slowest epoch {slowest_epoch + 1}, '
+              f'average time is {epoch_ave_time[slowest_epoch]:.4f}')
+        print(f'fastest epoch {fastest_epoch + 1}, '
+              f'average time is {epoch_ave_time[fastest_epoch]:.4f}')
+        print(f'time std over epochs is {std_over_epoch:.4f}')
+        print(f'average iter time: {np.mean(all_times):.4f} s/iter')
+        print()
+
+
+def plot_curve(log_dicts, args):
+    if args.backend is not None:
+        plt.switch_backend(args.backend)
+    sns.set_style(args.style)
+    # if legend is None, use {filename}_{key} as legend
+    legend = args.legend
+    if legend is None:
+        legend = []
+        for json_log in args.json_logs:
+            for metric in args.keys:
+                legend.append(f'{json_log}_{metric}')
+    assert len(legend) == (len(args.json_logs) * len(args.keys))
+    metrics = args.keys
+
+    num_metrics = len(metrics)
+    for i, log_dict in enumerate(log_dicts):
+        epochs = list(log_dict.keys())
+        for j, metric in enumerate(metrics):
+            print(f'plot curve of {args.json_logs[i]}, metric is {metric}')
+            if metric not in log_dict[epochs[args.interval - 1]]:
+                raise KeyError(
+                    f'{args.json_logs[i]} does not contain metric {metric}')
+
+            if args.mode == 'eval':
+                if min(epochs) == args.interval:
+                    x0 = args.interval
+                else:
+                    # if current training is resumed from previous checkpoint
+                    # we lost information in early epochs
+                    # `xs` should start according to `min(epochs)`
+                    if min(epochs) % args.interval == 0:
+                        x0 = min(epochs)
+                    else:
+                        # find the first epoch that do eval
+                        x0 = min(epochs) + args.interval - \
+                            min(epochs) % args.interval
+                xs = np.arange(x0, max(epochs) + 1, args.interval)
+                ys = []
+                for epoch in epochs[args.interval - 1::args.interval]:
+                    ys += log_dict[epoch][metric]
+
+                # if training is aborted before eval of the last epoch
+                # `xs` and `ys` will have different length and cause an error
+                # check if `ys[-1]` is empty here
+                if not log_dict[epoch][metric]:
+                    xs = xs[:-1]
+
+                ax = plt.gca()
+                ax.set_xticks(xs)
+                plt.xlabel('epoch')
+                plt.plot(xs, ys, label=legend[i * num_metrics + j], marker='o')
+            else:
+                xs = []
+                ys = []
+                num_iters_per_epoch = \
+                    log_dict[epochs[args.interval-1]]['iter'][-1]
+                for epoch in epochs[args.interval - 1::args.interval]:
+                    iters = log_dict[epoch]['iter']
+                    if log_dict[epoch]['mode'][-1] == 'val':
+                        iters = iters[:-1]
+                    xs.append(
+                        np.array(iters) + (epoch - 1) * num_iters_per_epoch)
+                    ys.append(np.array(log_dict[epoch][metric][:len(iters)]))
+                xs = np.concatenate(xs)
+                ys = np.concatenate(ys)
+                plt.xlabel('iter')
+                plt.plot(
+                    xs, ys, label=legend[i * num_metrics + j], linewidth=0.5)
+            plt.legend()
+        if args.title is not None:
+            plt.title(args.title)
+    if args.out is None:
+        plt.show()
+    else:
+        print(f'save curve to: {args.out}')
+        plt.savefig(args.out)
+        plt.cla()
+
+
+def add_plot_parser(subparsers):
+    parser_plt = subparsers.add_parser(
+        'plot_curve', help='parser for plotting curves')
+    parser_plt.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_plt.add_argument(
+        '--keys',
+        type=str,
+        nargs='+',
+        default=['mAP_0.25'],
+        help='the metric that you want to plot')
+    parser_plt.add_argument('--title', type=str, help='title of figure')
+    parser_plt.add_argument(
+        '--legend',
+        type=str,
+        nargs='+',
+        default=None,
+        help='legend of each plot')
+    parser_plt.add_argument(
+        '--backend', type=str, default=None, help='backend of plt')
+    parser_plt.add_argument(
+        '--style', type=str, default='dark', help='style of plt')
+    parser_plt.add_argument('--out', type=str, default=None)
+    parser_plt.add_argument('--mode', type=str, default='train')
+    parser_plt.add_argument('--interval', type=int, default=1)
+
+
+def add_time_parser(subparsers):
+    parser_time = subparsers.add_parser(
+        'cal_train_time',
+        help='parser for computing the average time per training iteration')
+    parser_time.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_time.add_argument(
+        '--include-outliers',
+        action='store_true',
+        help='include the first value of every epoch when computing '
+        'the average time')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Analyze Json Log')
+    # currently only support plot curve and calculate average train time
+    subparsers = parser.add_subparsers(dest='task', help='task parser')
+    add_plot_parser(subparsers)
+    add_time_parser(subparsers)
+    args = parser.parse_args()
+    return args
+
+
+def load_json_logs(json_logs):
+    # load and convert json_logs to log_dict, key is epoch, value is a sub dict
+    # keys of sub dict is different metrics, e.g. memory, bbox_mAP
+    # value of sub dict is a list of corresponding values of all iterations
+    log_dicts = [dict() for _ in json_logs]
+    for json_log, log_dict in zip(json_logs, log_dicts):
+        with open(json_log, 'r') as log_file:
+            for line in log_file:
+                log = json.loads(line.strip())
+                # skip lines without `epoch` field
+                if 'epoch' not in log:
+                    continue
+                epoch = log.pop('epoch')
+                if epoch not in log_dict:
+                    log_dict[epoch] = defaultdict(list)
+                for k, v in log.items():
+                    log_dict[epoch][k].append(v)
+    return log_dicts
+
+
+def main():
+    args = parse_args()
+
+    json_logs = args.json_logs
+    for json_log in json_logs:
+        assert json_log.endswith('.json')
+
+    log_dicts = load_json_logs(json_logs)
+
+    eval(args.task)(log_dicts, args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/bevformer/analysis_tools/benchmark.py b/adzoo/bevformer/analysis_tools/benchmark.py
new file mode 100755
index 0000000..487a348
--- /dev/null
+++ b/adzoo/bevformer/analysis_tools/benchmark.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import time
+import torch
+from mmcv import Config
+from mmcv.parallel import MMDataParallel
+from mmcv.runner import load_checkpoint, wrap_fp16_model
+import sys
+sys.path.append('.')
+from projects.mmdet3d_plugin.datasets.builder import build_dataloader
+from projects.mmdet3d_plugin.datasets import custom_build_dataset
+# from mmdet3d.datasets import build_dataloader, build_dataset
+from mmdet3d.models import build_detector
+#from tools.misc.fuse_conv_bn import fuse_module
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMDet benchmark a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--checkpoint', default=None, help='checkpoint file')
+    parser.add_argument('--samples', default=2000, help='samples to benchmark')
+    parser.add_argument(
+        '--log-interval', default=50, help='interval of logging')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    # build the dataloader
+    # TODO: support multiple images per gpu (only minor changes are needed)
+    print(cfg.data.test)
+    dataset = custom_build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=1,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=False,
+        shuffle=False)
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    if args.checkpoint is not None:
+        load_checkpoint(model, args.checkpoint, map_location='cpu')
+    #if args.fuse_conv_bn:
+    #    model = fuse_module(model)
+
+    model = MMDataParallel(model, device_ids=[0])
+
+    model.eval()
+
+    # the first several iterations may be very slow so skip them
+    num_warmup = 5
+    pure_inf_time = 0
+
+    # benchmark with several samples and take the average
+    for i, data in enumerate(data_loader):
+        torch.cuda.synchronize()
+        start_time = time.perf_counter()
+        with torch.no_grad():
+            model(return_loss=False, rescale=True, **data)
+
+        torch.cuda.synchronize()
+        elapsed = time.perf_counter() - start_time
+
+        if i >= num_warmup:
+            pure_inf_time += elapsed
+            if (i + 1) % args.log_interval == 0:
+                fps = (i + 1 - num_warmup) / pure_inf_time
+                print(f'Done image [{i + 1:<3}/ {args.samples}], '
+                      f'fps: {fps:.1f} img / s')
+
+        if (i + 1) == args.samples:
+            pure_inf_time += elapsed
+            fps = (i + 1 - num_warmup) / pure_inf_time
+            print(f'Overall fps: {fps:.1f} img / s')
+            break
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/bevformer/analysis_tools/get_params.py b/adzoo/bevformer/analysis_tools/get_params.py
new file mode 100644
index 0000000..fb697ad
--- /dev/null
+++ b/adzoo/bevformer/analysis_tools/get_params.py
@@ -0,0 +1,10 @@
+import torch
+file_path = './ckpts/bevformer_v4.pth'
+model = torch.load(file_path, map_location='cpu')
+all = 0
+for key in list(model['state_dict'].keys()):
+    all += model['state_dict'][key].nelement()
+print(all)
+
+# smaller 63374123
+# v4 69140395
diff --git a/adzoo/bevformer/analysis_tools/visual.py b/adzoo/bevformer/analysis_tools/visual.py
new file mode 100644
index 0000000..f711b75
--- /dev/null
+++ b/adzoo/bevformer/analysis_tools/visual.py
@@ -0,0 +1,477 @@
+# Based on https://github.com/nutonomy/nuscenes-devkit
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import mmcv
+from nuscenes.nuscenes import NuScenes
+from PIL import Image
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix
+from typing import Tuple, List, Iterable
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image
+from matplotlib import rcParams
+from matplotlib.axes import Axes
+from pyquaternion import Quaternion
+from PIL import Image
+from matplotlib import rcParams
+from matplotlib.axes import Axes
+from pyquaternion import Quaternion
+from tqdm import tqdm
+from nuscenes.utils.data_classes import LidarPointCloud, RadarPointCloud, Box
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix
+from nuscenes.eval.common.data_classes import EvalBoxes, EvalBox
+from nuscenes.eval.detection.data_classes import DetectionBox
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.eval.detection.render import visualize_sample
+
+
+
+
+cams = ['CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_BACK_RIGHT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+ 'CAM_FRONT_LEFT']
+
+import numpy as np
+import matplotlib.pyplot as plt
+from nuscenes.utils.data_classes import LidarPointCloud, RadarPointCloud, Box
+from PIL import Image
+from matplotlib import rcParams
+
+
+def render_annotation(
+        anntoken: str,
+        margin: float = 10,
+        view: np.ndarray = np.eye(4),
+        box_vis_level: BoxVisibility = BoxVisibility.ANY,
+        out_path: str = 'render.png',
+        extra_info: bool = False) -> None:
+    """
+    Render selected annotation.
+    :param anntoken: Sample_annotation token.
+    :param margin: How many meters in each direction to include in LIDAR view.
+    :param view: LIDAR view point.
+    :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+    :param out_path: Optional path to save the rendered figure to disk.
+    :param extra_info: Whether to render extra information below camera view.
+    """
+    ann_record = nusc.get('sample_annotation', anntoken)
+    sample_record = nusc.get('sample', ann_record['sample_token'])
+    assert 'LIDAR_TOP' in sample_record['data'].keys(), 'Error: No LIDAR_TOP in data, unable to render.'
+
+    # Figure out which camera the object is fully visible in (this may return nothing).
+    boxes, cam = [], []
+    cams = [key for key in sample_record['data'].keys() if 'CAM' in key]
+    all_bboxes = []
+    select_cams = []
+    for cam in cams:
+        _, boxes, _ = nusc.get_sample_data(sample_record['data'][cam], box_vis_level=box_vis_level,
+                                           selected_anntokens=[anntoken])
+        if len(boxes) > 0:
+            all_bboxes.append(boxes)
+            select_cams.append(cam)
+            # We found an image that matches. Let's abort.
+    # assert len(boxes) > 0, 'Error: Could not find image where annotation is visible. ' \
+    #                      'Try using e.g. BoxVisibility.ANY.'
+    # assert len(boxes) < 2, 'Error: Found multiple annotations. Something is wrong!'
+
+    num_cam = len(all_bboxes)
+
+    fig, axes = plt.subplots(1, num_cam + 1, figsize=(18, 9))
+    select_cams = [sample_record['data'][cam] for cam in select_cams]
+    print('bbox in cams:', select_cams)
+    # Plot LIDAR view.
+    lidar = sample_record['data']['LIDAR_TOP']
+    data_path, boxes, camera_intrinsic = nusc.get_sample_data(lidar, selected_anntokens=[anntoken])
+    LidarPointCloud.from_file(data_path).render_height(axes[0], view=view)
+    for box in boxes:
+        c = np.array(get_color(box.name)) / 255.0
+        box.render(axes[0], view=view, colors=(c, c, c))
+        corners = view_points(boxes[0].corners(), view, False)[:2, :]
+        axes[0].set_xlim([np.min(corners[0, :]) - margin, np.max(corners[0, :]) + margin])
+        axes[0].set_ylim([np.min(corners[1, :]) - margin, np.max(corners[1, :]) + margin])
+        axes[0].axis('off')
+        axes[0].set_aspect('equal')
+
+    # Plot CAMERA view.
+    for i in range(1, num_cam + 1):
+        cam = select_cams[i - 1]
+        data_path, boxes, camera_intrinsic = nusc.get_sample_data(cam, selected_anntokens=[anntoken])
+        im = Image.open(data_path)
+        axes[i].imshow(im)
+        axes[i].set_title(nusc.get('sample_data', cam)['channel'])
+        axes[i].axis('off')
+        axes[i].set_aspect('equal')
+        for box in boxes:
+            c = np.array(get_color(box.name)) / 255.0
+            box.render(axes[i], view=camera_intrinsic, normalize=True, colors=(c, c, c))
+
+        # Print extra information about the annotation below the camera view.
+        axes[i].set_xlim(0, im.size[0])
+        axes[i].set_ylim(im.size[1], 0)
+
+    if extra_info:
+        rcParams['font.family'] = 'monospace'
+
+        w, l, h = ann_record['size']
+        category = ann_record['category_name']
+        lidar_points = ann_record['num_lidar_pts']
+        radar_points = ann_record['num_radar_pts']
+
+        sample_data_record = nusc.get('sample_data', sample_record['data']['LIDAR_TOP'])
+        pose_record = nusc.get('ego_pose', sample_data_record['ego_pose_token'])
+        dist = np.linalg.norm(np.array(pose_record['translation']) - np.array(ann_record['translation']))
+
+        information = ' \n'.join(['category: {}'.format(category),
+                                  '',
+                                  '# lidar points: {0:>4}'.format(lidar_points),
+                                  '# radar points: {0:>4}'.format(radar_points),
+                                  '',
+                                  'distance: {:>7.3f}m'.format(dist),
+                                  '',
+                                  'width:  {:>7.3f}m'.format(w),
+                                  'length: {:>7.3f}m'.format(l),
+                                  'height: {:>7.3f}m'.format(h)])
+
+        plt.annotate(information, (0, 0), (0, -20), xycoords='axes fraction', textcoords='offset points', va='top')
+
+    if out_path is not None:
+        plt.savefig(out_path)
+
+
+
+def get_sample_data(sample_data_token: str,
+                    box_vis_level: BoxVisibility = BoxVisibility.ANY,
+                    selected_anntokens=None,
+                    use_flat_vehicle_coordinates: bool = False):
+    """
+    Returns the data path as well as all annotations related to that sample_data.
+    Note that the boxes are transformed into the current sensor's coordinate frame.
+    :param sample_data_token: Sample_data token.
+    :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+    :param selected_anntokens: If provided only return the selected annotation.
+    :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+                                         aligned to z-plane in the world.
+    :return: (data_path, boxes, camera_intrinsic <np.array: 3, 3>)
+    """
+
+    # Retrieve sensor & pose records
+    sd_record = nusc.get('sample_data', sample_data_token)
+    cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+    sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+
+    data_path = nusc.get_sample_data_path(sample_data_token)
+
+    if sensor_record['modality'] == 'camera':
+        cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+        imsize = (sd_record['width'], sd_record['height'])
+    else:
+        cam_intrinsic = None
+        imsize = None
+
+    # Retrieve all sample annotations and map to sensor coordinate system.
+    if selected_anntokens is not None:
+        boxes = list(map(nusc.get_box, selected_anntokens))
+    else:
+        boxes = nusc.get_boxes(sample_data_token)
+
+    # Make list of Box objects including coord system transforms.
+    box_list = []
+    for box in boxes:
+        if use_flat_vehicle_coordinates:
+            # Move box to ego vehicle coord system parallel to world z plane.
+            yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0]
+            box.translate(-np.array(pose_record['translation']))
+            box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse)
+        else:
+            # Move box to ego vehicle coord system.
+            box.translate(-np.array(pose_record['translation']))
+            box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+            #  Move box to sensor coord system.
+            box.translate(-np.array(cs_record['translation']))
+            box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+        if sensor_record['modality'] == 'camera' and not \
+                box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level):
+            continue
+
+        box_list.append(box)
+
+    return data_path, box_list, cam_intrinsic
+
+
+
+def get_predicted_data(sample_data_token: str,
+                       box_vis_level: BoxVisibility = BoxVisibility.ANY,
+                       selected_anntokens=None,
+                       use_flat_vehicle_coordinates: bool = False,
+                       pred_anns=None
+                       ):
+    """
+    Returns the data path as well as all annotations related to that sample_data.
+    Note that the boxes are transformed into the current sensor's coordinate frame.
+    :param sample_data_token: Sample_data token.
+    :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+    :param selected_anntokens: If provided only return the selected annotation.
+    :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+                                         aligned to z-plane in the world.
+    :return: (data_path, boxes, camera_intrinsic <np.array: 3, 3>)
+    """
+
+    # Retrieve sensor & pose records
+    sd_record = nusc.get('sample_data', sample_data_token)
+    cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+    sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+
+    data_path = nusc.get_sample_data_path(sample_data_token)
+
+    if sensor_record['modality'] == 'camera':
+        cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+        imsize = (sd_record['width'], sd_record['height'])
+    else:
+        cam_intrinsic = None
+        imsize = None
+
+    # Retrieve all sample annotations and map to sensor coordinate system.
+    # if selected_anntokens is not None:
+    #    boxes = list(map(nusc.get_box, selected_anntokens))
+    # else:
+    #    boxes = nusc.get_boxes(sample_data_token)
+    boxes = pred_anns
+    # Make list of Box objects including coord system transforms.
+    box_list = []
+    for box in boxes:
+        if use_flat_vehicle_coordinates:
+            # Move box to ego vehicle coord system parallel to world z plane.
+            yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0]
+            box.translate(-np.array(pose_record['translation']))
+            box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse)
+        else:
+            # Move box to ego vehicle coord system.
+            box.translate(-np.array(pose_record['translation']))
+            box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+            #  Move box to sensor coord system.
+            box.translate(-np.array(cs_record['translation']))
+            box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+        if sensor_record['modality'] == 'camera' and not \
+                box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level):
+            continue
+        box_list.append(box)
+
+    return data_path, box_list, cam_intrinsic
+
+
+
+
+def lidiar_render(sample_token, data,out_path=None):
+    bbox_gt_list = []
+    bbox_pred_list = []
+    anns = nusc.get('sample', sample_token)['anns']
+    for ann in anns:
+        content = nusc.get('sample_annotation', ann)
+        try:
+            bbox_gt_list.append(DetectionBox(
+                sample_token=content['sample_token'],
+                translation=tuple(content['translation']),
+                size=tuple(content['size']),
+                rotation=tuple(content['rotation']),
+                velocity=nusc.box_velocity(content['token'])[:2],
+                ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+                else tuple(content['ego_translation']),
+                num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+                detection_name=category_to_detection_name(content['category_name']),
+                detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+                attribute_name=''))
+        except:
+            pass
+
+    bbox_anns = data['results'][sample_token]
+    for content in bbox_anns:
+        bbox_pred_list.append(DetectionBox(
+            sample_token=content['sample_token'],
+            translation=tuple(content['translation']),
+            size=tuple(content['size']),
+            rotation=tuple(content['rotation']),
+            velocity=tuple(content['velocity']),
+            ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+            else tuple(content['ego_translation']),
+            num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+            detection_name=content['detection_name'],
+            detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+            attribute_name=content['attribute_name']))
+    gt_annotations = EvalBoxes()
+    pred_annotations = EvalBoxes()
+    gt_annotations.add_boxes(sample_token, bbox_gt_list)
+    pred_annotations.add_boxes(sample_token, bbox_pred_list)
+    print('green is ground truth')
+    print('blue is the predited result')
+    visualize_sample(nusc, sample_token, gt_annotations, pred_annotations, savepath=out_path+'_bev')
+
+
+def get_color(category_name: str):
+    """
+    Provides the default colors based on the category names.
+    This method works for the general nuScenes categories, as well as the nuScenes detection categories.
+    """
+    a = ['noise', 'animal', 'human.pedestrian.adult', 'human.pedestrian.child', 'human.pedestrian.construction_worker',
+     'human.pedestrian.personal_mobility', 'human.pedestrian.police_officer', 'human.pedestrian.stroller',
+     'human.pedestrian.wheelchair', 'movable_object.barrier', 'movable_object.debris',
+     'movable_object.pushable_pullable', 'movable_object.trafficcone', 'static_object.bicycle_rack', 'vehicle.bicycle',
+     'vehicle.bus.bendy', 'vehicle.bus.rigid', 'vehicle.car', 'vehicle.construction', 'vehicle.emergency.ambulance',
+     'vehicle.emergency.police', 'vehicle.motorcycle', 'vehicle.trailer', 'vehicle.truck', 'flat.driveable_surface',
+     'flat.other', 'flat.sidewalk', 'flat.terrain', 'static.manmade', 'static.other', 'static.vegetation',
+     'vehicle.ego']
+    class_names = [
+        'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+        'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+    ]
+    #print(category_name)
+    if category_name == 'bicycle':
+        return nusc.colormap['vehicle.bicycle']
+    elif category_name == 'construction_vehicle':
+        return nusc.colormap['vehicle.construction']
+    elif category_name == 'traffic_cone':
+        return nusc.colormap['movable_object.trafficcone']
+
+    for key in nusc.colormap.keys():
+        if category_name in key:
+            return nusc.colormap[key]
+    return [0, 0, 0]
+
+
+def render_sample_data(
+        sample_toekn: str,
+        with_anns: bool = True,
+        box_vis_level: BoxVisibility = BoxVisibility.ANY,
+        axes_limit: float = 40,
+        ax=None,
+        nsweeps: int = 1,
+        out_path: str = None,
+        underlay_map: bool = True,
+        use_flat_vehicle_coordinates: bool = True,
+        show_lidarseg: bool = False,
+        show_lidarseg_legend: bool = False,
+        filter_lidarseg_labels=None,
+        lidarseg_preds_bin_path: str = None,
+        verbose: bool = True,
+        show_panoptic: bool = False,
+        pred_data=None,
+      ) -> None:
+    """
+    Render sample data onto axis.
+    :param sample_data_token: Sample_data token.
+    :param with_anns: Whether to draw box annotations.
+    :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+    :param axes_limit: Axes limit for lidar and radar (measured in meters).
+    :param ax: Axes onto which to render.
+    :param nsweeps: Number of sweeps for lidar and radar.
+    :param out_path: Optional path to save the rendered figure to disk.
+    :param underlay_map: When set to true, lidar data is plotted onto the map. This can be slow.
+    :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+        aligned to z-plane in the world. Note: Previously this method did not use flat vehicle coordinates, which
+        can lead to small errors when the vertical axis of the global frame and lidar are not aligned. The new
+        setting is more correct and rotates the plot by ~90 degrees.
+    :param show_lidarseg: When set to True, the lidar data is colored with the segmentation labels. When set
+        to False, the colors of the lidar data represent the distance from the center of the ego vehicle.
+    :param show_lidarseg_legend: Whether to display the legend for the lidarseg labels in the frame.
+    :param filter_lidarseg_labels: Only show lidar points which belong to the given list of classes. If None
+        or the list is empty, all classes will be displayed.
+    :param lidarseg_preds_bin_path: A path to the .bin file which contains the user's lidar segmentation
+                                    predictions for the sample.
+    :param verbose: Whether to display the image after it is rendered.
+    :param show_panoptic: When set to True, the lidar data is colored with the panoptic labels. When set
+        to False, the colors of the lidar data represent the distance from the center of the ego vehicle.
+        If show_lidarseg is True, show_panoptic will be set to False.
+    """
+    lidiar_render(sample_toekn, pred_data, out_path=out_path)
+    sample = nusc.get('sample', sample_toekn)
+    # sample = data['results'][sample_token_list[0]][0]
+    cams = [
+        'CAM_FRONT_LEFT',
+        'CAM_FRONT',
+        'CAM_FRONT_RIGHT',
+        'CAM_BACK_LEFT',
+        'CAM_BACK',
+        'CAM_BACK_RIGHT',
+    ]
+    if ax is None:
+        _, ax = plt.subplots(4, 3, figsize=(24, 18))
+    j = 0
+    for ind, cam in enumerate(cams):
+        sample_data_token = sample['data'][cam]
+
+        sd_record = nusc.get('sample_data', sample_data_token)
+        sensor_modality = sd_record['sensor_modality']
+
+        if sensor_modality in ['lidar', 'radar']:
+            assert False
+        elif sensor_modality == 'camera':
+            # Load boxes and image.
+            boxes = [Box(record['translation'], record['size'], Quaternion(record['rotation']),
+                         name=record['detection_name'], token='predicted') for record in
+                     pred_data['results'][sample_toekn] if record['detection_score'] > 0.2]
+
+            data_path, boxes_pred, camera_intrinsic = get_predicted_data(sample_data_token,
+                                                                         box_vis_level=box_vis_level, pred_anns=boxes)
+            _, boxes_gt, _ = nusc.get_sample_data(sample_data_token, box_vis_level=box_vis_level)
+            if ind == 3:
+                j += 1
+            ind = ind % 3
+            data = Image.open(data_path)
+            # mmcv.imwrite(np.array(data)[:,:,::-1], f'{cam}.png')
+            # Init axes.
+
+            # Show image.
+            ax[j, ind].imshow(data)
+            ax[j + 2, ind].imshow(data)
+
+            # Show boxes.
+            if with_anns:
+                for box in boxes_pred:
+                    c = np.array(get_color(box.name)) / 255.0
+                    box.render(ax[j, ind], view=camera_intrinsic, normalize=True, colors=(c, c, c))
+                for box in boxes_gt:
+                    c = np.array(get_color(box.name)) / 255.0
+                    box.render(ax[j + 2, ind], view=camera_intrinsic, normalize=True, colors=(c, c, c))
+
+            # Limit visible range.
+            ax[j, ind].set_xlim(0, data.size[0])
+            ax[j, ind].set_ylim(data.size[1], 0)
+            ax[j + 2, ind].set_xlim(0, data.size[0])
+            ax[j + 2, ind].set_ylim(data.size[1], 0)
+
+        else:
+            raise ValueError("Error: Unknown sensor modality!")
+
+        ax[j, ind].axis('off')
+        ax[j, ind].set_title('PRED: {} {labels_type}'.format(
+            sd_record['channel'], labels_type='(predictions)' if lidarseg_preds_bin_path else ''))
+        ax[j, ind].set_aspect('equal')
+
+        ax[j + 2, ind].axis('off')
+        ax[j + 2, ind].set_title('GT:{} {labels_type}'.format(
+            sd_record['channel'], labels_type='(predictions)' if lidarseg_preds_bin_path else ''))
+        ax[j + 2, ind].set_aspect('equal')
+
+    if out_path is not None:
+        plt.savefig(out_path+'_camera', bbox_inches='tight', pad_inches=0, dpi=200)
+    if verbose:
+        plt.show()
+    plt.close()
+
+if __name__ == '__main__':
+    nusc = NuScenes(version='v1.0-trainval', dataroot='./data/nuscenes', verbose=True)
+    # render_annotation('7603b030b42a4b1caa8c443ccc1a7d52')
+    bevformer_results = mmcv.load('test/bevformer_base/Thu_Jun__9_16_22_37_2022/pts_bbox/results_nusc.json')
+    sample_token_list = list(bevformer_results['results'].keys())
+    for id in range(0, 10):
+        render_sample_data(sample_token_list[id], pred_data=bevformer_results, out_path=sample_token_list[id])
diff --git a/adzoo/bevformer/apis/__init__.py b/adzoo/bevformer/apis/__init__.py
new file mode 100644
index 0000000..15520b2
--- /dev/null
+++ b/adzoo/bevformer/apis/__init__.py
@@ -0,0 +1,2 @@
+from .train import custom_train_model
+from .mmdet_train import custom_train_detector
\ No newline at end of file
diff --git a/adzoo/bevformer/apis/mmdet_train.py b/adzoo/bevformer/apis/mmdet_train.py
new file mode 100644
index 0000000..3372f16
--- /dev/null
+++ b/adzoo/bevformer/apis/mmdet_train.py
@@ -0,0 +1,193 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+import random
+import warnings
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.nn import DataParallel
+from torch.nn.parallel.distributed import DistributedDataParallel
+from mmcv.runner import HOOKS, DistSamplerSeedHook, EpochBasedRunner, Fp16OptimizerHook, OptimizerHook, build_runner
+from mmcv.utils import build_from_cfg, get_root_logger
+
+from mmcv.core import EvalHook
+from mmcv.optims import build_optimizer
+from mmcv.datasets import build_dataset, replace_ImageToTensor
+import time
+import os.path as osp
+from mmcv.datasets.builder import build_dataloader
+from mmcv.core.evaluation.eval_hooks import CustomDistEvalHook
+
+def custom_train_detector(model,
+                   dataset,
+                   cfg,
+                   distributed=False,
+                   validate=False,
+                   timestamp=None,
+                   eval_model=None,
+                   meta=None):
+    logger = get_root_logger(cfg.log_level)
+
+    # prepare data loaders
+   
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    #assert len(dataset)==1s
+    if 'imgs_per_gpu' in cfg.data:
+        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
+                       'Please use "samples_per_gpu" instead')
+        if 'samples_per_gpu' in cfg.data:
+            logger.warning(
+                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
+                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
+                f'={cfg.data.imgs_per_gpu} is used in this experiments')
+        else:
+            logger.warning(
+                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
+                f'{cfg.data.imgs_per_gpu} in this experiments')
+        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
+
+    data_loaders = [
+        build_dataloader(
+            ds,
+            cfg.data.samples_per_gpu,
+            cfg.data.workers_per_gpu,
+            # cfg.gpus will be ignored if distributed
+            len(cfg.gpu_ids),
+            dist=distributed,
+            seed=cfg.seed,
+            shuffler_sampler=cfg.data.shuffler_sampler,  # dict(type='DistributedGroupSampler'),
+            nonshuffler_sampler=cfg.data.nonshuffler_sampler,  # dict(type='DistributedSampler'),
+        ) for ds in dataset
+    ]
+    # put model on gpus
+    if distributed:
+        find_unused_parameters = cfg.get('find_unused_parameters', False)
+        # Sets the `find_unused_parameters` parameter in
+        # torch.nn.parallel.DistributedDataParallel
+        model = DistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False,
+            find_unused_parameters=find_unused_parameters)
+        if eval_model is not None:
+            eval_model = DistributedDataParallel(
+                eval_model.cuda(),
+                device_ids=[torch.cuda.current_device()],
+                broadcast_buffers=False,
+                find_unused_parameters=find_unused_parameters)
+    else:
+        model = DataParallel(
+            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+        if eval_model is not None:
+            eval_model = DataParallel(
+                eval_model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+    # build runner
+    optimizer = build_optimizer(model, cfg.optimizer)
+
+    if 'runner' not in cfg:
+        cfg.runner = {
+            'type': 'EpochBasedRunner',
+            'max_epochs': cfg.total_epochs
+        }
+        warnings.warn(
+            'config is now expected to have a `runner` section, '
+            'please set `runner` in your config.', UserWarning)
+    else:
+        if 'total_epochs' in cfg:
+            assert cfg.total_epochs == cfg.runner.max_epochs
+    if eval_model is not None:
+        runner = build_runner(
+            cfg.runner,
+            default_args=dict(
+                model=model,
+                eval_model=eval_model,
+                optimizer=optimizer,
+                work_dir=cfg.work_dir,
+                logger=logger,
+                meta=meta))
+    else:
+        runner = build_runner(
+            cfg.runner,
+            default_args=dict(
+                model=model,
+                optimizer=optimizer,
+                work_dir=cfg.work_dir,
+                logger=logger,
+                meta=meta))
+
+    # an ugly workaround to make .log and .log.json filenames the same
+    runner.timestamp = timestamp
+
+    # fp16 setting
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        optimizer_config = Fp16OptimizerHook(
+            **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
+    elif distributed and 'type' not in cfg.optimizer_config:
+        optimizer_config = OptimizerHook(**cfg.optimizer_config)
+    else:
+        optimizer_config = cfg.optimizer_config
+
+    # register hooks
+    runner.register_training_hooks(cfg.lr_config, optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config,
+                                   cfg.get('momentum_config', None))
+    
+    # register profiler hook
+    #trace_config = dict(type='tb_trace', dir_name='work_dir')
+    #profiler_config = dict(on_trace_ready=trace_config)
+    #runner.register_profiler_hook(profiler_config)
+    
+    if distributed:
+        if isinstance(runner, EpochBasedRunner):
+            runner.register_hook(DistSamplerSeedHook())
+
+    # register eval hooks
+    if validate:
+        # Support batch_size > 1 in validation
+        val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
+        if val_samples_per_gpu > 1:
+            assert False
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.val.pipeline = replace_ImageToTensor(
+                cfg.data.val.pipeline)
+        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
+
+        val_dataloader = build_dataloader(
+            val_dataset,
+            samples_per_gpu=val_samples_per_gpu,
+            workers_per_gpu=cfg.data.workers_per_gpu,
+            dist=distributed,
+            shuffle=False,
+            shuffler_sampler=cfg.data.shuffler_sampler,  # dict(type='DistributedGroupSampler'),
+            nonshuffler_sampler=cfg.data.nonshuffler_sampler,  # dict(type='DistributedSampler'),
+        )
+        eval_cfg = cfg.get('evaluation', {})
+        eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
+        eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_'))
+        eval_hook = CustomDistEvalHook if distributed else EvalHook
+        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
+
+    # user-defined hooks
+    if cfg.get('custom_hooks', None):
+        custom_hooks = cfg.custom_hooks
+        assert isinstance(custom_hooks, list), \
+            f'custom_hooks expect list type, but got {type(custom_hooks)}'
+        for hook_cfg in cfg.custom_hooks:
+            assert isinstance(hook_cfg, dict), \
+                'Each item in custom_hooks expects dict type, but got ' \
+                f'{type(hook_cfg)}'
+            hook_cfg = hook_cfg.copy()
+            priority = hook_cfg.pop('priority', 'NORMAL')
+            hook = build_from_cfg(hook_cfg, HOOKS)
+            runner.register_hook(hook, priority=priority)
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow)
+
diff --git a/adzoo/bevformer/apis/test.py b/adzoo/bevformer/apis/test.py
new file mode 100644
index 0000000..7667395
--- /dev/null
+++ b/adzoo/bevformer/apis/test.py
@@ -0,0 +1,163 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+
+import torch
+import torch.distributed as dist
+from mmcv.image import tensor2imgs
+from mmcv.utils import get_dist_info
+
+from mmcv.core import encode_mask_results
+from mmcv.fileio.io import dump, load
+from mmcv.utils import mkdir_or_exist, ProgressBar
+
+import numpy as np
+import pycocotools.mask as mask_util
+
+def custom_encode_mask_results(mask_results):
+    """Encode bitmap mask to RLE code. Semantic Masks only
+    Args:
+        mask_results (list | tuple[list]): bitmap mask results.
+            In mask scoring rcnn, mask_results is a tuple of (segm_results,
+            segm_cls_score).
+    Returns:
+        list | tuple: RLE encoded mask.
+    """
+    cls_segms = mask_results
+    num_classes = len(cls_segms)
+    encoded_mask_results = []
+    for i in range(len(cls_segms)):
+        encoded_mask_results.append(
+            mask_util.encode(
+                np.array(
+                    cls_segms[i][:, :, np.newaxis], order='F',
+                        dtype='uint8'))[0])  # encoded with RLE
+    return [encoded_mask_results]
+
+def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+    """Test model with multiple gpus.
+    This method tests model with multiple gpus and collects the results
+    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
+    it encodes results to gpu tensors and use gpu communication for results
+    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
+    and collects them by the rank 0 worker.
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode.
+        gpu_collect (bool): Option to use either gpu or cpu to collect results.
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    bbox_results = []
+    mask_results = []
+    dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        prog_bar = ProgressBar(len(dataset))
+    time.sleep(2)  # This line can prevent deadlock problem in some cases.
+    have_mask = False
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(data, return_loss=False, rescale=True)
+            # encode mask results
+            if isinstance(result, dict):
+                if 'bbox_results' in result.keys():
+                    bbox_result = result['bbox_results']
+                    batch_size = len(result['bbox_results'])
+                    bbox_results.extend(bbox_result)
+                if 'mask_results' in result.keys() and result['mask_results'] is not None:
+                    mask_result = custom_encode_mask_results(result['mask_results'])
+                    mask_results.extend(mask_result)
+                    have_mask = True
+            else:
+                batch_size = len(result)
+                bbox_results.extend(result)
+
+            #if isinstance(result[0], tuple):
+            #    assert False, 'this code is for instance segmentation, which our code will not utilize.'
+            #    result = [(bbox_results, encode_mask_results(mask_results))
+            #              for bbox_results, mask_results in result]
+        if rank == 0:
+            
+            for _ in range(batch_size * world_size):
+                prog_bar.update()
+
+    # collect results from all ranks
+    if gpu_collect:
+        bbox_results = collect_results_gpu(bbox_results, len(dataset))
+        if have_mask:
+            mask_results = collect_results_gpu(mask_results, len(dataset))
+        else:
+            mask_results = None
+    else:
+        bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir)
+        tmpdir = tmpdir+'_mask' if tmpdir is not None else None
+        if have_mask:
+            mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir)
+        else:
+            mask_results = None
+
+    if mask_results is None:
+        return bbox_results
+    return {'bbox_results': bbox_results, 'mask_results': mask_results}
+
+
+def collect_results_cpu(result_part, size, tmpdir=None):
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ),
+                                32,
+                                dtype=torch.uint8,
+                                device='cuda')
+        if rank == 0:
+            mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+            dir_tensor[:len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mkdir_or_exist(tmpdir)
+    # dump the part result to the dir
+    dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            part_file = osp.join(tmpdir, f'part_{i}.pkl')
+            part_list.append(load(part_file))
+        # sort the results
+        ordered_results = []
+        '''
+        bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample,
+        '''
+        #for res in zip(*part_list):
+        for res in part_list:  
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)
+        return ordered_results
+
+
+def collect_results_gpu(result_part, size):
+    collect_results_cpu(result_part, size)
\ No newline at end of file
diff --git a/adzoo/bevformer/apis/train.py b/adzoo/bevformer/apis/train.py
new file mode 100644
index 0000000..dcae402
--- /dev/null
+++ b/adzoo/bevformer/apis/train.py
@@ -0,0 +1,65 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from .mmdet_train import custom_train_detector
+
+def custom_train_model(model,
+                dataset,
+                cfg,
+                distributed=False,
+                validate=False,
+                timestamp=None,
+                eval_model=None,
+                meta=None):
+    """A function wrapper for launching model training according to cfg.
+
+    Because we need different eval_hook in runner. Should be deprecated in the
+    future.
+    """
+    if cfg.model.type in ['EncoderDecoder3D']:
+        assert False
+    else:
+        custom_train_detector(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            eval_model=eval_model,
+            meta=meta)
+
+
+def train_model(model,
+                dataset,
+                cfg,
+                distributed=False,
+                validate=False,
+                timestamp=None,
+                meta=None):
+    """A function wrapper for launching model training according to cfg.
+
+    Because we need different eval_hook in runner. Should be deprecated in the
+    future.
+    """
+    if cfg.model.type in ['EncoderDecoder3D']:
+        train_segmentor(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            meta=meta)
+    else:
+        train_detector(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            meta=meta)
diff --git a/adzoo/bevformer/configs/_base_/datasets/coco_instance.py b/adzoo/bevformer/configs/_base_/datasets/coco_instance.py
new file mode 100644
index 0000000..f6ea4f4
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/coco_instance.py
@@ -0,0 +1,48 @@
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/adzoo/bevformer/configs/_base_/datasets/kitti-3d-3class.py b/adzoo/bevformer/configs/_base_/datasets/kitti-3d-3class.py
new file mode 100644
index 0000000..1822af4
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/kitti-3d-3class.py
@@ -0,0 +1,140 @@
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+    classes=class_names,
+    sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6))
+
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        file_client_args=file_client_args),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=6,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'kitti_infos_train.pkl',
+            split='training',
+            pts_prefix='velodyne_reduced',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR')),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pts_prefix='velodyne_reduced',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pts_prefix='velodyne_reduced',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+
+evaluation = dict(interval=1, pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/kitti-3d-car.py b/adzoo/bevformer/configs/_base_/datasets/kitti-3d-car.py
new file mode 100644
index 0000000..1e81226
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/kitti-3d-car.py
@@ -0,0 +1,138 @@
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Car']
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15))
+
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        file_client_args=file_client_args),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=6,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'kitti_infos_train.pkl',
+            split='training',
+            pts_prefix='velodyne_reduced',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR')),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pts_prefix='velodyne_reduced',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pts_prefix='velodyne_reduced',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+
+evaluation = dict(interval=1, pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/lyft-3d.py b/adzoo/bevformer/configs/_base_/datasets/lyft-3d.py
new file mode 100644
index 0000000..71baff0
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/lyft-3d.py
@@ -0,0 +1,136 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-80, -80, -5, 80, 80, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+    'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+    'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'LyftDataset'
+data_root = 'data/lyft/'
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/lyft/': 's3://lyft/lyft/',
+#         'data/lyft/': 's3://lyft/lyft/'
+#    }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_test.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True))
+# For Lyft dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/nuim_instance.py b/adzoo/bevformer/configs/_base_/datasets/nuim_instance.py
new file mode 100644
index 0000000..82fce56
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/nuim_instance.py
@@ -0,0 +1,59 @@
+dataset_type = 'CocoDataset'
+data_root = 'data/nuimages/'
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1280, 720), (1920, 1080)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1600, 900),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/nuimages_v1.0-train.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/nuimages_v1.0-val.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/nuimages_v1.0-val.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/adzoo/bevformer/configs/_base_/datasets/nus-3d.py b/adzoo/bevformer/configs/_base_/datasets/nus-3d.py
new file mode 100644
index 0000000..1548171
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/nus-3d.py
@@ -0,0 +1,142 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/nus-mono3d.py b/adzoo/bevformer/configs/_base_/datasets/nus-mono3d.py
new file mode 100644
index 0000000..1363a94
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/nus-mono3d.py
@@ -0,0 +1,100 @@
+dataset_type = 'CustomNuScenesMonoDataset'
+data_root = 'data/nuscenes/'
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=True,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D',
+        keys=[
+            'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers2d', 'depths'
+        ]),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='MultiScaleFlipAug',
+        scale_factor=1.0,
+        flip=False,
+        transforms=[
+            dict(type='RandomFlip3D'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['img']),
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['img'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train_mono3d.coco.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        box_type_3d='Camera'),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='Camera'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='Camera'))
+evaluation = dict(interval=2)
diff --git a/adzoo/bevformer/configs/_base_/datasets/range100_lyft-3d.py b/adzoo/bevformer/configs/_base_/datasets/range100_lyft-3d.py
new file mode 100644
index 0000000..efa63ea
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/range100_lyft-3d.py
@@ -0,0 +1,136 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-100, -100, -5, 100, 100, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+    'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+    'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'LyftDataset'
+data_root = 'data/lyft/'
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/lyft/': 's3://lyft/lyft/',
+#         'data/lyft/': 's3://lyft/lyft/'
+#    }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_test.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True))
+# For Lyft dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/s3dis-3d-5class.py b/adzoo/bevformer/configs/_base_/datasets/s3dis-3d-5class.py
new file mode 100644
index 0000000..2422766
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/s3dis-3d-5class.py
@@ -0,0 +1,114 @@
+# dataset settings
+dataset_type = 'S3DISDataset'
+data_root = './data/s3dis/'
+class_names = ('table', 'chair', 'sofa', 'bookcase', 'board')
+train_area = [1, 2, 3, 4, 6]
+test_area = 5
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='PointSample', num_points=40000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        # following ScanNet dataset the rotation range is 5 degrees
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0],
+        shift_height=True),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type='PointSample', num_points=40000),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type='ConcatDataset',
+            datasets=[
+                dict(
+                    type=dataset_type,
+                    data_root=data_root,
+                    ann_file=data_root + f's3dis_infos_Area_{i}.pkl',
+                    pipeline=train_pipeline,
+                    filter_empty_gt=False,
+                    classes=class_names,
+                    box_type_3d='Depth') for i in train_area
+            ],
+            separate_eval=False)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/s3dis_seg-3d-13class.py b/adzoo/bevformer/configs/_base_/datasets/s3dis_seg-3d-13class.py
new file mode 100644
index 0000000..39bf556
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/s3dis_seg-3d-13class.py
@@ -0,0 +1,139 @@
+# dataset settings
+dataset_type = 'S3DISSegDataset'
+data_root = './data/s3dis/'
+class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
+               'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter')
+num_points = 4096
+train_area = [1, 2, 3, 4, 6]
+test_area = 5
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=tuple(range(len(class_names))),
+        max_cat_id=13),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.0,
+        ignore_index=len(class_names),
+        use_normalized_coord=True,
+        enlarge_size=0.2,
+        min_unique_num=None),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        # a wrapper in order to successfully call test function
+        # actually we don't perform test-time-aug
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.0,
+                flip_ratio_bev_vertical=0.0),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+# we need to load gt seg_mask!
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=tuple(range(len(class_names))),
+        max_cat_id=13),
+    dict(
+        type='DefaultFormatBundle3D',
+        with_label=False,
+        class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    # train on area 1, 2, 3, 4, 6
+    # test on area 5
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_files=[
+            data_root + f's3dis_infos_Area_{i}.pkl' for i in train_area
+        ],
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        ignore_index=len(class_names),
+        scene_idxs=[
+            data_root + f'seg_info/Area_{i}_resampled_scene_idxs.npy'
+            for i in train_area
+        ]),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        ignore_index=len(class_names),
+        scene_idxs=data_root +
+        f'seg_info/Area_{test_area}_resampled_scene_idxs.npy'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        ignore_index=len(class_names)))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/scannet-3d-18class.py b/adzoo/bevformer/configs/_base_/datasets/scannet-3d-18class.py
new file mode 100644
index 0000000..93da1e5
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/scannet-3d-18class.py
@@ -0,0 +1,128 @@
+# dataset settings
+dataset_type = 'ScanNetDataset'
+data_root = './data/scannet/'
+class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+               'garbagebin')
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
+                       36, 39),
+        max_cat_id=40),
+    dict(type='PointSample', num_points=40000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0],
+        shift_height=True),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D',
+        keys=[
+            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+            'pts_instance_mask'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type='PointSample', num_points=40000),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'scannet_infos_train.pkl',
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            classes=class_names,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='Depth')),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/scannet_seg-3d-20class.py b/adzoo/bevformer/configs/_base_/datasets/scannet_seg-3d-20class.py
new file mode 100644
index 0000000..cf73b09
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/scannet_seg-3d-20class.py
@@ -0,0 +1,132 @@
+# dataset settings
+dataset_type = 'ScanNetSegDataset'
+data_root = './data/scannet/'
+class_names = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table',
+               'door', 'window', 'bookshelf', 'picture', 'counter', 'desk',
+               'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink',
+               'bathtub', 'otherfurniture')
+num_points = 8192
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28,
+                       33, 34, 36, 39),
+        max_cat_id=40),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.5,
+        ignore_index=len(class_names),
+        use_normalized_coord=False,
+        enlarge_size=0.2,
+        min_unique_num=None),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        # a wrapper in order to successfully call test function
+        # actually we don't perform test-time-aug
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.0,
+                flip_ratio_bev_vertical=0.0),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+# we need to load gt seg_mask!
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28,
+                       33, 34, 36, 39),
+        max_cat_id=40),
+    dict(
+        type='DefaultFormatBundle3D',
+        with_label=False,
+        class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        ignore_index=len(class_names),
+        scene_idxs=data_root + 'seg_info/train_resampled_scene_idxs.npy'),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        ignore_index=len(class_names)),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        ignore_index=len(class_names)))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/sunrgbd-3d-10class.py b/adzoo/bevformer/configs/_base_/datasets/sunrgbd-3d-10class.py
new file mode 100644
index 0000000..7121b75
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/sunrgbd-3d-10class.py
@@ -0,0 +1,107 @@
+dataset_type = 'SUNRGBDDataset'
+data_root = 'data/sunrgbd/'
+class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+               'night_stand', 'bookshelf', 'bathtub')
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+    ),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.523599, 0.523599],
+        scale_ratio_range=[0.85, 1.15],
+        shift_height=True),
+    dict(type='PointSample', num_points=20000),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+            ),
+            dict(type='PointSample', num_points=20000),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'sunrgbd_infos_train.pkl',
+            pipeline=train_pipeline,
+            classes=class_names,
+            filter_empty_gt=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='Depth')),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'sunrgbd_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'sunrgbd_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/waymoD5-3d-3class.py b/adzoo/bevformer/configs/_base_/datasets/waymoD5-3d-3class.py
new file mode 100644
index 0000000..920ac15
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/waymoD5-3d-3class.py
@@ -0,0 +1,145 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'LidarWaymoDataset'
+data_root = 'data/waymo-full/kitti_format/'
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
+
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'waymo_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+    classes=class_names,
+    sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        file_client_args=file_client_args))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        file_client_args=file_client_args),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'waymo_infos_train.pkl',
+            split='training',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            # load one frame every five frames
+            load_interval=5)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/datasets/waymoD5-3d-car.py b/adzoo/bevformer/configs/_base_/datasets/waymoD5-3d-car.py
new file mode 100644
index 0000000..02e2627
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/datasets/waymoD5-3d-car.py
@@ -0,0 +1,143 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
+
+class_names = ['Car']
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'waymo_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        file_client_args=file_client_args))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        file_client_args=file_client_args),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'waymo_infos_train.pkl',
+            split='training',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            # load one frame every five frames
+            load_interval=5)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/_base_/default_runtime.py b/adzoo/bevformer/configs/_base_/default_runtime.py
new file mode 100644
index 0000000..4e85b69
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/default_runtime.py
@@ -0,0 +1,18 @@
+checkpoint_config = dict(interval=1)
+# yapf:disable push
+# By default we use textlogger hook and tensorboard
+# For more loggers see
+# https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = None
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/adzoo/bevformer/configs/_base_/models/3dssd.py b/adzoo/bevformer/configs/_base_/models/3dssd.py
new file mode 100644
index 0000000..55344c7
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/3dssd.py
@@ -0,0 +1,77 @@
+model = dict(
+    type='SSD3DNet',
+    backbone=dict(
+        type='PointNet2SAMSG',
+        in_channels=4,
+        num_points=(4096, 512, (256, 256)),
+        radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)),
+        num_samples=((32, 32, 64), (32, 32, 64), (32, 32, 32)),
+        sa_channels=(((16, 16, 32), (16, 16, 32), (32, 32, 64)),
+                     ((64, 64, 128), (64, 64, 128), (64, 96, 128)),
+                     ((128, 128, 256), (128, 192, 256), (128, 256, 256))),
+        aggregation_channels=(64, 128, 256),
+        fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')),
+        fps_sample_range_lists=((-1), (-1), (512, -1)),
+        norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1),
+        sa_cfg=dict(
+            type='PointSAModuleMSG',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False)),
+    bbox_head=dict(
+        type='SSD3DHead',
+        in_channels=256,
+        vote_module_cfg=dict(
+            in_channels=256,
+            num_points=256,
+            gt_per_seed=1,
+            conv_channels=(128, ),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
+            with_res_feat=False,
+            vote_xyz_range=(3.0, 3.0, 2.0)),
+        vote_aggregation_cfg=dict(
+            type='PointSAModuleMSG',
+            num_point=256,
+            radii=(4.8, 6.4),
+            sample_nums=(16, 32),
+            mlp_channels=((256, 256, 256, 512), (256, 256, 512, 1024)),
+            norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1),
+            use_xyz=True,
+            normalize_xyz=False,
+            bias=True),
+        pred_layer_cfg=dict(
+            in_channels=1536,
+            shared_conv_channels=(512, 128),
+            cls_conv_channels=(128, ),
+            reg_conv_channels=(128, ),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
+            bias=True),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
+        objectness_loss=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            loss_weight=1.0),
+        center_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+        corner_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+        vote_loss=dict(type='SmoothL1Loss', reduction='sum', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        sample_mod='spec', pos_distance_thr=10.0, expand_dims_length=0.05),
+    test_cfg=dict(
+        nms_cfg=dict(type='nms', iou_thr=0.1),
+        sample_mod='spec',
+        score_thr=0.0,
+        per_class_proposal=True,
+        max_output_num=100))
diff --git a/adzoo/bevformer/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py b/adzoo/bevformer/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
new file mode 100644
index 0000000..fb9e0a8
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
@@ -0,0 +1,200 @@
+# model settings
+model = dict(
+    type='CascadeRCNN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='CascadeRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_across_levels=False,
+            nms_pre=2000,
+            nms_post=2000,
+            max_num=2000,
+            nms_thr=0.7,
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_post=1000,
+            max_num=1000,
+            nms_thr=0.7,
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/adzoo/bevformer/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py b/adzoo/bevformer/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py
new file mode 100644
index 0000000..efdce59
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py
@@ -0,0 +1,83 @@
+voxel_size = [0.1, 0.1, 0.2]
+model = dict(
+    type='CenterPoint',
+    pts_voxel_layer=dict(
+        max_num_points=10, voxel_size=voxel_size, max_voxels=(90000, 120000)),
+    pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
+    pts_middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=5,
+        sparse_shape=[41, 1024, 1024],
+        output_channels=128,
+        order=('conv', 'norm', 'act'),
+        encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
+                                                                      128)),
+        encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
+        block_type='basicblock'),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        out_channels=[128, 256],
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        conv_cfg=dict(type='Conv2d', bias=False)),
+    pts_neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        out_channels=[256, 256],
+        upsample_strides=[1, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        upsample_cfg=dict(type='deconv', bias=False),
+        use_conv_for_no_stride=True),
+    pts_bbox_head=dict(
+        type='CenterHead',
+        in_channels=sum([256, 256]),
+        tasks=[
+            dict(num_class=1, class_names=['car']),
+            dict(num_class=2, class_names=['truck', 'construction_vehicle']),
+            dict(num_class=2, class_names=['bus', 'trailer']),
+            dict(num_class=1, class_names=['barrier']),
+            dict(num_class=2, class_names=['motorcycle', 'bicycle']),
+            dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
+        ],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type='CenterPointBBoxCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_num=500,
+            score_threshold=0.1,
+            out_size_factor=8,
+            voxel_size=voxel_size[:2],
+            code_size=9),
+        separate_head=dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[1024, 1024, 40],
+            voxel_size=voxel_size,
+            out_size_factor=8,
+            dense_reg=1,
+            gaussian_overlap=0.1,
+            max_objs=500,
+            min_radius=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+    test_cfg=dict(
+        pts=dict(
+            post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_per_img=500,
+            max_pool_nms=False,
+            min_radius=[4, 12, 10, 1, 0.85, 0.175],
+            score_threshold=0.1,
+            out_size_factor=8,
+            voxel_size=voxel_size[:2],
+            nms_type='rotate',
+            pre_max_size=1000,
+            post_max_size=83,
+            nms_thr=0.2)))
diff --git a/adzoo/bevformer/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py b/adzoo/bevformer/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py
new file mode 100644
index 0000000..311d763
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py
@@ -0,0 +1,83 @@
+voxel_size = [0.2, 0.2, 8]
+model = dict(
+    type='CenterPoint',
+    pts_voxel_layer=dict(
+        max_num_points=20, voxel_size=voxel_size, max_voxels=(30000, 40000)),
+    pts_voxel_encoder=dict(
+        type='PillarFeatureNet',
+        in_channels=5,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=(0.2, 0.2, 8),
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+        legacy=False),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=(512, 512)),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        out_channels=[64, 128, 256],
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        conv_cfg=dict(type='Conv2d', bias=False)),
+    pts_neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        out_channels=[128, 128, 128],
+        upsample_strides=[0.5, 1, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        upsample_cfg=dict(type='deconv', bias=False),
+        use_conv_for_no_stride=True),
+    pts_bbox_head=dict(
+        type='CenterHead',
+        in_channels=sum([128, 128, 128]),
+        tasks=[
+            dict(num_class=1, class_names=['car']),
+            dict(num_class=2, class_names=['truck', 'construction_vehicle']),
+            dict(num_class=2, class_names=['bus', 'trailer']),
+            dict(num_class=1, class_names=['barrier']),
+            dict(num_class=2, class_names=['motorcycle', 'bicycle']),
+            dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
+        ],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type='CenterPointBBoxCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_num=500,
+            score_threshold=0.1,
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            code_size=9),
+        separate_head=dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            out_size_factor=4,
+            dense_reg=1,
+            gaussian_overlap=0.1,
+            max_objs=500,
+            min_radius=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+    test_cfg=dict(
+        pts=dict(
+            post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_per_img=500,
+            max_pool_nms=False,
+            min_radius=[4, 12, 10, 1, 0.85, 0.175],
+            score_threshold=0.1,
+            pc_range=[-51.2, -51.2],
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            nms_type='rotate',
+            pre_max_size=1000,
+            post_max_size=83,
+            nms_thr=0.2)))
diff --git a/adzoo/bevformer/configs/_base_/models/fcos3d.py b/adzoo/bevformer/configs/_base_/models/fcos3d.py
new file mode 100644
index 0000000..92ea907
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/fcos3d.py
@@ -0,0 +1,74 @@
+model = dict(
+    type='FCOSMono3D',
+    pretrained='open-mmlab://detectron2/resnet101_caffe',
+    backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5,
+        relu_before_extra_convs=True),
+    bbox_head=dict(
+        type='FCOSMono3DHead',
+        num_classes=10,
+        in_channels=256,
+        stacked_convs=2,
+        feat_channels=256,
+        use_direction_classifier=True,
+        diff_rad_by_sin=True,
+        pred_attrs=True,
+        pred_velo=True,
+        dir_offset=0.7854,  # pi/4
+        strides=[8, 16, 32, 64, 128],
+        group_reg_dims=(2, 1, 3, 1, 2),  # offset, depth, size, rot, velo
+        cls_branch=(256, ),
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            ()  # velo
+        ),
+        dir_branch=(256, ),
+        attr_branch=(256, ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_attr=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        norm_on_bbox=True,
+        centerness_on_reg=True,
+        center_sampling=True,
+        conv_bias=True,
+        dcn_on_last_conv=True),
+    train_cfg=dict(
+        allowed_border=0,
+        code_weight=[1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05],
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_thr=0.8,
+        score_thr=0.05,
+        min_bbox_size=0,
+        max_per_img=200))
diff --git a/adzoo/bevformer/configs/_base_/models/groupfree3d.py b/adzoo/bevformer/configs/_base_/models/groupfree3d.py
new file mode 100644
index 0000000..077d049
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/groupfree3d.py
@@ -0,0 +1,71 @@
+model = dict(
+    type='GroupFree3DNet',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=3,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                     (128, 128, 256)),
+        fp_channels=((256, 256), (256, 288)),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)),
+    bbox_head=dict(
+        type='GroupFree3DHead',
+        in_channels=288,
+        num_decoder_layers=6,
+        num_proposal=256,
+        transformerlayers=dict(
+            type='BaseTransformerLayer',
+            attn_cfgs=dict(
+                type='GroupFree3DMHA',
+                embed_dims=288,
+                num_heads=8,
+                attn_drop=0.1,
+                dropout_layer=dict(type='Dropout', drop_prob=0.1)),
+            ffn_cfgs=dict(
+                embed_dims=288,
+                feedforward_channels=2048,
+                ffn_drop=0.1,
+                act_cfg=dict(type='ReLU', inplace=True)),
+            operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn',
+                             'norm')),
+        pred_layer_cfg=dict(
+            in_channels=288, shared_conv_channels=(288, 288), bias=True),
+        sampling_objectness_loss=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=8.0),
+        objectness_loss=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        center_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', beta=1.0, reduction='sum', loss_weight=10.0),
+        semantic_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(sample_mod='kps'),
+    test_cfg=dict(
+        sample_mod='kps',
+        nms_thr=0.25,
+        score_thr=0.0,
+        per_class_proposal=True,
+        prediction_stages='last'))
diff --git a/adzoo/bevformer/configs/_base_/models/h3dnet.py b/adzoo/bevformer/configs/_base_/models/h3dnet.py
new file mode 100644
index 0000000..7605667
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/h3dnet.py
@@ -0,0 +1,341 @@
+primitive_z_cfg = dict(
+    type='PrimitiveHead',
+    num_dims=2,
+    num_classes=18,
+    primitive_mode='z',
+    upper_thresh=100.0,
+    surface_thresh=0.5,
+    vote_module_cfg=dict(
+        in_channels=256,
+        vote_per_seed=1,
+        gt_per_seed=1,
+        conv_channels=(256, 256),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        norm_feats=True,
+        vote_loss=dict(
+            type='ChamferDistance',
+            mode='l1',
+            reduction='none',
+            loss_dst_weight=10.0)),
+    vote_aggregation_cfg=dict(
+        type='PointSAModule',
+        num_point=1024,
+        radius=0.3,
+        num_sample=16,
+        mlp_channels=[256, 128, 128, 128],
+        use_xyz=True,
+        normalize_xyz=True),
+    feat_channels=(128, 128),
+    conv_cfg=dict(type='Conv1d'),
+    norm_cfg=dict(type='BN1d'),
+    objectness_loss=dict(
+        type='CrossEntropyLoss',
+        class_weight=[0.4, 0.6],
+        reduction='mean',
+        loss_weight=30.0),
+    center_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=0.5,
+        loss_dst_weight=0.5),
+    semantic_reg_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=0.5,
+        loss_dst_weight=0.5),
+    semantic_cls_loss=dict(
+        type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+    train_cfg=dict(
+        dist_thresh=0.2,
+        var_thresh=1e-2,
+        lower_thresh=1e-6,
+        num_point=100,
+        num_point_line=10,
+        line_thresh=0.2))
+
+primitive_xy_cfg = dict(
+    type='PrimitiveHead',
+    num_dims=1,
+    num_classes=18,
+    primitive_mode='xy',
+    upper_thresh=100.0,
+    surface_thresh=0.5,
+    vote_module_cfg=dict(
+        in_channels=256,
+        vote_per_seed=1,
+        gt_per_seed=1,
+        conv_channels=(256, 256),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        norm_feats=True,
+        vote_loss=dict(
+            type='ChamferDistance',
+            mode='l1',
+            reduction='none',
+            loss_dst_weight=10.0)),
+    vote_aggregation_cfg=dict(
+        type='PointSAModule',
+        num_point=1024,
+        radius=0.3,
+        num_sample=16,
+        mlp_channels=[256, 128, 128, 128],
+        use_xyz=True,
+        normalize_xyz=True),
+    feat_channels=(128, 128),
+    conv_cfg=dict(type='Conv1d'),
+    norm_cfg=dict(type='BN1d'),
+    objectness_loss=dict(
+        type='CrossEntropyLoss',
+        class_weight=[0.4, 0.6],
+        reduction='mean',
+        loss_weight=30.0),
+    center_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=0.5,
+        loss_dst_weight=0.5),
+    semantic_reg_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=0.5,
+        loss_dst_weight=0.5),
+    semantic_cls_loss=dict(
+        type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+    train_cfg=dict(
+        dist_thresh=0.2,
+        var_thresh=1e-2,
+        lower_thresh=1e-6,
+        num_point=100,
+        num_point_line=10,
+        line_thresh=0.2))
+
+primitive_line_cfg = dict(
+    type='PrimitiveHead',
+    num_dims=0,
+    num_classes=18,
+    primitive_mode='line',
+    upper_thresh=100.0,
+    surface_thresh=0.5,
+    vote_module_cfg=dict(
+        in_channels=256,
+        vote_per_seed=1,
+        gt_per_seed=1,
+        conv_channels=(256, 256),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        norm_feats=True,
+        vote_loss=dict(
+            type='ChamferDistance',
+            mode='l1',
+            reduction='none',
+            loss_dst_weight=10.0)),
+    vote_aggregation_cfg=dict(
+        type='PointSAModule',
+        num_point=1024,
+        radius=0.3,
+        num_sample=16,
+        mlp_channels=[256, 128, 128, 128],
+        use_xyz=True,
+        normalize_xyz=True),
+    feat_channels=(128, 128),
+    conv_cfg=dict(type='Conv1d'),
+    norm_cfg=dict(type='BN1d'),
+    objectness_loss=dict(
+        type='CrossEntropyLoss',
+        class_weight=[0.4, 0.6],
+        reduction='mean',
+        loss_weight=30.0),
+    center_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=1.0,
+        loss_dst_weight=1.0),
+    semantic_reg_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=1.0,
+        loss_dst_weight=1.0),
+    semantic_cls_loss=dict(
+        type='CrossEntropyLoss', reduction='sum', loss_weight=2.0),
+    train_cfg=dict(
+        dist_thresh=0.2,
+        var_thresh=1e-2,
+        lower_thresh=1e-6,
+        num_point=100,
+        num_point_line=10,
+        line_thresh=0.2))
+
+model = dict(
+    type='H3DNet',
+    backbone=dict(
+        type='MultiBackbone',
+        num_streams=4,
+        suffixes=['net0', 'net1', 'net2', 'net3'],
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.01),
+        act_cfg=dict(type='ReLU'),
+        backbones=dict(
+            type='PointNet2SASSG',
+            in_channels=4,
+            num_points=(2048, 1024, 512, 256),
+            radius=(0.2, 0.4, 0.8, 1.2),
+            num_samples=(64, 32, 16, 16),
+            sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                         (128, 128, 256)),
+            fp_channels=((256, 256), (256, 256)),
+            norm_cfg=dict(type='BN2d'),
+            sa_cfg=dict(
+                type='PointSAModule',
+                pool_mod='max',
+                use_xyz=True,
+                normalize_xyz=True))),
+    rpn_head=dict(
+        type='VoteHead',
+        vote_module_cfg=dict(
+            in_channels=256,
+            vote_per_seed=1,
+            gt_per_seed=3,
+            conv_channels=(256, 256),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            norm_feats=True,
+            vote_loss=dict(
+                type='ChamferDistance',
+                mode='l1',
+                reduction='none',
+                loss_dst_weight=10.0)),
+        vote_aggregation_cfg=dict(
+            type='PointSAModule',
+            num_point=256,
+            radius=0.3,
+            num_sample=16,
+            mlp_channels=[256, 128, 128, 128],
+            use_xyz=True,
+            normalize_xyz=True),
+        pred_layer_cfg=dict(
+            in_channels=128, shared_conv_channels=(128, 128), bias=True),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        objectness_loss=dict(
+            type='CrossEntropyLoss',
+            class_weight=[0.2, 0.8],
+            reduction='sum',
+            loss_weight=5.0),
+        center_loss=dict(
+            type='ChamferDistance',
+            mode='l2',
+            reduction='sum',
+            loss_src_weight=10.0,
+            loss_dst_weight=10.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        semantic_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    roi_head=dict(
+        type='H3DRoIHead',
+        primitive_list=[primitive_z_cfg, primitive_xy_cfg, primitive_line_cfg],
+        bbox_head=dict(
+            type='H3DBboxHead',
+            gt_per_seed=3,
+            num_proposal=256,
+            suface_matching_cfg=dict(
+                type='PointSAModule',
+                num_point=256 * 6,
+                radius=0.5,
+                num_sample=32,
+                mlp_channels=[128 + 6, 128, 64, 32],
+                use_xyz=True,
+                normalize_xyz=True),
+            line_matching_cfg=dict(
+                type='PointSAModule',
+                num_point=256 * 12,
+                radius=0.5,
+                num_sample=32,
+                mlp_channels=[128 + 12, 128, 64, 32],
+                use_xyz=True,
+                normalize_xyz=True),
+            feat_channels=(128, 128),
+            primitive_refine_channels=[128, 128, 128],
+            upper_thresh=100.0,
+            surface_thresh=0.5,
+            line_thresh=0.5,
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            objectness_loss=dict(
+                type='CrossEntropyLoss',
+                class_weight=[0.2, 0.8],
+                reduction='sum',
+                loss_weight=5.0),
+            center_loss=dict(
+                type='ChamferDistance',
+                mode='l2',
+                reduction='sum',
+                loss_src_weight=10.0,
+                loss_dst_weight=10.0),
+            dir_class_loss=dict(
+                type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
+            dir_res_loss=dict(
+                type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+            size_class_loss=dict(
+                type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
+            size_res_loss=dict(
+                type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+            semantic_loss=dict(
+                type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
+            cues_objectness_loss=dict(
+                type='CrossEntropyLoss',
+                class_weight=[0.3, 0.7],
+                reduction='mean',
+                loss_weight=5.0),
+            cues_semantic_loss=dict(
+                type='CrossEntropyLoss',
+                class_weight=[0.3, 0.7],
+                reduction='mean',
+                loss_weight=5.0),
+            proposal_objectness_loss=dict(
+                type='CrossEntropyLoss',
+                class_weight=[0.2, 0.8],
+                reduction='none',
+                loss_weight=5.0),
+            primitive_center_loss=dict(
+                type='MSELoss', reduction='none', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'),
+        rpn_proposal=dict(use_nms=False),
+        rcnn=dict(
+            pos_distance_thr=0.3,
+            neg_distance_thr=0.6,
+            sample_mod='vote',
+            far_threshold=0.6,
+            near_threshold=0.3,
+            mask_surface_threshold=0.3,
+            label_surface_threshold=0.3,
+            mask_line_threshold=0.3,
+            label_line_threshold=0.3)),
+    test_cfg=dict(
+        rpn=dict(
+            sample_mod='seed',
+            nms_thr=0.25,
+            score_thr=0.05,
+            per_class_proposal=True,
+            use_nms=False),
+        rcnn=dict(
+            sample_mod='seed',
+            nms_thr=0.25,
+            score_thr=0.05,
+            per_class_proposal=True)))
diff --git a/adzoo/bevformer/configs/_base_/models/hv_pointpillars_fpn_lyft.py b/adzoo/bevformer/configs/_base_/models/hv_pointpillars_fpn_lyft.py
new file mode 100644
index 0000000..87c7fe0
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/hv_pointpillars_fpn_lyft.py
@@ -0,0 +1,22 @@
+_base_ = './hv_pointpillars_fpn_nus.py'
+
+# model settings (based on nuScenes model settings)
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+model = dict(
+    pts_voxel_layer=dict(
+        max_num_points=20,
+        point_cloud_range=[-80, -80, -5, 80, 80, 3],
+        max_voxels=(60000, 60000)),
+    pts_voxel_encoder=dict(
+        feat_channels=[64], point_cloud_range=[-80, -80, -5, 80, 80, 3]),
+    pts_middle_encoder=dict(output_shape=[640, 640]),
+    pts_bbox_head=dict(
+        num_classes=9,
+        anchor_generator=dict(
+            ranges=[[-80, -80, -1.8, 80, 80, -1.8]], custom_values=[]),
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)),
+    # model training settings (based on nuScenes model settings)
+    train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])))
diff --git a/adzoo/bevformer/configs/_base_/models/hv_pointpillars_fpn_nus.py b/adzoo/bevformer/configs/_base_/models/hv_pointpillars_fpn_nus.py
new file mode 100644
index 0000000..e153f6c
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/hv_pointpillars_fpn_nus.py
@@ -0,0 +1,96 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.25, 0.25, 8]
+model = dict(
+    type='MVXFasterRCNN',
+    pts_voxel_layer=dict(
+        max_num_points=64,
+        point_cloud_range=[-50, -50, -5, 50, 50, 3],
+        voxel_size=voxel_size,
+        max_voxels=(30000, 40000)),
+    pts_voxel_encoder=dict(
+        type='HardVFE',
+        in_channels=4,
+        feat_channels=[64, 64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=[-50, -50, -5, 50, 50, 3],
+        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[400, 400]),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256]),
+    pts_neck=dict(
+        type='FPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        act_cfg=dict(type='ReLU'),
+        in_channels=[64, 128, 256],
+        out_channels=256,
+        start_level=0,
+        num_outs=3),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=10,
+        in_channels=256,
+        feat_channels=256,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
+            scales=[1, 2, 4],
+            sizes=[
+                [0.8660, 2.5981, 1.],  # 1.5/sqrt(3)
+                [0.5774, 1.7321, 1.],  # 1/sqrt(3)
+                [1., 1., 1.],
+                [0.4, 0.4, 1],
+            ],
+            custom_values=[0, 0],
+            rotations=[0, 1.57],
+            reshape_out=True),
+        assigner_per_size=False,
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            allowed_border=0,
+            code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        pts=dict(
+            use_rotate_nms=True,
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_thr=0.2,
+            score_thr=0.05,
+            min_bbox_size=0,
+            max_num=500)))
diff --git a/adzoo/bevformer/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py b/adzoo/bevformer/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py
new file mode 100644
index 0000000..9cd200f
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py
@@ -0,0 +1,22 @@
+_base_ = './hv_pointpillars_fpn_nus.py'
+
+# model settings (based on nuScenes model settings)
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+model = dict(
+    pts_voxel_layer=dict(
+        max_num_points=20,
+        point_cloud_range=[-100, -100, -5, 100, 100, 3],
+        max_voxels=(60000, 60000)),
+    pts_voxel_encoder=dict(
+        feat_channels=[64], point_cloud_range=[-100, -100, -5, 100, 100, 3]),
+    pts_middle_encoder=dict(output_shape=[800, 800]),
+    pts_bbox_head=dict(
+        num_classes=9,
+        anchor_generator=dict(
+            ranges=[[-100, -100, -1.8, 100, 100, -1.8]], custom_values=[]),
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)),
+    # model training settings (based on nuScenes model settings)
+    train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])))
diff --git a/adzoo/bevformer/configs/_base_/models/hv_pointpillars_secfpn_kitti.py b/adzoo/bevformer/configs/_base_/models/hv_pointpillars_secfpn_kitti.py
new file mode 100644
index 0000000..85076d0
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/hv_pointpillars_secfpn_kitti.py
@@ -0,0 +1,93 @@
+voxel_size = [0.16, 0.16, 4]
+
+model = dict(
+    type='VoxelNet',
+    voxel_layer=dict(
+        max_num_points=32,  # max_points_per_voxel
+        point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1],
+        voxel_size=voxel_size,
+        max_voxels=(16000, 40000)  # (training, testing) max_voxels
+    ),
+    voxel_encoder=dict(
+        type='PillarFeatureNet',
+        in_channels=4,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1]),
+    middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]),
+    backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[
+                [0, -39.68, -0.6, 70.4, 39.68, -0.6],
+                [0, -39.68, -0.6, 70.4, 39.68, -0.6],
+                [0, -39.68, -1.78, 70.4, 39.68, -1.78],
+            ],
+            sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=[
+            dict(  # for Pedestrian
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Cyclist
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Car
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1),
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.1,
+        min_bbox_size=0,
+        nms_pre=100,
+        max_num=50))
diff --git a/adzoo/bevformer/configs/_base_/models/hv_pointpillars_secfpn_waymo.py b/adzoo/bevformer/configs/_base_/models/hv_pointpillars_secfpn_waymo.py
new file mode 100644
index 0000000..14873ea
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/hv_pointpillars_secfpn_waymo.py
@@ -0,0 +1,108 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.32, 0.32, 6]
+model = dict(
+    type='MVXFasterRCNN',
+    pts_voxel_layer=dict(
+        max_num_points=20,
+        point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
+        voxel_size=voxel_size,
+        max_voxels=(32000, 32000)),
+    pts_voxel_encoder=dict(
+        type='HardVFE',
+        in_channels=5,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
+        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[468, 468]),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[3, 5, 5],
+        layer_strides=[1, 2, 2],
+        out_channels=[64, 128, 256]),
+    pts_neck=dict(
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345],
+                    [-74.88, -74.88, -0.1188, 74.88, 74.88, -0.1188],
+                    [-74.88, -74.88, 0, 74.88, 74.88, 0]],
+            sizes=[
+                [2.08, 4.73, 1.77],  # car
+                [0.84, 1.81, 1.77],  # cyclist
+                [0.84, 0.91, 1.74]  # pedestrian
+            ],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            assigner=[
+                dict(  # car
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.4,
+                    min_pos_iou=0.4,
+                    ignore_iof_thr=-1),
+                dict(  # cyclist
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.3,
+                    min_pos_iou=0.3,
+                    ignore_iof_thr=-1),
+                dict(  # pedestrian
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.3,
+                    min_pos_iou=0.3,
+                    ignore_iof_thr=-1),
+            ],
+            allowed_border=0,
+            code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        pts=dict(
+            use_rotate_nms=True,
+            nms_across_levels=False,
+            nms_pre=4096,
+            nms_thr=0.25,
+            score_thr=0.1,
+            min_bbox_size=0,
+            max_num=500)))
diff --git a/adzoo/bevformer/configs/_base_/models/hv_second_secfpn_kitti.py b/adzoo/bevformer/configs/_base_/models/hv_second_secfpn_kitti.py
new file mode 100644
index 0000000..6bf18ab
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/hv_second_secfpn_kitti.py
@@ -0,0 +1,89 @@
+voxel_size = [0.05, 0.05, 0.1]
+
+model = dict(
+    type='VoxelNet',
+    voxel_layer=dict(
+        max_num_points=5,
+        point_cloud_range=[0, -40, -3, 70.4, 40, 1],
+        voxel_size=voxel_size,
+        max_voxels=(16000, 40000)),
+    voxel_encoder=dict(type='HardSimpleVFE'),
+    middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=4,
+        sparse_shape=[41, 1600, 1408],
+        order=('conv', 'norm', 'act')),
+    backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+            ],
+            sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=[
+            dict(  # for Pedestrian
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.35,
+                neg_iou_thr=0.2,
+                min_pos_iou=0.2,
+                ignore_iof_thr=-1),
+            dict(  # for Cyclist
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.35,
+                neg_iou_thr=0.2,
+                min_pos_iou=0.2,
+                ignore_iof_thr=-1),
+            dict(  # for Car
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1),
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.1,
+        min_bbox_size=0,
+        nms_pre=100,
+        max_num=50))
diff --git a/adzoo/bevformer/configs/_base_/models/hv_second_secfpn_waymo.py b/adzoo/bevformer/configs/_base_/models/hv_second_secfpn_waymo.py
new file mode 100644
index 0000000..eb9bd3a
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/hv_second_secfpn_waymo.py
@@ -0,0 +1,100 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.08, 0.08, 0.1]
+model = dict(
+    type='VoxelNet',
+    voxel_layer=dict(
+        max_num_points=10,
+        point_cloud_range=[-76.8, -51.2, -2, 76.8, 51.2, 4],
+        voxel_size=voxel_size,
+        max_voxels=(80000, 90000)),
+    voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
+    middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=5,
+        sparse_shape=[61, 1280, 1920],
+        order=('conv', 'norm', 'act')),
+    backbone=dict(
+        type='SECOND',
+        in_channels=384,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-76.8, -51.2, -0.0345, 76.8, 51.2, -0.0345],
+                    [-76.8, -51.2, 0, 76.8, 51.2, 0],
+                    [-76.8, -51.2, -0.1188, 76.8, 51.2, -0.1188]],
+            sizes=[
+                [2.08, 4.73, 1.77],  # car
+                [0.84, 0.91, 1.74],  # pedestrian
+                [0.84, 1.81, 1.77]  # cyclist
+            ],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=[
+            dict(  # car
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.55,
+                neg_iou_thr=0.4,
+                min_pos_iou=0.4,
+                ignore_iof_thr=-1),
+            dict(  # pedestrian
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            dict(  # cyclist
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1)
+        ],
+        allowed_border=0,
+        code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=4096,
+        nms_thr=0.25,
+        score_thr=0.1,
+        min_bbox_size=0,
+        max_num=500))
diff --git a/adzoo/bevformer/configs/_base_/models/imvotenet_image.py b/adzoo/bevformer/configs/_base_/models/imvotenet_image.py
new file mode 100644
index 0000000..981f8bc
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/imvotenet_image.py
@@ -0,0 +1,108 @@
+model = dict(
+    type='ImVoteNet',
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    img_rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    img_roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=10,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+
+    # model training and testing settings
+    train_cfg=dict(
+        img_rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        img_rpn_proposal=dict(
+            nms_across_levels=False,
+            nms_pre=2000,
+            nms_post=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        img_rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        img_rpn=dict(
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_post=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        img_rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)))
diff --git a/adzoo/bevformer/configs/_base_/models/mask_rcnn_r50_fpn.py b/adzoo/bevformer/configs/_base_/models/mask_rcnn_r50_fpn.py
new file mode 100644
index 0000000..c5d5e32
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/mask_rcnn_r50_fpn.py
@@ -0,0 +1,124 @@
+# model settings
+model = dict(
+    type='MaskRCNN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_across_levels=False,
+            nms_pre=2000,
+            nms_post=1000,
+            max_num=1000,
+            nms_thr=0.7,
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            mask_size=28,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_post=1000,
+            max_num=1000,
+            nms_thr=0.7,
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/adzoo/bevformer/configs/_base_/models/paconv_cuda_ssg.py b/adzoo/bevformer/configs/_base_/models/paconv_cuda_ssg.py
new file mode 100644
index 0000000..f513bd4
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/paconv_cuda_ssg.py
@@ -0,0 +1,7 @@
+_base_ = './paconv_ssg.py'
+
+model = dict(
+    backbone=dict(
+        sa_cfg=dict(
+            type='PAConvCUDASAModule',
+            scorenet_cfg=dict(mlp_channels=[8, 16, 16]))))
diff --git a/adzoo/bevformer/configs/_base_/models/paconv_ssg.py b/adzoo/bevformer/configs/_base_/models/paconv_ssg.py
new file mode 100644
index 0000000..1d4f1ed
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/paconv_ssg.py
@@ -0,0 +1,49 @@
+# model settings
+model = dict(
+    type='EncoderDecoder3D',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=9,  # [xyz, rgb, normalized_xyz]
+        num_points=(1024, 256, 64, 16),
+        radius=(None, None, None, None),  # use kNN instead of ball query
+        num_samples=(32, 32, 32, 32),
+        sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256,
+                                                                    512)),
+        fp_channels=(),
+        norm_cfg=dict(type='BN2d', momentum=0.1),
+        sa_cfg=dict(
+            type='PAConvSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False,
+            paconv_num_kernels=[16, 16, 16],
+            paconv_kernel_input='w_neighbor',
+            scorenet_input='w_neighbor_dist',
+            scorenet_cfg=dict(
+                mlp_channels=[16, 16, 16],
+                score_norm='softmax',
+                temp_factor=1.0,
+                last_bn=False))),
+    decode_head=dict(
+        type='PAConvHead',
+        # PAConv model's decoder takes skip connections from beckbone
+        # different from PointNet++, it also concats input features in the last
+        # level of decoder, leading to `128 + 6` as the channel number
+        fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128),
+                     (128 + 6, 128, 128, 128)),
+        channels=128,
+        dropout_ratio=0.5,
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        act_cfg=dict(type='ReLU'),
+        loss_decode=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            class_weight=None,  # should be modified with dataset
+            loss_weight=1.0)),
+    # correlation loss to regularize PAConv's kernel weights
+    loss_regularization=dict(
+        type='PAConvRegularizationLoss', reduction='sum', loss_weight=10.0),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide'))
diff --git a/adzoo/bevformer/configs/_base_/models/parta2.py b/adzoo/bevformer/configs/_base_/models/parta2.py
new file mode 100644
index 0000000..6c5ae9a
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/parta2.py
@@ -0,0 +1,201 @@
+# model settings
+voxel_size = [0.05, 0.05, 0.1]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+model = dict(
+    type='PartA2',
+    voxel_layer=dict(
+        max_num_points=5,  # max_points_per_voxel
+        point_cloud_range=point_cloud_range,
+        voxel_size=voxel_size,
+        max_voxels=(16000, 40000)  # (training, testing) max_voxels
+    ),
+    voxel_encoder=dict(type='HardSimpleVFE'),
+    middle_encoder=dict(
+        type='SparseUNet',
+        in_channels=4,
+        sparse_shape=[41, 1600, 1408],
+        order=('conv', 'norm', 'act')),
+    backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    rpn_head=dict(
+        type='PartA2RPNHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -1.78, 70.4, 40.0, -1.78]],
+            sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        assigner_per_size=True,
+        assign_per_class=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    roi_head=dict(
+        type='PartAggregationROIHead',
+        num_classes=3,
+        semantic_head=dict(
+            type='PointwiseSemanticHead',
+            in_channels=16,
+            extra_width=0.2,
+            seg_score_thr=0.3,
+            num_classes=3,
+            loss_seg=dict(
+                type='FocalLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_part=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+        seg_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='max')),
+        part_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='avg')),
+        bbox_head=dict(
+            type='PartA2BboxHead',
+            num_classes=3,
+            seg_in_channels=16,
+            part_in_channels=4,
+            seg_conv_channels=[64, 64],
+            part_conv_channels=[64, 64],
+            merge_conv_channels=[128, 128],
+            down_conv_channels=[128, 256],
+            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+            shared_fc_channels=[256, 512, 512, 512],
+            cls_channels=[256, 256],
+            reg_channels=[256, 256],
+            dropout_ratio=0.1,
+            roi_feat_size=14,
+            with_corner_loss=True,
+            loss_bbox=dict(
+                type='SmoothL1Loss',
+                beta=1.0 / 9.0,
+                reduction='sum',
+                loss_weight=1.0),
+            loss_cls=dict(
+                type='CrossEntropyLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=[
+                dict(  # for Pedestrian
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.35,
+                    min_pos_iou=0.35,
+                    ignore_iof_thr=-1),
+                dict(  # for Cyclist
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.35,
+                    min_pos_iou=0.35,
+                    ignore_iof_thr=-1),
+                dict(  # for Car
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.45,
+                    min_pos_iou=0.45,
+                    ignore_iof_thr=-1)
+            ],
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=9000,
+            nms_post=512,
+            max_num=512,
+            nms_thr=0.8,
+            score_thr=0,
+            use_rotate_nms=False),
+        rcnn=dict(
+            assigner=[
+                dict(  # for Pedestrian
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1),
+                dict(  # for Cyclist
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1),
+                dict(  # for Car
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1)
+            ],
+            sampler=dict(
+                type='IoUNegPiecewiseSampler',
+                num=128,
+                pos_fraction=0.55,
+                neg_piece_fractions=[0.8, 0.2],
+                neg_iou_piece_thrs=[0.55, 0.1],
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False,
+                return_iou=True),
+            cls_pos_thr=0.75,
+            cls_neg_thr=0.25)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1024,
+            nms_post=100,
+            max_num=100,
+            nms_thr=0.7,
+            score_thr=0,
+            use_rotate_nms=True),
+        rcnn=dict(
+            use_rotate_nms=True,
+            use_raw_score=True,
+            nms_thr=0.01,
+            score_thr=0.1)))
diff --git a/adzoo/bevformer/configs/_base_/models/pointnet2_msg.py b/adzoo/bevformer/configs/_base_/models/pointnet2_msg.py
new file mode 100644
index 0000000..222ab88
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/pointnet2_msg.py
@@ -0,0 +1,28 @@
+_base_ = './pointnet2_ssg.py'
+
+# model settings
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='PointNet2SAMSG',
+        in_channels=6,  # [xyz, rgb], should be modified with dataset
+        num_points=(1024, 256, 64, 16),
+        radii=((0.05, 0.1), (0.1, 0.2), (0.2, 0.4), (0.4, 0.8)),
+        num_samples=((16, 32), (16, 32), (16, 32), (16, 32)),
+        sa_channels=(((16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 96,
+                                                                    128)),
+                     ((128, 196, 256), (128, 196, 256)), ((256, 256, 512),
+                                                          (256, 384, 512))),
+        aggregation_channels=(None, None, None, None),
+        fps_mods=(('D-FPS'), ('D-FPS'), ('D-FPS'), ('D-FPS')),
+        fps_sample_range_lists=((-1), (-1), (-1), (-1)),
+        dilated_group=(False, False, False, False),
+        out_indices=(0, 1, 2, 3),
+        sa_cfg=dict(
+            type='PointSAModuleMSG',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False)),
+    decode_head=dict(
+        fp_channels=((1536, 256, 256), (512, 256, 256), (352, 256, 128),
+                     (128, 128, 128, 128))))
diff --git a/adzoo/bevformer/configs/_base_/models/pointnet2_ssg.py b/adzoo/bevformer/configs/_base_/models/pointnet2_ssg.py
new file mode 100644
index 0000000..58b4c24
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/pointnet2_ssg.py
@@ -0,0 +1,35 @@
+# model settings
+model = dict(
+    type='EncoderDecoder3D',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=6,  # [xyz, rgb], should be modified with dataset
+        num_points=(1024, 256, 64, 16),
+        radius=(0.1, 0.2, 0.4, 0.8),
+        num_samples=(32, 32, 32, 32),
+        sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256,
+                                                                    512)),
+        fp_channels=(),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False)),
+    decode_head=dict(
+        type='PointNet2Head',
+        fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128),
+                     (128, 128, 128, 128)),
+        channels=128,
+        dropout_ratio=0.5,
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        act_cfg=dict(type='ReLU'),
+        loss_decode=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            class_weight=None,  # should be modified with dataset
+            loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide'))
diff --git a/adzoo/bevformer/configs/_base_/models/votenet.py b/adzoo/bevformer/configs/_base_/models/votenet.py
new file mode 100644
index 0000000..129339d
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/models/votenet.py
@@ -0,0 +1,73 @@
+model = dict(
+    type='VoteNet',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=4,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                     (128, 128, 256)),
+        fp_channels=((256, 256), (256, 256)),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)),
+    bbox_head=dict(
+        type='VoteHead',
+        vote_module_cfg=dict(
+            in_channels=256,
+            vote_per_seed=1,
+            gt_per_seed=3,
+            conv_channels=(256, 256),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            norm_feats=True,
+            vote_loss=dict(
+                type='ChamferDistance',
+                mode='l1',
+                reduction='none',
+                loss_dst_weight=10.0)),
+        vote_aggregation_cfg=dict(
+            type='PointSAModule',
+            num_point=256,
+            radius=0.3,
+            num_sample=16,
+            mlp_channels=[256, 128, 128, 128],
+            use_xyz=True,
+            normalize_xyz=True),
+        pred_layer_cfg=dict(
+            in_channels=128, shared_conv_channels=(128, 128), bias=True),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        objectness_loss=dict(
+            type='CrossEntropyLoss',
+            class_weight=[0.2, 0.8],
+            reduction='sum',
+            loss_weight=5.0),
+        center_loss=dict(
+            type='ChamferDistance',
+            mode='l2',
+            reduction='sum',
+            loss_src_weight=10.0,
+            loss_dst_weight=10.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
+        semantic_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'),
+    test_cfg=dict(
+        sample_mod='seed',
+        nms_thr=0.25,
+        score_thr=0.05,
+        per_class_proposal=True))
diff --git a/adzoo/bevformer/configs/_base_/schedules/cosine.py b/adzoo/bevformer/configs/_base_/schedules/cosine.py
new file mode 100644
index 0000000..69cb7df
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/schedules/cosine.py
@@ -0,0 +1,20 @@
+# This schedule is mainly used by models with dynamic voxelization
+# optimizer
+lr = 0.003  # max learning rate
+optimizer = dict(
+    type='AdamW',
+    lr=lr,
+    betas=(0.95, 0.99),  # the momentum is change during training
+    weight_decay=0.001)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 10,
+    min_lr_ratio=1e-5)
+
+momentum_config = None
+
+runner = dict(type='EpochBasedRunner', max_epochs=40)
diff --git a/adzoo/bevformer/configs/_base_/schedules/cyclic_20e.py b/adzoo/bevformer/configs/_base_/schedules/cyclic_20e.py
new file mode 100644
index 0000000..704740e
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/schedules/cyclic_20e.py
@@ -0,0 +1,24 @@
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 20. Please change the interval accordingly if you do not
+# use a default schedule.
+# optimizer
+# This schedule is mainly used by models on nuScenes dataset
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='cyclic',
+    target_ratio=(10, 1e-4),
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+momentum_config = dict(
+    policy='cyclic',
+    target_ratio=(0.85 / 0.95, 1),
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=20)
diff --git a/adzoo/bevformer/configs/_base_/schedules/cyclic_40e.py b/adzoo/bevformer/configs/_base_/schedules/cyclic_40e.py
new file mode 100644
index 0000000..4a711ac
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/schedules/cyclic_40e.py
@@ -0,0 +1,31 @@
+# The schedule is usually used by models trained on KITTI dataset
+
+# The learning rate set in the cyclic schedule is the initial learning rate
+# rather than the max learning rate. Since the target_ratio is (10, 1e-4),
+# the learning rate will change from 0.0018 to 0.018, than go to 0.0018*1e-4
+lr = 0.0018
+# The optimizer follows the setting in SECOND.Pytorch, but here we use
+# the offcial AdamW optimizer implemented by PyTorch.
+optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+# We use cyclic learning rate and momentum schedule following SECOND.Pytorch
+# https://github.com/traveller59/second.pytorch/blob/3aba19c9688274f75ebb5e576f65cfe54773c021/torchplus/train/learning_schedules_fastai.py#L69  # noqa
+# We implement them in mmcv, for more details, please refer to
+# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L327  # noqa
+# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/momentum_updater.py#L130  # noqa
+lr_config = dict(
+    policy='cyclic',
+    target_ratio=(10, 1e-4),
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+momentum_config = dict(
+    policy='cyclic',
+    target_ratio=(0.85 / 0.95, 1),
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+# Although the max_epochs is 40, this schedule is usually used we
+# RepeatDataset with repeat ratio N, thus the actual max epoch
+# number could be Nx40
+runner = dict(type='EpochBasedRunner', max_epochs=40)
diff --git a/adzoo/bevformer/configs/_base_/schedules/mmdet_schedule_1x.py b/adzoo/bevformer/configs/_base_/schedules/mmdet_schedule_1x.py
new file mode 100644
index 0000000..13b3783
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/schedules/mmdet_schedule_1x.py
@@ -0,0 +1,11 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[8, 11])
+runner = dict(type='EpochBasedRunner', max_epochs=12)
diff --git a/adzoo/bevformer/configs/_base_/schedules/schedule_2x.py b/adzoo/bevformer/configs/_base_/schedules/schedule_2x.py
new file mode 100644
index 0000000..afde799
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/schedules/schedule_2x.py
@@ -0,0 +1,14 @@
+# optimizer
+# This schedule is mainly used by models on nuScenes dataset
+optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[20, 23])
+momentum_config = None
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/adzoo/bevformer/configs/_base_/schedules/schedule_3x.py b/adzoo/bevformer/configs/_base_/schedules/schedule_3x.py
new file mode 100644
index 0000000..115cd26
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/schedules/schedule_3x.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used by models on indoor dataset,
+# e.g., VoteNet on SUNRGBD and ScanNet
+lr = 0.008  # max learning rate
+optimizer = dict(type='AdamW', lr=lr, weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+lr_config = dict(policy='step', warmup=None, step=[24, 32])
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/adzoo/bevformer/configs/_base_/schedules/seg_cosine_150e.py b/adzoo/bevformer/configs/_base_/schedules/seg_cosine_150e.py
new file mode 100644
index 0000000..04b44e5
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/schedules/seg_cosine_150e.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optimizer = dict(type='SGD', lr=0.2, weight_decay=0.0001, momentum=0.9)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=0.002)
+momentum_config = None
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=150)
diff --git a/adzoo/bevformer/configs/_base_/schedules/seg_cosine_200e.py b/adzoo/bevformer/configs/_base_/schedules/seg_cosine_200e.py
new file mode 100644
index 0000000..6a49484
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/schedules/seg_cosine_200e.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used on ScanNet dataset in segmentation task
+optimizer = dict(type='Adam', lr=0.001, weight_decay=0.01)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5)
+momentum_config = None
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/adzoo/bevformer/configs/_base_/schedules/seg_cosine_50e.py b/adzoo/bevformer/configs/_base_/schedules/seg_cosine_50e.py
new file mode 100644
index 0000000..975a8f9
--- /dev/null
+++ b/adzoo/bevformer/configs/_base_/schedules/seg_cosine_50e.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optimizer = dict(type='Adam', lr=0.001, weight_decay=0.001)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5)
+momentum_config = None
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=50)
diff --git a/adzoo/bevformer/configs/bevformer/bevformer_base.py b/adzoo/bevformer/configs/bevformer/bevformer_base.py
new file mode 100644
index 0000000..c67c978
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformer/bevformer_base.py
@@ -0,0 +1,260 @@
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+
+
+
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 4
+bev_h_ = 200
+bev_w_ = 200
+queue_length = 4 # each sequence contains `queue_length` frames.
+num_cams = 6
+model = dict(
+    type='BEVFormer',
+    use_grid_mask=True,
+    video_test_mode=True,
+    img_backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN2d', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), # original DCNv2 will print log when perform load_state_dict
+        stage_with_dcn=(False, False, True, True)),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=4,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='BEVFormerHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=10,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='BEVFormerPerceptionTransformer',
+            num_cams=num_cams,
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=6,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            num_cams=num_cams,
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='HungarianAssigner3D',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+            pc_range=point_cloud_range))))
+
+dataset_type = 'CustomNuScenesDataset'
+data_root = 'data/nuscenes/'
+anno_root = 'data/infos/'
+file_client_args = dict(backend='disk')
+
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=1,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=anno_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        bev_size=(bev_h_, bev_w_),
+        queue_length=queue_length,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             ann_file=anno_root + 'nuscenes_infos_temporal_val.pkl',
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             classes=class_names, modality=input_modality, samples_per_gpu=1),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              ann_file=anno_root + 'nuscenes_infos_temporal_val.pkl',
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              classes=class_names, modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 24
+evaluation = dict(interval=1, pipeline=test_pipeline)
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+load_from = 'ckpts/r101_dcn_fcos3d_pretrain.pth'
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+
+checkpoint_config = dict(interval=1)
diff --git a/adzoo/bevformer/configs/bevformer/bevformer_base_b2d.py b/adzoo/bevformer/configs/bevformer/bevformer_base_b2d.py
new file mode 100644
index 0000000..f987e1b
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformer/bevformer_base_b2d.py
@@ -0,0 +1,363 @@
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+
+
+
+
+
+NameMapping = {
+    #=================vehicle=================
+    # bicycle
+    'vehicle.bh.crossbike': 'bicycle',
+    "vehicle.diamondback.century": 'bicycle',
+    "vehicle.gazelle.omafiets": 'bicycle',
+    # car
+    "vehicle.chevrolet.impala": 'car',
+    "vehicle.dodge.charger_2020": 'car',
+    "vehicle.dodge.charger_police": 'car',
+    "vehicle.dodge.charger_police_2020": 'car',
+    "vehicle.lincoln.mkz_2017": 'car',
+    "vehicle.lincoln.mkz_2020": 'car',
+    "vehicle.mini.cooper_s_2021": 'car',
+    "vehicle.mercedes.coupe_2020": 'car',
+    "vehicle.ford.mustang": 'car',
+    "vehicle.nissan.patrol_2021": 'car',
+    "vehicle.audi.tt": 'car',
+    "vehicle.audi.etron": 'car',
+    "vehicle.ford.crown": 'car',
+    "vehicle.ford.mustang": 'car',
+    "vehicle.tesla.model3": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/FordCrown/SM_FordCrown_parked.SM_FordCrown_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Charger/SM_ChargerParked.SM_ChargerParked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Lincoln/SM_LincolnParked.SM_LincolnParked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/MercedesCCC/SM_MercedesCCC_Parked.SM_MercedesCCC_Parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Mini2021/SM_Mini2021_parked.SM_Mini2021_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/NissanPatrol2021/SM_NissanPatrol2021_parked.SM_NissanPatrol2021_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/TeslaM3/SM_TeslaM3_parked.SM_TeslaM3_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": 'car',
+    # bus
+    # van
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": "van",
+    "vehicle.ford.ambulance": "van",
+    # truck
+    "vehicle.carlamotors.firetruck": 'truck',
+    #=========================================
+
+    #=================traffic sign============
+    # traffic.speed_limit
+    "traffic.speed_limit.30": 'traffic_sign',
+    "traffic.speed_limit.40": 'traffic_sign',
+    "traffic.speed_limit.50": 'traffic_sign',
+    "traffic.speed_limit.60": 'traffic_sign',
+    "traffic.speed_limit.90": 'traffic_sign',
+    "traffic.speed_limit.120": 'traffic_sign',
+    
+    "traffic.stop": 'traffic_sign',
+    "traffic.yield": 'traffic_sign',
+    "traffic.traffic_light": 'traffic_light',
+    #=========================================
+
+    #===================Construction===========
+    "static.prop.warningconstruction" : 'traffic_cone',
+    "static.prop.warningaccident": 'traffic_cone',
+    "static.prop.trafficwarning": "traffic_cone",
+
+    #===================Construction===========
+    "static.prop.constructioncone": 'traffic_cone',
+
+    #=================pedestrian==============
+    "walker.pedestrian.0001": 'pedestrian',
+    "walker.pedestrian.0004": 'pedestrian',
+    "walker.pedestrian.0005": 'pedestrian',
+    "walker.pedestrian.0007": 'pedestrian',
+    "walker.pedestrian.0013": 'pedestrian',
+    "walker.pedestrian.0014": 'pedestrian',
+    "walker.pedestrian.0017": 'pedestrian',
+    "walker.pedestrian.0018": 'pedestrian',
+    "walker.pedestrian.0019": 'pedestrian',
+    "walker.pedestrian.0020": 'pedestrian',
+    "walker.pedestrian.0022": 'pedestrian',
+    "walker.pedestrian.0025": 'pedestrian',
+    "walker.pedestrian.0035": 'pedestrian',
+    "walker.pedestrian.0041": 'pedestrian',
+    "walker.pedestrian.0046": 'pedestrian',
+    "walker.pedestrian.0047": 'pedestrian',
+
+    # ==========================================
+    "static.prop.dirtdebris01": 'others',
+    "static.prop.dirtdebris02": 'others',
+}
+
+
+
+
+eval_cfg = {
+            "dist_ths": [0.5, 1.0, 2.0, 4.0],
+            "dist_th_tp": 2.0,
+            "min_recall": 0.1,
+            "min_precision": 0.1,
+            "mean_ap_weight": 5,
+            "class_names":['car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian'],
+            "tp_metrics":['trans_err', 'scale_err', 'orient_err', 'vel_err'],
+            "err_name_maping":{'trans_err': 'mATE','scale_err': 'mASE','orient_err': 'mAOE','vel_err': 'mAVE','attr_err': 'mAAE'},
+            "class_range":{'car':(50,50),'van':(50,50),'truck':(50,50),'bicycle':(40,40),'traffic_sign':(30,30),'traffic_cone':(30,30),'traffic_light':(30,30),'pedestrian':(40,40)}
+            }
+
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+
+class_names = [
+'car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian','others'
+]
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 4
+bev_h_ = 200
+bev_w_ = 200
+queue_length = 4 # each sequence contains `queue_length` frames.
+
+model = dict(
+    type='BEVFormer',
+    use_grid_mask=True,
+    video_test_mode=True,
+    img_backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN2d', requires_grad=False),
+        norm_eval=True,
+        style='caffe',
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), # original DCNv2 will print log when perform load_state_dict
+        stage_with_dcn=(False, False, True, True)),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=4,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='BEVFormerHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=len(class_names),
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='BEVFormerPerceptionTransformer',
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=6,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=len(class_names)),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='HungarianAssigner3D',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+            pc_range=point_cloud_range))))
+
+dataset_type = "B2D_Dataset"
+data_root = "data/bench2drive"
+info_root = "data/infos"
+file_client_args = dict(backend="disk")
+ann_file_train=info_root + f"/b2d_infos_train.pkl"
+ann_file_val=info_root + f"/b2d_infos_val.pkl"
+ann_file_test=info_root + f"/b2d_infos_val.pkl"
+
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=6,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=ann_file_train,
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        bev_size=(bev_h_, bev_w_),
+        queue_length=queue_length,
+        sample_interval=5,
+        name_mapping=NameMapping,
+        eval_cfg=eval_cfg,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             ann_file=ann_file_val,
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             classes=class_names, modality=input_modality, samples_per_gpu=1,sample_interval=5,        name_mapping=NameMapping,eval_cfg=eval_cfg,),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              ann_file=ann_file_val,
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              classes=class_names, modality=input_modality,sample_interval=5,
+              name_mapping=NameMapping,eval_cfg=eval_cfg,),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 24
+evaluation = dict(interval=1, pipeline=test_pipeline)
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+load_from = 'ckpts/r101_dcn_fcos3d_pretrain.pth'
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+
+checkpoint_config = dict(interval=1)
diff --git a/adzoo/bevformer/configs/bevformer/bevformer_tiny.py b/adzoo/bevformer/configs/bevformer/bevformer_tiny.py
new file mode 100644
index 0000000..78858ee
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformer/bevformer_tiny.py
@@ -0,0 +1,270 @@
+# BEvFormer-tiny consumes at lease 6700M GPU memory
+# compared to bevformer_base, bevformer_tiny has
+# smaller backbone: R101-DCN -> R50
+# smaller BEV: 200*200 -> 50*50
+# less encoder layers: 6 -> 3
+# smaller input size: 1600*900 -> 800*450
+# multi-scale feautres -> single scale features (C5)
+
+
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+
+
+
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 1
+bev_h_ = 50
+bev_w_ = 50
+queue_length = 3 # each sequence contains `queue_length` frames.
+
+model = dict(
+    type='BEVFormer',
+    use_grid_mask=True,
+    video_test_mode=True,
+    pretrained=dict(img='torchvision://resnet50'),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3,),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='BEVFormerHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=10,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='PerceptionTransformer',
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=3,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='HungarianAssigner3D',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+            pc_range=point_cloud_range))))
+
+dataset_type = 'CustomNuScenesDataset'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+   
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+            dict(type='PadMultiViewImage', size_divisor=32),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        bev_size=(bev_h_, bev_w_),
+        queue_length=queue_length,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             classes=class_names, modality=input_modality, samples_per_gpu=1),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              classes=class_names, modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 24
+evaluation = dict(interval=1, pipeline=test_pipeline)
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+
+checkpoint_config = dict(interval=1)
diff --git a/adzoo/bevformer/configs/bevformer/bevformer_tiny_b2d.py b/adzoo/bevformer/configs/bevformer/bevformer_tiny_b2d.py
new file mode 100644
index 0000000..d4f92f7
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformer/bevformer_tiny_b2d.py
@@ -0,0 +1,360 @@
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+
+
+
+
+
+NameMapping = {
+    #=================vehicle=================
+    # bicycle
+    'vehicle.bh.crossbike': 'bicycle',
+    "vehicle.diamondback.century": 'bicycle',
+    "vehicle.gazelle.omafiets": 'bicycle',
+    # car
+    "vehicle.chevrolet.impala": 'car',
+    "vehicle.dodge.charger_2020": 'car',
+    "vehicle.dodge.charger_police": 'car',
+    "vehicle.dodge.charger_police_2020": 'car',
+    "vehicle.lincoln.mkz_2017": 'car',
+    "vehicle.lincoln.mkz_2020": 'car',
+    "vehicle.mini.cooper_s_2021": 'car',
+    "vehicle.mercedes.coupe_2020": 'car',
+    "vehicle.ford.mustang": 'car',
+    "vehicle.nissan.patrol_2021": 'car',
+    "vehicle.audi.tt": 'car',
+    "vehicle.audi.etron": 'car',
+    "vehicle.ford.crown": 'car',
+    "vehicle.ford.mustang": 'car',
+    "vehicle.tesla.model3": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/FordCrown/SM_FordCrown_parked.SM_FordCrown_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Charger/SM_ChargerParked.SM_ChargerParked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Lincoln/SM_LincolnParked.SM_LincolnParked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/MercedesCCC/SM_MercedesCCC_Parked.SM_MercedesCCC_Parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Mini2021/SM_Mini2021_parked.SM_Mini2021_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/NissanPatrol2021/SM_NissanPatrol2021_parked.SM_NissanPatrol2021_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/TeslaM3/SM_TeslaM3_parked.SM_TeslaM3_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": 'car',
+    # bus
+    # van
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": "van",
+    "vehicle.ford.ambulance": "van",
+    # truck
+    "vehicle.carlamotors.firetruck": 'truck',
+    #=========================================
+
+    #=================traffic sign============
+    # traffic.speed_limit
+    "traffic.speed_limit.30": 'traffic_sign',
+    "traffic.speed_limit.40": 'traffic_sign',
+    "traffic.speed_limit.50": 'traffic_sign',
+    "traffic.speed_limit.60": 'traffic_sign',
+    "traffic.speed_limit.90": 'traffic_sign',
+    "traffic.speed_limit.120": 'traffic_sign',
+    
+    "traffic.stop": 'traffic_sign',
+    "traffic.yield": 'traffic_sign',
+    "traffic.traffic_light": 'traffic_light',
+    #=========================================
+
+    #===================Construction===========
+    "static.prop.warningconstruction" : 'traffic_cone',
+    "static.prop.warningaccident": 'traffic_cone',
+    "static.prop.trafficwarning": "traffic_cone",
+
+    #===================Construction===========
+    "static.prop.constructioncone": 'traffic_cone',
+
+    #=================pedestrian==============
+    "walker.pedestrian.0001": 'pedestrian',
+    "walker.pedestrian.0004": 'pedestrian',
+    "walker.pedestrian.0005": 'pedestrian',
+    "walker.pedestrian.0007": 'pedestrian',
+    "walker.pedestrian.0013": 'pedestrian',
+    "walker.pedestrian.0014": 'pedestrian',
+    "walker.pedestrian.0017": 'pedestrian',
+    "walker.pedestrian.0018": 'pedestrian',
+    "walker.pedestrian.0019": 'pedestrian',
+    "walker.pedestrian.0020": 'pedestrian',
+    "walker.pedestrian.0022": 'pedestrian',
+    "walker.pedestrian.0025": 'pedestrian',
+    "walker.pedestrian.0035": 'pedestrian',
+    "walker.pedestrian.0041": 'pedestrian',
+    "walker.pedestrian.0046": 'pedestrian',
+    "walker.pedestrian.0047": 'pedestrian',
+
+    # ==========================================
+    "static.prop.dirtdebris01": 'others',
+    "static.prop.dirtdebris02": 'others',
+}
+
+
+
+
+eval_cfg = {
+            "dist_ths": [0.5, 1.0, 2.0, 4.0],
+            "dist_th_tp": 2.0,
+            "min_recall": 0.1,
+            "min_precision": 0.1,
+            "mean_ap_weight": 5,
+            "class_names":['car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian'],
+            "tp_metrics":['trans_err', 'scale_err', 'orient_err', 'vel_err'],
+            "err_name_maping":{'trans_err': 'mATE','scale_err': 'mASE','orient_err': 'mAOE','vel_err': 'mAVE','attr_err': 'mAAE'},
+            "class_range":{'car':(50,50),'van':(50,50),'truck':(50,50),'bicycle':(40,40),'traffic_sign':(30,30),'traffic_cone':(30,30),'traffic_light':(30,30),'pedestrian':(40,40)}
+            }
+
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+
+class_names = [
+'car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian','others'
+]
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_ // 2
+_ffn_dim_ = _dim_ * 2
+_num_levels_ = 4
+bev_h_ = 100
+bev_w_ = 100
+queue_length = 3 # each sequence contains `queue_length` frames.
+
+model = dict(
+    type='BEVFormer',
+    use_grid_mask=True,
+    video_test_mode=True,
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(1,2,3),
+        frozen_stages=4,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='BEVFormerHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=len(class_names),
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='BEVFormerPerceptionTransformer',
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=3,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.0,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=_dim_//32,
+                            dropout=0.0),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.0,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=len(class_names)),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='HungarianAssigner3D',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+            pc_range=point_cloud_range))))
+
+dataset_type = "B2D_Dataset"
+data_root = "data/bench2drive"
+info_root = "data/infos"
+file_client_args = dict(backend="disk")
+ann_file_train=info_root + f"/b2d_infos_train.pkl"
+ann_file_val=info_root + f"/b2d_infos_val.pkl"
+ann_file_test=info_root + f"/b2d_infos_val.pkl"
+
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=6,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=ann_file_train,
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        bev_size=(bev_h_, bev_w_),
+        queue_length=queue_length,
+        sample_interval=5,
+        name_mapping=NameMapping,
+        eval_cfg=eval_cfg,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             ann_file=ann_file_val,
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             classes=class_names, modality=input_modality, samples_per_gpu=1,sample_interval=5,        name_mapping=NameMapping,eval_cfg=eval_cfg,),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              ann_file=ann_file_val,
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              classes=class_names, modality=input_modality,sample_interval=5,
+              name_mapping=NameMapping,eval_cfg=eval_cfg,),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    by_epoch=False,
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 1
+evaluation = dict(interval=1, pipeline=test_pipeline)
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+
+checkpoint_config = dict(interval=3000, by_epoch=False)
diff --git a/adzoo/bevformer/configs/bevformer_fp16/bevformer_tiny_fp16.py b/adzoo/bevformer/configs/bevformer_fp16/bevformer_tiny_fp16.py
new file mode 100644
index 0000000..aa1e043
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformer_fp16/bevformer_tiny_fp16.py
@@ -0,0 +1,272 @@
+# BEvFormer-tiny consumes at lease 6700M GPU memory
+# compared to bevformer_base, bevformer_tiny has
+# smaller backbone: R101-DCN -> R50
+# smaller BEV: 200*200 -> 50*50
+# less encoder layers: 6 -> 3
+# smaller input size: 1600*900 -> 800*450
+# multi-scale feautres -> single scale features (C5)
+
+
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+#
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+
+
+
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 1
+bev_h_ = 50
+bev_w_ = 50
+queue_length = 3 # each sequence contains `queue_length` frames.
+
+model = dict(
+    type='BEVFormer_fp16',
+    use_grid_mask=True,
+    video_test_mode=True,
+    pretrained=dict(img='torchvision://resnet50'),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3,),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='BEVFormerHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=10,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='PerceptionTransformer',
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=3,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='HungarianAssigner3D',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+            pc_range=point_cloud_range))))
+
+dataset_type = 'CustomNuScenesDataset'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+   
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='RandomScaleImageMultiViewImage', scales=[0.5]),
+            dict(type='PadMultiViewImage', size_divisor=32),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=8,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        bev_size=(bev_h_, bev_w_),
+        queue_length=queue_length,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             classes=class_names, modality=input_modality, samples_per_gpu=1),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              classes=class_names, modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=2.8e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+total_epochs = 24
+evaluation = dict(interval=1, pipeline=test_pipeline)
+
+runner = dict(type='EpochBasedRunner_video', max_epochs=total_epochs)
+
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+
+fp16 = dict(loss_scale=512.)
+checkpoint_config = dict(interval=1)
+custom_hooks = [dict(type='TransferWeight',priority='LOWEST')]
\ No newline at end of file
diff --git a/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-24ep.py b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-24ep.py
new file mode 100644
index 0000000..594f34b
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-24ep.py
@@ -0,0 +1,360 @@
+# mAP: 0.3805
+# mATE: 0.7198
+# mASE: 0.2805
+# mAOE: 0.4131
+# mAVE: 0.7652
+# mAAE: 0.1951
+# NDS: 0.4529
+_base_ = [
+    '../_base_/default_runtime.py'
+]
+# Dataset
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
+    'pedestrian', 'traffic_cone', 'trailer', 'truck'
+]
+dataset_type = 'CustomNuScenesDatasetV2'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
+bev_h_ = 200
+bev_w_ = 200
+frames = (0,)
+group_detr = 11
+voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
+ida_aug_conf = {
+    "reisze": [512, 544, 576, 608, 640, 672, 704, 736, 768],  #  (0.8, 1.2)
+    "crop": (0, 260, 1600, 900),
+    "H": 900,
+    "W": 1600,
+    "rand_flip": True,
+}
+ida_aug_conf_eval = {
+    "reisze": [640, ],
+    "crop": (0, 260, 1600, 900),
+    "H": 900,
+    "W": 1600,
+    "rand_flip": False,
+}
+# file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='GlobalRotScaleTransImage',
+        rot_range=[-22.5, 22.5],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0],
+        reverse_angle=True,
+        training=True,
+        flip_dx_ratio=0.5,
+        flip_dy_ratio=0.5,
+        only_gt=True,),
+    dict(
+        type='ObjectRangeFilter',
+        point_cloud_range=point_cloud_range),
+    dict(
+        type='ObjectNameFilter',
+        classes=class_names),
+    dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='CustomCollect3D',
+        keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
+              'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
+              'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
+    dict(type='DD3DMapper',
+         is_train=True,
+         tasks=dict(box2d_on=True, box3d_on=True),)
+]
+eval_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
+    dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 640),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D',
+                 keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
+                       'lidar2ego_rotation', 'timestamp'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    persistent_workers=True,
+    train=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        box_type_3d='LiDAR',
+        mono_cfg=dict(
+            name='nusc_trainval',
+            data_root='data/nuscenes/',
+            min_num_lidar_points=3,
+            min_box_visibility=0.2)),
+    val=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root='data/nuscenes/',
+        ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=eval_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        samples_per_gpu=1),
+    test=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root='data/nuscenes/',
+        ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=eval_pipeline,
+        classes=class_names,
+        modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler'))
+evaluation = dict(interval=4, pipeline=eval_pipeline)
+
+# model
+load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+_dim_ = 256
+_pos_dim_ = 128
+_ffn_dim_ = 512
+_num_levels_ = 4
+_num_mono_levels_ = 5
+
+model = dict(
+    type='BEVFormerV2',
+    use_grid_mask=True,
+    video_test_mode=False,
+    num_levels=_num_levels_,
+    num_mono_levels=_num_mono_levels_,
+    mono_loss_weight=1.0,
+    frames=frames,
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='SyncBN'),
+        norm_eval=False,
+        style='caffe'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_mono_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='BEVFormerHead_GroupDETR',
+        group_detr=group_detr,
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=10,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='PerceptionTransformerV2',
+            embed_dims=_dim_,
+            frames=frames,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=6,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=4),
+                            embed_dims=_dim_)
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='GroupMultiheadAttention',
+                            group=group_detr,
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1)
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    fcos3d_bbox_head=dict(
+        type='NuscenesDD3D',
+        num_classes=10,
+        in_channels=_dim_,
+        strides=[8, 16, 32, 64, 128],
+        box3d_on=True,
+        feature_locations_offset='none',
+        fcos2d_cfg=dict(
+            num_cls_convs=4,
+            num_box_convs=4,
+            norm='SyncBN',
+            use_deformable=False,
+            use_scale=True,
+            box2d_scale_init_factor=1.0),
+        fcos2d_loss_cfg=dict(
+            focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
+        fcos3d_cfg=dict(
+            num_convs=4,
+            norm='SyncBN',
+            use_scale=True,
+            depth_scale_init_factor=0.3,
+            proj_ctr_scale_init_factor=1.0,
+            use_per_level_predictors=False,
+            class_agnostic=False,
+            use_deformable=False,
+            mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
+            std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
+        fcos3d_loss_cfg=dict(
+            min_depth=0.1,
+            max_depth=80.0,
+            box3d_loss_weight=2.0,
+            conf3d_loss_weight=1.0,
+            conf_3d_temperature=1.0,
+            smooth_l1_loss_beta=0.05,
+            max_loss_per_group=20,
+            predict_allocentric_rot=True,
+            scale_depth_by_focal_lengths=True,
+            scale_depth_by_focal_lengths_factor=500.0,
+            class_agnostic=False,
+            predict_distance=False,
+            canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
+                             [0.61416006, 1.7016163, 1.3054738],
+                             [2.9139307, 10.725025, 3.2832346],
+                             [1.9751819, 4.641267, 1.74352],
+                             [2.772134, 6.565072, 3.2474296],
+                             [0.7800532, 2.138673, 1.4437162],
+                             [0.6667362, 0.7181772, 1.7616143],
+                             [0.40246472, 0.4027083, 1.0084083],
+                             [3.0059454, 12.8197, 4.1213827],
+                             [2.4986045, 6.9310856, 2.8382742]]),
+        target_assign_cfg=dict(
+            center_sample=True,
+            pos_radius=1.5,
+            sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
+                               (512, 100000000.0))),
+        nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            out_size_factor=4,
+            assigner=dict(
+                type='HungarianAssigner3D',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='SmoothL1Cost', weight=0.75),
+                iou_cost=dict(type='IoUCost', weight=0.0),
+                pc_range=point_cloud_range))))
+
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=4e-4,
+    paramwise_cfg=dict(
+        custom_keys=dict(
+            img_backbone=dict(lr_mult=0.5),
+        )),
+    weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=2000,
+    warmup_ratio=1.0 / 3,
+    step=[20, ])
+total_epochs = 24
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
diff --git a/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-48ep.py b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-48ep.py
new file mode 100644
index 0000000..a720051
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-48ep.py
@@ -0,0 +1,360 @@
+# mAP: 0.3953
+# mATE: 0.6941
+# mASE: 0.2765
+# mAOE: 0.4199
+# mAVE: 0.7537
+# mAAE: 0.1866
+# NDS: 0.4646
+_base_ = [
+    '../_base_/default_runtime.py'
+]
+# Dataset
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
+    'pedestrian', 'traffic_cone', 'trailer', 'truck'
+]
+dataset_type = 'CustomNuScenesDatasetV2'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
+bev_h_ = 200
+bev_w_ = 200
+frames = (0,)
+group_detr = 11
+voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
+ida_aug_conf = {
+    "reisze": [512, 544, 576, 608, 640, 672, 704, 736, 768],  #  (0.8, 1.2)
+    "crop": (0, 260, 1600, 900),
+    "H": 900,
+    "W": 1600,
+    "rand_flip": True,
+}
+ida_aug_conf_eval = {
+    "reisze": [640, ],
+    "crop": (0, 260, 1600, 900),
+    "H": 900,
+    "W": 1600,
+    "rand_flip": False,
+}
+# file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='GlobalRotScaleTransImage',
+        rot_range=[-22.5, 22.5],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0],
+        reverse_angle=True,
+        training=True,
+        flip_dx_ratio=0.5,
+        flip_dy_ratio=0.5,
+        only_gt=True,),
+    dict(
+        type='ObjectRangeFilter',
+        point_cloud_range=point_cloud_range),
+    dict(
+        type='ObjectNameFilter',
+        classes=class_names),
+    dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='CustomCollect3D',
+        keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
+              'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
+              'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
+    dict(type='DD3DMapper',
+         is_train=True,
+         tasks=dict(box2d_on=True, box3d_on=True),)
+]
+eval_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
+    dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 640),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D',
+                 keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
+                       'lidar2ego_rotation', 'timestamp'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    persistent_workers=True,
+    train=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        box_type_3d='LiDAR',
+        mono_cfg=dict(
+            name='nusc_trainval',
+            data_root='data/nuscenes/',
+            min_num_lidar_points=3,
+            min_box_visibility=0.2)),
+    val=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root='data/nuscenes/',
+        ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=eval_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        samples_per_gpu=1),
+    test=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root='data/nuscenes/',
+        ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=eval_pipeline,
+        classes=class_names,
+        modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler'))
+evaluation = dict(interval=4, pipeline=eval_pipeline)
+
+# model
+load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+_dim_ = 256
+_pos_dim_ = 128
+_ffn_dim_ = 512
+_num_levels_ = 4
+_num_mono_levels_ = 5
+
+model = dict(
+    type='BEVFormerV2',
+    use_grid_mask=True,
+    video_test_mode=False,
+    num_levels=_num_levels_,
+    num_mono_levels=_num_mono_levels_,
+    mono_loss_weight=1.0,
+    frames=frames,
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='SyncBN'),
+        norm_eval=False,
+        style='caffe'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_mono_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='BEVFormerHead_GroupDETR',
+        group_detr=group_detr,
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=10,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='PerceptionTransformerV2',
+            embed_dims=_dim_,
+            frames=frames,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=6,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=4),
+                            embed_dims=_dim_)
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='GroupMultiheadAttention',
+                            group=group_detr,
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1)
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    fcos3d_bbox_head=dict(
+        type='NuscenesDD3D',
+        num_classes=10,
+        in_channels=_dim_,
+        strides=[8, 16, 32, 64, 128],
+        box3d_on=True,
+        feature_locations_offset='none',
+        fcos2d_cfg=dict(
+            num_cls_convs=4,
+            num_box_convs=4,
+            norm='SyncBN',
+            use_deformable=False,
+            use_scale=True,
+            box2d_scale_init_factor=1.0),
+        fcos2d_loss_cfg=dict(
+            focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
+        fcos3d_cfg=dict(
+            num_convs=4,
+            norm='SyncBN',
+            use_scale=True,
+            depth_scale_init_factor=0.3,
+            proj_ctr_scale_init_factor=1.0,
+            use_per_level_predictors=False,
+            class_agnostic=False,
+            use_deformable=False,
+            mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
+            std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
+        fcos3d_loss_cfg=dict(
+            min_depth=0.1,
+            max_depth=80.0,
+            box3d_loss_weight=2.0,
+            conf3d_loss_weight=1.0,
+            conf_3d_temperature=1.0,
+            smooth_l1_loss_beta=0.05,
+            max_loss_per_group=20,
+            predict_allocentric_rot=True,
+            scale_depth_by_focal_lengths=True,
+            scale_depth_by_focal_lengths_factor=500.0,
+            class_agnostic=False,
+            predict_distance=False,
+            canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
+                             [0.61416006, 1.7016163, 1.3054738],
+                             [2.9139307, 10.725025, 3.2832346],
+                             [1.9751819, 4.641267, 1.74352],
+                             [2.772134, 6.565072, 3.2474296],
+                             [0.7800532, 2.138673, 1.4437162],
+                             [0.6667362, 0.7181772, 1.7616143],
+                             [0.40246472, 0.4027083, 1.0084083],
+                             [3.0059454, 12.8197, 4.1213827],
+                             [2.4986045, 6.9310856, 2.8382742]]),
+        target_assign_cfg=dict(
+            center_sample=True,
+            pos_radius=1.5,
+            sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
+                               (512, 100000000.0))),
+        nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            out_size_factor=4,
+            assigner=dict(
+                type='HungarianAssigner3D',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='SmoothL1Cost', weight=0.75),
+                iou_cost=dict(type='IoUCost', weight=0.0),
+                pc_range=point_cloud_range))))
+
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=4e-4,
+    paramwise_cfg=dict(
+        custom_keys=dict(
+            img_backbone=dict(lr_mult=0.5),
+        )),
+    weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=2000,
+    warmup_ratio=1.0 / 3,
+    step=[44, ])
+total_epochs = 48
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
diff --git a/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-base-24ep.py b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-base-24ep.py
new file mode 100644
index 0000000..10330cf
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-base-24ep.py
@@ -0,0 +1,349 @@
+# mAP: 0.3512
+# mATE: 0.7534
+# mASE: 0.2863
+# mAOE: 0.4665
+# mAVE: 0.8070
+# mAAE: 0.1861
+# NDS: 0.4257
+
+_base_ = [
+    '../_base_/default_runtime.py'
+]
+# Dataset
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
+    'pedestrian', 'traffic_cone', 'trailer', 'truck'
+]
+dataset_type = 'CustomNuScenesDatasetV2'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
+bev_h_ = 200
+bev_w_ = 200
+frames = (0,)
+voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
+ida_aug_conf = {
+    "reisze": [640, ],
+    "crop": (0, 260, 1600, 900),
+    "H": 900,
+    "W": 1600,
+    "rand_flip": False,
+}
+ida_aug_conf_eval = {
+    "reisze": [640, ],
+    "crop": (0, 260, 1600, 900),
+    "H": 900,
+    "W": 1600,
+    "rand_flip": False,
+}
+# file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(
+        type='ObjectRangeFilter',
+        point_cloud_range=point_cloud_range),
+    dict(
+        type='ObjectNameFilter',
+        classes=class_names),
+    dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='CustomCollect3D',
+        keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
+              'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
+              'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
+    dict(type='DD3DMapper',
+         is_train=True,
+         tasks=dict(box2d_on=True, box3d_on=True),)
+]
+eval_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
+    dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 640),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D',
+                 keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
+                       'lidar2ego_rotation', 'timestamp'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    persistent_workers=True,
+    train=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        box_type_3d='LiDAR',
+        mono_cfg=dict(
+            name='nusc_trainval',
+            data_root='data/nuscenes/',
+            min_num_lidar_points=3,
+            min_box_visibility=0.2)),
+    val=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root='data/nuscenes/',
+        ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=eval_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        samples_per_gpu=1),
+    test=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root='data/nuscenes/',
+        ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=eval_pipeline,
+        classes=class_names,
+        modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler'))
+evaluation = dict(interval=4, pipeline=eval_pipeline)
+
+# model
+load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+_dim_ = 256
+_pos_dim_ = 128
+_ffn_dim_ = 512
+_num_levels_ = 4
+_num_mono_levels_ = 5
+
+model = dict(
+    type='BEVFormerV2',
+    use_grid_mask=True,
+    video_test_mode=False,
+    num_levels=_num_levels_,
+    num_mono_levels=_num_mono_levels_,
+    mono_loss_weight=1.0,
+    frames=frames,
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='SyncBN'),
+        norm_eval=False,
+        style='caffe'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_mono_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='BEVFormerHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=10,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='PerceptionTransformerV2',
+            embed_dims=_dim_,
+            frames=frames,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=6,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=4),
+                            embed_dims=_dim_)
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1)
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    fcos3d_bbox_head=dict(
+        type='NuscenesDD3D',
+        num_classes=10,
+        in_channels=_dim_,
+        strides=[8, 16, 32, 64, 128],
+        box3d_on=True,
+        feature_locations_offset='none',
+        fcos2d_cfg=dict(
+            num_cls_convs=4,
+            num_box_convs=4,
+            norm='SyncBN',
+            use_deformable=False,
+            use_scale=True,
+            box2d_scale_init_factor=1.0),
+        fcos2d_loss_cfg=dict(
+            focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
+        fcos3d_cfg=dict(
+            num_convs=4,
+            norm='SyncBN',
+            use_scale=True,
+            depth_scale_init_factor=0.3,
+            proj_ctr_scale_init_factor=1.0,
+            use_per_level_predictors=False,
+            class_agnostic=False,
+            use_deformable=False,
+            mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
+            std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
+        fcos3d_loss_cfg=dict(
+            min_depth=0.1,
+            max_depth=80.0,
+            box3d_loss_weight=2.0,
+            conf3d_loss_weight=1.0,
+            conf_3d_temperature=1.0,
+            smooth_l1_loss_beta=0.05,
+            max_loss_per_group=20,
+            predict_allocentric_rot=True,
+            scale_depth_by_focal_lengths=True,
+            scale_depth_by_focal_lengths_factor=500.0,
+            class_agnostic=False,
+            predict_distance=False,
+            canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
+                             [0.61416006, 1.7016163, 1.3054738],
+                             [2.9139307, 10.725025, 3.2832346],
+                             [1.9751819, 4.641267, 1.74352],
+                             [2.772134, 6.565072, 3.2474296],
+                             [0.7800532, 2.138673, 1.4437162],
+                             [0.6667362, 0.7181772, 1.7616143],
+                             [0.40246472, 0.4027083, 1.0084083],
+                             [3.0059454, 12.8197, 4.1213827],
+                             [2.4986045, 6.9310856, 2.8382742]]),
+        target_assign_cfg=dict(
+            center_sample=True,
+            pos_radius=1.5,
+            sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
+                               (512, 100000000.0))),
+        nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            out_size_factor=4,
+            assigner=dict(
+                type='HungarianAssigner3D',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='SmoothL1Cost', weight=0.75),
+                iou_cost=dict(type='IoUCost', weight=0.0),
+                pc_range=point_cloud_range))))
+
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=4e-4,
+    paramwise_cfg=dict(
+        custom_keys=dict(
+            img_backbone=dict(lr_mult=0.5),
+        )),
+    weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=2000,
+    warmup_ratio=1.0 / 3,
+    step=[20, ])
+total_epochs = 24
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
diff --git a/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-base-48ep.py b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-base-48ep.py
new file mode 100644
index 0000000..9c6d3cc
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t1-base-48ep.py
@@ -0,0 +1,349 @@
+# mAP: 0.3594
+# mATE: 0.7327
+# mASE: 0.2814
+# mAOE: 0.4074
+# mAVE: 0.7831
+# mAAE: 0.1983
+# NDS: 0.4394
+
+_base_ = [
+    '../_base_/default_runtime.py'
+]
+# Dataset
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
+    'pedestrian', 'traffic_cone', 'trailer', 'truck'
+]
+dataset_type = 'CustomNuScenesDatasetV2'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
+bev_h_ = 200
+bev_w_ = 200
+frames = (0,)
+voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
+ida_aug_conf = {
+    "reisze": [640, ],
+    "crop": (0, 260, 1600, 900),
+    "H": 900,
+    "W": 1600,
+    "rand_flip": False,
+}
+ida_aug_conf_eval = {
+    "reisze": [640, ],
+    "crop": (0, 260, 1600, 900),
+    "H": 900,
+    "W": 1600,
+    "rand_flip": False,
+}
+# file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(
+        type='ObjectRangeFilter',
+        point_cloud_range=point_cloud_range),
+    dict(
+        type='ObjectNameFilter',
+        classes=class_names),
+    dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='CustomCollect3D',
+        keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
+              'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
+              'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
+    dict(type='DD3DMapper',
+         is_train=True,
+         tasks=dict(box2d_on=True, box3d_on=True),)
+]
+eval_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
+    dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 640),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D',
+                 keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
+                       'lidar2ego_rotation', 'timestamp'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    persistent_workers=True,
+    train=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        box_type_3d='LiDAR',
+        mono_cfg=dict(
+            name='nusc_trainval',
+            data_root='data/nuscenes/',
+            min_num_lidar_points=3,
+            min_box_visibility=0.2)),
+    val=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root='data/nuscenes/',
+        ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=eval_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        samples_per_gpu=1),
+    test=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root='data/nuscenes/',
+        ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=eval_pipeline,
+        classes=class_names,
+        modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler'))
+evaluation = dict(interval=4, pipeline=eval_pipeline)
+
+# model
+load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+_dim_ = 256
+_pos_dim_ = 128
+_ffn_dim_ = 512
+_num_levels_ = 4
+_num_mono_levels_ = 5
+
+model = dict(
+    type='BEVFormerV2',
+    use_grid_mask=True,
+    video_test_mode=False,
+    num_levels=_num_levels_,
+    num_mono_levels=_num_mono_levels_,
+    mono_loss_weight=1.0,
+    frames=frames,
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='SyncBN'),
+        norm_eval=False,
+        style='caffe'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_mono_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='BEVFormerHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=10,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='PerceptionTransformerV2',
+            embed_dims=_dim_,
+            frames=frames,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=6,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=4),
+                            embed_dims=_dim_)
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1)
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    fcos3d_bbox_head=dict(
+        type='NuscenesDD3D',
+        num_classes=10,
+        in_channels=_dim_,
+        strides=[8, 16, 32, 64, 128],
+        box3d_on=True,
+        feature_locations_offset='none',
+        fcos2d_cfg=dict(
+            num_cls_convs=4,
+            num_box_convs=4,
+            norm='SyncBN',
+            use_deformable=False,
+            use_scale=True,
+            box2d_scale_init_factor=1.0),
+        fcos2d_loss_cfg=dict(
+            focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
+        fcos3d_cfg=dict(
+            num_convs=4,
+            norm='SyncBN',
+            use_scale=True,
+            depth_scale_init_factor=0.3,
+            proj_ctr_scale_init_factor=1.0,
+            use_per_level_predictors=False,
+            class_agnostic=False,
+            use_deformable=False,
+            mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
+            std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
+        fcos3d_loss_cfg=dict(
+            min_depth=0.1,
+            max_depth=80.0,
+            box3d_loss_weight=2.0,
+            conf3d_loss_weight=1.0,
+            conf_3d_temperature=1.0,
+            smooth_l1_loss_beta=0.05,
+            max_loss_per_group=20,
+            predict_allocentric_rot=True,
+            scale_depth_by_focal_lengths=True,
+            scale_depth_by_focal_lengths_factor=500.0,
+            class_agnostic=False,
+            predict_distance=False,
+            canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
+                             [0.61416006, 1.7016163, 1.3054738],
+                             [2.9139307, 10.725025, 3.2832346],
+                             [1.9751819, 4.641267, 1.74352],
+                             [2.772134, 6.565072, 3.2474296],
+                             [0.7800532, 2.138673, 1.4437162],
+                             [0.6667362, 0.7181772, 1.7616143],
+                             [0.40246472, 0.4027083, 1.0084083],
+                             [3.0059454, 12.8197, 4.1213827],
+                             [2.4986045, 6.9310856, 2.8382742]]),
+        target_assign_cfg=dict(
+            center_sample=True,
+            pos_radius=1.5,
+            sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
+                               (512, 100000000.0))),
+        nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            out_size_factor=4,
+            assigner=dict(
+                type='HungarianAssigner3D',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='SmoothL1Cost', weight=0.75),
+                iou_cost=dict(type='IoUCost', weight=0.0),
+                pc_range=point_cloud_range))))
+
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=4e-4,
+    paramwise_cfg=dict(
+        custom_keys=dict(
+            img_backbone=dict(lr_mult=0.5),
+        )),
+    weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=2000,
+    warmup_ratio=1.0 / 3,
+    step=[44, ])
+total_epochs = 48
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
diff --git a/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t2-24ep.py b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t2-24ep.py
new file mode 100644
index 0000000..05bf708
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t2-24ep.py
@@ -0,0 +1,360 @@
+# mAP: 0.4199
+# mATE: 0.6689
+# mASE: 0.2814
+# mAOE: 0.3915
+# mAVE: 0.3834
+# mAAE: 0.1928
+# NDS: 0.5182
+_base_ = [
+    '../_base_/default_runtime.py'
+]
+# Dataset
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
+    'pedestrian', 'traffic_cone', 'trailer', 'truck'
+]
+dataset_type = 'CustomNuScenesDatasetV2'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
+bev_h_ = 200
+bev_w_ = 200
+frames = (-1, 0,)
+group_detr = 11
+voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
+ida_aug_conf = {
+    "reisze": [512, 544, 576, 608, 640, 672, 704, 736, 768],  #  (0.8, 1.2)
+    "crop": (0, 260, 1600, 900),
+    "H": 900,
+    "W": 1600,
+    "rand_flip": True,
+}
+ida_aug_conf_eval = {
+    "reisze": [640, ],
+    "crop": (0, 260, 1600, 900),
+    "H": 900,
+    "W": 1600,
+    "rand_flip": False,
+}
+# file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='GlobalRotScaleTransImage',
+        rot_range=[-22.5, 22.5],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0],
+        reverse_angle=True,
+        training=True,
+        flip_dx_ratio=0.5,
+        flip_dy_ratio=0.5,
+        only_gt=True,),
+    dict(
+        type='ObjectRangeFilter',
+        point_cloud_range=point_cloud_range),
+    dict(
+        type='ObjectNameFilter',
+        classes=class_names),
+    dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='CustomCollect3D',
+        keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
+              'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
+              'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
+    dict(type='DD3DMapper',
+         is_train=True,
+         tasks=dict(box2d_on=True, box3d_on=True),)
+]
+eval_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
+    dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 640),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D',
+                 keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
+                       'lidar2ego_rotation', 'timestamp'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    persistent_workers=True,
+    train=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        box_type_3d='LiDAR',
+        mono_cfg=dict(
+            name='nusc_trainval',
+            data_root='data/nuscenes/',
+            min_num_lidar_points=3,
+            min_box_visibility=0.2)),
+    val=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root='data/nuscenes/',
+        ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=eval_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        samples_per_gpu=1),
+    test=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root='data/nuscenes/',
+        ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=eval_pipeline,
+        classes=class_names,
+        modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler'))
+evaluation = dict(interval=4, pipeline=eval_pipeline)
+
+# model
+load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+_dim_ = 256
+_pos_dim_ = 128
+_ffn_dim_ = 512
+_num_levels_ = 4
+_num_mono_levels_ = 5
+
+model = dict(
+    type='BEVFormerV2',
+    use_grid_mask=True,
+    video_test_mode=False,
+    num_levels=_num_levels_,
+    num_mono_levels=_num_mono_levels_,
+    mono_loss_weight=1.0,
+    frames=frames,
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='SyncBN'),
+        norm_eval=False,
+        style='caffe'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_mono_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='BEVFormerHead_GroupDETR',
+        group_detr=group_detr,
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=10,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='PerceptionTransformerV2',
+            embed_dims=_dim_,
+            frames=frames,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=6,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=4),
+                            embed_dims=_dim_)
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='GroupMultiheadAttention',
+                            group=group_detr,
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1)
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    fcos3d_bbox_head=dict(
+        type='NuscenesDD3D',
+        num_classes=10,
+        in_channels=_dim_,
+        strides=[8, 16, 32, 64, 128],
+        box3d_on=True,
+        feature_locations_offset='none',
+        fcos2d_cfg=dict(
+            num_cls_convs=4,
+            num_box_convs=4,
+            norm='SyncBN',
+            use_deformable=False,
+            use_scale=True,
+            box2d_scale_init_factor=1.0),
+        fcos2d_loss_cfg=dict(
+            focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
+        fcos3d_cfg=dict(
+            num_convs=4,
+            norm='SyncBN',
+            use_scale=True,
+            depth_scale_init_factor=0.3,
+            proj_ctr_scale_init_factor=1.0,
+            use_per_level_predictors=False,
+            class_agnostic=False,
+            use_deformable=False,
+            mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
+            std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
+        fcos3d_loss_cfg=dict(
+            min_depth=0.1,
+            max_depth=80.0,
+            box3d_loss_weight=2.0,
+            conf3d_loss_weight=1.0,
+            conf_3d_temperature=1.0,
+            smooth_l1_loss_beta=0.05,
+            max_loss_per_group=20,
+            predict_allocentric_rot=True,
+            scale_depth_by_focal_lengths=True,
+            scale_depth_by_focal_lengths_factor=500.0,
+            class_agnostic=False,
+            predict_distance=False,
+            canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
+                             [0.61416006, 1.7016163, 1.3054738],
+                             [2.9139307, 10.725025, 3.2832346],
+                             [1.9751819, 4.641267, 1.74352],
+                             [2.772134, 6.565072, 3.2474296],
+                             [0.7800532, 2.138673, 1.4437162],
+                             [0.6667362, 0.7181772, 1.7616143],
+                             [0.40246472, 0.4027083, 1.0084083],
+                             [3.0059454, 12.8197, 4.1213827],
+                             [2.4986045, 6.9310856, 2.8382742]]),
+        target_assign_cfg=dict(
+            center_sample=True,
+            pos_radius=1.5,
+            sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
+                               (512, 100000000.0))),
+        nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            out_size_factor=4,
+            assigner=dict(
+                type='HungarianAssigner3D',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='SmoothL1Cost', weight=0.75),
+                iou_cost=dict(type='IoUCost', weight=0.0),
+                pc_range=point_cloud_range))))
+
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=4e-4,
+    paramwise_cfg=dict(
+        custom_keys=dict(
+            img_backbone=dict(lr_mult=0.5),
+        )),
+    weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=2000,
+    warmup_ratio=1.0 / 3,
+    step=[20, ])
+total_epochs = 24
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
diff --git a/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t2-48ep.py b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t2-48ep.py
new file mode 100644
index 0000000..2c1dab2
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t2-48ep.py
@@ -0,0 +1,360 @@
+# mAP: 0.4313
+# mATE: 0.6557
+# mASE: 0.2775
+# mAOE: 0.3851
+# mAVE: 0.3861
+# mAAE: 0.1882
+# NDS: 0.5264
+_base_ = [
+    '../_base_/default_runtime.py'
+]
+# Dataset
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
+    'pedestrian', 'traffic_cone', 'trailer', 'truck'
+]
+dataset_type = 'CustomNuScenesDatasetV2'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
+bev_h_ = 200
+bev_w_ = 200
+frames = (-1, 0,)
+group_detr = 11
+voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
+ida_aug_conf = {
+    "reisze": [512, 544, 576, 608, 640, 672, 704, 736, 768],  #  (0.8, 1.2)
+    "crop": (0, 260, 1600, 900),
+    "H": 900,
+    "W": 1600,
+    "rand_flip": True,
+}
+ida_aug_conf_eval = {
+    "reisze": [640, ],
+    "crop": (0, 260, 1600, 900),
+    "H": 900,
+    "W": 1600,
+    "rand_flip": False,
+}
+# file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='GlobalRotScaleTransImage',
+        rot_range=[-22.5, 22.5],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0],
+        reverse_angle=True,
+        training=True,
+        flip_dx_ratio=0.5,
+        flip_dy_ratio=0.5,
+        only_gt=True,),
+    dict(
+        type='ObjectRangeFilter',
+        point_cloud_range=point_cloud_range),
+    dict(
+        type='ObjectNameFilter',
+        classes=class_names),
+    dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='CustomCollect3D',
+        keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
+              'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
+              'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
+    dict(type='DD3DMapper',
+         is_train=True,
+         tasks=dict(box2d_on=True, box3d_on=True),)
+]
+eval_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
+    dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 640),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D',
+                 keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
+                       'lidar2ego_rotation', 'timestamp'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    persistent_workers=True,
+    train=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        box_type_3d='LiDAR',
+        mono_cfg=dict(
+            name='nusc_trainval',
+            data_root='data/nuscenes/',
+            min_num_lidar_points=3,
+            min_box_visibility=0.2)),
+    val=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root='data/nuscenes/',
+        ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=eval_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        samples_per_gpu=1),
+    test=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root='data/nuscenes/',
+        ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=eval_pipeline,
+        classes=class_names,
+        modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler'))
+evaluation = dict(interval=4, pipeline=eval_pipeline)
+
+# model
+load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+_dim_ = 256
+_pos_dim_ = 128
+_ffn_dim_ = 512
+_num_levels_ = 4
+_num_mono_levels_ = 5
+
+model = dict(
+    type='BEVFormerV2',
+    use_grid_mask=True,
+    video_test_mode=False,
+    num_levels=_num_levels_,
+    num_mono_levels=_num_mono_levels_,
+    mono_loss_weight=1.0,
+    frames=frames,
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='SyncBN'),
+        norm_eval=False,
+        style='caffe'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_mono_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='BEVFormerHead_GroupDETR',
+        group_detr=group_detr,
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=10,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='PerceptionTransformerV2',
+            embed_dims=_dim_,
+            frames=frames,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=6,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=4),
+                            embed_dims=_dim_)
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='GroupMultiheadAttention',
+                            group=group_detr,
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1)
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    fcos3d_bbox_head=dict(
+        type='NuscenesDD3D',
+        num_classes=10,
+        in_channels=_dim_,
+        strides=[8, 16, 32, 64, 128],
+        box3d_on=True,
+        feature_locations_offset='none',
+        fcos2d_cfg=dict(
+            num_cls_convs=4,
+            num_box_convs=4,
+            norm='SyncBN',
+            use_deformable=False,
+            use_scale=True,
+            box2d_scale_init_factor=1.0),
+        fcos2d_loss_cfg=dict(
+            focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
+        fcos3d_cfg=dict(
+            num_convs=4,
+            norm='SyncBN',
+            use_scale=True,
+            depth_scale_init_factor=0.3,
+            proj_ctr_scale_init_factor=1.0,
+            use_per_level_predictors=False,
+            class_agnostic=False,
+            use_deformable=False,
+            mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
+            std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
+        fcos3d_loss_cfg=dict(
+            min_depth=0.1,
+            max_depth=80.0,
+            box3d_loss_weight=2.0,
+            conf3d_loss_weight=1.0,
+            conf_3d_temperature=1.0,
+            smooth_l1_loss_beta=0.05,
+            max_loss_per_group=20,
+            predict_allocentric_rot=True,
+            scale_depth_by_focal_lengths=True,
+            scale_depth_by_focal_lengths_factor=500.0,
+            class_agnostic=False,
+            predict_distance=False,
+            canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
+                             [0.61416006, 1.7016163, 1.3054738],
+                             [2.9139307, 10.725025, 3.2832346],
+                             [1.9751819, 4.641267, 1.74352],
+                             [2.772134, 6.565072, 3.2474296],
+                             [0.7800532, 2.138673, 1.4437162],
+                             [0.6667362, 0.7181772, 1.7616143],
+                             [0.40246472, 0.4027083, 1.0084083],
+                             [3.0059454, 12.8197, 4.1213827],
+                             [2.4986045, 6.9310856, 2.8382742]]),
+        target_assign_cfg=dict(
+            center_sample=True,
+            pos_radius=1.5,
+            sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
+                               (512, 100000000.0))),
+        nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            out_size_factor=4,
+            assigner=dict(
+                type='HungarianAssigner3D',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='SmoothL1Cost', weight=0.75),
+                iou_cost=dict(type='IoUCost', weight=0.0),
+                pc_range=point_cloud_range))))
+
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=4e-4,
+    paramwise_cfg=dict(
+        custom_keys=dict(
+            img_backbone=dict(lr_mult=0.5),
+        )),
+    weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=2000,
+    warmup_ratio=1.0 / 3,
+    step=[20, ])
+total_epochs = 24
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
diff --git a/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t8-24ep.py b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t8-24ep.py
new file mode 100644
index 0000000..76cca1e
--- /dev/null
+++ b/adzoo/bevformer/configs/bevformerv2/bevformerv2-r50-t8-24ep.py
@@ -0,0 +1,361 @@
+# mAP: 0.4600
+# mATE: 0.6185
+# mASE: 0.2815
+# mAOE: 0.3660
+# mAVE: 0.3157
+# mAAE: 0.1902
+# NDS: 0.5528
+_base_ = [
+    '../_base_/default_runtime.py'
+]
+# Dataset
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle',
+    'pedestrian', 'traffic_cone', 'trailer', 'truck'
+]
+dataset_type = 'CustomNuScenesDatasetV2'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+img_norm_cfg = dict(mean=[103.53, 116.28, 123.675], std=[1, 1, 1], to_rgb=False)
+bev_h_ = 200
+bev_w_ = 200
+frames = (-7,-6,-5,-4,-3,-2,-1,0)
+group_detr = 11
+voxel_size = [102.4 / bev_h_, 102.4 / bev_w_, 8]
+ida_aug_conf = {
+    "reisze": [512, 544, 576, 608, 640, 672, 704, 736, 768],  #  (0.8, 1.2)
+    "crop": (0, 260, 1600, 900),
+    "H": 900,
+    "W": 1600,
+    "rand_flip": True,
+}
+ida_aug_conf_eval = {
+    "reisze": [640, ],
+    "crop": (0, 260, 1600, 900),
+    "H": 900,
+    "W": 1600,
+    "rand_flip": False,
+}
+# file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='GlobalRotScaleTransImage',
+        rot_range=[-22.5, 22.5],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0],
+        reverse_angle=True,
+        training=True,
+        flip_dx_ratio=0.5,
+        flip_dy_ratio=0.5,
+        only_gt=True,),
+    dict(
+        type='ObjectRangeFilter',
+        point_cloud_range=point_cloud_range),
+    dict(
+        type='ObjectNameFilter',
+        classes=class_names),
+    dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf, training=True, debug=False),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='CustomCollect3D',
+        keys=['gt_bboxes_3d', 'gt_labels_3d', 'img',
+              'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation', 'lidar2ego_rotation',
+              'timestamp', 'mono_input_dict', 'mono_ann_idx', 'aug_param']),
+    dict(type='DD3DMapper',
+         is_train=True,
+         tasks=dict(box2d_on=True, box3d_on=True),)
+]
+eval_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True, ),
+    dict(type='CropResizeFlipImage', data_aug_conf=ida_aug_conf_eval, training=False, debug=False),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 640),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D',
+                 keys=['img', 'ego2global_translation', 'ego2global_rotation', 'lidar2ego_translation',
+                       'lidar2ego_rotation', 'timestamp'])
+        ])
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    persistent_workers=True,
+    train=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        box_type_3d='LiDAR',
+        mono_cfg=dict(
+            name='nusc_trainval',
+            data_root='data/nuscenes/',
+            min_num_lidar_points=3,
+            min_box_visibility=0.2)),
+    val=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root='data/nuscenes/',
+        ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=eval_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        samples_per_gpu=1),
+    test=dict(
+        type='CustomNuScenesDatasetV2',
+        frames=frames,
+        data_root='data/nuscenes/',
+        ann_file=data_root + 'nuscenes_infos_temporal_val.pkl',
+        pipeline=eval_pipeline,
+        classes=class_names,
+        modality=input_modality),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler'))
+evaluation = dict(interval=4, pipeline=eval_pipeline)
+
+# model
+load_from = './ckpts/fcos_r50_coco_2mmdet.pth'
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+_dim_ = 256
+_pos_dim_ = 128
+_ffn_dim_ = 512
+_num_levels_ = 4
+_num_mono_levels_ = 5
+
+model = dict(
+    type='BEVFormerV2',
+    use_grid_mask=True,
+    video_test_mode=False,
+    num_levels=_num_levels_,
+    num_mono_levels=_num_mono_levels_,
+    mono_loss_weight=1.0,
+    frames=frames,
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='SyncBN'),
+        norm_eval=False,
+        style='caffe'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_mono_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='BEVFormerHead_GroupDETR',
+        group_detr=group_detr,
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=10,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        transformer=dict(
+            type='PerceptionTransformerV2',
+            embed_dims=_dim_,
+            frames=frames,
+            inter_channels=_dim_*2,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=6,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=4),
+                            embed_dims=_dim_)
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='GroupMultiheadAttention',
+                            group=group_detr,
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1)
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='NMSFreeCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='SmoothL1Loss', loss_weight=0.75, beta=1.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
+    fcos3d_bbox_head=dict(
+        type='NuscenesDD3D',
+        num_classes=10,
+        in_channels=_dim_,
+        strides=[8, 16, 32, 64, 128],
+        box3d_on=True,
+        feature_locations_offset='none',
+        fcos2d_cfg=dict(
+            num_cls_convs=4,
+            num_box_convs=4,
+            norm='SyncBN',
+            use_deformable=False,
+            use_scale=True,
+            box2d_scale_init_factor=1.0),
+        fcos2d_loss_cfg=dict(
+            focal_loss_alpha=0.25, focal_loss_gamma=2.0, loc_loss_type='giou'),
+        fcos3d_cfg=dict(
+            num_convs=4,
+            norm='SyncBN',
+            use_scale=True,
+            depth_scale_init_factor=0.3,
+            proj_ctr_scale_init_factor=1.0,
+            use_per_level_predictors=False,
+            class_agnostic=False,
+            use_deformable=False,
+            mean_depth_per_level=[44.921, 20.252, 11.712, 7.166, 8.548],
+            std_depth_per_level=[24.331, 9.833, 6.223, 4.611, 8.275]),
+        fcos3d_loss_cfg=dict(
+            min_depth=0.1,
+            max_depth=80.0,
+            box3d_loss_weight=2.0,
+            conf3d_loss_weight=1.0,
+            conf_3d_temperature=1.0,
+            smooth_l1_loss_beta=0.05,
+            max_loss_per_group=20,
+            predict_allocentric_rot=True,
+            scale_depth_by_focal_lengths=True,
+            scale_depth_by_focal_lengths_factor=500.0,
+            class_agnostic=False,
+            predict_distance=False,
+            canon_box_sizes=[[2.3524184, 0.5062202, 1.0413622],
+                             [0.61416006, 1.7016163, 1.3054738],
+                             [2.9139307, 10.725025, 3.2832346],
+                             [1.9751819, 4.641267, 1.74352],
+                             [2.772134, 6.565072, 3.2474296],
+                             [0.7800532, 2.138673, 1.4437162],
+                             [0.6667362, 0.7181772, 1.7616143],
+                             [0.40246472, 0.4027083, 1.0084083],
+                             [3.0059454, 12.8197, 4.1213827],
+                             [2.4986045, 6.9310856, 2.8382742]]),
+        target_assign_cfg=dict(
+            center_sample=True,
+            pos_radius=1.5,
+            sizes_of_interest=((-1, 64), (64, 128), (128, 256), (256, 512),
+                               (512, 100000000.0))),
+        nusc_loss_weight=dict(attr_loss_weight=0.2, speed_loss_weight=0.2)),
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            out_size_factor=4,
+            assigner=dict(
+                type='HungarianAssigner3D',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='SmoothL1Cost', weight=0.75),
+                iou_cost=dict(type='IoUCost', weight=0.0),
+                pc_range=point_cloud_range))))
+
+# optimizer
+optimizer = dict(
+    type='AdamW',
+    lr=4e-4,
+    paramwise_cfg=dict(
+        custom_keys=dict(
+            img_backbone=dict(lr_mult=0.5),
+        )),
+    weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=2000,
+    warmup_ratio=1.0 / 3,
+    step=[20, ])
+total_epochs = 24
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
diff --git a/adzoo/bevformer/configs/datasets/custom_lyft-3d.py b/adzoo/bevformer/configs/datasets/custom_lyft-3d.py
new file mode 100644
index 0000000..5a95d89
--- /dev/null
+++ b/adzoo/bevformer/configs/datasets/custom_lyft-3d.py
@@ -0,0 +1,136 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-80, -80, -5, 80, 80, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+    'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+    'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'CustomLyftDataset'
+data_root = 'data/lyft/'
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/lyft/': 's3://lyft/lyft/',
+#         'data/lyft/': 's3://lyft/lyft/'
+#    }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True))
+# For Lyft dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
\ No newline at end of file
diff --git a/adzoo/bevformer/configs/datasets/custom_nus-3d.py b/adzoo/bevformer/configs/datasets/custom_nus-3d.py
new file mode 100644
index 0000000..af81f9b
--- /dev/null
+++ b/adzoo/bevformer/configs/datasets/custom_nus-3d.py
@@ -0,0 +1,141 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+dataset_type = 'NuScenesDataset_eval_modified'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/bevformer/configs/datasets/custom_waymo-3d.py b/adzoo/bevformer/configs/datasets/custom_waymo-3d.py
new file mode 100644
index 0000000..4100e13
--- /dev/null
+++ b/adzoo/bevformer/configs/datasets/custom_waymo-3d.py
@@ -0,0 +1,112 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'CustomWaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
+
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=False, use_camera=True)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'waymo_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+    classes=class_names,
+    sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        file_client_args=file_client_args))
+
+
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1920, 1280),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'waymo_infos_train.pkl',
+            split='training',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            # load one frame every five frames
+            load_interval=5)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+
+evaluation = dict(interval=24, pipeline=test_pipeline)
\ No newline at end of file
diff --git a/adzoo/bevformer/create_data.py b/adzoo/bevformer/create_data.py
new file mode 100755
index 0000000..f2b0cc1
--- /dev/null
+++ b/adzoo/bevformer/create_data.py
@@ -0,0 +1,305 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+from data_converter.create_gt_database import create_groundtruth_database
+from data_converter import nuscenes_converter as nuscenes_converter
+from data_converter import lyft_converter as lyft_converter
+from data_converter import kitti_converter as kitti
+from data_converter import indoor_converter as indoor
+import argparse
+from os import path as osp
+import sys
+sys.path.append('.')
+
+
+def kitti_data_prep(root_path, info_prefix, version, out_dir):
+    """Prepare data related to Kitti dataset.
+
+    Related data consists of '.pkl' files recording basic infos,
+    2D annotations and groundtruth database.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        version (str): Dataset version.
+        out_dir (str): Output directory of the groundtruth database info.
+    """
+    kitti.create_kitti_info_file(root_path, info_prefix)
+    kitti.create_reduced_point_cloud(root_path, info_prefix)
+
+    info_train_path = osp.join(root_path, f'{info_prefix}_infos_train.pkl')
+    info_val_path = osp.join(root_path, f'{info_prefix}_infos_val.pkl')
+    info_trainval_path = osp.join(root_path,
+                                  f'{info_prefix}_infos_trainval.pkl')
+    info_test_path = osp.join(root_path, f'{info_prefix}_infos_test.pkl')
+    kitti.export_2d_annotation(root_path, info_train_path)
+    kitti.export_2d_annotation(root_path, info_val_path)
+    kitti.export_2d_annotation(root_path, info_trainval_path)
+    kitti.export_2d_annotation(root_path, info_test_path)
+
+    create_groundtruth_database(
+        'KittiDataset',
+        root_path,
+        info_prefix,
+        f'{out_dir}/{info_prefix}_infos_train.pkl',
+        relative_path=False,
+        mask_anno_path='instances_train.json',
+        with_mask=(version == 'mask'))
+
+
+def nuscenes_data_prep(root_path,
+                       can_bus_root_path,
+                       info_prefix,
+                       version,
+                       dataset_name,
+                       out_dir,
+                       max_sweeps=10):
+    """Prepare data related to nuScenes dataset.
+
+    Related data consists of '.pkl' files recording basic infos,
+    2D annotations and groundtruth database.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        version (str): Dataset version.
+        dataset_name (str): The dataset class name.
+        out_dir (str): Output directory of the groundtruth database info.
+        max_sweeps (int): Number of input consecutive frames. Default: 10
+    """
+    nuscenes_converter.create_nuscenes_infos(
+        root_path, out_dir, can_bus_root_path, info_prefix, version=version, max_sweeps=max_sweeps)
+
+    if version == 'v1.0-test':
+        info_test_path = osp.join(
+            out_dir, f'{info_prefix}_infos_temporal_test.pkl')
+        nuscenes_converter.export_2d_annotation(
+            root_path, info_test_path, version=version)
+    else:
+        info_train_path = osp.join(
+            out_dir, f'{info_prefix}_infos_temporal_train.pkl')
+        info_val_path = osp.join(
+            out_dir, f'{info_prefix}_infos_temporal_val.pkl')
+        nuscenes_converter.export_2d_annotation(
+            root_path, info_train_path, version=version)
+        nuscenes_converter.export_2d_annotation(
+            root_path, info_val_path, version=version)
+        # create_groundtruth_database(dataset_name, root_path, info_prefix,
+        #                             f'{out_dir}/{info_prefix}_infos_train.pkl')
+
+
+def lyft_data_prep(root_path, info_prefix, version, max_sweeps=10):
+    """Prepare data related to Lyft dataset.
+
+    Related data consists of '.pkl' files recording basic infos.
+    Although the ground truth database and 2D annotations are not used in
+    Lyft, it can also be generated like nuScenes.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        version (str): Dataset version.
+        max_sweeps (int, optional): Number of input consecutive frames.
+            Defaults to 10.
+    """
+    lyft_converter.create_lyft_infos(
+        root_path, info_prefix, version=version, max_sweeps=max_sweeps)
+
+
+def scannet_data_prep(root_path, info_prefix, out_dir, workers):
+    """Prepare the info file for scannet dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+    """
+    indoor.create_indoor_info_file(
+        root_path, info_prefix, out_dir, workers=workers)
+
+
+def s3dis_data_prep(root_path, info_prefix, out_dir, workers):
+    """Prepare the info file for s3dis dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+    """
+    indoor.create_indoor_info_file(
+        root_path, info_prefix, out_dir, workers=workers)
+
+
+def sunrgbd_data_prep(root_path, info_prefix, out_dir, workers):
+    """Prepare the info file for sunrgbd dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+    """
+    indoor.create_indoor_info_file(
+        root_path, info_prefix, out_dir, workers=workers)
+
+
+def waymo_data_prep(root_path,
+                    info_prefix,
+                    version,
+                    out_dir,
+                    workers,
+                    max_sweeps=5):
+    """Prepare the info file for waymo dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+        max_sweeps (int): Number of input consecutive frames. Default: 5 \
+            Here we store pose information of these frames for later use.
+    """
+    from tools.data_converter import waymo_converter as waymo
+
+    splits = ['training', 'validation', 'testing']
+
+    for i, split in enumerate(splits):
+        load_dir = osp.join(root_path, 'waymo_format', split)
+        if split == 'validation':
+            save_dir = osp.join(out_dir, 'kitti_format', 'training')
+        else:
+            save_dir = osp.join(out_dir, 'kitti_format', split)
+        converter = waymo.Waymo2KITTI(
+            load_dir,
+            save_dir,
+            prefix=str(i),
+            workers=workers,
+            test_mode=(split == 'test'))
+        converter.convert()
+    # Generate waymo infos
+    out_dir = osp.join(out_dir, 'kitti_format')
+    kitti.create_waymo_info_file(out_dir, info_prefix, max_sweeps=max_sweeps)
+
+    create_groundtruth_database(
+        'WaymoDataset',
+        out_dir,
+        info_prefix,
+        f'{out_dir}/{info_prefix}_infos_train.pkl',
+        relative_path=False,
+        with_mask=False)
+
+
+parser = argparse.ArgumentParser(description='Data converter arg parser')
+parser.add_argument('dataset', metavar='kitti', help='name of the dataset')
+parser.add_argument(
+    '--root-path',
+    type=str,
+    default='./data/kitti',
+    help='specify the root path of dataset')
+parser.add_argument(
+    '--canbus',
+    type=str,
+    default='./data',
+    help='specify the root path of nuScenes canbus')
+parser.add_argument(
+    '--version',
+    type=str,
+    default='v1.0',
+    required=False,
+    help='specify the dataset version, no need for kitti')
+parser.add_argument(
+    '--max-sweeps',
+    type=int,
+    default=10,
+    required=False,
+    help='specify sweeps of lidar per example')
+parser.add_argument(
+    '--out-dir',
+    type=str,
+    default='./data/kitti',
+    required='False',
+    help='name of info pkl')
+parser.add_argument('--extra-tag', type=str, default='kitti')
+parser.add_argument(
+    '--workers', type=int, default=4, help='number of threads to be used')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    if args.dataset == 'kitti':
+        kitti_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=args.version,
+            out_dir=args.out_dir)
+    elif args.dataset == 'nuscenes' and args.version != 'v1.0-mini':
+        train_version = f'{args.version}-trainval'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
+        test_version = f'{args.version}-test'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=test_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'nuscenes' and args.version == 'v1.0-mini':
+        train_version = f'{args.version}'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'lyft':
+        train_version = f'{args.version}-train'
+        lyft_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            max_sweeps=args.max_sweeps)
+        test_version = f'{args.version}-test'
+        lyft_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=test_version,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'waymo':
+        waymo_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=args.version,
+            out_dir=args.out_dir,
+            workers=args.workers,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'scannet':
+        scannet_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            out_dir=args.out_dir,
+            workers=args.workers)
+    elif args.dataset == 's3dis':
+        s3dis_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            out_dir=args.out_dir,
+            workers=args.workers)
+    elif args.dataset == 'sunrgbd':
+        sunrgbd_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            out_dir=args.out_dir,
+            workers=args.workers)
diff --git a/adzoo/bevformer/data_converter/__init__.py b/adzoo/bevformer/data_converter/__init__.py
new file mode 100755
index 0000000..ef101fe
--- /dev/null
+++ b/adzoo/bevformer/data_converter/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/adzoo/bevformer/data_converter/create_gt_database.py b/adzoo/bevformer/data_converter/create_gt_database.py
new file mode 100755
index 0000000..6be53ec
--- /dev/null
+++ b/adzoo/bevformer/data_converter/create_gt_database.py
@@ -0,0 +1,338 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import pickle
+from mmcv import track_iter_progress
+from mmcv.ops import roi_align
+from os import path as osp
+from pycocotools import mask as maskUtils
+from pycocotools.coco import COCO
+
+from mmcv.core.bbox import box_np_ops as box_np_ops
+from mmcv.datasets import build_dataset
+from mmcv.core.evaluation.bbox_overlaps import bbox_overlaps
+
+
+def _poly2mask(mask_ann, img_h, img_w):
+    if isinstance(mask_ann, list):
+        # polygon -- a single object might consist of multiple parts
+        # we merge all parts into one mask rle code
+        rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+        rle = maskUtils.merge(rles)
+    elif isinstance(mask_ann['counts'], list):
+        # uncompressed RLE
+        rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+    else:
+        # rle
+        rle = mask_ann
+    mask = maskUtils.decode(rle)
+    return mask
+
+
+def _parse_coco_ann_info(ann_info):
+    gt_bboxes = []
+    gt_labels = []
+    gt_bboxes_ignore = []
+    gt_masks_ann = []
+
+    for i, ann in enumerate(ann_info):
+        if ann.get('ignore', False):
+            continue
+        x1, y1, w, h = ann['bbox']
+        if ann['area'] <= 0:
+            continue
+        bbox = [x1, y1, x1 + w, y1 + h]
+        if ann.get('iscrowd', False):
+            gt_bboxes_ignore.append(bbox)
+        else:
+            gt_bboxes.append(bbox)
+            gt_masks_ann.append(ann['segmentation'])
+
+    if gt_bboxes:
+        gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+        gt_labels = np.array(gt_labels, dtype=np.int64)
+    else:
+        gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+        gt_labels = np.array([], dtype=np.int64)
+
+    if gt_bboxes_ignore:
+        gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+    else:
+        gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+    ann = dict(
+        bboxes=gt_bboxes, bboxes_ignore=gt_bboxes_ignore, masks=gt_masks_ann)
+
+    return ann
+
+
+def crop_image_patch_v2(pos_proposals, pos_assigned_gt_inds, gt_masks):
+    import torch
+    from torch.nn.modules.utils import _pair
+    device = pos_proposals.device
+    num_pos = pos_proposals.size(0)
+    fake_inds = (
+        torch.arange(num_pos,
+                     device=device).to(dtype=pos_proposals.dtype)[:, None])
+    rois = torch.cat([fake_inds, pos_proposals], dim=1)  # Nx5
+    mask_size = _pair(28)
+    rois = rois.to(device=device)
+    gt_masks_th = (
+        torch.from_numpy(gt_masks).to(device).index_select(
+            0, pos_assigned_gt_inds).to(dtype=rois.dtype))
+    # Use RoIAlign could apparently accelerate the training (~0.1s/iter)
+    targets = (
+        roi_align(gt_masks_th, rois, mask_size[::-1], 1.0, 0, True).squeeze(1))
+    return targets
+
+
+def crop_image_patch(pos_proposals, gt_masks, pos_assigned_gt_inds, org_img):
+    num_pos = pos_proposals.shape[0]
+    masks = []
+    img_patches = []
+    for i in range(num_pos):
+        gt_mask = gt_masks[pos_assigned_gt_inds[i]]
+        bbox = pos_proposals[i, :].astype(np.int32)
+        x1, y1, x2, y2 = bbox
+        w = np.maximum(x2 - x1 + 1, 1)
+        h = np.maximum(y2 - y1 + 1, 1)
+
+        mask_patch = gt_mask[y1:y1 + h, x1:x1 + w]
+        masked_img = gt_mask[..., None] * org_img
+        img_patch = masked_img[y1:y1 + h, x1:x1 + w]
+
+        img_patches.append(img_patch)
+        masks.append(mask_patch)
+    return img_patches, masks
+
+
+def create_groundtruth_database(dataset_class_name,
+                                data_path,
+                                info_prefix,
+                                info_path=None,
+                                mask_anno_path=None,
+                                used_classes=None,
+                                database_save_path=None,
+                                db_info_save_path=None,
+                                relative_path=True,
+                                add_rgb=False,
+                                lidar_only=False,
+                                bev_only=False,
+                                coors_range=None,
+                                with_mask=False):
+    """Given the raw data, generate the ground truth database.
+
+    Args:
+        dataset_class_name （str): Name of the input dataset.
+        data_path (str): Path of the data.
+        info_prefix (str): Prefix of the info file.
+        info_path (str): Path of the info file.
+            Default: None.
+        mask_anno_path (str): Path of the mask_anno.
+            Default: None.
+        used_classes (list[str]): Classes have been used.
+            Default: None.
+        database_save_path (str): Path to save database.
+            Default: None.
+        db_info_save_path (str): Path to save db_info.
+            Default: None.
+        relative_path (bool): Whether to use relative path.
+            Default: True.
+        with_mask (bool): Whether to use mask.
+            Default: False.
+    """
+    print(f'Create GT Database of {dataset_class_name}')
+    dataset_cfg = dict(
+        type=dataset_class_name, data_root=data_path, ann_file=info_path)
+    if dataset_class_name == 'KittiDataset':
+        file_client_args = dict(backend='disk')
+        dataset_cfg.update(
+            test_mode=False,
+            split='training',
+            modality=dict(
+                use_lidar=True,
+                use_depth=False,
+                use_lidar_intensity=True,
+                use_camera=with_mask,
+            ),
+            pipeline=[
+                dict(
+                    type='LoadPointsFromFile',
+                    coord_type='LIDAR',
+                    load_dim=4,
+                    use_dim=4,
+                    file_client_args=file_client_args),
+                dict(
+                    type='LoadAnnotations3D',
+                    with_bbox_3d=True,
+                    with_label_3d=True,
+                    file_client_args=file_client_args)
+            ])
+
+    elif dataset_class_name == 'NuScenesDataset':
+        dataset_cfg.update(
+            use_valid_flag=True,
+            pipeline=[
+                dict(
+                    type='LoadPointsFromFile',
+                    coord_type='LIDAR',
+                    load_dim=5,
+                    use_dim=5),
+                dict(
+                    type='LoadPointsFromMultiSweeps',
+                    sweeps_num=10,
+                    use_dim=[0, 1, 2, 3, 4],
+                    pad_empty_sweeps=True,
+                    remove_close=True),
+                dict(
+                    type='LoadAnnotations3D',
+                    with_bbox_3d=True,
+                    with_label_3d=True)
+            ])
+
+    elif dataset_class_name == 'WaymoDataset':
+        file_client_args = dict(backend='disk')
+        dataset_cfg.update(
+            test_mode=False,
+            split='training',
+            modality=dict(
+                use_lidar=True,
+                use_depth=False,
+                use_lidar_intensity=True,
+                use_camera=False,
+            ),
+            pipeline=[
+                dict(
+                    type='LoadPointsFromFile',
+                    coord_type='LIDAR',
+                    load_dim=6,
+                    use_dim=5,
+                    file_client_args=file_client_args),
+                dict(
+                    type='LoadAnnotations3D',
+                    with_bbox_3d=True,
+                    with_label_3d=True,
+                    file_client_args=file_client_args)
+            ])
+
+    dataset = build_dataset(dataset_cfg)
+
+    if database_save_path is None:
+        database_save_path = osp.join(data_path, f'{info_prefix}_gt_database')
+    if db_info_save_path is None:
+        db_info_save_path = osp.join(data_path,
+                                     f'{info_prefix}_dbinfos_train.pkl')
+    mmcv.mkdir_or_exist(database_save_path)
+    all_db_infos = dict()
+    if with_mask:
+        coco = COCO(osp.join(data_path, mask_anno_path))
+        imgIds = coco.getImgIds()
+        file2id = dict()
+        for i in imgIds:
+            info = coco.loadImgs([i])[0]
+            file2id.update({info['file_name']: i})
+
+    group_counter = 0
+    for j in track_iter_progress(list(range(len(dataset)))):
+        input_dict = dataset.get_data_info(j)
+        dataset.pre_pipeline(input_dict)
+        example = dataset.pipeline(input_dict)
+        annos = example['ann_info']
+        image_idx = example['sample_idx']
+        points = example['points'].tensor.numpy()
+        gt_boxes_3d = annos['gt_bboxes_3d'].tensor.numpy()
+        names = annos['gt_names']
+        group_dict = dict()
+        if 'group_ids' in annos:
+            group_ids = annos['group_ids']
+        else:
+            group_ids = np.arange(gt_boxes_3d.shape[0], dtype=np.int64)
+        difficulty = np.zeros(gt_boxes_3d.shape[0], dtype=np.int32)
+        if 'difficulty' in annos:
+            difficulty = annos['difficulty']
+
+        num_obj = gt_boxes_3d.shape[0]
+        point_indices = box_np_ops.points_in_rbbox(points, gt_boxes_3d)
+
+        if with_mask:
+            # prepare masks
+            gt_boxes = annos['gt_bboxes']
+            img_path = osp.split(example['img_info']['filename'])[-1]
+            if img_path not in file2id.keys():
+                print(f'skip image {img_path} for empty mask')
+                continue
+            img_id = file2id[img_path]
+            kins_annIds = coco.getAnnIds(imgIds=img_id)
+            kins_raw_info = coco.loadAnns(kins_annIds)
+            kins_ann_info = _parse_coco_ann_info(kins_raw_info)
+            h, w = annos['img_shape'][:2]
+            gt_masks = [
+                _poly2mask(mask, h, w) for mask in kins_ann_info['masks']
+            ]
+            # get mask inds based on iou mapping
+            bbox_iou = bbox_overlaps(kins_ann_info['bboxes'], gt_boxes)
+            mask_inds = bbox_iou.argmax(axis=0)
+            valid_inds = (bbox_iou.max(axis=0) > 0.5)
+
+            # mask the image
+            # use more precise crop when it is ready
+            # object_img_patches = np.ascontiguousarray(
+            #     np.stack(object_img_patches, axis=0).transpose(0, 3, 1, 2))
+            # crop image patches using roi_align
+            # object_img_patches = crop_image_patch_v2(
+            #     torch.Tensor(gt_boxes),
+            #     torch.Tensor(mask_inds).long(), object_img_patches)
+            object_img_patches, object_masks = crop_image_patch(
+                gt_boxes, gt_masks, mask_inds, annos['img'])
+
+        for i in range(num_obj):
+            filename = f'{image_idx}_{names[i]}_{i}.bin'
+            abs_filepath = osp.join(database_save_path, filename)
+            rel_filepath = osp.join(f'{info_prefix}_gt_database', filename)
+
+            # save point clouds and image patches for each object
+            gt_points = points[point_indices[:, i]]
+            gt_points[:, :3] -= gt_boxes_3d[i, :3]
+
+            if with_mask:
+                if object_masks[i].sum() == 0 or not valid_inds[i]:
+                    # Skip object for empty or invalid mask
+                    continue
+                img_patch_path = abs_filepath + '.png'
+                mask_patch_path = abs_filepath + '.mask.png'
+                mmcv.imwrite(object_img_patches[i], img_patch_path)
+                mmcv.imwrite(object_masks[i], mask_patch_path)
+
+            with open(abs_filepath, 'w') as f:
+                gt_points.tofile(f)
+
+            if (used_classes is None) or names[i] in used_classes:
+                db_info = {
+                    'name': names[i],
+                    'path': rel_filepath,
+                    'image_idx': image_idx,
+                    'gt_idx': i,
+                    'box3d_lidar': gt_boxes_3d[i],
+                    'num_points_in_gt': gt_points.shape[0],
+                    'difficulty': difficulty[i],
+                }
+                local_group_id = group_ids[i]
+                # if local_group_id >= 0:
+                if local_group_id not in group_dict:
+                    group_dict[local_group_id] = group_counter
+                    group_counter += 1
+                db_info['group_id'] = group_dict[local_group_id]
+                if 'score' in annos:
+                    db_info['score'] = annos['score'][i]
+                if with_mask:
+                    db_info.update({'box2d_camera': gt_boxes[i]})
+                if names[i] in all_db_infos:
+                    all_db_infos[names[i]].append(db_info)
+                else:
+                    all_db_infos[names[i]] = [db_info]
+
+    for k, v in all_db_infos.items():
+        print(f'load {len(v)} {k} database infos')
+
+    with open(db_info_save_path, 'wb') as f:
+        pickle.dump(all_db_infos, f)
diff --git a/adzoo/bevformer/data_converter/indoor_converter.py b/adzoo/bevformer/data_converter/indoor_converter.py
new file mode 100755
index 0000000..0aa5820
--- /dev/null
+++ b/adzoo/bevformer/data_converter/indoor_converter.py
@@ -0,0 +1,108 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import os
+
+from .s3dis_data_utils import S3DISData, S3DISSegData
+from .scannet_data_utils import ScanNetData, ScanNetSegData
+from .sunrgbd_data_utils import SUNRGBDData
+
+
+def create_indoor_info_file(data_path,
+                            pkl_prefix='sunrgbd',
+                            save_path=None,
+                            use_v1=False,
+                            workers=4):
+    """Create indoor information file.
+
+    Get information of the raw data and save it to the pkl file.
+
+    Args:
+        data_path (str): Path of the data.
+        pkl_prefix (str): Prefix of the pkl to be saved. Default: 'sunrgbd'.
+        save_path (str): Path of the pkl to be saved. Default: None.
+        use_v1 (bool): Whether to use v1. Default: False.
+        workers (int): Number of threads to be used. Default: 4.
+    """
+    assert os.path.exists(data_path)
+    assert pkl_prefix in ['sunrgbd', 'scannet', 's3dis'], \
+        f'unsupported indoor dataset {pkl_prefix}'
+    save_path = data_path if save_path is None else save_path
+    assert os.path.exists(save_path)
+
+    # generate infos for both detection and segmentation task
+    if pkl_prefix in ['sunrgbd', 'scannet']:
+        train_filename = os.path.join(save_path,
+                                      f'{pkl_prefix}_infos_train.pkl')
+        val_filename = os.path.join(save_path, f'{pkl_prefix}_infos_val.pkl')
+        if pkl_prefix == 'sunrgbd':
+            # SUN RGB-D has a train-val split
+            train_dataset = SUNRGBDData(
+                root_path=data_path, split='train', use_v1=use_v1)
+            val_dataset = SUNRGBDData(
+                root_path=data_path, split='val', use_v1=use_v1)
+        else:
+            # ScanNet has a train-val-test split
+            train_dataset = ScanNetData(root_path=data_path, split='train')
+            val_dataset = ScanNetData(root_path=data_path, split='val')
+            test_dataset = ScanNetData(root_path=data_path, split='test')
+            test_filename = os.path.join(save_path,
+                                         f'{pkl_prefix}_infos_test.pkl')
+
+        infos_train = train_dataset.get_infos(
+            num_workers=workers, has_label=True)
+        mmcv.dump(infos_train, train_filename, 'pkl')
+        print(f'{pkl_prefix} info train file is saved to {train_filename}')
+
+        infos_val = val_dataset.get_infos(num_workers=workers, has_label=True)
+        mmcv.dump(infos_val, val_filename, 'pkl')
+        print(f'{pkl_prefix} info val file is saved to {val_filename}')
+
+    if pkl_prefix == 'scannet':
+        infos_test = test_dataset.get_infos(
+            num_workers=workers, has_label=False)
+        mmcv.dump(infos_test, test_filename, 'pkl')
+        print(f'{pkl_prefix} info test file is saved to {test_filename}')
+
+    # generate infos for the semantic segmentation task
+    # e.g. re-sampled scene indexes and label weights
+    # scene indexes are used to re-sample rooms with different number of points
+    # label weights are used to balance classes with different number of points
+    if pkl_prefix == 'scannet':
+        # label weight computation function is adopted from
+        # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24
+        train_dataset = ScanNetSegData(
+            data_root=data_path,
+            ann_file=train_filename,
+            split='train',
+            num_points=8192,
+            label_weight_func=lambda x: 1.0 / np.log(1.2 + x))
+        # TODO: do we need to generate on val set?
+        val_dataset = ScanNetSegData(
+            data_root=data_path,
+            ann_file=val_filename,
+            split='val',
+            num_points=8192,
+            label_weight_func=lambda x: 1.0 / np.log(1.2 + x))
+        # no need to generate for test set
+        train_dataset.get_seg_infos()
+        val_dataset.get_seg_infos()
+    elif pkl_prefix == 's3dis':
+        # S3DIS doesn't have a fixed train-val split
+        # it has 6 areas instead, so we generate info file for each of them
+        # in training, we will use dataset to wrap different areas
+        splits = [f'Area_{i}' for i in [1, 2, 3, 4, 5, 6]]
+        for split in splits:
+            dataset = S3DISData(root_path=data_path, split=split)
+            info = dataset.get_infos(num_workers=workers, has_label=True)
+            filename = os.path.join(save_path,
+                                    f'{pkl_prefix}_infos_{split}.pkl')
+            mmcv.dump(info, filename, 'pkl')
+            print(f'{pkl_prefix} info {split} file is saved to {filename}')
+            seg_dataset = S3DISSegData(
+                data_root=data_path,
+                ann_file=filename,
+                split=split,
+                num_points=4096,
+                label_weight_func=lambda x: 1.0 / np.log(1.2 + x))
+            seg_dataset.get_seg_infos()
diff --git a/adzoo/bevformer/data_converter/kitti_converter.py b/adzoo/bevformer/data_converter/kitti_converter.py
new file mode 100755
index 0000000..6ac2cef
--- /dev/null
+++ b/adzoo/bevformer/data_converter/kitti_converter.py
@@ -0,0 +1,546 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+from collections import OrderedDict
+from nuscenes.utils.geometry_utils import view_points
+from pathlib import Path
+
+from mmcv.core.bbox import box_np_ops
+from .kitti_data_utils import get_kitti_image_info, get_waymo_image_info
+from .nuscenes_converter import post_process_coords
+
+kitti_categories = ('Pedestrian', 'Cyclist', 'Car')
+
+
+def convert_to_kitti_info_version2(info):
+    """convert kitti info v1 to v2 if possible.
+
+    Args:
+        info (dict): Info of the input kitti data.
+            - image (dict): image info
+            - calib (dict): calibration info
+            - point_cloud (dict): point cloud info
+    """
+    if 'image' not in info or 'calib' not in info or 'point_cloud' not in info:
+        info['image'] = {
+            'image_shape': info['img_shape'],
+            'image_idx': info['image_idx'],
+            'image_path': info['img_path'],
+        }
+        info['calib'] = {
+            'R0_rect': info['calib/R0_rect'],
+            'Tr_velo_to_cam': info['calib/Tr_velo_to_cam'],
+            'P2': info['calib/P2'],
+        }
+        info['point_cloud'] = {
+            'velodyne_path': info['velodyne_path'],
+        }
+
+
+def _read_imageset_file(path):
+    with open(path, 'r') as f:
+        lines = f.readlines()
+    return [int(line) for line in lines]
+
+
+def _calculate_num_points_in_gt(data_path,
+                                infos,
+                                relative_path,
+                                remove_outside=True,
+                                num_features=4):
+    for info in mmcv.track_iter_progress(infos):
+        pc_info = info['point_cloud']
+        image_info = info['image']
+        calib = info['calib']
+        if relative_path:
+            v_path = str(Path(data_path) / pc_info['velodyne_path'])
+        else:
+            v_path = pc_info['velodyne_path']
+        points_v = np.fromfile(
+            v_path, dtype=np.float32, count=-1).reshape([-1, num_features])
+        rect = calib['R0_rect']
+        Trv2c = calib['Tr_velo_to_cam']
+        P2 = calib['P2']
+        if remove_outside:
+            points_v = box_np_ops.remove_outside_points(
+                points_v, rect, Trv2c, P2, image_info['image_shape'])
+
+        # points_v = points_v[points_v[:, 0] > 0]
+        annos = info['annos']
+        num_obj = len([n for n in annos['name'] if n != 'DontCare'])
+        # annos = kitti.filter_kitti_anno(annos, ['DontCare'])
+        dims = annos['dimensions'][:num_obj]
+        loc = annos['location'][:num_obj]
+        rots = annos['rotation_y'][:num_obj]
+        gt_boxes_camera = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                         axis=1)
+        gt_boxes_lidar = box_np_ops.box_camera_to_lidar(
+            gt_boxes_camera, rect, Trv2c)
+        indices = box_np_ops.points_in_rbbox(points_v[:, :3], gt_boxes_lidar)
+        num_points_in_gt = indices.sum(0)
+        num_ignored = len(annos['dimensions']) - num_obj
+        num_points_in_gt = np.concatenate(
+            [num_points_in_gt, -np.ones([num_ignored])])
+        annos['num_points_in_gt'] = num_points_in_gt.astype(np.int32)
+
+
+def create_kitti_info_file(data_path,
+                           pkl_prefix='kitti',
+                           save_path=None,
+                           relative_path=True):
+    """Create info file of KITTI dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        data_path (str): Path of the data root.
+        pkl_prefix (str): Prefix of the info file to be generated.
+        save_path (str): Path to save the info file.
+        relative_path (bool): Whether to use relative path.
+    """
+    imageset_folder = Path(data_path) / 'ImageSets'
+    train_img_ids = _read_imageset_file(str(imageset_folder / 'train.txt'))
+
+    val_img_ids = _read_imageset_file(str(imageset_folder / 'val.txt'))
+    test_img_ids = _read_imageset_file(str(imageset_folder / 'test.txt'))
+
+    print('Generate info. this may take several minutes.')
+    if save_path is None:
+        save_path = Path(data_path)
+    else:
+        save_path = Path(save_path)
+    kitti_infos_train = get_kitti_image_info(
+        data_path,
+        training=True,
+        velodyne=True,
+        calib=True,
+        image_ids=train_img_ids,
+        relative_path=relative_path)
+    _calculate_num_points_in_gt(data_path, kitti_infos_train, relative_path)
+    filename = save_path / f'{pkl_prefix}_infos_train.pkl'
+    print(f'Kitti info train file is saved to {filename}')
+    mmcv.dump(kitti_infos_train, filename)
+    kitti_infos_val = get_kitti_image_info(
+        data_path,
+        training=True,
+        velodyne=True,
+        calib=True,
+        image_ids=val_img_ids,
+        relative_path=relative_path)
+    _calculate_num_points_in_gt(data_path, kitti_infos_val, relative_path)
+    filename = save_path / f'{pkl_prefix}_infos_val.pkl'
+    print(f'Kitti info val file is saved to {filename}')
+    mmcv.dump(kitti_infos_val, filename)
+    filename = save_path / f'{pkl_prefix}_infos_trainval.pkl'
+    print(f'Kitti info trainval file is saved to {filename}')
+    mmcv.dump(kitti_infos_train + kitti_infos_val, filename)
+
+    kitti_infos_test = get_kitti_image_info(
+        data_path,
+        training=False,
+        label_info=False,
+        velodyne=True,
+        calib=True,
+        image_ids=test_img_ids,
+        relative_path=relative_path)
+    filename = save_path / f'{pkl_prefix}_infos_test.pkl'
+    print(f'Kitti info test file is saved to {filename}')
+    mmcv.dump(kitti_infos_test, filename)
+
+
+def create_waymo_info_file(data_path,
+                           pkl_prefix='waymo',
+                           save_path=None,
+                           relative_path=True,
+                           max_sweeps=5):
+    """Create info file of waymo dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        data_path (str): Path of the data root.
+        pkl_prefix (str): Prefix of the info file to be generated.
+        save_path (str | None): Path to save the info file.
+        relative_path (bool): Whether to use relative path.
+        max_sweeps (int): Max sweeps before the detection frame to be used.
+    """
+    imageset_folder = Path(data_path) / 'ImageSets'
+    train_img_ids = _read_imageset_file(str(imageset_folder / 'train.txt'))
+    # val_img_ids = _read_imageset_file(str(imageset_folder / 'val.txt'))
+    # test_img_ids = _read_imageset_file(str(imageset_folder / 'test.txt'))
+    train_img_ids = [each for each in train_img_ids if each % 5 == 0]
+    print('Generate info. this may take several minutes.')
+    if save_path is None:
+        save_path = Path(data_path)
+    else:
+        save_path = Path(save_path)
+    waymo_infos_train = get_waymo_image_info(
+        data_path,
+        training=True,
+        velodyne=True,
+        calib=True,
+        pose=True,
+        image_ids=train_img_ids,
+        relative_path=relative_path,
+        max_sweeps=max_sweeps)
+    _calculate_num_points_in_gt(
+        data_path,
+        waymo_infos_train,
+        relative_path,
+        num_features=6,
+        remove_outside=False)
+    filename = save_path / f'{pkl_prefix}_infos_train.pkl'
+    print(f'Waymo info train file is saved to {filename}')
+    mmcv.dump(waymo_infos_train, filename)
+    #
+    # waymo_infos_val = get_waymo_image_info(
+    #     data_path,
+    #     training=True,
+    #     velodyne=True,
+    #     calib=True,
+    #     pose=True,
+    #     image_ids=val_img_ids,
+    #     relative_path=relative_path,
+    #     max_sweeps=max_sweeps)
+    # _calculate_num_points_in_gt(
+    #     data_path,
+    #     waymo_infos_val,
+    #     relative_path,
+    #     num_features=6,
+    #     remove_outside=False)
+    # filename = save_path / f'{pkl_prefix}_infos_val.pkl'
+    # print(f'Waymo info val file is saved to {filename}')
+    # mmcv.dump(waymo_infos_val, filename)
+    # filename = save_path / f'{pkl_prefix}_infos_trainval.pkl'
+    # print(f'Waymo info trainval file is saved to {filename}')
+    # mmcv.dump(waymo_infos_train + waymo_infos_val, filename)
+    # waymo_infos_test = get_waymo_image_info(
+    #     data_path,
+    #     training=False,
+    #     label_info=False,
+    #     velodyne=True,
+    #     calib=True,
+    #     pose=True,
+    #     image_ids=test_img_ids,
+    #     relative_path=relative_path,
+    #     max_sweeps=max_sweeps)
+    # filename = save_path / f'{pkl_prefix}_infos_test.pkl'
+    # print(f'Waymo info test file is saved to {filename}')
+    # mmcv.dump(waymo_infos_test, filename)
+
+
+def _create_reduced_point_cloud(data_path,
+                                info_path,
+                                save_path=None,
+                                back=False,
+                                num_features=4,
+                                front_camera_id=2):
+    """Create reduced point clouds for given info.
+
+    Args:
+        data_path (str): Path of original data.
+        info_path (str): Path of data info.
+        save_path (str | None): Path to save reduced point cloud data.
+            Default: None.
+        back (bool): Whether to flip the points to back.
+        num_features (int): Number of point features. Default: 4.
+        front_camera_id (int): The referenced/front camera ID. Default: 2.
+    """
+    kitti_infos = mmcv.load(info_path)
+
+    for info in mmcv.track_iter_progress(kitti_infos):
+        pc_info = info['point_cloud']
+        image_info = info['image']
+        calib = info['calib']
+
+        v_path = pc_info['velodyne_path']
+        v_path = Path(data_path) / v_path
+        points_v = np.fromfile(
+            str(v_path), dtype=np.float32,
+            count=-1).reshape([-1, num_features])
+        rect = calib['R0_rect']
+        if front_camera_id == 2:
+            P2 = calib['P2']
+        else:
+            P2 = calib[f'P{str(front_camera_id)}']
+        Trv2c = calib['Tr_velo_to_cam']
+        # first remove z < 0 points
+        # keep = points_v[:, -1] > 0
+        # points_v = points_v[keep]
+        # then remove outside.
+        if back:
+            points_v[:, 0] = -points_v[:, 0]
+        points_v = box_np_ops.remove_outside_points(points_v, rect, Trv2c, P2,
+                                                    image_info['image_shape'])
+        if save_path is None:
+            save_dir = v_path.parent.parent / (v_path.parent.stem + '_reduced')
+            if not save_dir.exists():
+                save_dir.mkdir()
+            save_filename = save_dir / v_path.name
+            # save_filename = str(v_path) + '_reduced'
+            if back:
+                save_filename += '_back'
+        else:
+            save_filename = str(Path(save_path) / v_path.name)
+            if back:
+                save_filename += '_back'
+        with open(save_filename, 'w') as f:
+            points_v.tofile(f)
+
+
+def create_reduced_point_cloud(data_path,
+                               pkl_prefix,
+                               train_info_path=None,
+                               val_info_path=None,
+                               test_info_path=None,
+                               save_path=None,
+                               with_back=False):
+    """Create reduced point clouds for training/validation/testing.
+
+    Args:
+        data_path (str): Path of original data.
+        pkl_prefix (str): Prefix of info files.
+        train_info_path (str | None): Path of training set info.
+            Default: None.
+        val_info_path (str | None): Path of validation set info.
+            Default: None.
+        test_info_path (str | None): Path of test set info.
+            Default: None.
+        save_path (str | None): Path to save reduced point cloud data.
+        with_back (bool): Whether to flip the points to back.
+    """
+    if train_info_path is None:
+        train_info_path = Path(data_path) / f'{pkl_prefix}_infos_train.pkl'
+    if val_info_path is None:
+        val_info_path = Path(data_path) / f'{pkl_prefix}_infos_val.pkl'
+    if test_info_path is None:
+        test_info_path = Path(data_path) / f'{pkl_prefix}_infos_test.pkl'
+
+    print('create reduced point cloud for training set')
+    _create_reduced_point_cloud(data_path, train_info_path, save_path)
+    print('create reduced point cloud for validation set')
+    _create_reduced_point_cloud(data_path, val_info_path, save_path)
+    print('create reduced point cloud for testing set')
+    _create_reduced_point_cloud(data_path, test_info_path, save_path)
+    if with_back:
+        _create_reduced_point_cloud(
+            data_path, train_info_path, save_path, back=True)
+        _create_reduced_point_cloud(
+            data_path, val_info_path, save_path, back=True)
+        _create_reduced_point_cloud(
+            data_path, test_info_path, save_path, back=True)
+
+
+def export_2d_annotation(root_path, info_path, mono3d=True):
+    """Export 2d annotation from the info file and raw data.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        info_path (str): Path of the info file.
+        mono3d (bool): Whether to export mono3d annotation. Default: True.
+    """
+    # get bbox annotations for camera
+    kitti_infos = mmcv.load(info_path)
+    cat2Ids = [
+        dict(id=kitti_categories.index(cat_name), name=cat_name)
+        for cat_name in kitti_categories
+    ]
+    coco_ann_id = 0
+    coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+    from os import path as osp
+    for info in mmcv.track_iter_progress(kitti_infos):
+        coco_infos = get_2d_boxes(info, occluded=[0, 1, 2, 3], mono3d=mono3d)
+        (height, width,
+         _) = mmcv.imread(osp.join(root_path,
+                                   info['image']['image_path'])).shape
+        coco_2d_dict['images'].append(
+            dict(
+                file_name=info['image']['image_path'],
+                id=info['image']['image_idx'],
+                Tri2v=info['calib']['Tr_imu_to_velo'],
+                Trv2c=info['calib']['Tr_velo_to_cam'],
+                rect=info['calib']['R0_rect'],
+                cam_intrinsic=info['calib']['P2'],
+                width=width,
+                height=height))
+        for coco_info in coco_infos:
+            if coco_info is None:
+                continue
+            # add an empty key for coco format
+            coco_info['segmentation'] = []
+            coco_info['id'] = coco_ann_id
+            coco_2d_dict['annotations'].append(coco_info)
+            coco_ann_id += 1
+    if mono3d:
+        json_prefix = f'{info_path[:-4]}_mono3d'
+    else:
+        json_prefix = f'{info_path[:-4]}'
+    mmcv.dump(coco_2d_dict, f'{json_prefix}.coco.json')
+
+
+def get_2d_boxes(info, occluded, mono3d=True):
+    """Get the 2D annotation records for a given info.
+
+    Args:
+        info: Information of the given sample data.
+        occluded: Integer (0, 1, 2, 3) indicating occlusion state: \
+            0 = fully visible, 1 = partly occluded, 2 = largely occluded, \
+            3 = unknown, -1 = DontCare
+        mono3d (bool): Whether to get boxes with mono3d annotation.
+
+    Return:
+        list[dict]: List of 2D annotation record that belongs to the input
+            `sample_data_token`.
+    """
+    # Get calibration information
+    P2 = info['calib']['P2']
+
+    repro_recs = []
+    # if no annotations in info (test dataset), then return
+    if 'annos' not in info:
+        return repro_recs
+
+    # Get all the annotation with the specified visibilties.
+    ann_dicts = info['annos']
+    mask = [(ocld in occluded) for ocld in ann_dicts['occluded']]
+    for k in ann_dicts.keys():
+        ann_dicts[k] = ann_dicts[k][mask]
+
+    # convert dict of list to list of dict
+    ann_recs = []
+    for i in range(len(ann_dicts['occluded'])):
+        ann_rec = {}
+        for k in ann_dicts.keys():
+            ann_rec[k] = ann_dicts[k][i]
+        ann_recs.append(ann_rec)
+
+    for ann_idx, ann_rec in enumerate(ann_recs):
+        # Augment sample_annotation with token information.
+        ann_rec['sample_annotation_token'] = \
+            f"{info['image']['image_idx']}.{ann_idx}"
+        ann_rec['sample_data_token'] = info['image']['image_idx']
+        sample_data_token = info['image']['image_idx']
+
+        loc = ann_rec['location'][np.newaxis, :]
+        dim = ann_rec['dimensions'][np.newaxis, :]
+        rot = ann_rec['rotation_y'][np.newaxis, np.newaxis]
+        # transform the center from [0.5, 1.0, 0.5] to [0.5, 0.5, 0.5]
+        dst = np.array([0.5, 0.5, 0.5])
+        src = np.array([0.5, 1.0, 0.5])
+        loc = loc + dim * (dst - src)
+        offset = (info['calib']['P2'][0, 3] - info['calib']['P0'][0, 3]) \
+            / info['calib']['P2'][0, 0]
+        loc_3d = np.copy(loc)
+        loc_3d[0, 0] += offset
+        gt_bbox_3d = np.concatenate([loc, dim, rot], axis=1).astype(np.float32)
+
+        # Filter out the corners that are not in front of the calibrated
+        # sensor.
+        corners_3d = box_np_ops.center_to_corner_box3d(
+            gt_bbox_3d[:, :3],
+            gt_bbox_3d[:, 3:6],
+            gt_bbox_3d[:, 6], [0.5, 0.5, 0.5],
+            axis=1)
+        corners_3d = corners_3d[0].T  # (1, 8, 3) -> (3, 8)
+        in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+        corners_3d = corners_3d[:, in_front]
+
+        # Project 3d box to 2d.
+        camera_intrinsic = P2
+        corner_coords = view_points(corners_3d, camera_intrinsic,
+                                    True).T[:, :2].tolist()
+
+        # Keep only corners that fall within the image.
+        final_coords = post_process_coords(corner_coords)
+
+        # Skip if the convex hull of the re-projected corners
+        # does not intersect the image canvas.
+        if final_coords is None:
+            continue
+        else:
+            min_x, min_y, max_x, max_y = final_coords
+
+        # Generate dictionary record to be included in the .json file.
+        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+                                    sample_data_token,
+                                    info['image']['image_path'])
+
+        # If mono3d=True, add 3D annotations in camera coordinates
+        if mono3d and (repro_rec is not None):
+            repro_rec['bbox_cam3d'] = np.concatenate(
+                [loc_3d, dim, rot],
+                axis=1).astype(np.float32).squeeze().tolist()
+            repro_rec['velo_cam3d'] = -1  # no velocity in KITTI
+
+            center3d = np.array(loc).reshape([1, 3])
+            center2d = box_np_ops.points_cam2img(
+                center3d, camera_intrinsic, with_depth=True)
+            repro_rec['center2d'] = center2d.squeeze().tolist()
+            # normalized center2D + depth
+            # samples with depth < 0 will be removed
+            if repro_rec['center2d'][2] <= 0:
+                continue
+
+            repro_rec['attribute_name'] = -1  # no attribute in KITTI
+            repro_rec['attribute_id'] = -1
+
+        repro_recs.append(repro_rec)
+
+    return repro_recs
+
+
+def generate_record(ann_rec, x1, y1, x2, y2, sample_data_token, filename):
+    """Generate one 2D annotation record given various informations on top of
+    the 2D bounding box coordinates.
+
+    Args:
+        ann_rec (dict): Original 3d annotation record.
+        x1 (float): Minimum value of the x coordinate.
+        y1 (float): Minimum value of the y coordinate.
+        x2 (float): Maximum value of the x coordinate.
+        y2 (float): Maximum value of the y coordinate.
+        sample_data_token (str): Sample data token.
+        filename (str):The corresponding image file where the annotation
+            is present.
+
+    Returns:
+        dict: A sample 2D annotation record.
+            - file_name (str): flie name
+            - image_id (str): sample data token
+            - area (float): 2d box area
+            - category_name (str): category name
+            - category_id (int): category id
+            - bbox (list[float]): left x, top y, dx, dy of 2d box
+            - iscrowd (int): whether the area is crowd
+    """
+    repro_rec = OrderedDict()
+    repro_rec['sample_data_token'] = sample_data_token
+    coco_rec = dict()
+
+    key_mapping = {
+        'name': 'category_name',
+        'num_points_in_gt': 'num_lidar_pts',
+        'sample_annotation_token': 'sample_annotation_token',
+        'sample_data_token': 'sample_data_token',
+    }
+
+    for key, value in ann_rec.items():
+        if key in key_mapping.keys():
+            repro_rec[key_mapping[key]] = value
+
+    repro_rec['bbox_corners'] = [x1, y1, x2, y2]
+    repro_rec['filename'] = filename
+
+    coco_rec['file_name'] = filename
+    coco_rec['image_id'] = sample_data_token
+    coco_rec['area'] = (y2 - y1) * (x2 - x1)
+
+    if repro_rec['category_name'] not in kitti_categories:
+        return None
+    cat_name = repro_rec['category_name']
+    coco_rec['category_name'] = cat_name
+    coco_rec['category_id'] = kitti_categories.index(cat_name)
+    coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1]
+    coco_rec['iscrowd'] = 0
+
+    return coco_rec
diff --git a/adzoo/bevformer/data_converter/kitti_data_utils.py b/adzoo/bevformer/data_converter/kitti_data_utils.py
new file mode 100755
index 0000000..01538e0
--- /dev/null
+++ b/adzoo/bevformer/data_converter/kitti_data_utils.py
@@ -0,0 +1,554 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from collections import OrderedDict
+from concurrent import futures as futures
+from os import path as osp
+from pathlib import Path
+from skimage import io
+
+
+def get_image_index_str(img_idx, use_prefix_id=False):
+    if use_prefix_id:
+        return '{:07d}'.format(img_idx)
+    else:
+        return '{:06d}'.format(img_idx)
+
+
+def get_kitti_info_path(idx,
+                        prefix,
+                        info_type='image_2',
+                        file_tail='.png',
+                        training=True,
+                        relative_path=True,
+                        exist_check=True,
+                        use_prefix_id=False):
+    img_idx_str = get_image_index_str(idx, use_prefix_id)
+    img_idx_str += file_tail
+    prefix = Path(prefix)
+    if training:
+        file_path = Path('training') / info_type / img_idx_str
+    else:
+        file_path = Path('testing') / info_type / img_idx_str
+    if exist_check and not (prefix / file_path).exists():
+        raise ValueError('file not exist: {}'.format(file_path))
+    if relative_path:
+        return str(file_path)
+    else:
+        return str(prefix / file_path)
+
+
+def get_image_path(idx,
+                   prefix,
+                   training=True,
+                   relative_path=True,
+                   exist_check=True,
+                   info_type='image_2',
+                   use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, info_type, '.png', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_label_path(idx,
+                   prefix,
+                   training=True,
+                   relative_path=True,
+                   exist_check=True,
+                   info_type='label_2',
+                   use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, info_type, '.txt', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_velodyne_path(idx,
+                      prefix,
+                      training=True,
+                      relative_path=True,
+                      exist_check=True,
+                      use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, 'velodyne', '.bin', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_calib_path(idx,
+                   prefix,
+                   training=True,
+                   relative_path=True,
+                   exist_check=True,
+                   use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, 'calib', '.txt', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_pose_path(idx,
+                  prefix,
+                  training=True,
+                  relative_path=True,
+                  exist_check=True,
+                  use_prefix_id=False):
+    return get_kitti_info_path(idx, prefix, 'pose', '.txt', training,
+                               relative_path, exist_check, use_prefix_id)
+
+
+def get_label_anno(label_path):
+    annotations = {}
+    annotations.update({
+        'name': [],
+        'truncated': [],
+        'occluded': [],
+        'alpha': [],
+        'bbox': [],
+        'dimensions': [],
+        'location': [],
+        'rotation_y': []
+    })
+    with open(label_path, 'r') as f:
+        lines = f.readlines()
+    # if len(lines) == 0 or len(lines[0]) < 15:
+    #     content = []
+    # else:
+    content = [line.strip().split(' ') for line in lines]
+    num_objects = len([x[0] for x in content if x[0] != 'DontCare'])
+    annotations['name'] = np.array([x[0] for x in content])
+    num_gt = len(annotations['name'])
+    annotations['truncated'] = np.array([float(x[1]) for x in content])
+    annotations['occluded'] = np.array([int(x[2]) for x in content])
+    annotations['alpha'] = np.array([float(x[3]) for x in content])
+    annotations['bbox'] = np.array([[float(info) for info in x[4:8]]
+                                    for x in content]).reshape(-1, 4)
+    # dimensions will convert hwl format to standard lhw(camera) format.
+    annotations['dimensions'] = np.array([[float(info) for info in x[8:11]]
+                                          for x in content
+                                          ]).reshape(-1, 3)[:, [2, 0, 1]]
+    annotations['location'] = np.array([[float(info) for info in x[11:14]]
+                                        for x in content]).reshape(-1, 3)
+    annotations['rotation_y'] = np.array([float(x[14])
+                                          for x in content]).reshape(-1)
+    if len(content) != 0 and len(content[0]) == 16:  # have score
+        annotations['score'] = np.array([float(x[15]) for x in content])
+    else:
+        annotations['score'] = np.zeros((annotations['bbox'].shape[0], ))
+    index = list(range(num_objects)) + [-1] * (num_gt - num_objects)
+    annotations['index'] = np.array(index, dtype=np.int32)
+    annotations['group_ids'] = np.arange(num_gt, dtype=np.int32)
+    return annotations
+
+
+def _extend_matrix(mat):
+    mat = np.concatenate([mat, np.array([[0., 0., 0., 1.]])], axis=0)
+    return mat
+
+
+def get_kitti_image_info(path,
+                         training=True,
+                         label_info=True,
+                         velodyne=False,
+                         calib=False,
+                         image_ids=7481,
+                         extend_matrix=True,
+                         num_worker=8,
+                         relative_path=True,
+                         with_imageshape=True):
+    """
+    KITTI annotation format version 2:
+    {
+        [optional]points: [N, 3+] point cloud
+        [optional, for kitti]image: {
+            image_idx: ...
+            image_path: ...
+            image_shape: ...
+        }
+        point_cloud: {
+            num_features: 4
+            velodyne_path: ...
+        }
+        [optional, for kitti]calib: {
+            R0_rect: ...
+            Tr_velo_to_cam: ...
+            P2: ...
+        }
+        annos: {
+            location: [num_gt, 3] array
+            dimensions: [num_gt, 3] array
+            rotation_y: [num_gt] angle array
+            name: [num_gt] ground truth name array
+            [optional]difficulty: kitti difficulty
+            [optional]group_ids: used for multi-part object
+        }
+    }
+    """
+    root_path = Path(path)
+    if not isinstance(image_ids, list):
+        image_ids = list(range(image_ids))
+
+    def map_func(idx):
+        info = {}
+        pc_info = {'num_features': 4}
+        calib_info = {}
+
+        image_info = {'image_idx': idx}
+        annotations = None
+        if velodyne:
+            pc_info['velodyne_path'] = get_velodyne_path(
+                idx, path, training, relative_path)
+        image_info['image_path'] = get_image_path(idx, path, training,
+                                                  relative_path)
+        if with_imageshape:
+            img_path = image_info['image_path']
+            if relative_path:
+                img_path = str(root_path / img_path)
+            image_info['image_shape'] = np.array(
+                io.imread(img_path).shape[:2], dtype=np.int32)
+        if label_info:
+            label_path = get_label_path(idx, path, training, relative_path)
+            if relative_path:
+                label_path = str(root_path / label_path)
+            annotations = get_label_anno(label_path)
+        info['image'] = image_info
+        info['point_cloud'] = pc_info
+        if calib:
+            calib_path = get_calib_path(
+                idx, path, training, relative_path=False)
+            with open(calib_path, 'r') as f:
+                lines = f.readlines()
+            P0 = np.array([float(info) for info in lines[0].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P1 = np.array([float(info) for info in lines[1].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P2 = np.array([float(info) for info in lines[2].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P3 = np.array([float(info) for info in lines[3].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            if extend_matrix:
+                P0 = _extend_matrix(P0)
+                P1 = _extend_matrix(P1)
+                P2 = _extend_matrix(P2)
+                P3 = _extend_matrix(P3)
+            R0_rect = np.array([
+                float(info) for info in lines[4].split(' ')[1:10]
+            ]).reshape([3, 3])
+            if extend_matrix:
+                rect_4x4 = np.zeros([4, 4], dtype=R0_rect.dtype)
+                rect_4x4[3, 3] = 1.
+                rect_4x4[:3, :3] = R0_rect
+            else:
+                rect_4x4 = R0_rect
+
+            Tr_velo_to_cam = np.array([
+                float(info) for info in lines[5].split(' ')[1:13]
+            ]).reshape([3, 4])
+            Tr_imu_to_velo = np.array([
+                float(info) for info in lines[6].split(' ')[1:13]
+            ]).reshape([3, 4])
+            if extend_matrix:
+                Tr_velo_to_cam = _extend_matrix(Tr_velo_to_cam)
+                Tr_imu_to_velo = _extend_matrix(Tr_imu_to_velo)
+            calib_info['P0'] = P0
+            calib_info['P1'] = P1
+            calib_info['P2'] = P2
+            calib_info['P3'] = P3
+            calib_info['R0_rect'] = rect_4x4
+            calib_info['Tr_velo_to_cam'] = Tr_velo_to_cam
+            calib_info['Tr_imu_to_velo'] = Tr_imu_to_velo
+            info['calib'] = calib_info
+
+        if annotations is not None:
+            info['annos'] = annotations
+            add_difficulty_to_annos(info)
+        return info
+
+    with futures.ThreadPoolExecutor(num_worker) as executor:
+        image_infos = executor.map(map_func, image_ids)
+
+    return list(image_infos)
+
+
+def get_waymo_image_info(path,
+                         training=True,
+                         label_info=True,
+                         velodyne=False,
+                         calib=False,
+                         pose=False,
+                         image_ids=7481,
+                         extend_matrix=True,
+                         num_worker=8,
+                         relative_path=True,
+                         with_imageshape=True,
+                         max_sweeps=5):
+    """
+    Waymo annotation format version like KITTI:
+    {
+        [optional]points: [N, 3+] point cloud
+        [optional, for kitti]image: {
+            image_idx: ...
+            image_path: ...
+            image_shape: ...
+        }
+        point_cloud: {
+            num_features: 6
+            velodyne_path: ...
+        }
+        [optional, for kitti]calib: {
+            R0_rect: ...
+            Tr_velo_to_cam0: ...
+            P0: ...
+        }
+        annos: {
+            location: [num_gt, 3] array
+            dimensions: [num_gt, 3] array
+            rotation_y: [num_gt] angle array
+            name: [num_gt] ground truth name array
+            [optional]difficulty: kitti difficulty
+            [optional]group_ids: used for multi-part object
+        }
+    }
+    """
+    root_path = Path(path)
+    if not isinstance(image_ids, list):
+        image_ids = list(range(image_ids))
+
+    def map_func(idx):
+        info = {}
+        pc_info = {'num_features': 6}
+        calib_info = {}
+
+        image_info = {'image_idx': idx}
+        annotations = None
+        if velodyne:
+            pc_info['velodyne_path'] = get_velodyne_path(
+                idx, path, training, relative_path, use_prefix_id=True)
+            points = np.fromfile(
+                Path(path) / pc_info['velodyne_path'], dtype=np.float32)
+            points = np.copy(points).reshape(-1, pc_info['num_features'])
+            info['timestamp'] = np.int64(points[0, -1])
+            # values of the last dim are all the timestamp
+        image_info['image_path'] = get_image_path(
+            idx,
+            path,
+            training,
+            relative_path,
+            info_type='image_0',
+            use_prefix_id=True)
+        if with_imageshape:
+            img_path = image_info['image_path']
+            if relative_path:
+                img_path = str(root_path / img_path)
+            image_info['image_shape'] = np.array(
+                io.imread(img_path).shape[:2], dtype=np.int32)
+        if label_info:
+            label_path = get_label_path(
+                idx,
+                path,
+                training,
+                relative_path,
+                info_type='label_all',
+                use_prefix_id=True)
+            if relative_path:
+                label_path = str(root_path / label_path)
+            annotations = get_label_anno(label_path)
+        info['image'] = image_info
+        info['point_cloud'] = pc_info
+        if calib:
+            calib_path = get_calib_path(
+                idx, path, training, relative_path=False, use_prefix_id=True)
+            with open(calib_path, 'r') as f:
+                lines = f.readlines()
+            P0 = np.array([float(info) for info in lines[0].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P1 = np.array([float(info) for info in lines[1].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P2 = np.array([float(info) for info in lines[2].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P3 = np.array([float(info) for info in lines[3].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            P4 = np.array([float(info) for info in lines[4].split(' ')[1:13]
+                           ]).reshape([3, 4])
+            if extend_matrix:
+                P0 = _extend_matrix(P0)
+                P1 = _extend_matrix(P1)
+                P2 = _extend_matrix(P2)
+                P3 = _extend_matrix(P3)
+                P4 = _extend_matrix(P4)
+            R0_rect = np.array([
+                float(info) for info in lines[5].split(' ')[1:10]
+            ]).reshape([3, 3])
+            if extend_matrix:
+                rect_4x4 = np.zeros([4, 4], dtype=R0_rect.dtype)
+                rect_4x4[3, 3] = 1.
+                rect_4x4[:3, :3] = R0_rect
+            else:
+                rect_4x4 = R0_rect
+
+            Tr_velo_to_cam = np.array([
+                float(info) for info in lines[6].split(' ')[1:13]
+            ]).reshape([3, 4])
+            if extend_matrix:
+                Tr_velo_to_cam = _extend_matrix(Tr_velo_to_cam)
+            calib_info['P0'] = P0
+            calib_info['P1'] = P1
+            calib_info['P2'] = P2
+            calib_info['P3'] = P3
+            calib_info['P4'] = P4
+            calib_info['R0_rect'] = rect_4x4
+            calib_info['Tr_velo_to_cam'] = Tr_velo_to_cam
+            info['calib'] = calib_info
+        if pose:
+            pose_path = get_pose_path(
+                idx, path, training, relative_path=False, use_prefix_id=True)
+            info['pose'] = np.loadtxt(pose_path)
+
+        if annotations is not None:
+            info['annos'] = annotations
+            info['annos']['camera_id'] = info['annos'].pop('score')
+            add_difficulty_to_annos(info)
+
+        sweeps = []
+        prev_idx = idx
+        while len(sweeps) < max_sweeps:
+            prev_info = {}
+            prev_idx -= 1
+            prev_info['velodyne_path'] = get_velodyne_path(
+                prev_idx,
+                path,
+                training,
+                relative_path,
+                exist_check=False,
+                use_prefix_id=True)
+            if_prev_exists = osp.exists(
+                Path(path) / prev_info['velodyne_path'])
+            if if_prev_exists:
+                prev_points = np.fromfile(
+                    Path(path) / prev_info['velodyne_path'], dtype=np.float32)
+                prev_points = np.copy(prev_points).reshape(
+                    -1, pc_info['num_features'])
+                prev_info['timestamp'] = np.int64(prev_points[0, -1])
+                prev_pose_path = get_pose_path(
+                    prev_idx,
+                    path,
+                    training,
+                    relative_path=False,
+                    use_prefix_id=True)
+                prev_info['pose'] = np.loadtxt(prev_pose_path)
+                sweeps.append(prev_info)
+            else:
+                break
+        info['sweeps'] = sweeps
+
+        return info
+
+    with futures.ThreadPoolExecutor(num_worker) as executor:
+        image_infos = executor.map(map_func, image_ids)
+
+    return list(image_infos)
+
+
+def kitti_anno_to_label_file(annos, folder):
+    folder = Path(folder)
+    for anno in annos:
+        image_idx = anno['metadata']['image_idx']
+        label_lines = []
+        for j in range(anno['bbox'].shape[0]):
+            label_dict = {
+                'name': anno['name'][j],
+                'alpha': anno['alpha'][j],
+                'bbox': anno['bbox'][j],
+                'location': anno['location'][j],
+                'dimensions': anno['dimensions'][j],
+                'rotation_y': anno['rotation_y'][j],
+                'score': anno['score'][j],
+            }
+            label_line = kitti_result_line(label_dict)
+            label_lines.append(label_line)
+        label_file = folder / f'{get_image_index_str(image_idx)}.txt'
+        label_str = '\n'.join(label_lines)
+        with open(label_file, 'w') as f:
+            f.write(label_str)
+
+
+def add_difficulty_to_annos(info):
+    min_height = [40, 25,
+                  25]  # minimum height for evaluated groundtruth/detections
+    max_occlusion = [
+        0, 1, 2
+    ]  # maximum occlusion level of the groundtruth used for evaluation
+    max_trunc = [
+        0.15, 0.3, 0.5
+    ]  # maximum truncation level of the groundtruth used for evaluation
+    annos = info['annos']
+    dims = annos['dimensions']  # lhw format
+    bbox = annos['bbox']
+    height = bbox[:, 3] - bbox[:, 1]
+    occlusion = annos['occluded']
+    truncation = annos['truncated']
+    diff = []
+    easy_mask = np.ones((len(dims), ), dtype=np.bool)
+    moderate_mask = np.ones((len(dims), ), dtype=np.bool)
+    hard_mask = np.ones((len(dims), ), dtype=np.bool)
+    i = 0
+    for h, o, t in zip(height, occlusion, truncation):
+        if o > max_occlusion[0] or h <= min_height[0] or t > max_trunc[0]:
+            easy_mask[i] = False
+        if o > max_occlusion[1] or h <= min_height[1] or t > max_trunc[1]:
+            moderate_mask[i] = False
+        if o > max_occlusion[2] or h <= min_height[2] or t > max_trunc[2]:
+            hard_mask[i] = False
+        i += 1
+    is_easy = easy_mask
+    is_moderate = np.logical_xor(easy_mask, moderate_mask)
+    is_hard = np.logical_xor(hard_mask, moderate_mask)
+
+    for i in range(len(dims)):
+        if is_easy[i]:
+            diff.append(0)
+        elif is_moderate[i]:
+            diff.append(1)
+        elif is_hard[i]:
+            diff.append(2)
+        else:
+            diff.append(-1)
+    annos['difficulty'] = np.array(diff, np.int32)
+    return diff
+
+
+def kitti_result_line(result_dict, precision=4):
+    prec_float = '{' + ':.{}f'.format(precision) + '}'
+    res_line = []
+    all_field_default = OrderedDict([
+        ('name', None),
+        ('truncated', -1),
+        ('occluded', -1),
+        ('alpha', -10),
+        ('bbox', None),
+        ('dimensions', [-1, -1, -1]),
+        ('location', [-1000, -1000, -1000]),
+        ('rotation_y', -10),
+        ('score', 0.0),
+    ])
+    res_dict = [(key, None) for key, val in all_field_default.items()]
+    res_dict = OrderedDict(res_dict)
+    for key, val in result_dict.items():
+        if all_field_default[key] is None and val is None:
+            raise ValueError('you must specify a value for {}'.format(key))
+        res_dict[key] = val
+
+    for key, val in res_dict.items():
+        if key == 'name':
+            res_line.append(val)
+        elif key in ['truncated', 'alpha', 'rotation_y', 'score']:
+            if val is None:
+                res_line.append(str(all_field_default[key]))
+            else:
+                res_line.append(prec_float.format(val))
+        elif key == 'occluded':
+            if val is None:
+                res_line.append(str(all_field_default[key]))
+            else:
+                res_line.append('{}'.format(val))
+        elif key in ['bbox', 'dimensions', 'location']:
+            if val is None:
+                res_line += [str(v) for v in all_field_default[key]]
+            else:
+                res_line += [prec_float.format(v) for v in val]
+        else:
+            raise ValueError('unknown key. supported key:{}'.format(
+                res_dict.keys()))
+    return ' '.join(res_line)
diff --git a/adzoo/bevformer/data_converter/lyft_converter.py b/adzoo/bevformer/data_converter/lyft_converter.py
new file mode 100755
index 0000000..db4f0fb
--- /dev/null
+++ b/adzoo/bevformer/data_converter/lyft_converter.py
@@ -0,0 +1,268 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import os
+from logging import warning
+from lyft_dataset_sdk.lyftdataset import LyftDataset as Lyft
+from os import path as osp
+from pyquaternion import Quaternion
+
+from mmcv.datasets import LyftDataset
+from .nuscenes_converter import (get_2d_boxes, get_available_scenes,
+                                 obtain_sensor2top)
+
+lyft_categories = ('car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle',
+                   'motorcycle', 'bicycle', 'pedestrian', 'animal')
+
+
+def create_lyft_infos(root_path,
+                      info_prefix,
+                      version='v1.01-train',
+                      max_sweeps=10):
+    """Create info file of lyft dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        root_path (str): Path of the data root.
+        info_prefix (str): Prefix of the info file to be generated.
+        version (str): Version of the data.
+            Default: 'v1.01-train'
+        max_sweeps (int): Max number of sweeps.
+            Default: 10
+    """
+    lyft = Lyft(
+        data_path=osp.join(root_path, version),
+        json_path=osp.join(root_path, version, version),
+        verbose=True)
+    available_vers = ['v1.01-train', 'v1.01-test']
+    assert version in available_vers
+    if version == 'v1.01-train':
+        train_scenes = mmcv.list_from_file('data/lyft/train.txt')
+        val_scenes = mmcv.list_from_file('data/lyft/val.txt')
+    elif version == 'v1.01-test':
+        train_scenes = mmcv.list_from_file('data/lyft/test.txt')
+        val_scenes = []
+    else:
+        raise ValueError('unknown')
+
+    # filter existing scenes.
+    available_scenes = get_available_scenes(lyft)
+    available_scene_names = [s['name'] for s in available_scenes]
+    train_scenes = list(
+        filter(lambda x: x in available_scene_names, train_scenes))
+    val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes))
+    train_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in train_scenes
+    ])
+    val_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in val_scenes
+    ])
+
+    test = 'test' in version
+    if test:
+        print(f'test scene: {len(train_scenes)}')
+    else:
+        print(f'train scene: {len(train_scenes)}, \
+                val scene: {len(val_scenes)}')
+    train_lyft_infos, val_lyft_infos = _fill_trainval_infos(
+        lyft, train_scenes, val_scenes, test, max_sweeps=max_sweeps)
+
+    metadata = dict(version=version)
+    if test:
+        print(f'test sample: {len(train_lyft_infos)}')
+        data = dict(infos=train_lyft_infos, metadata=metadata)
+        info_name = f'{info_prefix}_infos_test'
+        info_path = osp.join(root_path, f'{info_name}.pkl')
+        mmcv.dump(data, info_path)
+    else:
+        print(f'train sample: {len(train_lyft_infos)}, \
+                val sample: {len(val_lyft_infos)}')
+        data = dict(infos=train_lyft_infos, metadata=metadata)
+        train_info_name = f'{info_prefix}_infos_train'
+        info_path = osp.join(root_path, f'{train_info_name}.pkl')
+        mmcv.dump(data, info_path)
+        data['infos'] = val_lyft_infos
+        val_info_name = f'{info_prefix}_infos_val'
+        info_val_path = osp.join(root_path, f'{val_info_name}.pkl')
+        mmcv.dump(data, info_val_path)
+
+
+def _fill_trainval_infos(lyft,
+                         train_scenes,
+                         val_scenes,
+                         test=False,
+                         max_sweeps=10):
+    """Generate the train/val infos from the raw data.
+
+    Args:
+        lyft (:obj:`LyftDataset`): Dataset class in the Lyft dataset.
+        train_scenes (list[str]): Basic information of training scenes.
+        val_scenes (list[str]): Basic information of validation scenes.
+        test (bool): Whether use the test mode. In the test mode, no
+            annotations can be accessed. Default: False.
+        max_sweeps (int): Max number of sweeps. Default: 10.
+
+    Returns:
+        tuple[list[dict]]: Information of training set and
+            validation set that will be saved to the info file.
+    """
+    train_lyft_infos = []
+    val_lyft_infos = []
+
+    for sample in mmcv.track_iter_progress(lyft.sample):
+        lidar_token = sample['data']['LIDAR_TOP']
+        sd_rec = lyft.get('sample_data', sample['data']['LIDAR_TOP'])
+        cs_record = lyft.get('calibrated_sensor',
+                             sd_rec['calibrated_sensor_token'])
+        pose_record = lyft.get('ego_pose', sd_rec['ego_pose_token'])
+        abs_lidar_path, boxes, _ = lyft.get_sample_data(lidar_token)
+        # nuScenes devkit returns more convenient relative paths while
+        # lyft devkit returns absolute paths
+        abs_lidar_path = str(abs_lidar_path)  # absolute path
+        lidar_path = abs_lidar_path.split(f'{os.getcwd()}/')[-1]
+        # relative path
+
+        mmcv.check_file_exist(lidar_path)
+
+        info = {
+            'lidar_path': lidar_path,
+            'token': sample['token'],
+            'sweeps': [],
+            'cams': dict(),
+            'lidar2ego_translation': cs_record['translation'],
+            'lidar2ego_rotation': cs_record['rotation'],
+            'ego2global_translation': pose_record['translation'],
+            'ego2global_rotation': pose_record['rotation'],
+            'timestamp': sample['timestamp'],
+        }
+
+        l2e_r = info['lidar2ego_rotation']
+        l2e_t = info['lidar2ego_translation']
+        e2g_r = info['ego2global_rotation']
+        e2g_t = info['ego2global_translation']
+        l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+        e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+        # obtain 6 image's information per frame
+        camera_types = [
+            'CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_FRONT_LEFT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_BACK_RIGHT',
+        ]
+        for cam in camera_types:
+            cam_token = sample['data'][cam]
+            cam_path, _, cam_intrinsic = lyft.get_sample_data(cam_token)
+            cam_info = obtain_sensor2top(lyft, cam_token, l2e_t, l2e_r_mat,
+                                         e2g_t, e2g_r_mat, cam)
+            cam_info.update(cam_intrinsic=cam_intrinsic)
+            info['cams'].update({cam: cam_info})
+
+        # obtain sweeps for a single key-frame
+        sd_rec = lyft.get('sample_data', sample['data']['LIDAR_TOP'])
+        sweeps = []
+        while len(sweeps) < max_sweeps:
+            if not sd_rec['prev'] == '':
+                sweep = obtain_sensor2top(lyft, sd_rec['prev'], l2e_t,
+                                          l2e_r_mat, e2g_t, e2g_r_mat, 'lidar')
+                sweeps.append(sweep)
+                sd_rec = lyft.get('sample_data', sd_rec['prev'])
+            else:
+                break
+        info['sweeps'] = sweeps
+        # obtain annotation
+        if not test:
+            annotations = [
+                lyft.get('sample_annotation', token)
+                for token in sample['anns']
+            ]
+            locs = np.array([b.center for b in boxes]).reshape(-1, 3)
+            dims = np.array([b.wlh for b in boxes]).reshape(-1, 3)
+            rots = np.array([b.orientation.yaw_pitch_roll[0]
+                             for b in boxes]).reshape(-1, 1)
+
+            names = [b.name for b in boxes]
+            for i in range(len(names)):
+                if names[i] in LyftDataset.NameMapping:
+                    names[i] = LyftDataset.NameMapping[names[i]]
+            names = np.array(names)
+
+            # we need to convert rot to SECOND format.
+            gt_boxes = np.concatenate([locs, dims, -rots - np.pi / 2], axis=1)
+            assert len(gt_boxes) == len(
+                annotations), f'{len(gt_boxes)}, {len(annotations)}'
+            info['gt_boxes'] = gt_boxes
+            info['gt_names'] = names
+            info['num_lidar_pts'] = np.array(
+                [a['num_lidar_pts'] for a in annotations])
+            info['num_radar_pts'] = np.array(
+                [a['num_radar_pts'] for a in annotations])
+
+        if sample['scene_token'] in train_scenes:
+            train_lyft_infos.append(info)
+        else:
+            val_lyft_infos.append(info)
+
+    return train_lyft_infos, val_lyft_infos
+
+
+def export_2d_annotation(root_path, info_path, version):
+    """Export 2d annotation from the info file and raw data.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        info_path (str): Path of the info file.
+        version (str): Dataset version.
+    """
+    warning.warn('DeprecationWarning: 2D annotations are not used on the '
+                 'Lyft dataset. The function export_2d_annotation will be '
+                 'deprecated.')
+    # get bbox annotations for camera
+    camera_types = [
+        'CAM_FRONT',
+        'CAM_FRONT_RIGHT',
+        'CAM_FRONT_LEFT',
+        'CAM_BACK',
+        'CAM_BACK_LEFT',
+        'CAM_BACK_RIGHT',
+    ]
+    lyft_infos = mmcv.load(info_path)['infos']
+    lyft = Lyft(
+        data_path=osp.join(root_path, version),
+        json_path=osp.join(root_path, version, version),
+        verbose=True)
+    # info_2d_list = []
+    cat2Ids = [
+        dict(id=lyft_categories.index(cat_name), name=cat_name)
+        for cat_name in lyft_categories
+    ]
+    coco_ann_id = 0
+    coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+    for info in mmcv.track_iter_progress(lyft_infos):
+        for cam in camera_types:
+            cam_info = info['cams'][cam]
+            coco_infos = get_2d_boxes(
+                lyft,
+                cam_info['sample_data_token'],
+                visibilities=['', '1', '2', '3', '4'])
+            (height, width, _) = mmcv.imread(cam_info['data_path']).shape
+            coco_2d_dict['images'].append(
+                dict(
+                    file_name=cam_info['data_path'],
+                    id=cam_info['sample_data_token'],
+                    width=width,
+                    height=height))
+            for coco_info in coco_infos:
+                if coco_info is None:
+                    continue
+                # add an empty key for coco format
+                coco_info['segmentation'] = []
+                coco_info['id'] = coco_ann_id
+                coco_2d_dict['annotations'].append(coco_info)
+                coco_ann_id += 1
+    mmcv.dump(coco_2d_dict, f'{info_path[:-4]}.coco.json')
diff --git a/adzoo/bevformer/data_converter/lyft_data_fixer.py b/adzoo/bevformer/data_converter/lyft_data_fixer.py
new file mode 100755
index 0000000..4207049
--- /dev/null
+++ b/adzoo/bevformer/data_converter/lyft_data_fixer.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import numpy as np
+import os
+
+
+def fix_lyft(root_folder='./data/lyft', version='v1.01'):
+    # refer to https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/discussion/110000  # noqa
+    lidar_path = 'lidar/host-a011_lidar1_1233090652702363606.bin'
+    root_folder = os.path.join(root_folder, f'{version}-train')
+    lidar_path = os.path.join(root_folder, lidar_path)
+    assert os.path.isfile(lidar_path), f'Please download the complete Lyft ' \
+        f'dataset and make sure {lidar_path} is present.'
+    points = np.fromfile(lidar_path, dtype=np.float32, count=-1)
+    try:
+        points.reshape([-1, 5])
+        print(f'This fix is not required for version {version}.')
+    except ValueError:
+        new_points = np.array(list(points) + [100.0, 1.0], dtype='float32')
+        new_points.tofile(lidar_path)
+        print(f'Appended 100.0 and 1.0 to the end of {lidar_path}.')
+
+
+parser = argparse.ArgumentParser(description='Lyft dataset fixer arg parser')
+parser.add_argument(
+    '--root-folder',
+    type=str,
+    default='./data/lyft',
+    help='specify the root path of Lyft dataset')
+parser.add_argument(
+    '--version',
+    type=str,
+    default='v1.01',
+    help='specify Lyft dataset version')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    fix_lyft(root_folder=args.root_folder, version=args.version)
diff --git a/adzoo/bevformer/data_converter/nuimage_converter.py b/adzoo/bevformer/data_converter/nuimage_converter.py
new file mode 100755
index 0000000..92be1de
--- /dev/null
+++ b/adzoo/bevformer/data_converter/nuimage_converter.py
@@ -0,0 +1,225 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import base64
+import mmcv
+import numpy as np
+from nuimages import NuImages
+from nuimages.utils.utils import mask_decode, name_to_index_mapping
+from os import path as osp
+
+nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+                  'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+                  'barrier')
+
+NAME_MAPPING = {
+    'movable_object.barrier': 'barrier',
+    'vehicle.bicycle': 'bicycle',
+    'vehicle.bus.bendy': 'bus',
+    'vehicle.bus.rigid': 'bus',
+    'vehicle.car': 'car',
+    'vehicle.construction': 'construction_vehicle',
+    'vehicle.motorcycle': 'motorcycle',
+    'human.pedestrian.adult': 'pedestrian',
+    'human.pedestrian.child': 'pedestrian',
+    'human.pedestrian.construction_worker': 'pedestrian',
+    'human.pedestrian.police_officer': 'pedestrian',
+    'movable_object.trafficcone': 'traffic_cone',
+    'vehicle.trailer': 'trailer',
+    'vehicle.truck': 'truck',
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Data converter arg parser')
+    parser.add_argument(
+        '--data-root',
+        type=str,
+        default='./data/nuimages',
+        help='specify the root path of dataset')
+    parser.add_argument(
+        '--version',
+        type=str,
+        nargs='+',
+        default=['v1.0-mini'],
+        required=False,
+        help='specify the dataset version')
+    parser.add_argument(
+        '--out-dir',
+        type=str,
+        default='./data/nuimages/annotations/',
+        required=False,
+        help='path to save the exported json')
+    parser.add_argument(
+        '--nproc',
+        type=int,
+        default=4,
+        required=False,
+        help='workers to process semantic masks')
+    parser.add_argument('--extra-tag', type=str, default='nuimages')
+    args = parser.parse_args()
+    return args
+
+
+def get_img_annos(nuim, img_info, cat2id, out_dir, data_root, seg_root):
+    """Get semantic segmentation map for an image.
+
+    Args:
+        nuim (obj:`NuImages`): NuImages dataset object
+        img_info (dict): Meta information of img
+
+    Returns:
+        np.ndarray: Semantic segmentation map of the image
+    """
+    sd_token = img_info['token']
+    image_id = img_info['id']
+    name_to_index = name_to_index_mapping(nuim.category)
+
+    # Get image data.
+    width, height = img_info['width'], img_info['height']
+    semseg_mask = np.zeros((height, width)).astype('uint8')
+
+    # Load stuff / surface regions.
+    surface_anns = [
+        o for o in nuim.surface_ann if o['sample_data_token'] == sd_token
+    ]
+
+    # Draw stuff / surface regions.
+    for ann in surface_anns:
+        # Get color and mask.
+        category_token = ann['category_token']
+        category_name = nuim.get('category', category_token)['name']
+        if ann['mask'] is None:
+            continue
+        mask = mask_decode(ann['mask'])
+
+        # Draw mask for semantic segmentation.
+        semseg_mask[mask == 1] = name_to_index[category_name]
+
+    # Load object instances.
+    object_anns = [
+        o for o in nuim.object_ann if o['sample_data_token'] == sd_token
+    ]
+
+    # Sort by token to ensure that objects always appear in the
+    # instance mask in the same order.
+    object_anns = sorted(object_anns, key=lambda k: k['token'])
+
+    # Draw object instances.
+    # The 0 index is reserved for background; thus, the instances
+    # should start from index 1.
+    annotations = []
+    for i, ann in enumerate(object_anns, start=1):
+        # Get color, box, mask and name.
+        category_token = ann['category_token']
+        category_name = nuim.get('category', category_token)['name']
+        if ann['mask'] is None:
+            continue
+        mask = mask_decode(ann['mask'])
+
+        # Draw masks for semantic segmentation and instance segmentation.
+        semseg_mask[mask == 1] = name_to_index[category_name]
+
+        if category_name in NAME_MAPPING:
+            cat_name = NAME_MAPPING[category_name]
+            cat_id = cat2id[cat_name]
+
+            x_min, y_min, x_max, y_max = ann['bbox']
+            # encode calibrated instance mask
+            mask_anno = dict()
+            mask_anno['counts'] = base64.b64decode(
+                ann['mask']['counts']).decode()
+            mask_anno['size'] = ann['mask']['size']
+
+            data_anno = dict(
+                image_id=image_id,
+                category_id=cat_id,
+                bbox=[x_min, y_min, x_max - x_min, y_max - y_min],
+                area=(x_max - x_min) * (y_max - y_min),
+                segmentation=mask_anno,
+                iscrowd=0)
+            annotations.append(data_anno)
+
+    # after process, save semantic masks
+    img_filename = img_info['file_name']
+    seg_filename = img_filename.replace('jpg', 'png')
+    seg_filename = osp.join(seg_root, seg_filename)
+    mmcv.imwrite(semseg_mask, seg_filename)
+    return annotations, np.max(semseg_mask)
+
+
+def export_nuim_to_coco(nuim, data_root, out_dir, extra_tag, version, nproc):
+    print('Process category information')
+    categories = []
+    categories = [
+        dict(id=nus_categories.index(cat_name), name=cat_name)
+        for cat_name in nus_categories
+    ]
+    cat2id = {k_v['name']: k_v['id'] for k_v in categories}
+
+    images = []
+    print('Process image meta information...')
+    for sample_info in mmcv.track_iter_progress(nuim.sample_data):
+        if sample_info['is_key_frame']:
+            img_idx = len(images)
+            images.append(
+                dict(
+                    id=img_idx,
+                    token=sample_info['token'],
+                    file_name=sample_info['filename'],
+                    width=sample_info['width'],
+                    height=sample_info['height']))
+
+    seg_root = f'{out_dir}semantic_masks'
+    mmcv.mkdir_or_exist(seg_root)
+    mmcv.mkdir_or_exist(osp.join(data_root, 'calibrated'))
+
+    global process_img_anno
+
+    def process_img_anno(img_info):
+        single_img_annos, max_cls_id = get_img_annos(nuim, img_info, cat2id,
+                                                     out_dir, data_root,
+                                                     seg_root)
+        return single_img_annos, max_cls_id
+
+    print('Process img annotations...')
+    if nproc > 1:
+        outputs = mmcv.track_parallel_progress(
+            process_img_anno, images, nproc=nproc)
+    else:
+        outputs = []
+        for img_info in mmcv.track_iter_progress(images):
+            outputs.append(process_img_anno(img_info))
+
+    # Determine the index of object annotation
+    print('Process annotation information...')
+    annotations = []
+    max_cls_ids = []
+    for single_img_annos, max_cls_id in outputs:
+        max_cls_ids.append(max_cls_id)
+        for img_anno in single_img_annos:
+            img_anno.update(id=len(annotations))
+            annotations.append(img_anno)
+
+    max_cls_id = max(max_cls_ids)
+    print(f'Max ID of class in the semantic map: {max_cls_id}')
+
+    coco_format_json = dict(
+        images=images, annotations=annotations, categories=categories)
+
+    mmcv.mkdir_or_exist(out_dir)
+    out_file = osp.join(out_dir, f'{extra_tag}_{version}.json')
+    print(f'Annotation dumped to {out_file}')
+    mmcv.dump(coco_format_json, out_file)
+
+
+def main():
+    args = parse_args()
+    for version in args.version:
+        nuim = NuImages(
+            dataroot=args.data_root, version=version, verbose=True, lazy=True)
+        export_nuim_to_coco(nuim, args.data_root, args.out_dir, args.extra_tag,
+                            version, args.nproc)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/bevformer/data_converter/nuscenes_converter.py b/adzoo/bevformer/data_converter/nuscenes_converter.py
new file mode 100755
index 0000000..c3c071e
--- /dev/null
+++ b/adzoo/bevformer/data_converter/nuscenes_converter.py
@@ -0,0 +1,674 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+import numpy as np
+import os
+from collections import OrderedDict
+from nuscenes.nuscenes import NuScenes
+from nuscenes.utils.geometry_utils import view_points
+from os import path as osp
+from pyquaternion import Quaternion
+from shapely.geometry import MultiPoint, box
+from typing import List, Tuple, Union
+
+from mmcv.core.bbox.box_np_ops import points_cam2img
+from mmcv.datasets import NuScenesDataset
+from mmcv.fileio.io import dump, load
+from mmcv.image.io import imread
+from mmcv.utils import is_filepath, check_file_exist, track_iter_progress
+
+nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+                  'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+                  'barrier')
+
+nus_attributes = ('cycle.with_rider', 'cycle.without_rider',
+                  'pedestrian.moving', 'pedestrian.standing',
+                  'pedestrian.sitting_lying_down', 'vehicle.moving',
+                  'vehicle.parked', 'vehicle.stopped', 'None')
+
+
+def create_nuscenes_infos(root_path,
+                          out_path,
+                          can_bus_root_path,
+                          info_prefix,
+                          version='v1.0-trainval',
+                          max_sweeps=10):
+    """Create info file of nuscene dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        root_path (str): Path of the data root.
+        info_prefix (str): Prefix of the info file to be generated.
+        version (str): Version of the data.
+            Default: 'v1.0-trainval'
+        max_sweeps (int): Max number of sweeps.
+            Default: 10
+    """
+    from nuscenes.nuscenes import NuScenes
+    from nuscenes.can_bus.can_bus_api import NuScenesCanBus
+    print(version, root_path)
+    nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+    nusc_can_bus = NuScenesCanBus(dataroot=can_bus_root_path)
+    from nuscenes.utils import splits
+    available_vers = ['v1.0-trainval', 'v1.0-test', 'v1.0-mini']
+    assert version in available_vers
+    if version == 'v1.0-trainval':
+        train_scenes = splits.train
+        val_scenes = splits.val
+    elif version == 'v1.0-test':
+        train_scenes = splits.test
+        val_scenes = []
+    elif version == 'v1.0-mini':
+        train_scenes = splits.mini_train
+        val_scenes = splits.mini_val
+    else:
+        raise ValueError('unknown')
+
+    # filter existing scenes.
+    available_scenes = get_available_scenes(nusc)
+    available_scene_names = [s['name'] for s in available_scenes]
+    train_scenes = list(
+        filter(lambda x: x in available_scene_names, train_scenes))
+    val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes))
+    train_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in train_scenes
+    ])
+    val_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in val_scenes
+    ])
+
+    test = 'test' in version
+    if test:
+        print('test scene: {}'.format(len(train_scenes)))
+    else:
+        print('train scene: {}, val scene: {}'.format(
+            len(train_scenes), len(val_scenes)))
+
+    train_nusc_infos, val_nusc_infos = _fill_trainval_infos(
+        nusc, nusc_can_bus, train_scenes, val_scenes, test, max_sweeps=max_sweeps)
+
+    metadata = dict(version=version)
+    if test:
+        print('test sample: {}'.format(len(train_nusc_infos)))
+        data = dict(infos=train_nusc_infos, metadata=metadata)
+        info_path = osp.join(out_path,
+                             '{}_infos_temporal_test.pkl'.format(info_prefix))
+        dump(data, info_path)
+    else:
+        print('train sample: {}, val sample: {}'.format(
+            len(train_nusc_infos), len(val_nusc_infos)))
+        data = dict(infos=train_nusc_infos, metadata=metadata)
+        info_path = osp.join(out_path,
+                             '{}_infos_temporal_train.pkl'.format(info_prefix))
+        dump(data, info_path)
+        data['infos'] = val_nusc_infos
+        info_val_path = osp.join(out_path,
+                                 '{}_infos_temporal_val.pkl'.format(info_prefix))
+        dump(data, info_val_path)
+
+
+def get_available_scenes(nusc):
+    """Get available scenes from the input nuscenes class.
+
+    Given the raw data, get the information of available scenes for
+    further info generation.
+
+    Args:
+        nusc (class): Dataset class in the nuScenes dataset.
+
+    Returns:
+        available_scenes (list[dict]): List of basic information for the
+            available scenes.
+    """
+    available_scenes = []
+    print('total scene num: {}'.format(len(nusc.scene)))
+    for scene in nusc.scene:
+        scene_token = scene['token']
+        scene_rec = nusc.get('scene', scene_token)
+        sample_rec = nusc.get('sample', scene_rec['first_sample_token'])
+        sd_rec = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+        has_more_frames = True
+        scene_not_exist = False
+        while has_more_frames:
+            lidar_path, boxes, _ = nusc.get_sample_data(sd_rec['token'])
+            lidar_path = str(lidar_path)
+            if os.getcwd() in lidar_path:
+                # path from lyftdataset is absolute path
+                lidar_path = lidar_path.split(f'{os.getcwd()}/')[-1]
+                # relative path
+            if not is_filepath(lidar_path):
+                scene_not_exist = True
+                break
+            else:
+                break
+        if scene_not_exist:
+            continue
+        available_scenes.append(scene)
+    print('exist scene num: {}'.format(len(available_scenes)))
+    return available_scenes
+
+
+def _get_can_bus_info(nusc, nusc_can_bus, sample):
+    scene_name = nusc.get('scene', sample['scene_token'])['name']
+    sample_timestamp = sample['timestamp']
+    try:
+        pose_list = nusc_can_bus.get_messages(scene_name, 'pose')
+    except:
+        return np.zeros(18)  # server scenes do not have can bus information.
+    can_bus = []
+    # during each scene, the first timestamp of can_bus may be large than the first sample's timestamp
+    last_pose = pose_list[0]
+    for i, pose in enumerate(pose_list):
+        if pose['utime'] > sample_timestamp:
+            break
+        last_pose = pose
+    _ = last_pose.pop('utime')  # useless
+    pos = last_pose.pop('pos')
+    rotation = last_pose.pop('orientation')
+    can_bus.extend(pos)
+    can_bus.extend(rotation)
+    for key in last_pose.keys():
+        can_bus.extend(pose[key])  # 16 elements
+    can_bus.extend([0., 0.])
+    return np.array(can_bus)
+
+
+def _fill_trainval_infos(nusc,
+                         nusc_can_bus,
+                         train_scenes,
+                         val_scenes,
+                         test=False,
+                         max_sweeps=10):
+    """Generate the train/val infos from the raw data.
+
+    Args:
+        nusc (:obj:`NuScenes`): Dataset class in the nuScenes dataset.
+        train_scenes (list[str]): Basic information of training scenes.
+        val_scenes (list[str]): Basic information of validation scenes.
+        test (bool): Whether use the test mode. In the test mode, no
+            annotations can be accessed. Default: False.
+        max_sweeps (int): Max number of sweeps. Default: 10.
+
+    Returns:
+        tuple[list[dict]]: Information of training set and validation set
+            that will be saved to the info file.
+    """
+    train_nusc_infos = []
+    val_nusc_infos = []
+    frame_idx = 0
+    for sample in track_iter_progress(nusc.sample):
+        lidar_token = sample['data']['LIDAR_TOP']
+        sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+        cs_record = nusc.get('calibrated_sensor',
+                             sd_rec['calibrated_sensor_token'])
+        pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+        lidar_path, boxes, _ = nusc.get_sample_data(lidar_token)
+
+        check_file_exist(lidar_path)
+        can_bus = _get_can_bus_info(nusc, nusc_can_bus, sample)
+        ##
+        info = {
+            'lidar_path': lidar_path,
+            'token': sample['token'],
+            'prev': sample['prev'],
+            'next': sample['next'],
+            'can_bus': can_bus,
+            'frame_idx': frame_idx,  # temporal related info
+            'sweeps': [],
+            'cams': dict(),
+            'scene_token': sample['scene_token'],  # temporal related info
+            'lidar2ego_translation': cs_record['translation'],
+            'lidar2ego_rotation': cs_record['rotation'],
+            'ego2global_translation': pose_record['translation'],
+            'ego2global_rotation': pose_record['rotation'],
+            'timestamp': sample['timestamp'],
+        }
+
+        if sample['next'] == '':
+            frame_idx = 0
+        else:
+            frame_idx += 1
+
+        l2e_r = info['lidar2ego_rotation']
+        l2e_t = info['lidar2ego_translation']
+        e2g_r = info['ego2global_rotation']
+        e2g_t = info['ego2global_translation']
+        l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+        e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+        # obtain 6 image's information per frame
+        camera_types = [
+            'CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_FRONT_LEFT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_BACK_RIGHT',
+        ]
+        for cam in camera_types:
+            cam_token = sample['data'][cam]
+            cam_path, _, cam_intrinsic = nusc.get_sample_data(cam_token)
+            cam_info = obtain_sensor2top(nusc, cam_token, l2e_t, l2e_r_mat,
+                                         e2g_t, e2g_r_mat, cam)
+            cam_info.update(cam_intrinsic=cam_intrinsic)
+            info['cams'].update({cam: cam_info})
+
+        # obtain sweeps for a single key-frame
+        sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+        sweeps = []
+        while len(sweeps) < max_sweeps:
+            if not sd_rec['prev'] == '':
+                sweep = obtain_sensor2top(nusc, sd_rec['prev'], l2e_t,
+                                          l2e_r_mat, e2g_t, e2g_r_mat, 'lidar')
+                sweeps.append(sweep)
+                sd_rec = nusc.get('sample_data', sd_rec['prev'])
+            else:
+                break
+        info['sweeps'] = sweeps
+        # obtain annotation
+        if not test:
+            annotations = [
+                nusc.get('sample_annotation', token)
+                for token in sample['anns']
+            ]
+            locs = np.array([b.center for b in boxes]).reshape(-1, 3)
+            dims = np.array([b.wlh for b in boxes]).reshape(-1, 3)
+            rots = np.array([b.orientation.yaw_pitch_roll[0]
+                             for b in boxes]).reshape(-1, 1)
+            velocity = np.array(
+                [nusc.box_velocity(token)[:2] for token in sample['anns']])
+            valid_flag = np.array(
+                [(anno['num_lidar_pts'] + anno['num_radar_pts']) > 0
+                 for anno in annotations],
+                dtype=bool).reshape(-1)
+            # convert velo from global to lidar
+            for i in range(len(boxes)):
+                velo = np.array([*velocity[i], 0.0])
+                velo = velo @ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(
+                    l2e_r_mat).T
+                velocity[i] = velo[:2]
+
+            names = [b.name for b in boxes]
+            for i in range(len(names)):
+                if names[i] in NuScenesDataset.NameMapping:
+                    names[i] = NuScenesDataset.NameMapping[names[i]]
+            names = np.array(names)
+            # we need to convert rot to SECOND format.
+            gt_boxes = np.concatenate([locs, dims, -rots - np.pi / 2], axis=1)
+            assert len(gt_boxes) == len(
+                annotations), f'{len(gt_boxes)}, {len(annotations)}'
+            info['gt_boxes'] = gt_boxes
+            info['gt_names'] = names
+            info['gt_velocity'] = velocity.reshape(-1, 2)
+            info['num_lidar_pts'] = np.array(
+                [a['num_lidar_pts'] for a in annotations])
+            info['num_radar_pts'] = np.array(
+                [a['num_radar_pts'] for a in annotations])
+            info['valid_flag'] = valid_flag
+
+        if sample['scene_token'] in train_scenes:
+            train_nusc_infos.append(info)
+        else:
+            val_nusc_infos.append(info)
+
+    return train_nusc_infos, val_nusc_infos
+
+
+def obtain_sensor2top(nusc,
+                      sensor_token,
+                      l2e_t,
+                      l2e_r_mat,
+                      e2g_t,
+                      e2g_r_mat,
+                      sensor_type='lidar'):
+    """Obtain the info with RT matric from general sensor to Top LiDAR.
+
+    Args:
+        nusc (class): Dataset class in the nuScenes dataset.
+        sensor_token (str): Sample data token corresponding to the
+            specific sensor type.
+        l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3).
+        l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego
+            in shape (3, 3).
+        e2g_t (np.ndarray): Translation from ego to global in shape (1, 3).
+        e2g_r_mat (np.ndarray): Rotation matrix from ego to global
+            in shape (3, 3).
+        sensor_type (str): Sensor to calibrate. Default: 'lidar'.
+
+    Returns:
+        sweep (dict): Sweep information after transformation.
+    """
+    sd_rec = nusc.get('sample_data', sensor_token)
+    cs_record = nusc.get('calibrated_sensor',
+                         sd_rec['calibrated_sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    data_path = str(nusc.get_sample_data_path(sd_rec['token']))
+    if os.getcwd() in data_path:  # path from lyftdataset is absolute path
+        data_path = data_path.split(f'{os.getcwd()}/')[-1]  # relative path
+    sweep = {
+        'data_path': data_path,
+        'type': sensor_type,
+        'sample_data_token': sd_rec['token'],
+        'sensor2ego_translation': cs_record['translation'],
+        'sensor2ego_rotation': cs_record['rotation'],
+        'ego2global_translation': pose_record['translation'],
+        'ego2global_rotation': pose_record['rotation'],
+        'timestamp': sd_rec['timestamp']
+    }
+
+    l2e_r_s = sweep['sensor2ego_rotation']
+    l2e_t_s = sweep['sensor2ego_translation']
+    e2g_r_s = sweep['ego2global_rotation']
+    e2g_t_s = sweep['ego2global_translation']
+
+    # obtain the RT from sensor to Top LiDAR
+    # sweep->ego->global->ego'->lidar
+    l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix
+    e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix
+    R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
+                  ) + l2e_t @ np.linalg.inv(l2e_r_mat).T
+    sweep['sensor2lidar_rotation'] = R.T  # points @ R.T + T
+    sweep['sensor2lidar_translation'] = T
+    return sweep
+
+
+def export_2d_annotation(root_path, info_path, version, mono3d=True):
+    """Export 2d annotation from the info file and raw data.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        info_path (str): Path of the info file.
+        version (str): Dataset version.
+        mono3d (bool): Whether to export mono3d annotation. Default: True.
+    """
+    # get bbox annotations for camera
+    camera_types = [
+        'CAM_FRONT',
+        'CAM_FRONT_RIGHT',
+        'CAM_FRONT_LEFT',
+        'CAM_BACK',
+        'CAM_BACK_LEFT',
+        'CAM_BACK_RIGHT',
+    ]
+    nusc_infos = load(info_path)['infos']
+    nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+    # info_2d_list = []
+    cat2Ids = [
+        dict(id=nus_categories.index(cat_name), name=cat_name)
+        for cat_name in nus_categories
+    ]
+    coco_ann_id = 0
+    coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+    for info in track_iter_progress(nusc_infos):
+        for cam in camera_types:
+            cam_info = info['cams'][cam]
+            coco_infos = get_2d_boxes(
+                nusc,
+                cam_info['sample_data_token'],
+                visibilities=['', '1', '2', '3', '4'],
+                mono3d=mono3d)
+            (height, width, _) = imread(cam_info['data_path']).shape
+            coco_2d_dict['images'].append(
+                dict(
+                    file_name=cam_info['data_path'].split('data/nuscenes/')
+                    [-1],
+                    id=cam_info['sample_data_token'],
+                    token=info['token'],
+                    cam2ego_rotation=cam_info['sensor2ego_rotation'],
+                    cam2ego_translation=cam_info['sensor2ego_translation'],
+                    ego2global_rotation=info['ego2global_rotation'],
+                    ego2global_translation=info['ego2global_translation'],
+                    cam_intrinsic=cam_info['cam_intrinsic'],
+                    width=width,
+                    height=height))
+            for coco_info in coco_infos:
+                if coco_info is None:
+                    continue
+                # add an empty key for coco format
+                coco_info['segmentation'] = []
+                coco_info['id'] = coco_ann_id
+                coco_2d_dict['annotations'].append(coco_info)
+                coco_ann_id += 1
+    if mono3d:
+        json_prefix = f'{info_path[:-4]}_mono3d'
+    else:
+        json_prefix = f'{info_path[:-4]}'
+    dump(coco_2d_dict, f'{json_prefix}.coco.json')
+
+
+def get_2d_boxes(nusc,
+                 sample_data_token: str,
+                 visibilities: List[str],
+                 mono3d=True):
+    """Get the 2D annotation records for a given `sample_data_token`.
+
+    Args:
+        sample_data_token (str): Sample data token belonging to a camera \
+            keyframe.
+        visibilities (list[str]): Visibility filter.
+        mono3d (bool): Whether to get boxes with mono3d annotation.
+
+    Return:
+        list[dict]: List of 2D annotation record that belongs to the input
+            `sample_data_token`.
+    """
+
+    # Get the sample data and the sample corresponding to that sample data.
+    sd_rec = nusc.get('sample_data', sample_data_token)
+
+    assert sd_rec[
+        'sensor_modality'] == 'camera', 'Error: get_2d_boxes only works' \
+        ' for camera sample_data!'
+    if not sd_rec['is_key_frame']:
+        raise ValueError(
+            'The 2D re-projections are available only for keyframes.')
+
+    s_rec = nusc.get('sample', sd_rec['sample_token'])
+
+    # Get the calibrated sensor and ego pose
+    # record to get the transformation matrices.
+    cs_rec = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token'])
+    pose_rec = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    camera_intrinsic = np.array(cs_rec['camera_intrinsic'])
+
+    # Get all the annotation with the specified visibilties.
+    ann_recs = [
+        nusc.get('sample_annotation', token) for token in s_rec['anns']
+    ]
+    ann_recs = [
+        ann_rec for ann_rec in ann_recs
+        if (ann_rec['visibility_token'] in visibilities)
+    ]
+
+    repro_recs = []
+
+    for ann_rec in ann_recs:
+        # Augment sample_annotation with token information.
+        ann_rec['sample_annotation_token'] = ann_rec['token']
+        ann_rec['sample_data_token'] = sample_data_token
+
+        # Get the box in global coordinates.
+        box = nusc.get_box(ann_rec['token'])
+
+        # Move them to the ego-pose frame.
+        box.translate(-np.array(pose_rec['translation']))
+        box.rotate(Quaternion(pose_rec['rotation']).inverse)
+
+        # Move them to the calibrated sensor frame.
+        box.translate(-np.array(cs_rec['translation']))
+        box.rotate(Quaternion(cs_rec['rotation']).inverse)
+
+        # Filter out the corners that are not in front of the calibrated
+        # sensor.
+        corners_3d = box.corners()
+        in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+        corners_3d = corners_3d[:, in_front]
+
+        # Project 3d box to 2d.
+        corner_coords = view_points(corners_3d, camera_intrinsic,
+                                    True).T[:, :2].tolist()
+
+        # Keep only corners that fall within the image.
+        final_coords = post_process_coords(corner_coords)
+
+        # Skip if the convex hull of the re-projected corners
+        # does not intersect the image canvas.
+        if final_coords is None:
+            continue
+        else:
+            min_x, min_y, max_x, max_y = final_coords
+
+        # Generate dictionary record to be included in the .json file.
+        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+                                    sample_data_token, sd_rec['filename'])
+
+        # If mono3d=True, add 3D annotations in camera coordinates
+        if mono3d and (repro_rec is not None):
+            loc = box.center.tolist()
+
+            dim = box.wlh
+            dim[[0, 1, 2]] = dim[[1, 2, 0]]  # convert wlh to our lhw
+            dim = dim.tolist()
+
+            rot = box.orientation.yaw_pitch_roll[0]
+            rot = [-rot]  # convert the rot to our cam coordinate
+
+            global_velo2d = nusc.box_velocity(box.token)[:2]
+            global_velo3d = np.array([*global_velo2d, 0.0])
+            e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix
+            c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix
+            cam_velo3d = global_velo3d @ np.linalg.inv(
+                e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T
+            velo = cam_velo3d[0::2].tolist()
+
+            repro_rec['bbox_cam3d'] = loc + dim + rot
+            repro_rec['velo_cam3d'] = velo
+
+            center3d = np.array(loc).reshape([1, 3])
+            center2d = points_cam2img(
+                center3d, camera_intrinsic, with_depth=True)
+            repro_rec['center2d'] = center2d.squeeze().tolist()
+            # normalized center2D + depth
+            # if samples with depth < 0 will be removed
+            if repro_rec['center2d'][2] <= 0:
+                continue
+
+            ann_token = nusc.get('sample_annotation',
+                                 box.token)['attribute_tokens']
+            if len(ann_token) == 0:
+                attr_name = 'None'
+            else:
+                attr_name = nusc.get('attribute', ann_token[0])['name']
+            attr_id = nus_attributes.index(attr_name)
+            repro_rec['attribute_name'] = attr_name
+            repro_rec['attribute_id'] = attr_id
+
+        repro_recs.append(repro_rec)
+
+    return repro_recs
+
+
+def post_process_coords(
+    corner_coords: List, imsize: Tuple[int, int] = (1600, 900)
+) -> Union[Tuple[float, float, float, float], None]:
+    """Get the intersection of the convex hull of the reprojected bbox corners
+    and the image canvas, return None if no intersection.
+
+    Args:
+        corner_coords (list[int]): Corner coordinates of reprojected
+            bounding box.
+        imsize (tuple[int]): Size of the image canvas.
+
+    Return:
+        tuple [float]: Intersection of the convex hull of the 2D box
+            corners and the image canvas.
+    """
+    polygon_from_2d_box = MultiPoint(corner_coords).convex_hull
+    img_canvas = box(0, 0, imsize[0], imsize[1])
+
+    if polygon_from_2d_box.intersects(img_canvas):
+        img_intersection = polygon_from_2d_box.intersection(img_canvas)
+        intersection_coords = np.array(
+            [coord for coord in img_intersection.exterior.coords])
+
+        min_x = min(intersection_coords[:, 0])
+        min_y = min(intersection_coords[:, 1])
+        max_x = max(intersection_coords[:, 0])
+        max_y = max(intersection_coords[:, 1])
+
+        return min_x, min_y, max_x, max_y
+    else:
+        return None
+
+
+def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float,
+                    sample_data_token: str, filename: str) -> OrderedDict:
+    """Generate one 2D annotation record given various informations on top of
+    the 2D bounding box coordinates.
+
+    Args:
+        ann_rec (dict): Original 3d annotation record.
+        x1 (float): Minimum value of the x coordinate.
+        y1 (float): Minimum value of the y coordinate.
+        x2 (float): Maximum value of the x coordinate.
+        y2 (float): Maximum value of the y coordinate.
+        sample_data_token (str): Sample data token.
+        filename (str):The corresponding image file where the annotation
+            is present.
+
+    Returns:
+        dict: A sample 2D annotation record.
+            - file_name (str): flie name
+            - image_id (str): sample data token
+            - area (float): 2d box area
+            - category_name (str): category name
+            - category_id (int): category id
+            - bbox (list[float]): left x, top y, dx, dy of 2d box
+            - iscrowd (int): whether the area is crowd
+    """
+    repro_rec = OrderedDict()
+    repro_rec['sample_data_token'] = sample_data_token
+    coco_rec = dict()
+
+    relevant_keys = [
+        'attribute_tokens',
+        'category_name',
+        'instance_token',
+        'next',
+        'num_lidar_pts',
+        'num_radar_pts',
+        'prev',
+        'sample_annotation_token',
+        'sample_data_token',
+        'visibility_token',
+    ]
+
+    for key, value in ann_rec.items():
+        if key in relevant_keys:
+            repro_rec[key] = value
+
+    repro_rec['bbox_corners'] = [x1, y1, x2, y2]
+    repro_rec['filename'] = filename
+
+    coco_rec['file_name'] = filename
+    coco_rec['image_id'] = sample_data_token
+    coco_rec['area'] = (y2 - y1) * (x2 - x1)
+
+    if repro_rec['category_name'] not in NuScenesDataset.NameMapping:
+        return None
+    cat_name = NuScenesDataset.NameMapping[repro_rec['category_name']]
+    coco_rec['category_name'] = cat_name
+    coco_rec['category_id'] = nus_categories.index(cat_name)
+    coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1]
+    coco_rec['iscrowd'] = 0
+
+    return coco_rec
diff --git a/adzoo/bevformer/data_converter/s3dis_data_utils.py b/adzoo/bevformer/data_converter/s3dis_data_utils.py
new file mode 100755
index 0000000..d2b6b77
--- /dev/null
+++ b/adzoo/bevformer/data_converter/s3dis_data_utils.py
@@ -0,0 +1,241 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import os
+from concurrent import futures as futures
+from os import path as osp
+
+
+class S3DISData(object):
+    """S3DIS data.
+
+    Generate s3dis infos for s3dis_converter.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        split (str): Set split type of the data. Default: 'Area_1'.
+    """
+
+    def __init__(self, root_path, split='Area_1'):
+        self.root_dir = root_path
+        self.split = split
+        self.data_dir = osp.join(root_path,
+                                 'Stanford3dDataset_v1.2_Aligned_Version')
+
+        # Following `GSDN <https://arxiv.org/abs/2006.12356>`_, use 5 furniture
+        # classes for detection: table, chair, sofa, bookcase, board.
+        self.cat_ids = np.array([7, 8, 9, 10, 11])
+        self.cat_ids2class = {
+            cat_id: i
+            for i, cat_id in enumerate(list(self.cat_ids))
+        }
+
+        assert split in [
+            'Area_1', 'Area_2', 'Area_3', 'Area_4', 'Area_5', 'Area_6'
+        ]
+        self.sample_id_list = os.listdir(osp.join(self.data_dir,
+                                                  split))  # conferenceRoom_1
+        for sample_id in self.sample_id_list:
+            if os.path.isfile(osp.join(self.data_dir, split, sample_id)):
+                self.sample_id_list.remove(sample_id)
+
+    def __len__(self):
+        return len(self.sample_id_list)
+
+    def get_infos(self, num_workers=4, has_label=True, sample_id_list=None):
+        """Get data infos.
+
+        This method gets information from the raw data.
+
+        Args:
+            num_workers (int): Number of threads to be used. Default: 4.
+            has_label (bool): Whether the data has label. Default: True.
+            sample_id_list (list[int]): Index list of the sample.
+                Default: None.
+
+        Returns:
+            infos (list[dict]): Information of the raw data.
+        """
+
+        def process_single_scene(sample_idx):
+            print(f'{self.split} sample_idx: {sample_idx}')
+            info = dict()
+            pc_info = {
+                'num_features': 6,
+                'lidar_idx': f'{self.split}_{sample_idx}'
+            }
+            info['point_cloud'] = pc_info
+            pts_filename = osp.join(self.root_dir, 's3dis_data',
+                                    f'{self.split}_{sample_idx}_point.npy')
+            pts_instance_mask_path = osp.join(
+                self.root_dir, 's3dis_data',
+                f'{self.split}_{sample_idx}_ins_label.npy')
+            pts_semantic_mask_path = osp.join(
+                self.root_dir, 's3dis_data',
+                f'{self.split}_{sample_idx}_sem_label.npy')
+
+            points = np.load(pts_filename).astype(np.float32)
+            pts_instance_mask = np.load(pts_instance_mask_path).astype(np.int)
+            pts_semantic_mask = np.load(pts_semantic_mask_path).astype(np.int)
+
+            mmcv.mkdir_or_exist(osp.join(self.root_dir, 'points'))
+            mmcv.mkdir_or_exist(osp.join(self.root_dir, 'instance_mask'))
+            mmcv.mkdir_or_exist(osp.join(self.root_dir, 'semantic_mask'))
+
+            points.tofile(
+                osp.join(self.root_dir, 'points',
+                         f'{self.split}_{sample_idx}.bin'))
+            pts_instance_mask.tofile(
+                osp.join(self.root_dir, 'instance_mask',
+                         f'{self.split}_{sample_idx}.bin'))
+            pts_semantic_mask.tofile(
+                osp.join(self.root_dir, 'semantic_mask',
+                         f'{self.split}_{sample_idx}.bin'))
+
+            info['pts_path'] = osp.join('points',
+                                        f'{self.split}_{sample_idx}.bin')
+            info['pts_instance_mask_path'] = osp.join(
+                'instance_mask', f'{self.split}_{sample_idx}.bin')
+            info['pts_semantic_mask_path'] = osp.join(
+                'semantic_mask', f'{self.split}_{sample_idx}.bin')
+            info['annos'] = self.get_bboxes(points, pts_instance_mask,
+                                            pts_semantic_mask)
+
+            return info
+
+        sample_id_list = sample_id_list if sample_id_list is not None \
+            else self.sample_id_list
+        with futures.ThreadPoolExecutor(num_workers) as executor:
+            infos = executor.map(process_single_scene, sample_id_list)
+        return list(infos)
+
+    def get_bboxes(self, points, pts_instance_mask, pts_semantic_mask):
+        """Convert instance masks to axis-aligned bounding boxes.
+
+        Args:
+            points (np.array): Scene points of shape (n, 6).
+            pts_instance_mask (np.ndarray): Instance labels of shape (n,).
+            pts_semantic_mask (np.ndarray): Semantic labels of shape (n,).
+
+        Returns:
+            dict: A dict containing detection infos with following keys:
+
+                - gt_boxes_upright_depth (np.ndarray): Bounding boxes
+                    of shape (n, 6)
+                - class (np.ndarray): Box labels of shape (n,)
+                - gt_num (int): Number of boxes.
+        """
+        bboxes, labels = [], []
+        for i in range(1, pts_instance_mask.max()):
+            ids = pts_instance_mask == i
+            # check if all instance points have same semantic label
+            assert pts_semantic_mask[ids].min() == pts_semantic_mask[ids].max()
+            label = pts_semantic_mask[ids][0]
+            # keep only furniture objects
+            if label in self.cat_ids2class:
+                labels.append(self.cat_ids2class[pts_semantic_mask[ids][0]])
+                pts = points[:, :3][ids]
+                min_pts = pts.min(axis=0)
+                max_pts = pts.max(axis=0)
+                locations = (min_pts + max_pts) / 2
+                dimensions = max_pts - min_pts
+                bboxes.append(np.concatenate((locations, dimensions)))
+        annotation = dict()
+        # follow ScanNet and SUN RGB-D keys
+        annotation['gt_boxes_upright_depth'] = np.array(bboxes)
+        annotation['class'] = np.array(labels)
+        annotation['gt_num'] = len(labels)
+        return annotation
+
+
+class S3DISSegData(object):
+    """S3DIS dataset used to generate infos for semantic segmentation task.
+
+    Args:
+        data_root (str): Root path of the raw data.
+        ann_file (str): The generated scannet infos.
+        split (str): Set split type of the data. Default: 'train'.
+        num_points (int): Number of points in each data input. Default: 8192.
+        label_weight_func (function): Function to compute the label weight.
+            Default: None.
+    """
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 split='Area_1',
+                 num_points=4096,
+                 label_weight_func=None):
+        self.data_root = data_root
+        self.data_infos = mmcv.load(ann_file)
+        self.split = split
+        self.num_points = num_points
+
+        self.all_ids = np.arange(13)  # all possible ids
+        self.cat_ids = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                 12])  # used for seg task
+        self.ignore_index = len(self.cat_ids)
+
+        self.cat_id2class = np.ones((self.all_ids.shape[0],), dtype=np.int) * \
+            self.ignore_index
+        for i, cat_id in enumerate(self.cat_ids):
+            self.cat_id2class[cat_id] = i
+
+        # label weighting function is taken from
+        # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24
+        self.label_weight_func = (lambda x: 1.0 / np.log(1.2 + x)) if \
+            label_weight_func is None else label_weight_func
+
+    def get_seg_infos(self):
+        scene_idxs, label_weight = self.get_scene_idxs_and_label_weight()
+        save_folder = osp.join(self.data_root, 'seg_info')
+        mmcv.mkdir_or_exist(save_folder)
+        np.save(
+            osp.join(save_folder, f'{self.split}_resampled_scene_idxs.npy'),
+            scene_idxs)
+        np.save(
+            osp.join(save_folder, f'{self.split}_label_weight.npy'),
+            label_weight)
+        print(f'{self.split} resampled scene index and label weight saved')
+
+    def _convert_to_label(self, mask):
+        """Convert class_id in loaded segmentation mask to label."""
+        if isinstance(mask, str):
+            if mask.endswith('npy'):
+                mask = np.load(mask)
+            else:
+                mask = np.fromfile(mask, dtype=np.long)
+        label = self.cat_id2class[mask]
+        return label
+
+    def get_scene_idxs_and_label_weight(self):
+        """Compute scene_idxs for data sampling and label weight for loss \
+        calculation.
+
+        We sample more times for scenes with more points. Label_weight is
+        inversely proportional to number of class points.
+        """
+        num_classes = len(self.cat_ids)
+        num_point_all = []
+        label_weight = np.zeros((num_classes + 1, ))  # ignore_index
+        for data_info in self.data_infos:
+            label = self._convert_to_label(
+                osp.join(self.data_root, data_info['pts_semantic_mask_path']))
+            num_point_all.append(label.shape[0])
+            class_count, _ = np.histogram(label, range(num_classes + 2))
+            label_weight += class_count
+
+        # repeat scene_idx for num_scene_point // num_sample_point times
+        sample_prob = np.array(num_point_all) / float(np.sum(num_point_all))
+        num_iter = int(np.sum(num_point_all) / float(self.num_points))
+        scene_idxs = []
+        for idx in range(len(self.data_infos)):
+            scene_idxs.extend([idx] * int(round(sample_prob[idx] * num_iter)))
+        scene_idxs = np.array(scene_idxs).astype(np.int32)
+
+        # calculate label weight, adopted from PointNet++
+        label_weight = label_weight[:-1].astype(np.float32)
+        label_weight = label_weight / label_weight.sum()
+        label_weight = self.label_weight_func(label_weight).astype(np.float32)
+
+        return scene_idxs, label_weight
diff --git a/adzoo/bevformer/data_converter/scannet_data_utils.py b/adzoo/bevformer/data_converter/scannet_data_utils.py
new file mode 100755
index 0000000..a437fe0
--- /dev/null
+++ b/adzoo/bevformer/data_converter/scannet_data_utils.py
@@ -0,0 +1,293 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import os
+from concurrent import futures as futures
+from os import path as osp
+
+
+class ScanNetData(object):
+    """ScanNet data.
+
+    Generate scannet infos for scannet_converter.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        split (str): Set split type of the data. Default: 'train'.
+    """
+
+    def __init__(self, root_path, split='train'):
+        self.root_dir = root_path
+        self.split = split
+        self.split_dir = osp.join(root_path)
+        self.classes = [
+            'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+            'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+            'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+            'garbagebin'
+        ]
+        self.cat2label = {cat: self.classes.index(cat) for cat in self.classes}
+        self.label2cat = {self.cat2label[t]: t for t in self.cat2label}
+        self.cat_ids = np.array(
+            [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39])
+        self.cat_ids2class = {
+            nyu40id: i
+            for i, nyu40id in enumerate(list(self.cat_ids))
+        }
+        assert split in ['train', 'val', 'test']
+        split_file = osp.join(self.root_dir, 'meta_data',
+                              f'scannetv2_{split}.txt')
+        mmcv.check_file_exist(split_file)
+        self.sample_id_list = mmcv.list_from_file(split_file)
+        self.test_mode = (split == 'test')
+
+    def __len__(self):
+        return len(self.sample_id_list)
+
+    def get_aligned_box_label(self, idx):
+        box_file = osp.join(self.root_dir, 'scannet_instance_data',
+                            f'{idx}_aligned_bbox.npy')
+        mmcv.check_file_exist(box_file)
+        return np.load(box_file)
+
+    def get_unaligned_box_label(self, idx):
+        box_file = osp.join(self.root_dir, 'scannet_instance_data',
+                            f'{idx}_unaligned_bbox.npy')
+        mmcv.check_file_exist(box_file)
+        return np.load(box_file)
+
+    def get_axis_align_matrix(self, idx):
+        matrix_file = osp.join(self.root_dir, 'scannet_instance_data',
+                               f'{idx}_axis_align_matrix.npy')
+        mmcv.check_file_exist(matrix_file)
+        return np.load(matrix_file)
+
+    def get_images(self, idx):
+        paths = []
+        path = osp.join(self.root_dir, 'posed_images', idx)
+        for file in sorted(os.listdir(path)):
+            if file.endswith('.jpg'):
+                paths.append(osp.join('posed_images', idx, file))
+        return paths
+
+    def get_extrinsics(self, idx):
+        extrinsics = []
+        path = osp.join(self.root_dir, 'posed_images', idx)
+        for file in sorted(os.listdir(path)):
+            if file.endswith('.txt') and not file == 'intrinsic.txt':
+                extrinsics.append(np.loadtxt(osp.join(path, file)))
+        return extrinsics
+
+    def get_intrinsics(self, idx):
+        matrix_file = osp.join(self.root_dir, 'posed_images', idx,
+                               'intrinsic.txt')
+        mmcv.check_file_exist(matrix_file)
+        return np.loadtxt(matrix_file)
+
+    def get_infos(self, num_workers=4, has_label=True, sample_id_list=None):
+        """Get data infos.
+
+        This method gets information from the raw data.
+
+        Args:
+            num_workers (int): Number of threads to be used. Default: 4.
+            has_label (bool): Whether the data has label. Default: True.
+            sample_id_list (list[int]): Index list of the sample.
+                Default: None.
+
+        Returns:
+            infos (list[dict]): Information of the raw data.
+        """
+
+        def process_single_scene(sample_idx):
+            print(f'{self.split} sample_idx: {sample_idx}')
+            info = dict()
+            pc_info = {'num_features': 6, 'lidar_idx': sample_idx}
+            info['point_cloud'] = pc_info
+            pts_filename = osp.join(self.root_dir, 'scannet_instance_data',
+                                    f'{sample_idx}_vert.npy')
+            points = np.load(pts_filename)
+            mmcv.mkdir_or_exist(osp.join(self.root_dir, 'points'))
+            points.tofile(
+                osp.join(self.root_dir, 'points', f'{sample_idx}.bin'))
+            info['pts_path'] = osp.join('points', f'{sample_idx}.bin')
+
+            # update with RGB image paths if exist
+            if os.path.exists(osp.join(self.root_dir, 'posed_images')):
+                info['intrinsics'] = self.get_intrinsics(sample_idx)
+                all_extrinsics = self.get_extrinsics(sample_idx)
+                all_img_paths = self.get_images(sample_idx)
+                # some poses in ScanNet are invalid
+                extrinsics, img_paths = [], []
+                for extrinsic, img_path in zip(all_extrinsics, all_img_paths):
+                    if np.all(np.isfinite(extrinsic)):
+                        img_paths.append(img_path)
+                        extrinsics.append(extrinsic)
+                info['extrinsics'] = extrinsics
+                info['img_paths'] = img_paths
+
+            if not self.test_mode:
+                pts_instance_mask_path = osp.join(
+                    self.root_dir, 'scannet_instance_data',
+                    f'{sample_idx}_ins_label.npy')
+                pts_semantic_mask_path = osp.join(
+                    self.root_dir, 'scannet_instance_data',
+                    f'{sample_idx}_sem_label.npy')
+
+                pts_instance_mask = np.load(pts_instance_mask_path).astype(
+                    np.long)
+                pts_semantic_mask = np.load(pts_semantic_mask_path).astype(
+                    np.long)
+
+                mmcv.mkdir_or_exist(osp.join(self.root_dir, 'instance_mask'))
+                mmcv.mkdir_or_exist(osp.join(self.root_dir, 'semantic_mask'))
+
+                pts_instance_mask.tofile(
+                    osp.join(self.root_dir, 'instance_mask',
+                             f'{sample_idx}.bin'))
+                pts_semantic_mask.tofile(
+                    osp.join(self.root_dir, 'semantic_mask',
+                             f'{sample_idx}.bin'))
+
+                info['pts_instance_mask_path'] = osp.join(
+                    'instance_mask', f'{sample_idx}.bin')
+                info['pts_semantic_mask_path'] = osp.join(
+                    'semantic_mask', f'{sample_idx}.bin')
+
+            if has_label:
+                annotations = {}
+                # box is of shape [k, 6 + class]
+                aligned_box_label = self.get_aligned_box_label(sample_idx)
+                unaligned_box_label = self.get_unaligned_box_label(sample_idx)
+                annotations['gt_num'] = aligned_box_label.shape[0]
+                if annotations['gt_num'] != 0:
+                    aligned_box = aligned_box_label[:, :-1]  # k, 6
+                    unaligned_box = unaligned_box_label[:, :-1]
+                    classes = aligned_box_label[:, -1]  # k
+                    annotations['name'] = np.array([
+                        self.label2cat[self.cat_ids2class[classes[i]]]
+                        for i in range(annotations['gt_num'])
+                    ])
+                    # default names are given to aligned bbox for compatibility
+                    # we also save unaligned bbox info with marked names
+                    annotations['location'] = aligned_box[:, :3]
+                    annotations['dimensions'] = aligned_box[:, 3:6]
+                    annotations['gt_boxes_upright_depth'] = aligned_box
+                    annotations['unaligned_location'] = unaligned_box[:, :3]
+                    annotations['unaligned_dimensions'] = unaligned_box[:, 3:6]
+                    annotations[
+                        'unaligned_gt_boxes_upright_depth'] = unaligned_box
+                    annotations['index'] = np.arange(
+                        annotations['gt_num'], dtype=np.int32)
+                    annotations['class'] = np.array([
+                        self.cat_ids2class[classes[i]]
+                        for i in range(annotations['gt_num'])
+                    ])
+                axis_align_matrix = self.get_axis_align_matrix(sample_idx)
+                annotations['axis_align_matrix'] = axis_align_matrix  # 4x4
+                info['annos'] = annotations
+            return info
+
+        sample_id_list = sample_id_list if sample_id_list is not None \
+            else self.sample_id_list
+        with futures.ThreadPoolExecutor(num_workers) as executor:
+            infos = executor.map(process_single_scene, sample_id_list)
+        return list(infos)
+
+
+class ScanNetSegData(object):
+    """ScanNet dataset used to generate infos for semantic segmentation task.
+
+    Args:
+        data_root (str): Root path of the raw data.
+        ann_file (str): The generated scannet infos.
+        split (str): Set split type of the data. Default: 'train'.
+        num_points (int): Number of points in each data input. Default: 8192.
+        label_weight_func (function): Function to compute the label weight.
+            Default: None.
+    """
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 split='train',
+                 num_points=8192,
+                 label_weight_func=None):
+        self.data_root = data_root
+        self.data_infos = mmcv.load(ann_file)
+        self.split = split
+        assert split in ['train', 'val', 'test']
+        self.num_points = num_points
+
+        self.all_ids = np.arange(41)  # all possible ids
+        self.cat_ids = np.array([
+            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36,
+            39
+        ])  # used for seg task
+        self.ignore_index = len(self.cat_ids)
+
+        self.cat_id2class = np.ones((self.all_ids.shape[0],), dtype=np.int) * \
+            self.ignore_index
+        for i, cat_id in enumerate(self.cat_ids):
+            self.cat_id2class[cat_id] = i
+
+        # label weighting function is taken from
+        # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24
+        self.label_weight_func = (lambda x: 1.0 / np.log(1.2 + x)) if \
+            label_weight_func is None else label_weight_func
+
+    def get_seg_infos(self):
+        if self.split == 'test':
+            return
+        scene_idxs, label_weight = self.get_scene_idxs_and_label_weight()
+        save_folder = osp.join(self.data_root, 'seg_info')
+        mmcv.mkdir_or_exist(save_folder)
+        np.save(
+            osp.join(save_folder, f'{self.split}_resampled_scene_idxs.npy'),
+            scene_idxs)
+        np.save(
+            osp.join(save_folder, f'{self.split}_label_weight.npy'),
+            label_weight)
+        print(f'{self.split} resampled scene index and label weight saved')
+
+    def _convert_to_label(self, mask):
+        """Convert class_id in loaded segmentation mask to label."""
+        if isinstance(mask, str):
+            if mask.endswith('npy'):
+                mask = np.load(mask)
+            else:
+                mask = np.fromfile(mask, dtype=np.long)
+        label = self.cat_id2class[mask]
+        return label
+
+    def get_scene_idxs_and_label_weight(self):
+        """Compute scene_idxs for data sampling and label weight for loss \
+        calculation.
+
+        We sample more times for scenes with more points. Label_weight is
+        inversely proportional to number of class points.
+        """
+        num_classes = len(self.cat_ids)
+        num_point_all = []
+        label_weight = np.zeros((num_classes + 1, ))  # ignore_index
+        for data_info in self.data_infos:
+            label = self._convert_to_label(
+                osp.join(self.data_root, data_info['pts_semantic_mask_path']))
+            num_point_all.append(label.shape[0])
+            class_count, _ = np.histogram(label, range(num_classes + 2))
+            label_weight += class_count
+
+        # repeat scene_idx for num_scene_point // num_sample_point times
+        sample_prob = np.array(num_point_all) / float(np.sum(num_point_all))
+        num_iter = int(np.sum(num_point_all) / float(self.num_points))
+        scene_idxs = []
+        for idx in range(len(self.data_infos)):
+            scene_idxs.extend([idx] * int(round(sample_prob[idx] * num_iter)))
+        scene_idxs = np.array(scene_idxs).astype(np.int32)
+
+        # calculate label weight, adopted from PointNet++
+        label_weight = label_weight[:-1].astype(np.float32)
+        label_weight = label_weight / label_weight.sum()
+        label_weight = self.label_weight_func(label_weight).astype(np.float32)
+
+        return scene_idxs, label_weight
diff --git a/adzoo/bevformer/data_converter/sunrgbd_data_utils.py b/adzoo/bevformer/data_converter/sunrgbd_data_utils.py
new file mode 100755
index 0000000..9f8a502
--- /dev/null
+++ b/adzoo/bevformer/data_converter/sunrgbd_data_utils.py
@@ -0,0 +1,221 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+from concurrent import futures as futures
+from os import path as osp
+from scipy import io as sio
+
+
+def random_sampling(points, num_points, replace=None, return_choices=False):
+    """Random sampling.
+
+    Sampling point cloud to a certain number of points.
+
+    Args:
+        points (ndarray): Point cloud.
+        num_points (int): The number of samples.
+        replace (bool): Whether the sample is with or without replacement.
+        return_choices (bool): Whether to return choices.
+
+    Returns:
+        points (ndarray): Point cloud after sampling.
+    """
+
+    if replace is None:
+        replace = (points.shape[0] < num_points)
+    choices = np.random.choice(points.shape[0], num_points, replace=replace)
+    if return_choices:
+        return points[choices], choices
+    else:
+        return points[choices]
+
+
+class SUNRGBDInstance(object):
+
+    def __init__(self, line):
+        data = line.split(' ')
+        data[1:] = [float(x) for x in data[1:]]
+        self.classname = data[0]
+        self.xmin = data[1]
+        self.ymin = data[2]
+        self.xmax = data[1] + data[3]
+        self.ymax = data[2] + data[4]
+        self.box2d = np.array([self.xmin, self.ymin, self.xmax, self.ymax])
+        self.centroid = np.array([data[5], data[6], data[7]])
+        self.w = data[8]
+        self.l = data[9]  # noqa: E741
+        self.h = data[10]
+        self.orientation = np.zeros((3, ))
+        self.orientation[0] = data[11]
+        self.orientation[1] = data[12]
+        self.heading_angle = -1 * np.arctan2(self.orientation[1],
+                                             self.orientation[0])
+        self.box3d = np.concatenate([
+            self.centroid,
+            np.array([self.l * 2, self.w * 2, self.h * 2, self.heading_angle])
+        ])
+
+
+class SUNRGBDData(object):
+    """SUNRGBD data.
+
+    Generate scannet infos for sunrgbd_converter.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        split (str): Set split type of the data. Default: 'train'.
+        use_v1 (bool): Whether to use v1. Default: False.
+    """
+
+    def __init__(self, root_path, split='train', use_v1=False):
+        self.root_dir = root_path
+        self.split = split
+        self.split_dir = osp.join(root_path, 'sunrgbd_trainval')
+        self.classes = [
+            'bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+            'night_stand', 'bookshelf', 'bathtub'
+        ]
+        self.cat2label = {cat: self.classes.index(cat) for cat in self.classes}
+        self.label2cat = {
+            label: self.classes[label]
+            for label in range(len(self.classes))
+        }
+        assert split in ['train', 'val', 'test']
+        split_file = osp.join(self.split_dir, f'{split}_data_idx.txt')
+        mmcv.check_file_exist(split_file)
+        self.sample_id_list = map(int, mmcv.list_from_file(split_file))
+        self.image_dir = osp.join(self.split_dir, 'image')
+        self.calib_dir = osp.join(self.split_dir, 'calib')
+        self.depth_dir = osp.join(self.split_dir, 'depth')
+        if use_v1:
+            self.label_dir = osp.join(self.split_dir, 'label_v1')
+        else:
+            self.label_dir = osp.join(self.split_dir, 'label')
+
+    def __len__(self):
+        return len(self.sample_id_list)
+
+    def get_image(self, idx):
+        img_filename = osp.join(self.image_dir, f'{idx:06d}.jpg')
+        return mmcv.imread(img_filename)
+
+    def get_image_shape(self, idx):
+        image = self.get_image(idx)
+        return np.array(image.shape[:2], dtype=np.int32)
+
+    def get_depth(self, idx):
+        depth_filename = osp.join(self.depth_dir, f'{idx:06d}.mat')
+        depth = sio.loadmat(depth_filename)['instance']
+        return depth
+
+    def get_calibration(self, idx):
+        calib_filepath = osp.join(self.calib_dir, f'{idx:06d}.txt')
+        lines = [line.rstrip() for line in open(calib_filepath)]
+        Rt = np.array([float(x) for x in lines[0].split(' ')])
+        Rt = np.reshape(Rt, (3, 3), order='F').astype(np.float32)
+        K = np.array([float(x) for x in lines[1].split(' ')])
+        K = np.reshape(K, (3, 3), order='F').astype(np.float32)
+        return K, Rt
+
+    def get_label_objects(self, idx):
+        label_filename = osp.join(self.label_dir, f'{idx:06d}.txt')
+        lines = [line.rstrip() for line in open(label_filename)]
+        objects = [SUNRGBDInstance(line) for line in lines]
+        return objects
+
+    def get_infos(self, num_workers=4, has_label=True, sample_id_list=None):
+        """Get data infos.
+
+        This method gets information from the raw data.
+
+        Args:
+            num_workers (int): Number of threads to be used. Default: 4.
+            has_label (bool): Whether the data has label. Default: True.
+            sample_id_list (list[int]): Index list of the sample.
+                Default: None.
+
+        Returns:
+            infos (list[dict]): Information of the raw data.
+        """
+
+        def process_single_scene(sample_idx):
+            print(f'{self.split} sample_idx: {sample_idx}')
+            # convert depth to points
+            SAMPLE_NUM = 50000
+            # TODO: Check whether can move the point
+            #  sampling process during training.
+            pc_upright_depth = self.get_depth(sample_idx)
+            pc_upright_depth_subsampled = random_sampling(
+                pc_upright_depth, SAMPLE_NUM)
+
+            info = dict()
+            pc_info = {'num_features': 6, 'lidar_idx': sample_idx}
+            info['point_cloud'] = pc_info
+
+            mmcv.mkdir_or_exist(osp.join(self.root_dir, 'points'))
+            pc_upright_depth_subsampled.tofile(
+                osp.join(self.root_dir, 'points', f'{sample_idx:06d}.bin'))
+
+            info['pts_path'] = osp.join('points', f'{sample_idx:06d}.bin')
+            img_path = osp.join('image', f'{sample_idx:06d}.jpg')
+            image_info = {
+                'image_idx': sample_idx,
+                'image_shape': self.get_image_shape(sample_idx),
+                'image_path': img_path
+            }
+            info['image'] = image_info
+
+            K, Rt = self.get_calibration(sample_idx)
+            calib_info = {'K': K, 'Rt': Rt}
+            info['calib'] = calib_info
+
+            if has_label:
+                obj_list = self.get_label_objects(sample_idx)
+                annotations = {}
+                annotations['gt_num'] = len([
+                    obj.classname for obj in obj_list
+                    if obj.classname in self.cat2label.keys()
+                ])
+                if annotations['gt_num'] != 0:
+                    annotations['name'] = np.array([
+                        obj.classname for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ])
+                    annotations['bbox'] = np.concatenate([
+                        obj.box2d.reshape(1, 4) for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ],
+                                                         axis=0)
+                    annotations['location'] = np.concatenate([
+                        obj.centroid.reshape(1, 3) for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ],
+                                                             axis=0)
+                    annotations['dimensions'] = 2 * np.array([
+                        [obj.l, obj.w, obj.h] for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ])  # lwh (depth) format
+                    annotations['rotation_y'] = np.array([
+                        obj.heading_angle for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ])
+                    annotations['index'] = np.arange(
+                        len(obj_list), dtype=np.int32)
+                    annotations['class'] = np.array([
+                        self.cat2label[obj.classname] for obj in obj_list
+                        if obj.classname in self.cat2label.keys()
+                    ])
+                    annotations['gt_boxes_upright_depth'] = np.stack(
+                        [
+                            obj.box3d for obj in obj_list
+                            if obj.classname in self.cat2label.keys()
+                        ],
+                        axis=0)  # (K,8)
+                info['annos'] = annotations
+            return info
+
+        sample_id_list = sample_id_list if \
+            sample_id_list is not None else self.sample_id_list
+        with futures.ThreadPoolExecutor(num_workers) as executor:
+            infos = executor.map(process_single_scene, sample_id_list)
+        return list(infos)
diff --git a/adzoo/bevformer/data_converter/waymo_converter.py b/adzoo/bevformer/data_converter/waymo_converter.py
new file mode 100755
index 0000000..94fcae1
--- /dev/null
+++ b/adzoo/bevformer/data_converter/waymo_converter.py
@@ -0,0 +1,519 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+r"""Adapted from `Waymo to KITTI converter
+    <https://github.com/caizhongang/waymo_kitti_converter>`_.
+"""
+
+try:
+    from waymo_open_dataset import dataset_pb2
+except ImportError:
+    raise ImportError(
+        'Please run "pip install waymo-open-dataset-tf-2-2-0==1.2.0" '
+        'to install the official devkit first.')
+
+import mmcv
+import numpy as np
+import tensorflow as tf
+from glob import glob
+from os.path import join
+from waymo_open_dataset.utils import range_image_utils, transform_utils
+from waymo_open_dataset.utils.frame_utils import \
+    parse_range_image_and_camera_projection
+
+
+class Waymo2KITTI(object):
+    """Waymo to KITTI converter.
+
+    This class serves as the converter to change the waymo raw data to KITTI
+    format.
+
+    Args:
+        load_dir (str): Directory to load waymo raw data.
+        save_dir (str): Directory to save data in KITTI format.
+        prefix (str): Prefix of filename. In general, 0 for training, 1 for
+            validation and 2 for testing.
+        workers (str): Number of workers for the parallel process.
+        test_mode (bool): Whether in the test_mode. Default: False.
+    """
+
+    def __init__(self,
+                 load_dir,
+                 save_dir,
+                 prefix,
+                 workers=64,
+                 test_mode=False):
+        self.filter_empty_3dboxes = True
+        self.filter_no_label_zone_points = True
+
+        self.selected_waymo_classes = ['VEHICLE', 'PEDESTRIAN', 'CYCLIST']
+
+        # Only data collected in specific locations will be converted
+        # If set None, this filter is disabled
+        # Available options: location_sf (main dataset)
+        self.selected_waymo_locations = None
+        self.save_track_id = False
+
+        # turn on eager execution for older tensorflow versions
+        if int(tf.__version__.split('.')[0]) < 2:
+            tf.enable_eager_execution()
+
+        self.lidar_list = [
+            '_FRONT', '_FRONT_RIGHT', '_FRONT_LEFT', '_SIDE_RIGHT',
+            '_SIDE_LEFT'
+        ]
+        self.type_list = [
+            'UNKNOWN', 'VEHICLE', 'PEDESTRIAN', 'SIGN', 'CYCLIST'
+        ]
+        self.waymo_to_kitti_class_map = {
+            'UNKNOWN': 'DontCare',
+            'PEDESTRIAN': 'Pedestrian',
+            'VEHICLE': 'Car',
+            'CYCLIST': 'Cyclist',
+            'SIGN': 'Sign'  # not in kitti
+        }
+
+        self.load_dir = load_dir
+        self.save_dir = save_dir
+        self.prefix = prefix
+        self.workers = int(workers)
+        self.test_mode = test_mode
+
+        self.tfrecord_pathnames = sorted(
+            glob(join(self.load_dir, '*.tfrecord')))
+
+        self.label_save_dir = f'{self.save_dir}/label_'
+        self.label_all_save_dir = f'{self.save_dir}/label_all'
+        self.image_save_dir = f'{self.save_dir}/image_'
+        self.calib_save_dir = f'{self.save_dir}/calib'
+        self.point_cloud_save_dir = f'{self.save_dir}/velodyne'
+        self.pose_save_dir = f'{self.save_dir}/pose'
+
+        self.create_folder()
+
+    def convert(self):
+        """Convert action."""
+        print('Start converting ...')
+        mmcv.track_parallel_progress(self.convert_one, range(len(self)),
+                                     self.workers)
+        print('\nFinished ...')
+
+    def convert_one(self, file_idx):
+        """Convert action for single file.
+
+        Args:
+            file_idx (int): Index of the file to be converted.
+        """
+        pathname = self.tfrecord_pathnames[file_idx]
+        dataset = tf.data.TFRecordDataset(pathname, compression_type='')
+
+        for frame_idx, data in enumerate(dataset):
+
+            if frame_idx % 5 != 0:
+                continue
+            # print(frame_idx)
+            frame = dataset_pb2.Frame()
+            frame.ParseFromString(bytearray(data.numpy()))
+            if (self.selected_waymo_locations is not None
+                    and frame.context.stats.location
+                    not in self.selected_waymo_locations):
+                continue
+
+            self.save_image(frame, file_idx, frame_idx)
+            self.save_calib(frame, file_idx, frame_idx)
+            self.save_lidar(frame, file_idx, frame_idx)
+            self.save_pose(frame, file_idx, frame_idx)
+
+            if not self.test_mode:
+                self.save_label(frame, file_idx, frame_idx)
+
+    def __len__(self):
+        """Length of the filename list."""
+        return len(self.tfrecord_pathnames)
+
+    def save_image(self, frame, file_idx, frame_idx):
+        """Parse and save the images in png format.
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame proto.
+            file_idx (int): Current file index.
+            frame_idx (int): Current frame index.
+        """
+        for img in frame.images:
+            img_path = f'{self.image_save_dir}{str(img.name - 1)}/' + \
+                f'{self.prefix}{str(file_idx).zfill(3)}' + \
+                f'{str(frame_idx).zfill(3)}.png'
+            img = mmcv.imfrombytes(img.image)
+            mmcv.imwrite(img, img_path)
+
+    def save_calib(self, frame, file_idx, frame_idx):
+        """Parse and save the calibration data.
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame proto.
+            file_idx (int): Current file index.
+            frame_idx (int): Current frame index.
+        """
+        # waymo front camera to kitti reference camera
+        T_front_cam_to_ref = np.array([[0.0, -1.0, 0.0], [0.0, 0.0, -1.0],
+                                       [1.0, 0.0, 0.0]])
+        camera_calibs = []
+        R0_rect = [f'{i:e}' for i in np.eye(3).flatten()]
+        Tr_velo_to_cams = []
+        calib_context = ''
+
+        for camera in frame.context.camera_calibrations:
+            # extrinsic parameters
+            T_cam_to_vehicle = np.array(camera.extrinsic.transform).reshape(
+                4, 4)
+            T_vehicle_to_cam = np.linalg.inv(T_cam_to_vehicle)
+            Tr_velo_to_cam = \
+                self.cart_to_homo(T_front_cam_to_ref) @ T_vehicle_to_cam
+            if camera.name == 1:  # FRONT = 1, see dataset.proto for details
+                self.T_velo_to_front_cam = Tr_velo_to_cam.copy()
+            Tr_velo_to_cam = Tr_velo_to_cam[:3, :].reshape((12, ))
+            Tr_velo_to_cams.append([f'{i:e}' for i in Tr_velo_to_cam])
+
+            # intrinsic parameters
+            camera_calib = np.zeros((3, 4))
+            camera_calib[0, 0] = camera.intrinsic[0]
+            camera_calib[1, 1] = camera.intrinsic[1]
+            camera_calib[0, 2] = camera.intrinsic[2]
+            camera_calib[1, 2] = camera.intrinsic[3]
+            camera_calib[2, 2] = 1
+            camera_calib = list(camera_calib.reshape(12))
+            camera_calib = [f'{i:e}' for i in camera_calib]
+            camera_calibs.append(camera_calib)
+
+        # all camera ids are saved as id-1 in the result because
+        # camera 0 is unknown in the proto
+        for i in range(5):
+            calib_context += 'P' + str(i) + ': ' + \
+                ' '.join(camera_calibs[i]) + '\n'
+        calib_context += 'R0_rect' + ': ' + ' '.join(R0_rect) + '\n'
+        for i in range(5):
+            calib_context += 'Tr_velo_to_cam_' + str(i) + ': ' + \
+                ' '.join(Tr_velo_to_cams[i]) + '\n'
+
+        with open(
+                f'{self.calib_save_dir}/{self.prefix}' +
+                f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt',
+                'w+') as fp_calib:
+            fp_calib.write(calib_context)
+            fp_calib.close()
+
+    def save_lidar(self, frame, file_idx, frame_idx):
+        """Parse and save the lidar data in psd format.
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame proto.
+            file_idx (int): Current file index.
+            frame_idx (int): Current frame index.
+        """
+        range_images, camera_projections, range_image_top_pose = \
+            parse_range_image_and_camera_projection(frame)
+
+        # First return
+        points_0, cp_points_0, intensity_0, elongation_0 = \
+            self.convert_range_image_to_point_cloud(
+                frame,
+                range_images,
+                camera_projections,
+                range_image_top_pose,
+                ri_index=0
+            )
+        points_0 = np.concatenate(points_0, axis=0)
+        intensity_0 = np.concatenate(intensity_0, axis=0)
+        elongation_0 = np.concatenate(elongation_0, axis=0)
+
+        # Second return
+        points_1, cp_points_1, intensity_1, elongation_1 = \
+            self.convert_range_image_to_point_cloud(
+                frame,
+                range_images,
+                camera_projections,
+                range_image_top_pose,
+                ri_index=1
+            )
+        points_1 = np.concatenate(points_1, axis=0)
+        intensity_1 = np.concatenate(intensity_1, axis=0)
+        elongation_1 = np.concatenate(elongation_1, axis=0)
+
+        points = np.concatenate([points_0, points_1], axis=0)
+        intensity = np.concatenate([intensity_0, intensity_1], axis=0)
+        elongation = np.concatenate([elongation_0, elongation_1], axis=0)
+        timestamp = frame.timestamp_micros * np.ones_like(intensity)
+
+        # concatenate x,y,z, intensity, elongation, timestamp (6-dim)
+        point_cloud = np.column_stack(
+            (points, intensity, elongation, timestamp))
+
+        pc_path = f'{self.point_cloud_save_dir}/{self.prefix}' + \
+            f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.bin'
+        point_cloud.astype(np.float32).tofile(pc_path)
+
+    def save_label(self, frame, file_idx, frame_idx):
+        """Parse and save the label data in txt format.
+        The relation between waymo and kitti coordinates is noteworthy:
+        1. x, y, z correspond to l, w, h (waymo) -> l, h, w (kitti)
+        2. x-y-z: front-left-up (waymo) -> right-down-front(kitti)
+        3. bbox origin at volumetric center (waymo) -> bottom center (kitti)
+        4. rotation: +x around y-axis (kitti) -> +x around z-axis (waymo)
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame proto.
+            file_idx (int): Current file index.
+            frame_idx (int): Current frame index.
+        """
+        fp_label_all = open(
+            f'{self.label_all_save_dir}/{self.prefix}' +
+            f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt', 'w+')
+        id_to_bbox = dict()
+        id_to_name = dict()
+        for labels in frame.projected_lidar_labels:
+            name = labels.name
+            for label in labels.labels:
+                # TODO: need a workaround as bbox may not belong to front cam
+                bbox = [
+                    label.box.center_x - label.box.length / 2,
+                    label.box.center_y - label.box.width / 2,
+                    label.box.center_x + label.box.length / 2,
+                    label.box.center_y + label.box.width / 2
+                ]
+                id_to_bbox[label.id] = bbox
+                id_to_name[label.id] = name - 1
+
+        for obj in frame.laser_labels:
+            bounding_box = None
+            name = None
+            id = obj.id
+            for lidar in self.lidar_list:
+                if id + lidar in id_to_bbox:
+                    bounding_box = id_to_bbox.get(id + lidar)
+                    name = str(id_to_name.get(id + lidar))
+                    break
+
+            if bounding_box is None or name is None:
+                name = '0'
+                bounding_box = (0, 0, 0, 0)
+
+            my_type = self.type_list[obj.type]
+
+            if my_type not in self.selected_waymo_classes:
+                continue
+
+            if self.filter_empty_3dboxes and obj.num_lidar_points_in_box < 1:
+                continue
+
+            my_type = self.waymo_to_kitti_class_map[my_type]
+
+            height = obj.box.height
+            width = obj.box.width
+            length = obj.box.length
+
+            x = obj.box.center_x
+            y = obj.box.center_y
+            z = obj.box.center_z - height / 2
+
+            # project bounding box to the virtual reference frame
+            pt_ref = self.T_velo_to_front_cam @ \
+                np.array([x, y, z, 1]).reshape((4, 1))
+            x, y, z, _ = pt_ref.flatten().tolist()
+
+            rotation_y = -obj.box.heading - np.pi / 2
+            track_id = obj.id
+
+            # not available
+            truncated = 0
+            occluded = 0
+            alpha = -10
+
+            line = my_type + \
+                ' {} {} {} {} {} {} {} {} {} {} {} {} {} {}\n'.format(
+                    round(truncated, 2), occluded, round(alpha, 2),
+                    round(bounding_box[0], 2), round(bounding_box[1], 2),
+                    round(bounding_box[2], 2), round(bounding_box[3], 2),
+                    round(height, 2), round(width, 2), round(length, 2),
+                    round(x, 2), round(y, 2), round(z, 2),
+                    round(rotation_y, 2))
+
+            if self.save_track_id:
+                line_all = line[:-1] + ' ' + name + ' ' + track_id + '\n'
+            else:
+                line_all = line[:-1] + ' ' + name + '\n'
+
+            fp_label = open(
+                f'{self.label_save_dir}{name}/{self.prefix}' +
+                f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt', 'a')
+            fp_label.write(line)
+            fp_label.close()
+
+            fp_label_all.write(line_all)
+
+        fp_label_all.close()
+
+    def save_pose(self, frame, file_idx, frame_idx):
+        """Parse and save the pose data.
+
+        Note that SDC's own pose is not included in the regular training
+        of KITTI dataset. KITTI raw dataset contains ego motion files
+        but are not often used. Pose is important for algorithms that
+        take advantage of the temporal information.
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame proto.
+            file_idx (int): Current file index.
+            frame_idx (int): Current frame index.
+        """
+        pose = np.array(frame.pose.transform).reshape(4, 4)
+        np.savetxt(
+            join(f'{self.pose_save_dir}/{self.prefix}' +
+                 f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'),
+            pose)
+
+    def create_folder(self):
+        """Create folder for data preprocessing."""
+        if not self.test_mode:
+            dir_list1 = [
+                self.label_all_save_dir, self.calib_save_dir,
+                self.point_cloud_save_dir, self.pose_save_dir
+            ]
+            dir_list2 = [self.label_save_dir, self.image_save_dir]
+        else:
+            dir_list1 = [
+                self.calib_save_dir, self.point_cloud_save_dir,
+                self.pose_save_dir
+            ]
+            dir_list2 = [self.image_save_dir]
+        for d in dir_list1:
+            mmcv.mkdir_or_exist(d)
+        for d in dir_list2:
+            for i in range(5):
+                mmcv.mkdir_or_exist(f'{d}{str(i)}')
+
+    def convert_range_image_to_point_cloud(self,
+                                           frame,
+                                           range_images,
+                                           camera_projections,
+                                           range_image_top_pose,
+                                           ri_index=0):
+        """Convert range images to point cloud.
+
+        Args:
+            frame (:obj:`Frame`): Open dataset frame.
+            range_images (dict): Mapping from laser_name to list of two
+                range images corresponding with two returns.
+            camera_projections (dict): Mapping from laser_name to list of two
+                camera projections corresponding with two returns.
+            range_image_top_pose (:obj:`Transform`): Range image pixel pose for
+                top lidar.
+            ri_index (int): 0 for the first return, 1 for the second return.
+                Default: 0.
+
+        Returns:
+            tuple[list[np.ndarray]]: (List of points with shape [N, 3],
+                camera projections of points with shape [N, 6], intensity
+                with shape [N, 1], elongation with shape [N, 1]). All the
+                lists have the length of lidar numbers (5).
+        """
+        calibrations = sorted(
+            frame.context.laser_calibrations, key=lambda c: c.name)
+        points = []
+        cp_points = []
+        intensity = []
+        elongation = []
+
+        frame_pose = tf.convert_to_tensor(
+            value=np.reshape(np.array(frame.pose.transform), [4, 4]))
+        # [H, W, 6]
+        range_image_top_pose_tensor = tf.reshape(
+            tf.convert_to_tensor(value=range_image_top_pose.data),
+            range_image_top_pose.shape.dims)
+        # [H, W, 3, 3]
+        range_image_top_pose_tensor_rotation = \
+            transform_utils.get_rotation_matrix(
+                range_image_top_pose_tensor[..., 0],
+                range_image_top_pose_tensor[..., 1],
+                range_image_top_pose_tensor[..., 2])
+        range_image_top_pose_tensor_translation = \
+            range_image_top_pose_tensor[..., 3:]
+        range_image_top_pose_tensor = transform_utils.get_transform(
+            range_image_top_pose_tensor_rotation,
+            range_image_top_pose_tensor_translation)
+        for c in calibrations:
+            range_image = range_images[c.name][ri_index]
+            if len(c.beam_inclinations) == 0:
+                beam_inclinations = range_image_utils.compute_inclination(
+                    tf.constant(
+                        [c.beam_inclination_min, c.beam_inclination_max]),
+                    height=range_image.shape.dims[0])
+            else:
+                beam_inclinations = tf.constant(c.beam_inclinations)
+
+            beam_inclinations = tf.reverse(beam_inclinations, axis=[-1])
+            extrinsic = np.reshape(np.array(c.extrinsic.transform), [4, 4])
+
+            range_image_tensor = tf.reshape(
+                tf.convert_to_tensor(value=range_image.data),
+                range_image.shape.dims)
+            pixel_pose_local = None
+            frame_pose_local = None
+            if c.name == dataset_pb2.LaserName.TOP:
+                pixel_pose_local = range_image_top_pose_tensor
+                pixel_pose_local = tf.expand_dims(pixel_pose_local, axis=0)
+                frame_pose_local = tf.expand_dims(frame_pose, axis=0)
+            range_image_mask = range_image_tensor[..., 0] > 0
+
+            if self.filter_no_label_zone_points:
+                nlz_mask = range_image_tensor[..., 3] != 1.0  # 1.0: in NLZ
+                range_image_mask = range_image_mask & nlz_mask
+
+            range_image_cartesian = \
+                range_image_utils.extract_point_cloud_from_range_image(
+                    tf.expand_dims(range_image_tensor[..., 0], axis=0),
+                    tf.expand_dims(extrinsic, axis=0),
+                    tf.expand_dims(tf.convert_to_tensor(
+                        value=beam_inclinations), axis=0),
+                    pixel_pose=pixel_pose_local,
+                    frame_pose=frame_pose_local)
+
+            range_image_cartesian = tf.squeeze(range_image_cartesian, axis=0)
+            points_tensor = tf.gather_nd(range_image_cartesian,
+                                         tf.compat.v1.where(range_image_mask))
+
+            cp = camera_projections[c.name][ri_index]
+            cp_tensor = tf.reshape(
+                tf.convert_to_tensor(value=cp.data), cp.shape.dims)
+            cp_points_tensor = tf.gather_nd(
+                cp_tensor, tf.compat.v1.where(range_image_mask))
+            points.append(points_tensor.numpy())
+            cp_points.append(cp_points_tensor.numpy())
+
+            intensity_tensor = tf.gather_nd(range_image_tensor[..., 1],
+                                            tf.where(range_image_mask))
+            intensity.append(intensity_tensor.numpy())
+
+            elongation_tensor = tf.gather_nd(range_image_tensor[..., 2],
+                                             tf.where(range_image_mask))
+            elongation.append(elongation_tensor.numpy())
+
+        return points, cp_points, intensity, elongation
+
+    def cart_to_homo(self, mat):
+        """Convert transformation matrix in Cartesian coordinates to
+        homogeneous format.
+
+        Args:
+            mat (np.ndarray): Transformation matrix in Cartesian.
+                The input matrix shape is 3x3 or 3x4.
+
+        Returns:
+            np.ndarray: Transformation matrix in homogeneous format.
+                The matrix shape is 4x4.
+        """
+        ret = np.eye(4)
+        if mat.shape == (3, 3):
+            ret[:3, :3] = mat
+        elif mat.shape == (3, 4):
+            ret[:3, :] = mat
+        else:
+            raise ValueError(mat.shape)
+        return ret
diff --git a/adzoo/bevformer/dist_test.sh b/adzoo/bevformer/dist_test.sh
new file mode 100755
index 0000000..8b19a04
--- /dev/null
+++ b/adzoo/bevformer/dist_test.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+PORT=${PORT:-29203}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} --eval bbox
diff --git a/adzoo/bevformer/dist_train.sh b/adzoo/bevformer/dist_train.sh
new file mode 100755
index 0000000..84d7fd7
--- /dev/null
+++ b/adzoo/bevformer/dist_train.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-38912}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
diff --git a/adzoo/bevformer/fp16/dist_train.sh b/adzoo/bevformer/fp16/dist_train.sh
new file mode 100755
index 0000000..4ac9a15
--- /dev/null
+++ b/adzoo/bevformer/fp16/dist_train.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-28508}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic
diff --git a/adzoo/bevformer/fp16/train.py b/adzoo/bevformer/fp16/train.py
new file mode 100644
index 0000000..eddc349
--- /dev/null
+++ b/adzoo/bevformer/fp16/train.py
@@ -0,0 +1,271 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from __future__ import division
+
+import argparse
+import copy
+import mmcv
+import os
+import time
+import torch
+import warnings
+from mmcv import Config, DictAction
+from mmcv.runner import get_dist_info, init_dist, wrap_fp16_model
+from os import path as osp
+
+from mmdet import __version__ as mmdet_version
+from mmdet3d import __version__ as mmdet3d_version
+#from mmdet3d.apis import train_model
+
+from mmdet3d.datasets import build_dataset
+from mmdet3d.models import build_model
+from mmdet3d.utils import collect_env, get_root_logger
+from mmdet.apis import set_random_seed
+from mmseg import __version__ as mmseg_version
+
+from mmcv.utils import TORCH_VERSION, digit_version
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume-from', help='the checkpoint file to resume from')
+    parser.add_argument(
+        '--no-validate',
+        action='store_true',
+        help='whether not to evaluate the checkpoint during training')
+    group_gpus = parser.add_mutually_exclusive_group()
+    group_gpus.add_argument(
+        '--gpus',
+        type=int,
+        help='number of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument('--seed', type=int, default=0, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file (deprecate), '
+        'change to --cfg-options instead.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument(
+        '--autoscale-lr',
+        action='store_true',
+        help='automatically scale lr with the number of gpus')
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.cfg_options:
+        raise ValueError(
+            '--options and --cfg-options cannot be both specified, '
+            '--options is deprecated in favor of --cfg-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --cfg-options')
+        args.cfg_options = args.options
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+
+    # import modules from plguin/xx, registry will be updated
+    if hasattr(cfg, 'plugin'):
+        if cfg.plugin:
+            import importlib
+            if hasattr(cfg, 'plugin_dir'):
+                plugin_dir = cfg.plugin_dir
+                _module_dir = os.path.dirname(plugin_dir)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            else:
+                # import dir is the dirpath for the config file
+                _module_dir = os.path.dirname(args.config)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            
+            from projects.mmdet3d_plugin.bevformer.apis import custom_train_model
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+    #if args.resume_from is not None:
+
+    if args.resume_from is not None and osp.isfile(args.resume_from): 
+        cfg.resume_from = args.resume_from
+
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids
+    else:
+        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+    if digit_version(TORCH_VERSION) != digit_version('1.8.1'):
+        cfg.optimizer['type'] = 'AdamW'
+    if args.autoscale_lr:
+        # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
+        cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        assert False, 'DOT NOT SUPPORT!!!'
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+        # re-set gpu_ids with distributed training mode
+        _, world_size = get_dist_info()
+        cfg.gpu_ids = range(world_size)
+
+    # create work_dir
+    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+    # dump config
+    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
+    # init the logger before other steps
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+    # specify logger name, if we still use 'mmdet', the output info will be
+    # filtered and won't be saved in the log_file
+    # TODO: ugly workaround to judge whether we are training det or seg model
+    if cfg.model.type in ['EncoderDecoder3D']:
+        logger_name = 'mmseg'
+    else:
+        logger_name = 'mmdet'
+    logger = get_root_logger(
+        log_file=log_file, log_level=cfg.log_level, name=logger_name)
+
+    # init the meta dict to record some important information such as
+    # environment info and seed, which will be logged
+    meta = dict()
+    # log env info
+    env_info_dict = collect_env()
+    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+    dash_line = '-' * 60 + '\n'
+    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
+                dash_line)
+    meta['env_info'] = env_info
+    meta['config'] = cfg.pretty_text
+
+    # log some basic info
+    logger.info(f'Distributed training: {distributed}')
+    logger.info(f'Config:\n{cfg.pretty_text}')
+
+    # set random seeds
+    if args.seed is not None:
+        logger.info(f'Set random seed to {args.seed}, '
+                    f'deterministic: {args.deterministic}')
+        set_random_seed(args.seed, deterministic=args.deterministic)
+    cfg.seed = args.seed
+    meta['seed'] = args.seed
+    meta['exp_name'] = osp.basename(args.config)
+
+    model = build_model(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    model.init_weights()
+
+    eval_model_config = copy.deepcopy(cfg.model)
+    eval_model = build_model(
+        eval_model_config,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(eval_model)
+
+    #eval_model.init_weights()
+    eval_model.load_state_dict(model.state_dict())
+
+    logger.info(f'Model:\n{model}')
+    from projects.mmdet3d_plugin.datasets import custom_build_dataset
+    datasets = [custom_build_dataset(cfg.data.train)]
+    if len(cfg.workflow) == 2:
+        val_dataset = copy.deepcopy(cfg.data.val)
+        # in case we use a dataset wrapper
+        if 'dataset' in cfg.data.train:
+            val_dataset.pipeline = cfg.data.train.dataset.pipeline
+        else:
+            val_dataset.pipeline = cfg.data.train.pipeline
+        # set test_mode=False here in deep copied config
+        # which do not affect AP/AR calculation later
+        # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow  # noqa
+        val_dataset.test_mode = False
+        datasets.append(custom_build_dataset(val_dataset))
+    if cfg.checkpoint_config is not None:
+        # save mmdet version, config file content and class names in
+        # checkpoints as meta data
+        cfg.checkpoint_config.meta = dict(
+            mmdet_version=mmdet_version,
+            mmseg_version=mmseg_version,
+            mmdet3d_version=mmdet3d_version,
+            config=cfg.pretty_text,
+            CLASSES=datasets[0].CLASSES,
+            PALETTE=datasets[0].PALETTE  # for segmentors
+            if hasattr(datasets[0], 'PALETTE') else None)
+    # add an attribute for visualization convenience
+    model.CLASSES = datasets[0].CLASSES
+    custom_train_model(
+        model,
+        datasets,
+        cfg,
+        eval_model=eval_model,
+        distributed=distributed,
+        validate=(not args.no_validate),
+        timestamp=timestamp,
+        meta=meta)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/bevformer/misc/browse_dataset.py b/adzoo/bevformer/misc/browse_dataset.py
new file mode 100755
index 0000000..e3419f6
--- /dev/null
+++ b/adzoo/bevformer/misc/browse_dataset.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import numpy as np
+import warnings
+from mmcv import Config, DictAction, mkdir_or_exist, track_iter_progress
+from os import path as osp
+
+from mmdet3d.core.bbox import (Box3DMode, CameraInstance3DBoxes, Coord3DMode,
+                               DepthInstance3DBoxes, LiDARInstance3DBoxes)
+from mmdet3d.core.visualizer import (show_multi_modality_result, show_result,
+                                     show_seg_result)
+from mmdet3d.datasets import build_dataset
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--skip-type',
+        type=str,
+        nargs='+',
+        default=['Normalize'],
+        help='skip some useless pipeline')
+    parser.add_argument(
+        '--output-dir',
+        default=None,
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument(
+        '--task',
+        type=str,
+        choices=['det', 'seg', 'multi_modality-det', 'mono-det'],
+        help='Determine the visualization method depending on the task.')
+    parser.add_argument(
+        '--online',
+        action='store_true',
+        help='Whether to perform online visualization. Note that you often '
+        'need a monitor to do so.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def build_data_cfg(config_path, skip_type, cfg_options):
+    """Build data config for loading visualization data."""
+    cfg = Config.fromfile(config_path)
+    if cfg_options is not None:
+        cfg.merge_from_dict(cfg_options)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+    # extract inner dataset of `RepeatDataset` as `cfg.data.train`
+    # so we don't need to worry about it later
+    if cfg.data.train['type'] == 'RepeatDataset':
+        cfg.data.train = cfg.data.train.dataset
+    # use only first dataset for `ConcatDataset`
+    if cfg.data.train['type'] == 'ConcatDataset':
+        cfg.data.train = cfg.data.train.datasets[0]
+    train_data_cfg = cfg.data.train
+    # eval_pipeline purely consists of loading functions
+    # use eval_pipeline for data loading
+    train_data_cfg['pipeline'] = [
+        x for x in cfg.eval_pipeline if x['type'] not in skip_type
+    ]
+
+    return cfg
+
+
+def to_depth_mode(points, bboxes):
+    """Convert points and bboxes to Depth Coord and Depth Box mode."""
+    if points is not None:
+        points = Coord3DMode.convert_point(points.copy(), Coord3DMode.LIDAR,
+                                           Coord3DMode.DEPTH)
+    if bboxes is not None:
+        bboxes = Box3DMode.convert(bboxes.clone(), Box3DMode.LIDAR,
+                                   Box3DMode.DEPTH)
+    return points, bboxes
+
+
+def show_det_data(idx, dataset, out_dir, filename, show=False):
+    """Visualize 3D point cloud and 3D bboxes."""
+    example = dataset.prepare_train_data(idx)
+    points = example['points']._data.numpy()
+    gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d'].tensor
+    if dataset.box_mode_3d != Box3DMode.DEPTH:
+        points, gt_bboxes = to_depth_mode(points, gt_bboxes)
+    show_result(
+        points,
+        gt_bboxes.clone(),
+        None,
+        out_dir,
+        filename,
+        show=show,
+        snapshot=True)
+
+
+def show_seg_data(idx, dataset, out_dir, filename, show=False):
+    """Visualize 3D point cloud and segmentation mask."""
+    example = dataset.prepare_train_data(idx)
+    points = example['points']._data.numpy()
+    gt_seg = example['pts_semantic_mask']._data.numpy()
+    show_seg_result(
+        points,
+        gt_seg.copy(),
+        None,
+        out_dir,
+        filename,
+        np.array(dataset.PALETTE),
+        dataset.ignore_index,
+        show=show,
+        snapshot=True)
+
+
+def show_proj_bbox_img(idx,
+                       dataset,
+                       out_dir,
+                       filename,
+                       show=False,
+                       is_nus_mono=False):
+    """Visualize 3D bboxes on 2D image by projection."""
+    try:
+        example = dataset.prepare_train_data(idx)
+    except AttributeError:  # for Mono-3D datasets
+        example = dataset.prepare_train_img(idx)
+    gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d']
+    img_metas = example['img_metas']._data
+    img = example['img']._data.numpy()
+    # need to transpose channel to first dim
+    img = img.transpose(1, 2, 0)
+    # no 3D gt bboxes, just show img
+    if gt_bboxes.tensor.shape[0] == 0:
+        gt_bboxes = None
+    if isinstance(gt_bboxes, DepthInstance3DBoxes):
+        show_multi_modality_result(
+            img,
+            gt_bboxes,
+            None,
+            None,
+            out_dir,
+            filename,
+            box_mode='depth',
+            img_metas=img_metas,
+            show=show)
+    elif isinstance(gt_bboxes, LiDARInstance3DBoxes):
+        show_multi_modality_result(
+            img,
+            gt_bboxes,
+            None,
+            img_metas['lidar2img'],
+            out_dir,
+            filename,
+            box_mode='lidar',
+            img_metas=img_metas,
+            show=show)
+    elif isinstance(gt_bboxes, CameraInstance3DBoxes):
+        show_multi_modality_result(
+            img,
+            gt_bboxes,
+            None,
+            img_metas['cam2img'],
+            out_dir,
+            filename,
+            box_mode='camera',
+            img_metas=img_metas,
+            show=show)
+    else:
+        # can't project, just show img
+        warnings.warn(
+            f'unrecognized gt box type {type(gt_bboxes)}, only show image')
+        show_multi_modality_result(
+            img, None, None, None, out_dir, filename, show=show)
+
+
+def main():
+    args = parse_args()
+
+    if args.output_dir is not None:
+        mkdir_or_exist(args.output_dir)
+
+    cfg = build_data_cfg(args.config, args.skip_type, args.cfg_options)
+    try:
+        dataset = build_dataset(
+            cfg.data.train, default_args=dict(filter_empty_gt=False))
+    except TypeError:  # seg dataset doesn't have `filter_empty_gt` key
+        dataset = build_dataset(cfg.data.train)
+    data_infos = dataset.data_infos
+    dataset_type = cfg.dataset_type
+
+    # configure visualization mode
+    vis_task = args.task  # 'det', 'seg', 'multi_modality-det', 'mono-det'
+
+    for idx, data_info in enumerate(track_iter_progress(data_infos)):
+        if dataset_type in ['KittiDataset', 'WaymoDataset']:
+            data_path = data_info['point_cloud']['velodyne_path']
+        elif dataset_type in [
+                'ScanNetDataset', 'SUNRGBDDataset', 'ScanNetSegDataset',
+                'S3DISSegDataset', 'S3DISDataset'
+        ]:
+            data_path = data_info['pts_path']
+        elif dataset_type in ['NuScenesDataset', 'LyftDataset']:
+            data_path = data_info['lidar_path']
+        elif dataset_type in ['NuScenesMonoDataset']:
+            data_path = data_info['file_name']
+        else:
+            raise NotImplementedError(
+                f'unsupported dataset type {dataset_type}')
+
+        file_name = osp.splitext(osp.basename(data_path))[0]
+
+        if vis_task in ['det', 'multi_modality-det']:
+            # show 3D bboxes on 3D point clouds
+            show_det_data(
+                idx, dataset, args.output_dir, file_name, show=args.online)
+        if vis_task in ['multi_modality-det', 'mono-det']:
+            # project 3D bboxes to 2D image
+            show_proj_bbox_img(
+                idx,
+                dataset,
+                args.output_dir,
+                file_name,
+                show=args.online,
+                is_nus_mono=(dataset_type == 'NuScenesMonoDataset'))
+        elif vis_task in ['seg']:
+            # show 3D segmentation mask on 3D point clouds
+            show_seg_data(
+                idx, dataset, args.output_dir, file_name, show=args.online)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/bevformer/misc/print_config.py b/adzoo/bevformer/misc/print_config.py
new file mode 100755
index 0000000..3100fc3
--- /dev/null
+++ b/adzoo/bevformer/misc/print_config.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from mmcv import Config, DictAction
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Print the whole config')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument(
+        '--options', nargs='+', action=DictAction, help='arguments in dict')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.options is not None:
+        cfg.merge_from_dict(args.options)
+    print(f'Config:\n{cfg.pretty_text}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/bevformer/misc/visualize_results.py b/adzoo/bevformer/misc/visualize_results.py
new file mode 100755
index 0000000..302adc5
--- /dev/null
+++ b/adzoo/bevformer/misc/visualize_results.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import mmcv
+from mmcv import Config
+
+from mmdet3d.datasets import build_dataset
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet3D visualize the results')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--result', help='results file in pickle format')
+    parser.add_argument(
+        '--show-dir', help='directory where visualize results will be saved')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.result is not None and \
+            not args.result.endswith(('.pkl', '.pickle')):
+        raise ValueError('The results file must be a pkl file.')
+
+    cfg = Config.fromfile(args.config)
+    cfg.data.test.test_mode = True
+
+    # build the dataset
+    dataset = build_dataset(cfg.data.test)
+    results = mmcv.load(args.result)
+
+    if getattr(dataset, 'show', None) is not None:
+        # data loading pipeline for showing
+        eval_pipeline = cfg.get('eval_pipeline', {})
+        if eval_pipeline:
+            dataset.show(results, args.show_dir, pipeline=eval_pipeline)
+        else:
+            dataset.show(results, args.show_dir)  # use default pipeline
+    else:
+        raise NotImplementedError(
+            'Show is not implemented for dataset {}!'.format(
+                type(dataset).__name__))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/bevformer/mmdet3d_plugin/bevformer/__init__.py b/adzoo/bevformer/mmdet3d_plugin/bevformer/__init__.py
new file mode 100644
index 0000000..0ead209
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/bevformer/__init__.py
@@ -0,0 +1 @@
+from .hooks import *
diff --git a/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/__init__.py b/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/__init__.py
new file mode 100644
index 0000000..15dff22
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/__init__.py
@@ -0,0 +1,3 @@
+from .train import custom_train_model
+from .mmdet_train import custom_train_detector
+# from .test import custom_multi_gpu_test
\ No newline at end of file
diff --git a/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/mmdet_train.py b/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/mmdet_train.py
new file mode 100644
index 0000000..1a218f0
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/mmdet_train.py
@@ -0,0 +1,203 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+import random
+import warnings
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.nn import DataParallel
+from torch.nn.parallel.distributed import DistributedDataParallel
+from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
+                         Fp16OptimizerHook, OptimizerHook,
+                         build_runner, )
+from mmcv.optims import build_optimizer
+from mmcv.utils import build_from_cfg
+
+from mmcv.core import EvalHook
+
+from mmcv.datasets import (build_dataset, replace_ImageToTensor)
+from mmcv.utils import get_root_logger, get_dist_info
+import time
+import os.path as osp
+from mmcv.datasets import build_dataloader
+from mmcv.core.evaluation.eval_hooks import CustomDistEvalHook
+from adzoo.bevformer.apis.test import custom_multi_gpu_test
+
+def custom_train_detector(model,
+                   dataset,
+                   cfg,
+                   distributed=False,
+                   validate=False,
+                   timestamp=None,
+                   eval_model=None,
+                   meta=None):
+    logger = get_root_logger(cfg.log_level)
+
+    # prepare data loaders
+   
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    #assert len(dataset)==1s
+    if 'imgs_per_gpu' in cfg.data:
+        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
+                       'Please use "samples_per_gpu" instead')
+        if 'samples_per_gpu' in cfg.data:
+            logger.warning(
+                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
+                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
+                f'={cfg.data.imgs_per_gpu} is used in this experiments')
+        else:
+            logger.warning(
+                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
+                f'{cfg.data.imgs_per_gpu} in this experiments')
+        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
+
+    data_loaders = [
+        build_dataloader(
+            ds,
+            cfg.data.samples_per_gpu,
+            cfg.data.workers_per_gpu,
+            # cfg.gpus will be ignored if distributed
+            len(cfg.gpu_ids),
+            dist=distributed,
+            seed=cfg.seed,
+            shuffler_sampler=cfg.data.shuffler_sampler,  # dict(type='DistributedGroupSampler'),
+            nonshuffler_sampler=cfg.data.nonshuffler_sampler,  # dict(type='DistributedSampler'),
+        ) for ds in dataset
+    ]
+    
+    # import ipdb
+    # ipdb.set_trace()
+    # put model on gpus
+    if distributed:
+        find_unused_parameters = cfg.get('find_unused_parameters', False)
+        # Sets the `find_unused_parameters` parameter in
+        # torch.nn.parallel.DistributedDataParallel
+        model = DistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False,
+            find_unused_parameters=find_unused_parameters)
+        if eval_model is not None:
+            eval_model = DistributedDataParallel(
+                eval_model.cuda(),
+                device_ids=[torch.cuda.current_device()],
+                broadcast_buffers=False,
+                find_unused_parameters=find_unused_parameters)
+    else:
+        model = DataParallel(
+            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+        if eval_model is not None:
+            eval_model = DataParallel(
+                eval_model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+
+    # build runner
+    optimizer = build_optimizer(model, cfg.optimizer)
+
+    if 'runner' not in cfg:
+        cfg.runner = {
+            'type': 'EpochBasedRunner',
+            'max_epochs': cfg.total_epochs
+        }
+        warnings.warn(
+            'config is now expected to have a `runner` section, '
+            'please set `runner` in your config.', UserWarning)
+    else:
+        if 'total_epochs' in cfg:
+            assert cfg.total_epochs == cfg.runner.max_epochs
+    if eval_model is not None:
+        runner = build_runner(
+            cfg.runner,
+            default_args=dict(
+                model=model,
+                eval_model=eval_model,
+                optimizer=optimizer,
+                work_dir=cfg.work_dir,
+                logger=logger,
+                meta=meta))
+    else:
+        runner = build_runner(
+            cfg.runner,
+            default_args=dict(
+                model=model,
+                optimizer=optimizer,
+                work_dir=cfg.work_dir,
+                logger=logger,
+                meta=meta))
+
+    # an ugly workaround to make .log and .log.json filenames the same
+    runner.timestamp = timestamp
+
+    # fp16 setting
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        optimizer_config = Fp16OptimizerHook(
+            **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
+    elif distributed and 'type' not in cfg.optimizer_config:
+        optimizer_config = OptimizerHook(**cfg.optimizer_config)
+    else:
+        optimizer_config = cfg.optimizer_config
+
+    # register hooks
+    runner.register_training_hooks(cfg.lr_config, optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config,
+                                   cfg.get('momentum_config', None))
+    
+    # register profiler hook
+    #trace_config = dict(type='tb_trace', dir_name='work_dir')
+    #profiler_config = dict(on_trace_ready=trace_config)
+    #runner.register_profiler_hook(profiler_config)
+    
+    if distributed:
+        if isinstance(runner, EpochBasedRunner):
+            runner.register_hook(DistSamplerSeedHook())
+
+    # register eval hooks
+    if validate:
+        # Support batch_size > 1 in validation
+        val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
+        if val_samples_per_gpu > 1:
+            assert False
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.val.pipeline = replace_ImageToTensor(
+                cfg.data.val.pipeline)
+        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
+
+        val_dataloader = build_dataloader(
+            val_dataset,
+            samples_per_gpu=val_samples_per_gpu,
+            workers_per_gpu=cfg.data.workers_per_gpu,
+            dist=distributed,
+            shuffle=False,
+            shuffler_sampler=cfg.data.shuffler_sampler,  # dict(type='DistributedGroupSampler'),
+            nonshuffler_sampler=cfg.data.nonshuffler_sampler,  # dict(type='DistributedSampler'),
+        )
+        eval_cfg = cfg.get('evaluation', {})
+        eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
+        eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_'))
+        eval_hook = CustomDistEvalHook if distributed else EvalHook
+        runner.register_hook(eval_hook(val_dataloader, test_fn=custom_multi_gpu_test, **eval_cfg))
+
+    # user-defined hooks
+    if cfg.get('custom_hooks', None):
+        custom_hooks = cfg.custom_hooks
+        assert isinstance(custom_hooks, list), \
+            f'custom_hooks expect list type, but got {type(custom_hooks)}'
+        for hook_cfg in cfg.custom_hooks:
+            assert isinstance(hook_cfg, dict), \
+                'Each item in custom_hooks expects dict type, but got ' \
+                f'{type(hook_cfg)}'
+            hook_cfg = hook_cfg.copy()
+            priority = hook_cfg.pop('priority', 'NORMAL')
+            hook = build_from_cfg(hook_cfg, HOOKS)
+            runner.register_hook(hook, priority=priority)
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow)
+
diff --git a/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/test.py b/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/test.py
new file mode 100644
index 0000000..cd507e4
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/test.py
@@ -0,0 +1,164 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+
+import mmcv
+import torch
+import torch.distributed as dist
+from mmcv.image import tensor2imgs
+from mmcv.runner import get_dist_info
+
+from mmdet.core import encode_mask_results
+
+
+import mmcv
+import numpy as np
+import pycocotools.mask as mask_util
+
+def custom_encode_mask_results(mask_results):
+    """Encode bitmap mask to RLE code. Semantic Masks only
+    Args:
+        mask_results (list | tuple[list]): bitmap mask results.
+            In mask scoring rcnn, mask_results is a tuple of (segm_results,
+            segm_cls_score).
+    Returns:
+        list | tuple: RLE encoded mask.
+    """
+    cls_segms = mask_results
+    num_classes = len(cls_segms)
+    encoded_mask_results = []
+    for i in range(len(cls_segms)):
+        encoded_mask_results.append(
+            mask_util.encode(
+                np.array(
+                    cls_segms[i][:, :, np.newaxis], order='F',
+                        dtype='uint8'))[0])  # encoded with RLE
+    return [encoded_mask_results]
+
+def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+    """Test model with multiple gpus.
+    This method tests model with multiple gpus and collects the results
+    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
+    it encodes results to gpu tensors and use gpu communication for results
+    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
+    and collects them by the rank 0 worker.
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode.
+        gpu_collect (bool): Option to use either gpu or cpu to collect results.
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    bbox_results = []
+    mask_results = []
+    dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        prog_bar = mmcv.ProgressBar(len(dataset))
+    time.sleep(2)  # This line can prevent deadlock problem in some cases.
+    have_mask = False
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, rescale=True, **data)
+            # encode mask results
+            if isinstance(result, dict):
+                if 'bbox_results' in result.keys():
+                    bbox_result = result['bbox_results']
+                    batch_size = len(result['bbox_results'])
+                    bbox_results.extend(bbox_result)
+                if 'mask_results' in result.keys() and result['mask_results'] is not None:
+                    mask_result = custom_encode_mask_results(result['mask_results'])
+                    mask_results.extend(mask_result)
+                    have_mask = True
+            else:
+                batch_size = len(result)
+                bbox_results.extend(result)
+
+            #if isinstance(result[0], tuple):
+            #    assert False, 'this code is for instance segmentation, which our code will not utilize.'
+            #    result = [(bbox_results, encode_mask_results(mask_results))
+            #              for bbox_results, mask_results in result]
+        if rank == 0:
+            
+            for _ in range(batch_size * world_size):
+                prog_bar.update()
+
+    # collect results from all ranks
+    if gpu_collect:
+        bbox_results = collect_results_gpu(bbox_results, len(dataset))
+        if have_mask:
+            mask_results = collect_results_gpu(mask_results, len(dataset))
+        else:
+            mask_results = None
+    else:
+        bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir)
+        tmpdir = tmpdir+'_mask' if tmpdir is not None else None
+        if have_mask:
+            mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir)
+        else:
+            mask_results = None
+
+    if mask_results is None:
+        return bbox_results
+    return {'bbox_results': bbox_results, 'mask_results': mask_results}
+
+
+def collect_results_cpu(result_part, size, tmpdir=None):
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ),
+                                32,
+                                dtype=torch.uint8,
+                                device='cuda')
+        if rank == 0:
+            mmcv.mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+            dir_tensor[:len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mmcv.mkdir_or_exist(tmpdir)
+    # dump the part result to the dir
+    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            part_file = osp.join(tmpdir, f'part_{i}.pkl')
+            part_list.append(mmcv.load(part_file))
+        # sort the results
+        ordered_results = []
+        '''
+        bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample,
+        '''
+        #for res in zip(*part_list):
+        for res in part_list:  
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)
+        return ordered_results
+
+
+def collect_results_gpu(result_part, size):
+    collect_results_cpu(result_part, size)
\ No newline at end of file
diff --git a/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/train.py b/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/train.py
new file mode 100644
index 0000000..dcae402
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/bevformer/apis/train.py
@@ -0,0 +1,65 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from .mmdet_train import custom_train_detector
+
+def custom_train_model(model,
+                dataset,
+                cfg,
+                distributed=False,
+                validate=False,
+                timestamp=None,
+                eval_model=None,
+                meta=None):
+    """A function wrapper for launching model training according to cfg.
+
+    Because we need different eval_hook in runner. Should be deprecated in the
+    future.
+    """
+    if cfg.model.type in ['EncoderDecoder3D']:
+        assert False
+    else:
+        custom_train_detector(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            eval_model=eval_model,
+            meta=meta)
+
+
+def train_model(model,
+                dataset,
+                cfg,
+                distributed=False,
+                validate=False,
+                timestamp=None,
+                meta=None):
+    """A function wrapper for launching model training according to cfg.
+
+    Because we need different eval_hook in runner. Should be deprecated in the
+    future.
+    """
+    if cfg.model.type in ['EncoderDecoder3D']:
+        train_segmentor(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            meta=meta)
+    else:
+        train_detector(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            meta=meta)
diff --git a/adzoo/bevformer/mmdet3d_plugin/bevformer/hooks/__init__.py b/adzoo/bevformer/mmdet3d_plugin/bevformer/hooks/__init__.py
new file mode 100644
index 0000000..aa04ec1
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/bevformer/hooks/__init__.py
@@ -0,0 +1 @@
+from .custom_hooks import TransferWeight
\ No newline at end of file
diff --git a/adzoo/bevformer/mmdet3d_plugin/bevformer/hooks/custom_hooks.py b/adzoo/bevformer/mmdet3d_plugin/bevformer/hooks/custom_hooks.py
new file mode 100644
index 0000000..ef1e35d
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/bevformer/hooks/custom_hooks.py
@@ -0,0 +1,12 @@
+from mmcv.runner.hooks.hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class TransferWeight(Hook):
+    
+    def __init__(self, every_n_inters=1):
+        self.every_n_inters=every_n_inters
+
+    def after_train_iter(self, runner):
+        if self.every_n_inner_iters(runner, self.every_n_inters):
+            runner.eval_model.load_state_dict(runner.model.state_dict())
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/__init__.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/__init__.py
new file mode 100644
index 0000000..64eaac4
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/__init__.py
@@ -0,0 +1 @@
+from .modeling import *
\ No newline at end of file
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/datasets/__init__.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/datasets/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/datasets/nuscenes.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/datasets/nuscenes.py
new file mode 100644
index 0000000..9eed59b
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/datasets/nuscenes.py
@@ -0,0 +1,360 @@
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+#import functools
+from collections import OrderedDict
+
+import numpy as np
+import seaborn as sns
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+#from detectron2.data import MetadataCatalog
+from mmcv.structures import BoxMode
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.nuscenes import NuScenes
+from nuscenes.utils.splits import create_splits_scenes
+
+#from tridet.data import collect_dataset_dicts
+from adzoo.bevformer.mmdet3d_plugin.dd3d.structures.boxes3d import GenericBoxes3D
+from adzoo.bevformer.mmdet3d_plugin.dd3d.structures.pose import Pose
+from adzoo.bevformer.mmdet3d_plugin.dd3d.utils.geometry import project_points3d
+from adzoo.bevformer.mmdet3d_plugin.dd3d.utils.visualization import float_to_uint8_color
+
+#  https://github.com/nutonomy/nuscenes-devkit/blob/9b209638ef3dee6d0cdc5ac700c493747f5b35fe/python-sdk/nuscenes/utils/splits.py#L189
+#     - train/val/test: The standard splits of the nuScenes dataset (700/150/150 scenes).
+#     - mini_train/mini_val: Train and val splits of the mini subset used for visualization and debugging (8/2 scenes).
+#     - train_detect/train_track: Two halves of the train split used for separating the training sets of detector and
+#         tracker if required
+DATASET_NAME_TO_VERSION = {
+    "nusc_train": "v1.0-trainval",
+    "nusc_val": "v1.0-trainval",
+    "nusc_val-subsample-8": "v1.0-trainval",
+    "nusc_trainval": "v1.0-trainval",
+    "nusc_test": "v1.0-test",
+    "nusc_mini_train": "v1.0-mini",
+    "nusc_mini_val": "v1.0-mini",
+}
+
+CAMERA_NAMES = ('CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT')
+
+ATTRIBUTE_IDS = {
+    'vehicle.moving': 0,
+    'vehicle.parked': 1,
+    'vehicle.stopped': 2,
+    'pedestrian.moving': 0,
+    'pedestrian.standing': 1,
+    'pedestrian.sitting_lying_down': 2,
+    'cycle.with_rider': 0,
+    'cycle.without_rider': 1,
+}
+
+CATEGORY_IDS = OrderedDict({
+    'barrier': 0,
+    'bicycle': 1,
+    'bus': 2,
+    'car': 3,
+    'construction_vehicle': 4,
+    'motorcycle': 5,
+    'pedestrian': 6,
+    'traffic_cone': 7,
+    'trailer': 8,
+    'truck': 9,
+})
+
+COLORS = [float_to_uint8_color(clr) for clr in sns.color_palette("bright", n_colors=10)]
+COLORMAP = OrderedDict({
+    'barrier': COLORS[8],  # yellow
+    'bicycle': COLORS[0],  # blue
+    'bus': COLORS[6],  # pink
+    'car': COLORS[2],  # green
+    'construction_vehicle': COLORS[7],  # gray
+    'motorcycle': COLORS[4],  # purple
+    'pedestrian': COLORS[1],  # orange
+    'traffic_cone': COLORS[3],  # red
+    'trailer': COLORS[9],  # skyblue
+    'truck': COLORS[5],  # brown
+})
+
+MAX_NUM_ATTRIBUTES = 3
+
+
+def _compute_iou(box1, box2):
+    """
+    Parameters
+    ----------
+    box1, box2:
+        (x1, y1, x2, y2)
+    """
+    xx1 = max(box1[0], box2[0])
+    yy1 = max(box1[1], box2[1])
+    xx2 = min(box1[2], box2[2])
+    yy2 = min(box1[3], box2[3])
+    if xx1 >= xx2 or yy1 >= yy2:
+        return 0.
+    inter = (xx2 - xx1) * (yy2 - yy1)
+    a1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    a2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    return inter / (a1 + a2 - inter)
+
+
+class NuscenesDataset(Dataset):
+    def __init__(self, name, data_root, datum_names=CAMERA_NAMES, min_num_lidar_points=3, min_box_visibility=0.2, **unused):
+        self.data_root = data_root
+        assert name in DATASET_NAME_TO_VERSION
+        version = DATASET_NAME_TO_VERSION[name]
+        self.nusc = NuScenes(version=version, dataroot=data_root, verbose=True)
+
+        self.datum_names = datum_names
+        self.min_num_lidar_points = min_num_lidar_points
+        self.min_box_visibility = min_box_visibility
+
+        self.dataset_item_info = self._build_dataset_item_info(name)
+
+        # Index instance tokens to their IDs
+        self._instance_token_to_id = self._index_instance_tokens()
+
+        # Construct the mapping from datum_token (image id) to index
+        print("Generating the mapping from image id to idx...")
+        self.datumtoken2idx = {}
+        for idx, (datum_token, _, _, _, _) in enumerate(self.dataset_item_info):
+            self.datumtoken2idx[datum_token] = idx
+        print("Done.")
+
+    def _build_dataset_item_info(self, name):
+        scenes_in_split = self._get_split_scenes(name)
+
+        dataset_items = []
+        for _, scene_token in tqdm(scenes_in_split):
+            scene = self.nusc.get('scene', scene_token)
+            sample_token = scene['first_sample_token']
+            for sample_idx in range(scene['nbr_samples']):
+                if name.endswith('subsample-8') and sample_idx % 8 > 0:
+                    # Sample-level subsampling.
+                    continue
+
+                sample = self.nusc.get('sample', sample_token)
+                for datum_name, datum_token in sample['data'].items():
+                    if datum_name not in self.datum_names:
+                        continue
+                    dataset_items.append((datum_token, sample_token, scene['name'], sample_idx, datum_name))
+                sample_token = sample['next']
+        return dataset_items
+
+    def _get_split_scenes(self, name):
+        scenes_in_splits = create_splits_scenes()
+        if name == "nusc_trainval":
+            scenes = scenes_in_splits["train"] + scenes_in_splits["val"]
+        elif name == "nusc_val-subsample-8":
+            scenes = scenes_in_splits["val"]
+        else:
+            assert name.startswith('nusc_'), f"Invalid dataset name: {name}"
+            split = name[5:]
+            assert split in scenes_in_splits, f"Invalid dataset: {split}"
+            scenes = scenes_in_splits[split]
+
+        # Mapping from scene name to token.
+        name_to_token = {scene['name']: scene['token'] for scene in self.nusc.scene}
+        return [(name, name_to_token[name]) for name in scenes]
+
+    def __len__(self):
+        return len(self.dataset_item_info)
+
+    def _build_id(self, scene_name, sample_idx, datum_name):
+        sample_id = f"{scene_name}_{sample_idx:03d}"
+        image_id = f"{sample_id}_{datum_name}"
+        return image_id, sample_id
+
+    def _index_instance_tokens(self):
+        """Index instance tokens for uniquely identifying instances across samples"""
+        instance_token_to_id = {}
+        for record in self.nusc.sample_annotation:
+            instance_token = record['instance_token']
+            if instance_token not in instance_token_to_id:
+                next_instance_id = len(instance_token_to_id)
+                instance_token_to_id[instance_token] = next_instance_id
+        return instance_token_to_id
+
+    def get_instance_annotations(self, annotation_list, K, image_shape, pose_WS):
+        annotations = []
+        for _ann in annotation_list:
+            ann = self.nusc.get('sample_annotation', _ann.token)
+            if ann['num_lidar_pts'] + ann['num_radar_pts'] < self.min_num_lidar_points:
+                continue
+            annotation = OrderedDict()
+
+            # --------
+            # Category
+            # --------
+            category = category_to_detection_name(ann['category_name'])
+            if category is None:
+                continue
+            annotation['category_id'] = CATEGORY_IDS[category]
+
+            # ------
+            # 3D box
+            # ------
+            # NOTE: ann['rotation'], ann['translation'] is in global frame.
+            pose_SO = Pose(wxyz=_ann.orientation, tvec=_ann.center)  # pose in sensor frame
+            # DEBUG:
+            # pose_WO_1 = Pose(np.array(ann['rotation']), np.array(ann['translation']))
+            # pose_WO_2 = pose_WS * pose_SO
+            # assert np.allclose(pose_WO_1.matrix, pose_WO_2.matrix)
+            bbox3d = GenericBoxes3D(_ann.orientation, _ann.center, _ann.wlh)
+            annotation['bbox3d'] = bbox3d.vectorize().tolist()[0]
+
+            # --------------------------------------
+            # 2D box -- project 8 corners of 3D bbox
+            # --------------------------------------
+            corners = project_points3d(bbox3d.corners.cpu().numpy().squeeze(0), K)
+            l, t = corners[:, 0].min(), corners[:, 1].min()
+            r, b = corners[:, 0].max(), corners[:, 1].max()
+
+            x1 = max(0, l)
+            y1 = max(0, t)
+            x2 = min(image_shape[1], r)
+            y2 = min(image_shape[0], b)
+
+            iou = _compute_iou([l, t, r, b], [x1, y1, x2, y2])
+            if iou < self.min_box_visibility:
+                continue
+
+            annotation['bbox'] = [x1, y1, x2, y2]
+            annotation['bbox_mode'] = BoxMode.XYXY_ABS
+
+            # --------
+            # Track ID
+            # --------
+            annotation['track_id'] = self._instance_token_to_id[ann['instance_token']]
+
+            # ---------
+            # Attribute
+            # ---------
+            attr_tokens = ann['attribute_tokens']
+            assert len(attr_tokens) < 2  # NOTE: Allow only single attrubute.
+            attribute_id = MAX_NUM_ATTRIBUTES  # By default, MAX_NUM_ATTRIBUTES -- this is to be ignored in loss compute.
+            if attr_tokens:
+                attribute = self.nusc.get('attribute', attr_tokens[0])['name']
+                attribute_id = ATTRIBUTE_IDS[attribute]
+            annotation['attribute_id'] = attribute_id
+
+            # -----
+            # Speed
+            # -----
+            vel_global = self.nusc.box_velocity(ann['token'])
+            speed = np.linalg.norm(vel_global)  # NOTE: This can be NaN.
+            # DEBUG:
+            # speed * Quaternion(ann['rotation']).rotation_matrix.T[0] ~= vel_global
+            annotation['speed'] = speed
+
+            annotations.append(annotation)
+
+        return annotations
+
+    def _get_ego_velocity(self, current, max_time_diff=1.5):
+        """Velocity of ego-vehicle in m/s.
+        """
+        has_prev = current['prev'] != ''
+        has_next = current['next'] != ''
+
+        # Cannot estimate velocity for a single annotation.
+        if not has_prev and not has_next:
+            return np.array([np.nan, np.nan, np.nan])
+
+        if has_prev:
+            first = self.nusc.get('sample_data', current['prev'])
+        else:
+            first = current
+
+        if has_next:
+            last = self.nusc.get('sample_data', current['next'])
+        else:
+            last = current
+
+        pos_first = self.nusc.get('ego_pose', first['ego_pose_token'])['translation']
+        pos_last = self.nusc.get('ego_pose', last['ego_pose_token'])['translation']
+        pos_diff = np.float32(pos_last) - np.float32(pos_first)
+
+        time_last = 1e-6 * last['timestamp']
+        time_first = 1e-6 * first['timestamp']
+        time_diff = time_last - time_first
+
+        if has_next and has_prev:
+            # If doing centered difference, allow for up to double the max_time_diff.
+            max_time_diff *= 2
+
+        if time_diff > max_time_diff:
+            # If time_diff is too big, don't return an estimate.
+            return np.array([np.nan, np.nan, np.nan])
+        else:
+            return pos_diff / time_diff
+
+    def __getitem__(self, idx):
+        datum_token, sample_token, scene_name, sample_idx, datum_name = self.dataset_item_info[idx]
+        datum = self.nusc.get('sample_data', datum_token)
+        assert datum['is_key_frame']
+
+        filename, _annotations, K = self.nusc.get_sample_data(datum_token)
+        image_id, sample_id = self._build_id(scene_name, sample_idx, datum_name)
+        height, width = datum['height'], datum['width']
+        d2_dict = OrderedDict(
+            file_name=filename,
+            height=height,
+            width=width,
+            image_id=image_id,
+            sample_id=sample_id,
+            sample_token=sample_token
+        )
+
+        # Intrinsics
+        d2_dict['intrinsics'] = list(K.flatten())
+
+        # Get pose of the sensor (S) from vehicle (V) frame
+        _pose_VS = self.nusc.get('calibrated_sensor', datum['calibrated_sensor_token'])
+        pose_VS = Pose(wxyz=np.float64(_pose_VS['rotation']), tvec=np.float64(_pose_VS['translation']))
+
+        # Get ego-pose of the vehicle (V) from global/world (W) frame
+        _pose_WV = self.nusc.get('ego_pose', datum['ego_pose_token'])
+        pose_WV = Pose(wxyz=np.float64(_pose_WV['rotation']), tvec=np.float64(_pose_WV['translation']))
+        pose_WS = pose_WV * pose_VS
+
+        d2_dict['pose'] = {'wxyz': list(pose_WS.quat.elements), 'tvec': list(pose_WS.tvec)}
+        d2_dict['extrinsics'] = {'wxyz': list(pose_VS.quat.elements), 'tvec': list(pose_VS.tvec)}
+
+        d2_dict['ego_speed'] = np.linalg.norm(self._get_ego_velocity(datum))
+
+        d2_dict['annotations'] = self.get_instance_annotations(_annotations, K, (height, width), pose_WS)
+
+        return d2_dict
+
+    def getitem_by_datumtoken(self, datum_token):
+        # idx = self.datumtoken2idx[datum_token]
+        # ret = self.__getitem__(idx)
+
+        datum = self.nusc.get('sample_data', datum_token)
+        sample_token = datum['sample_token']
+        filename, _annotations, K = self.nusc.get_sample_data(datum_token)
+        height, width = datum['height'], datum['width']
+        d2_dict = OrderedDict(
+            file_name=filename,
+            height=height,
+            width=width,
+            image_id=0,
+            sample_id=0,
+            sample_token=sample_token
+        )
+        # Intrinsics
+        d2_dict['intrinsics'] = list(K.flatten())
+        # Get pose of the sensor (S) from vehicle (V) frame
+        _pose_VS = self.nusc.get('calibrated_sensor', datum['calibrated_sensor_token'])
+        pose_VS = Pose(wxyz=np.float64(_pose_VS['rotation']), tvec=np.float64(_pose_VS['translation'])) 
+        # Get ego-pose of the vehicle (V) from global/world (W) frame
+        _pose_WV = self.nusc.get('ego_pose', datum['ego_pose_token'])
+        pose_WV = Pose(wxyz=np.float64(_pose_WV['rotation']), tvec=np.float64(_pose_WV['translation']))
+        pose_WS = pose_WV * pose_VS
+
+        d2_dict['pose'] = {'wxyz': list(pose_WS.quat.elements), 'tvec': list(pose_WS.tvec)}
+        d2_dict['extrinsics'] = {'wxyz': list(pose_VS.quat.elements), 'tvec': list(pose_VS.tvec)}
+
+        d2_dict['ego_speed'] = np.linalg.norm(self._get_ego_velocity(datum))
+
+        d2_dict['annotations'] = self.get_instance_annotations(_annotations, K, (height, width), pose_WS)
+        return d2_dict
\ No newline at end of file
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/datasets/transform_utils.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/datasets/transform_utils.py
new file mode 100644
index 0000000..623bd6e
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/datasets/transform_utils.py
@@ -0,0 +1,136 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+# Adapted from detectron2:
+#   https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/detection_utils.py
+import numpy as np
+import torch
+
+from detectron2.data import transforms as T
+from detectron2.structures import Boxes, BoxMode, Instances
+
+from projects.mmdet3d_plugin.dd3d.structures.boxes3d import Boxes3D
+
+__all__ = ["transform_instance_annotations", "annotations_to_instances"]
+
+
+def transform_instance_annotations(
+    annotation,
+    transforms,
+    image_size,
+):
+    """Adapted from:
+        https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/detection_utils.py#L254
+
+    The changes from original:
+        - The presence of 2D bounding box (i.e. "bbox" field) is assumed by default in d2; here it's optional.
+        - Add optional 3D bounding box support.
+        - If the instance mask annotation is in RLE, then it's decoded into polygons, not bitmask, to save memory.
+
+    ===============================================================================================================
+
+    Apply transforms to box, segmentation and keypoints annotations of a single instance.
+
+    It will use `transforms.apply_box` for the box, and
+    `transforms.apply_coords` for segmentation polygons & keypoints.
+    If you need anything more specially designed for each data structure,
+    you'll need to implement your own version of this function or the transforms.
+
+    Args:
+        annotation (dict): dict of instance annotations for a single instance.
+            It will be modified in-place.
+        transforms (TransformList or list[Transform]):
+        image_size (tuple): the height, width of the transformed image
+        keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`.
+
+    Returns:
+        dict:
+            the same input dict with fields "bbox", "segmentation", "keypoints"
+            transformed according to `transforms`.
+            The "bbox_mode" field will be set to XYXY_ABS.
+    """
+    if isinstance(transforms, (tuple, list)):
+        transforms = T.TransformList(transforms)
+    # (dennis.park) Here 2D bounding box is optional.
+    if "bbox" in annotation:
+        assert "bbox_mode" in annotation, "'bbox' is present, but 'bbox_mode' is not."
+        # bbox is 1d (per-instance bounding box)
+        bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
+        bbox = transforms.apply_box(np.array([bbox]))[0]
+        # clip transformed bbox to image size
+        bbox = bbox.clip(min=0)
+        bbox = np.minimum(bbox, list(image_size + image_size)[::-1])
+        annotation["bbox"] = bbox
+        annotation["bbox_mode"] = BoxMode.XYXY_ABS
+
+    # Vertical flipping is not implemented (`flip_transform.py`). TODO: implement if needed.
+    if "bbox3d" in annotation:
+        bbox3d = np.array(annotation["bbox3d"])
+        annotation['bbox3d'] = transforms.apply_box3d(bbox3d)
+
+    return annotation
+
+
+def _create_empty_instances(image_size):
+    target = Instances(image_size)
+
+    target.gt_boxes = Boxes([])
+    target.gt_classes = torch.tensor([], dtype=torch.int64)
+    target.gt_boxes3d = Boxes3D.from_vectors([], torch.eye(3, dtype=torch.float32))
+
+    return target
+
+
+def annotations_to_instances(
+    annos,
+    image_size,
+    intrinsics=None,
+):
+    """
+    Create an :class:`Instances` object used by the models,
+    from instance annotations in the dataset dict.
+
+    Args:
+        annos (list[dict]): a list of instance annotations in one image, each
+            element for one instance.
+        image_size (tuple): height, width
+
+    Returns:
+        Instances:
+            It will contain fields "gt_boxes", "gt_classes",
+            "gt_masks", "gt_keypoints", if they can be obtained from `annos`.
+            This is the format that builtin models expect.
+    """
+    if len(annos) == 0:
+        return _create_empty_instances(image_size)
+
+    boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos]
+    target = Instances(image_size)
+    target.gt_boxes = Boxes(boxes)
+
+    classes = [obj["category_id"] for obj in annos]
+    classes = torch.tensor(classes, dtype=torch.int64)
+    target.gt_classes = classes
+
+    if len(annos) and "bbox3d" in annos[0]:
+        assert intrinsics is not None
+        target.gt_boxes3d = Boxes3D.from_vectors([anno['bbox3d'] for anno in annos], intrinsics)
+        if len(target.gt_boxes3d) != target.gt_boxes.tensor.shape[0]:
+            raise ValueError(
+                f"The sizes of `gt_boxes3d` and `gt_boxes` do not match: a={len(target.gt_boxes3d)}, b={target.gt_boxes.tensor.shape[0]}."
+            )
+
+    # NOTE: add nuscenes attributes here
+    # NOTE: instances will be filtered later
+    # NuScenes attributes
+    if len(annos) and "attribute_id" in annos[0]:    
+        attributes = [obj["attribute_id"] for obj in annos] 
+        target.gt_attributes = torch.tensor(attributes, dtype=torch.int64)
+
+    # Speed (magnitude of velocity)
+    if len(annos) and "speed" in annos[0]:
+        speeds = [obj["speed"] for obj in annos]
+        target.gt_speeds = torch.tensor(speeds, dtype=torch.float32)
+
+    assert len(boxes) == len(classes) == len(attributes) == len(speeds), \
+        'the numbers of annotations should be the same'
+    return target
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/layers/iou_loss.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/layers/iou_loss.py
new file mode 100644
index 0000000..97638ef
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/layers/iou_loss.py
@@ -0,0 +1,71 @@
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+# Adapted from AdelaiDet:
+#   https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/iou_loss.py
+import torch
+from torch import nn
+
+
+class IOULoss(nn.Module):
+    """
+    Intersetion Over Union (IoU) loss which supports three
+    different IoU computations:
+
+    * IoU
+    * Linear IoU
+    * gIoU
+    """
+    def __init__(self, loc_loss_type='iou'):
+        super(IOULoss, self).__init__()
+        self.loc_loss_type = loc_loss_type
+
+    def forward(self, pred, target, weight=None):
+        """
+        Args:
+            pred: Nx4 predicted bounding boxes
+            target: Nx4 target bounding boxes
+            weight: N loss weight for each instance
+        """
+        pred_left = pred[:, 0]
+        pred_top = pred[:, 1]
+        pred_right = pred[:, 2]
+        pred_bottom = pred[:, 3]
+
+        target_left = target[:, 0]
+        target_top = target[:, 1]
+        target_right = target[:, 2]
+        target_bottom = target[:, 3]
+
+        target_aera = (target_left + target_right) * \
+                      (target_top + target_bottom)
+        pred_aera = (pred_left + pred_right) * \
+                    (pred_top + pred_bottom)
+
+        w_intersect = torch.min(pred_left, target_left) + \
+                      torch.min(pred_right, target_right)
+        h_intersect = torch.min(pred_bottom, target_bottom) + \
+                      torch.min(pred_top, target_top)
+
+        g_w_intersect = torch.max(pred_left, target_left) + \
+                        torch.max(pred_right, target_right)
+        g_h_intersect = torch.max(pred_bottom, target_bottom) + \
+                        torch.max(pred_top, target_top)
+        ac_uion = g_w_intersect * g_h_intersect
+
+        area_intersect = w_intersect * h_intersect
+        area_union = target_aera + pred_aera - area_intersect
+
+        ious = (area_intersect + 1.0) / (area_union + 1.0)
+        gious = ious - (ac_uion - area_union) / ac_uion
+        if self.loc_loss_type == 'iou':
+            losses = -torch.log(ious)
+        elif self.loc_loss_type == 'linear_iou':
+            losses = 1 - ious
+        elif self.loc_loss_type == 'giou':
+            losses = 1 - gious
+        else:
+            raise NotImplementedError
+
+        if weight is not None:
+            return (losses * weight).sum()
+        else:
+            return losses.sum()
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/layers/normalization.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/layers/normalization.py
new file mode 100644
index 0000000..bed7c63
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/layers/normalization.py
@@ -0,0 +1,40 @@
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+# Adapted from AdelaiDet
+#   https://github.com/aim-uofa/AdelaiDet/
+import logging
+
+import torch
+from torch import nn
+
+LOG = logging.getLogger(__name__)
+
+
+class Scale(nn.Module):
+    def __init__(self, init_value=1.0):
+        super(Scale, self).__init__()
+        self.scale = nn.Parameter(torch.FloatTensor([init_value]))
+
+    def forward(self, input):
+        return input * self.scale
+
+
+class Offset(nn.Module):
+    def __init__(self, init_value=0.):
+        super(Offset, self).__init__()
+        self.bias = nn.Parameter(torch.FloatTensor([init_value]))
+
+    def forward(self, input):
+        return input + self.bias
+
+
+class ModuleListDial(nn.ModuleList):
+    def __init__(self, modules=None):
+        super(ModuleListDial, self).__init__(modules)
+        self.cur_position = 0
+
+    def forward(self, x):
+        result = self[self.cur_position](x)
+        self.cur_position += 1
+        if self.cur_position >= len(self):
+            self.cur_position = 0
+        return result
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/layers/smooth_l1_loss.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/layers/smooth_l1_loss.py
new file mode 100644
index 0000000..b5448d0
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/layers/smooth_l1_loss.py
@@ -0,0 +1,80 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+# Adapted from fvcore:
+#   https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/smooth_l1_loss.py
+
+import torch
+
+
+def smooth_l1_loss(input: torch.Tensor, target: torch.Tensor, beta: float, reduction: str = "none") -> torch.Tensor:
+    """
+    Smooth L1 loss defined in the Fast R-CNN paper as:
+
+                  | 0.5 * x ** 2 / beta   if abs(x) < beta
+    smoothl1(x) = |
+                  | abs(x) - 0.5 * beta   otherwise,
+
+    where x = input - target.
+
+    Smooth L1 loss is related to Huber loss, which is defined as:
+
+                | 0.5 * x ** 2                  if abs(x) < beta
+     huber(x) = |
+                | beta * (abs(x) - 0.5 * beta)  otherwise
+
+    Smooth L1 loss is equal to huber(x) / beta. This leads to the following
+    differences:
+
+     - As beta -> 0, Smooth L1 loss converges to L1 loss, while Huber loss
+       converges to a constant 0 loss.
+     - As beta -> +inf, Smooth L1 converges to a constant 0 loss, while Huber loss
+       converges to L2 loss.
+     - For Smooth L1 loss, as beta varies, the L1 segment of the loss has a constant
+       slope of 1. For Huber loss, the slope of the L1 segment is beta.
+
+    Smooth L1 loss can be seen as exactly L1 loss, but with the abs(x) < beta
+    portion replaced with a quadratic function such that at abs(x) = beta, its
+    slope is 1. The quadratic segment smooths the L1 loss near x = 0.
+
+    Args:
+        input (Tensor): input tensor of any shape
+        target (Tensor): target value tensor with the same shape as input
+        beta (float): L1 to L2 change point.
+            For beta values < 1e-5, L1 loss is computed.
+        reduction: 'none' | 'mean' | 'sum'
+                 'none': No reduction will be applied to the output.
+                 'mean': The output will be averaged.
+                 'sum': The output will be summed.
+
+    Returns:
+        The loss with the reduction option applied.
+
+    Note:
+        PyTorch's builtin "Smooth L1 loss" implementation does not actually
+        implement Smooth L1 loss, nor does it implement Huber loss. It implements
+        the special case of both in which they are equal (beta=1).
+        See: https://pytorch.org/docs/stable/nn.html#torch.nn.SmoothL1Loss.
+     """
+    # (dennis.park) Make it work with mixed precision training.
+    beta = torch.as_tensor(beta).to(input.dtype)
+    if beta < 1e-5:
+        # if beta == 0, then torch.where will result in nan gradients when
+        # the chain rule is applied due to pytorch implementation details
+        # (the False branch "0.5 * n ** 2 / 0" has an incoming gradient of
+        # zeros, rather than "no gradient"). To avoid this issue, we define
+        # small values of beta to be exactly l1 loss.
+        loss = torch.abs(input - target)
+    else:
+        n = torch.abs(input - target)
+        cond = n < beta
+        a = 0.5 * n**2
+        b = n - 0.5 * beta
+        a, b = a.to(input.dtype), b.to(input.dtype)
+        loss = torch.where(cond, a, b)
+        # loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta)
+
+    if reduction == "mean":
+        loss = loss.mean()
+    elif reduction == "sum":
+        loss = loss.sum()
+    return loss
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/__init__.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/__init__.py
new file mode 100644
index 0000000..dd76a61
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/__init__.py
@@ -0,0 +1 @@
+from .nuscenes_dd3d import NuscenesDD3D
\ No newline at end of file
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/core.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/core.py
new file mode 100644
index 0000000..4830248
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/core.py
@@ -0,0 +1,217 @@
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import torch
+from torch import nn
+
+#from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
+from mmcv.modeling.postprocessing import detector_postprocess as resize_instances
+from mmcv.structures import Instances
+from mmcv.layers import ShapeSpec
+from mmcv.utils import force_fp32
+
+from .fcos2d import FCOS2DHead, FCOS2DInference, FCOS2DLoss
+from .fcos3d import FCOS3DHead, FCOS3DInference, FCOS3DLoss
+#from tridet.modeling.dd3d.postprocessing import nuscenes_sample_aggregate
+from .prepare_targets import DD3DTargetPreparer
+#from tridet.modeling.feature_extractor import build_feature_extractor
+from ..structures.image_list import ImageList
+from adzoo.bevformer.mmdet3d_plugin.dd3d.utils.tensor2d import compute_features_locations as compute_locations_per_level
+
+
+#@META_ARCH_REGISTRY.register()
+class DD3D(nn.Module):
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 strides,
+                 fcos2d_cfg=dict(),
+                 fcos2d_loss_cfg=dict(),
+                 fcos3d_cfg=dict(),
+                 fcos3d_loss_cfg=dict(),
+                 target_assign_cfg=dict(),
+                 box3d_on=True,
+                 feature_locations_offset="none"):
+        super().__init__()
+        # NOTE: do not need backbone
+        # self.backbone = build_feature_extractor(cfg)
+        # backbone_output_shape = self.backbone.output_shape()
+        # self.in_features = cfg.DD3D.IN_FEATURES or list(backbone_output_shape.keys())
+        
+        self.backbone_output_shape = [ShapeSpec(channels=in_channels, stride=s) for s in strides]
+
+        self.feature_locations_offset = feature_locations_offset
+
+        self.fcos2d_head = FCOS2DHead(num_classes=num_classes, input_shape=self.backbone_output_shape,
+                                     **fcos2d_cfg)
+        self.fcos2d_loss = FCOS2DLoss(num_classes=num_classes, **fcos2d_loss_cfg)
+        # NOTE: inference later
+        # self.fcos2d_inference = FCOS2DInference(cfg)
+
+        if box3d_on:
+            self.fcos3d_head = FCOS3DHead(num_classes=num_classes, input_shape=self.backbone_output_shape,
+                                          **fcos3d_cfg)
+            self.fcos3d_loss = FCOS3DLoss(num_classes=num_classes, **fcos3d_loss_cfg)
+            # NOTE: inference later
+            # self.fcos3d_inference = FCOS3DInference(cfg)
+            self.only_box2d = False
+        else:
+            self.only_box2d = True
+
+        self.prepare_targets = DD3DTargetPreparer(num_classes=num_classes, 
+                                                  input_shape=self.backbone_output_shape,
+                                                  box3d_on=box3d_on,
+                                                  **target_assign_cfg)
+
+        # NOTE: inference later
+        # self.postprocess_in_inference = cfg.DD3D.INFERENCE.DO_POSTPROCESS
+
+        # self.do_nms = cfg.DD3D.INFERENCE.DO_NMS
+        # self.do_bev_nms = cfg.DD3D.INFERENCE.DO_BEV_NMS
+        # self.bev_nms_iou_thresh = cfg.DD3D.INFERENCE.BEV_NMS_IOU_THRESH
+
+        # nuScenes inference aggregates detections over all 6 cameras.
+        # self.nusc_sample_aggregate_in_inference = cfg.DD3D.INFERENCE.NUSC_SAMPLE_AGGREGATE
+        self.num_classes = num_classes
+
+        # NOTE: do not need normalize
+        # self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
+        # self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
+
+    # NOTE:
+    # @property
+    # def device(self):
+    #     return self.pixel_mean.device
+
+    # def preprocess_image(self, x):
+    #     return (x - self.pixel_mean) / self.pixel_std
+
+    @force_fp32(apply_to=('features'))
+    def forward(self, features, batched_inputs):
+        # NOTE:
+        # images = [x["image"].to(self.device) for x in batched_inputs]
+        # images = [self.preprocess_image(x) for x in images]
+
+        # NOTE: directly use inv_intrinsics
+        # if 'intrinsics' in batched_inputs[0]:
+        #     intrinsics = [x['intrinsics'].to(self.device) for x in batched_inputs]
+        # else:
+        #     intrinsics = None
+        # images = ImageList.from_tensors(images, self.backbone.size_divisibility, intrinsics=intrinsics)
+        if 'inv_intrinsics' in batched_inputs[0]:
+            inv_intrinsics = [x['inv_intrinsics'].to(features[0].device) for x in batched_inputs]
+            inv_intrinsics = torch.stack(inv_intrinsics, dim=0)
+        else:
+            inv_intrinsics = None
+
+        # NOTE:
+        # gt_dense_depth = None
+        # if 'depth' in batched_inputs[0]:
+        #     gt_dense_depth = [x["depth"].to(self.device) for x in batched_inputs]
+        #     gt_dense_depth = ImageList.from_tensors(
+        #         gt_dense_depth, self.backbone.size_divisibility, intrinsics=intrinsics
+        #     )
+
+        # NOTE: directly input feature
+        # features = self.backbone(images.tensor)
+        # features = [features[f] for f in self.in_features]
+
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(features[0].device) for x in batched_inputs]
+        else:
+            gt_instances = None
+
+        locations = self.compute_locations(features)
+        logits, box2d_reg, centerness, _ = self.fcos2d_head(features)
+        if not self.only_box2d:
+            box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth = self.fcos3d_head(features)
+        # NOTE: directly use inv_intrinsics
+        # inv_intrinsics = images.intrinsics.inverse() if images.intrinsics is not None else None
+
+        if self.training:
+            assert gt_instances is not None
+            feature_shapes = [x.shape[-2:] for x in features]
+            training_targets = self.prepare_targets(locations, gt_instances, feature_shapes)
+            # NOTE: 
+            # if gt_dense_depth is not None:
+            #    training_targets.update({"dense_depth": gt_dense_depth})
+
+            losses = {}
+            fcos2d_loss, fcos2d_info = self.fcos2d_loss(logits, box2d_reg, centerness, training_targets)
+            losses.update(fcos2d_loss)
+
+            if not self.only_box2d:
+                fcos3d_loss = self.fcos3d_loss(
+                    box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth, inv_intrinsics,
+                    fcos2d_info, training_targets
+                )
+                losses.update(fcos3d_loss)
+            return losses
+        else:
+            # TODO: do not support inference now
+            raise NotImplementedError
+            
+            pred_instances, fcos2d_info = self.fcos2d_inference(
+                logits, box2d_reg, centerness, locations, images.image_sizes
+            )
+            if not self.only_box2d:
+                # This adds 'pred_boxes3d' and 'scores_3d' to Instances in 'pred_instances' in place.
+                self.fcos3d_inference(
+                    box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances,
+                    fcos2d_info
+                )
+
+                # 3D score == 2D score x confidence.
+                score_key = "scores_3d"
+            else:
+                score_key = "scores"
+
+            # Transpose to "image-first", i.e. (B, L)
+            pred_instances = list(zip(*pred_instances))
+            pred_instances = [Instances.cat(instances) for instances in pred_instances]
+
+            # 2D NMS and pick top-K.
+            if self.do_nms:
+                pred_instances = self.fcos2d_inference.nms_and_top_k(pred_instances, score_key)
+
+            if not self.only_box2d and self.do_bev_nms:
+                # Bird-eye-view NMS.
+                dummy_group_idxs = {i: [i] for i, _ in enumerate(pred_instances)}
+                if 'pose' in batched_inputs[0]:
+                    poses = [x['pose'] for x in batched_inputs]
+                else:
+                    poses = [x['extrinsics'] for x in batched_inputs]
+                pred_instances = nuscenes_sample_aggregate(
+                    pred_instances,
+                    dummy_group_idxs,
+                    self.num_classes,
+                    poses,
+                    iou_threshold=self.bev_nms_iou_thresh,
+                    include_boxes3d_global=False
+                )
+
+            if self.postprocess_in_inference:
+                processed_results = []
+                for results_per_image, input_per_image, image_size in \
+                        zip(pred_instances, batched_inputs, images.image_sizes):
+                    height = input_per_image.get("height", image_size[0])
+                    width = input_per_image.get("width", image_size[1])
+                    r = resize_instances(results_per_image, height, width)
+                    processed_results.append({"instances": r})
+            else:
+                processed_results = [{"instances": x} for x in pred_instances]
+
+            return processed_results
+
+    def compute_locations(self, features):
+        locations = []
+        in_strides = [x.stride for x in self.backbone_output_shape]
+        for level, feature in enumerate(features):
+            h, w = feature.size()[-2:]
+            locations_per_level = compute_locations_per_level(
+                h, w, in_strides[level], feature.dtype, feature.device, offset=self.feature_locations_offset
+            )
+            locations.append(locations_per_level)
+        return locations
+
+    def forward_train(self, features, batched_inputs):
+        self.train()
+        return self.forward(features, batched_inputs)
\ No newline at end of file
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/disentangled_box3d_loss.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/disentangled_box3d_loss.py
new file mode 100644
index 0000000..5cdaf0f
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/disentangled_box3d_loss.py
@@ -0,0 +1,46 @@
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import logging
+
+import torch
+import torch.nn as nn
+
+from adzoo.bevformer.mmdet3d_plugin.dd3d.layers.smooth_l1_loss import smooth_l1_loss
+
+LOG = logging.getLogger(__name__)
+
+
+class DisentangledBox3DLoss(nn.Module):
+    def __init__(self, smooth_l1_loss_beta, max_loss_per_group):
+        super().__init__()
+        self.smooth_l1_loss_beta = smooth_l1_loss_beta
+        self.max_loss_per_group = max_loss_per_group
+
+    def forward(self, box3d_pred, box3d_targets, locations, weights=None):
+
+        box3d_pred = box3d_pred.to(torch.float32)
+        box3d_targets = box3d_targets.to(torch.float32)
+
+        target_corners = box3d_targets.corners
+
+        disentangled_losses = {}
+        for component_key in ["quat", "proj_ctr", "depth", "size"]:
+            disentangled_boxes = box3d_targets.clone()
+            setattr(disentangled_boxes, component_key, getattr(box3d_pred, component_key))
+            pred_corners = disentangled_boxes.to(torch.float32).corners
+
+            loss = smooth_l1_loss(pred_corners, target_corners, beta=self.smooth_l1_loss_beta)
+
+            # Bound the loss
+            loss.clamp(max=self.max_loss_per_group)
+
+            if weights is not None:
+                # loss = torch.sum(loss.reshape(-1, 24) * weights.unsqueeze(-1))
+                loss = torch.sum(loss.reshape(-1, 24).mean(dim=1) * weights)
+            else:
+                loss = loss.reshape(-1, 24).mean()
+
+            disentangled_losses["loss_box3d_" + component_key] = loss
+
+        entangled_l1_dist = (target_corners - box3d_pred.corners).detach().abs().reshape(-1, 24).mean(dim=1)
+
+        return disentangled_losses, entangled_l1_dist
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/fcos2d.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/fcos2d.py
new file mode 100644
index 0000000..c9c6c08
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/fcos2d.py
@@ -0,0 +1,388 @@
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+# Adapted from AdelaiDet:
+#   https://github.com/aim-uofa/AdelaiDet
+import torch
+from mmcv.losses import sigmoid_focal_loss
+from torch import nn
+from torch.nn import functional as F
+
+from mmcv.layers import batched_nms, get_norm
+from mmcv.structures import Instances, Boxes
+from torch import distributed as dist
+from mmcv.utils import force_fp32
+from mmcv.layers import Conv2d, batched_nms, cat, get_norm
+
+from adzoo.bevformer.mmdet3d_plugin.dd3d.layers.iou_loss import IOULoss
+from adzoo.bevformer.mmdet3d_plugin.dd3d.layers.normalization import ModuleListDial, Scale
+from adzoo.bevformer.mmdet3d_plugin.dd3d.utils.comm import reduce_sum
+
+INF = 100000000
+
+def get_world_size() -> int:
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+
+def compute_ctrness_targets(reg_targets):
+    if len(reg_targets) == 0:
+        return reg_targets.new_zeros(len(reg_targets))
+    left_right = reg_targets[:, [0, 2]]
+    top_bottom = reg_targets[:, [1, 3]]
+    ctrness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * \
+                 (top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
+    return torch.sqrt(ctrness)
+
+class FCOS2DHead(nn.Module):
+    def __init__(self, 
+                 num_classes, 
+                 input_shape,
+                 num_cls_convs=4,
+                 num_box_convs=4,
+                 norm='BN',
+                 use_deformable=False,
+                 use_scale=True,
+                 box2d_scale_init_factor=1.0,
+                 version='v2'):
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.in_strides = [shape.stride for shape in input_shape]
+        self.num_levels = len(input_shape)
+
+        self.use_scale = use_scale
+        self.box2d_scale_init_factor = box2d_scale_init_factor
+
+        self._version = version
+
+        in_channels = [s.channels for s in input_shape]
+        assert len(set(in_channels)) == 1, "Each level must have the same channel!"
+        in_channels = in_channels[0]
+
+        if use_deformable:
+            raise ValueError("Not supported yet.")
+
+        head_configs = {'cls': num_cls_convs, 'box2d': num_box_convs}
+
+        for head_name, num_convs in head_configs.items():
+            tower = []
+            if self._version == "v1":
+                for _ in range(num_convs):
+                    conv_func = nn.Conv2d
+                    tower.append(conv_func(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=True))
+                    if norm == "GN":
+                        raise NotImplementedError()
+                    elif norm == "NaiveGN":
+                        raise NotImplementedError()
+                    elif norm == "BN":
+                        tower.append(ModuleListDial([nn.BatchNorm2d(in_channels) for _ in range(self.num_levels)]))
+                    elif norm == "SyncBN":
+                        raise NotImplementedError()
+                    tower.append(nn.ReLU())
+            elif self._version == "v2":
+                for _ in range(num_convs):
+                    if norm in ("BN", "FrozenBN", "SyncBN", "GN"):
+                        # NOTE: need to add norm here!
+                        # Each FPN level has its own batchnorm layer.
+                        # NOTE: do not use dd3d train.py!
+                        # "BN" is converted to "SyncBN" in distributed training (see train.py)
+                        norm_layer = ModuleListDial([get_norm(norm, in_channels) for _ in range(self.num_levels)])
+                    else:
+                        norm_layer = get_norm(norm, in_channels)
+                    tower.append(
+                        Conv2d(
+                            in_channels,
+                            in_channels,
+                            kernel_size=3,
+                            stride=1,
+                            padding=1,
+                            bias=norm_layer is None,
+                            norm=norm_layer,
+                            activation=F.relu
+                        )
+                    )
+            else:
+                raise ValueError(f"Invalid FCOS2D version: {self._version}")
+            self.add_module(f'{head_name}_tower', nn.Sequential(*tower))
+
+        self.cls_logits = nn.Conv2d(in_channels, self.num_classes, kernel_size=3, stride=1, padding=1)
+        self.box2d_reg = nn.Conv2d(in_channels, 4, kernel_size=3, stride=1, padding=1)
+        self.centerness = nn.Conv2d(in_channels, 1, kernel_size=3, stride=1, padding=1)
+
+        if self.use_scale:
+            if self._version == "v1":
+                self.scales_reg = nn.ModuleList([
+                    Scale(init_value=stride * self.box2d_scale_init_factor) for stride in self.in_strides
+                ])
+            else:
+                self.scales_box2d_reg = nn.ModuleList([
+                    Scale(init_value=stride * self.box2d_scale_init_factor) for stride in self.in_strides
+                ])
+
+        self.init_weights()
+
+    def init_weights(self):
+
+        for tower in [self.cls_tower, self.box2d_tower]:
+            for l in tower.modules():
+                if isinstance(l, nn.Conv2d):
+                    torch.nn.init.kaiming_normal_(l.weight, mode='fan_out', nonlinearity='relu')
+                    if l.bias is not None:
+                        torch.nn.init.constant_(l.bias, 0)
+
+        predictors = [self.cls_logits, self.box2d_reg, self.centerness]
+
+        for modules in predictors:
+            for l in modules.modules():
+                if isinstance(l, nn.Conv2d):
+                    torch.nn.init.kaiming_uniform_(l.weight, a=1)
+                    if l.bias is not None:  # depth head may not have bias.
+                        torch.nn.init.constant_(l.bias, 0)
+
+    def forward(self, x):
+        logits = []
+        box2d_reg = []
+        centerness = []
+
+        extra_output = {"cls_tower_out": []}
+
+        for l, feature in enumerate(x):
+            cls_tower_out = self.cls_tower(feature)
+            bbox_tower_out = self.box2d_tower(feature)
+
+            # 2D box
+            logits.append(self.cls_logits(cls_tower_out))
+            centerness.append(self.centerness(bbox_tower_out))
+            box_reg = self.box2d_reg(bbox_tower_out)
+            if self.use_scale:
+                # TODO: to optimize the runtime, apply this scaling in inference (and loss compute) only on FG pixels?
+                if self._version == "v1":
+                    box_reg = self.scales_reg[l](box_reg)
+                else:
+                    box_reg = self.scales_box2d_reg[l](box_reg)
+            # Note that we use relu, as in the improved FCOS, instead of exp.
+            box2d_reg.append(F.relu(box_reg))
+
+            extra_output['cls_tower_out'].append(cls_tower_out)
+
+        return logits, box2d_reg, centerness, extra_output
+
+
+class FCOS2DLoss(nn.Module):
+    def __init__(self,
+                 num_classes,
+                 focal_loss_alpha=0.25,
+                 focal_loss_gamma=2.0,
+                 loc_loss_type='giou',
+                 ):
+        super().__init__()
+        self.focal_loss_alpha = focal_loss_alpha
+        self.focal_loss_gamma = focal_loss_gamma
+
+        self.box2d_reg_loss_fn = IOULoss(loc_loss_type)
+
+        self.num_classes = num_classes
+
+    @force_fp32(apply_to=('logits', 'box2d_reg', 'centerness'))
+    def forward(self, logits, box2d_reg, centerness, targets):
+        labels = targets['labels']
+        box2d_reg_targets = targets['box2d_reg_targets']
+        pos_inds = targets["pos_inds"]
+
+        if len(labels) != box2d_reg_targets.shape[0]:
+            raise ValueError(
+                f"The size of 'labels' and 'box2d_reg_targets' does not match: a={len(labels)}, b={box2d_reg_targets.shape[0]}"
+            )
+
+        # Flatten predictions
+        logits = cat([x.permute(0, 2, 3, 1).reshape(-1, self.num_classes) for x in logits])
+        box2d_reg_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 4) for x in box2d_reg])
+        centerness_pred = cat([x.permute(0, 2, 3, 1).reshape(-1) for x in centerness])
+
+        # -------------------
+        # Classification loss
+        # -------------------
+        num_pos_local = pos_inds.numel()
+        num_gpus = get_world_size()
+        total_num_pos = reduce_sum(pos_inds.new_tensor([num_pos_local])).item()
+        num_pos_avg = max(total_num_pos / num_gpus, 1.0)
+
+        # prepare one_hot
+        cls_target = torch.zeros_like(logits)
+        cls_target[pos_inds, labels[pos_inds]] = 1
+
+        loss_cls = sigmoid_focal_loss(
+            logits,
+            cls_target,
+            alpha=self.focal_loss_alpha,
+            gamma=self.focal_loss_gamma,
+            reduction="sum",
+        ) / num_pos_avg
+
+        # NOTE: The rest of losses only consider foreground pixels.
+        box2d_reg_pred = box2d_reg_pred[pos_inds]
+        box2d_reg_targets = box2d_reg_targets[pos_inds]
+
+        centerness_pred = centerness_pred[pos_inds]
+
+        # Compute centerness targets here using 2D regression targets of foreground pixels.
+        centerness_targets = compute_ctrness_targets(box2d_reg_targets)
+
+        # Denominator for all foreground losses.
+        ctrness_targets_sum = centerness_targets.sum()
+        loss_denom = max(reduce_sum(ctrness_targets_sum).item() / num_gpus, 1e-6)
+
+        # NOTE: change the return after reduce_sum
+        if pos_inds.numel() == 0:
+            losses = {
+                "loss_cls": loss_cls,
+                "loss_box2d_reg": box2d_reg_pred.sum() * 0.,
+                "loss_centerness": centerness_pred.sum() * 0.,
+            }
+            return losses, {}
+
+        # ----------------------
+        # 2D box regression loss
+        # ----------------------
+        loss_box2d_reg = self.box2d_reg_loss_fn(box2d_reg_pred, box2d_reg_targets, centerness_targets) / loss_denom
+
+        # ---------------
+        # Centerness loss
+        # ---------------
+        loss_centerness = F.binary_cross_entropy_with_logits(
+            centerness_pred, centerness_targets, reduction="sum"
+        ) / num_pos_avg
+
+        loss_dict = {"loss_cls": loss_cls, "loss_box2d_reg": loss_box2d_reg, "loss_centerness": loss_centerness}
+        extra_info = {"loss_denom": loss_denom, "centerness_targets": centerness_targets}
+
+        return loss_dict, extra_info
+
+
+class FCOS2DInference():
+    def __init__(self, cfg):
+        self.thresh_with_ctr = cfg.DD3D.FCOS2D.INFERENCE.THRESH_WITH_CTR
+        self.pre_nms_thresh = cfg.DD3D.FCOS2D.INFERENCE.PRE_NMS_THRESH
+        self.pre_nms_topk = cfg.DD3D.FCOS2D.INFERENCE.PRE_NMS_TOPK
+        self.post_nms_topk = cfg.DD3D.FCOS2D.INFERENCE.POST_NMS_TOPK
+        self.nms_thresh = cfg.DD3D.FCOS2D.INFERENCE.NMS_THRESH
+        self.num_classes = cfg.DD3D.NUM_CLASSES
+
+    def __call__(self, logits, box2d_reg, centerness, locations, image_sizes):
+
+        pred_instances = []  # List[List[Instances]], shape = (L, B)
+        extra_info = []
+        for lvl, (logits_lvl, box2d_reg_lvl, centerness_lvl, locations_lvl) in \
+            enumerate(zip(logits, box2d_reg, centerness, locations)):
+
+            instances_per_lvl, extra_info_per_lvl = self.forward_for_single_feature_map(
+                logits_lvl, box2d_reg_lvl, centerness_lvl, locations_lvl, image_sizes
+            )  # List of Instances; one for each image.
+
+            for instances_per_im in instances_per_lvl:
+                instances_per_im.fpn_levels = locations_lvl.new_ones(len(instances_per_im), dtype=torch.long) * lvl
+
+            pred_instances.append(instances_per_lvl)
+            extra_info.append(extra_info_per_lvl)
+
+        return pred_instances, extra_info
+
+    def forward_for_single_feature_map(self, logits, box2d_reg, centerness, locations, image_sizes):
+        N, C, _, __ = logits.shape
+
+        # put in the same format as locations
+        scores = logits.permute(0, 2, 3, 1).reshape(N, -1, C).sigmoid()
+        box2d_reg = box2d_reg.permute(0, 2, 3, 1).reshape(N, -1, 4)
+        centerness = centerness.permute(0, 2, 3, 1).reshape(N, -1).sigmoid()
+
+        # if self.thresh_with_ctr is True, we multiply the classification
+        # scores with centerness scores before applying the threshold.
+        if self.thresh_with_ctr:
+            scores = scores * centerness[:, :, None]
+
+        candidate_mask = scores > self.pre_nms_thresh
+
+        pre_nms_topk = candidate_mask.reshape(N, -1).sum(1)
+        pre_nms_topk = pre_nms_topk.clamp(max=self.pre_nms_topk)
+
+        if not self.thresh_with_ctr:
+            scores = scores * centerness[:, :, None]
+
+        results = []
+        all_fg_inds_per_im, all_topk_indices, all_class_inds_per_im = [], [], []
+        for i in range(N):
+            scores_per_im = scores[i]
+            candidate_mask_per_im = candidate_mask[i]
+            scores_per_im = scores_per_im[candidate_mask_per_im]
+
+            candidate_inds_per_im = candidate_mask_per_im.nonzero(as_tuple=False)
+            fg_inds_per_im = candidate_inds_per_im[:, 0]
+            class_inds_per_im = candidate_inds_per_im[:, 1]
+
+            # Cache info here.
+            all_fg_inds_per_im.append(fg_inds_per_im)
+            all_class_inds_per_im.append(class_inds_per_im)
+
+            box2d_reg_per_im = box2d_reg[i][fg_inds_per_im]
+            locations_per_im = locations[fg_inds_per_im]
+
+            pre_nms_topk_per_im = pre_nms_topk[i]
+
+            if candidate_mask_per_im.sum().item() > pre_nms_topk_per_im.item():
+                scores_per_im, topk_indices = \
+                    scores_per_im.topk(pre_nms_topk_per_im, sorted=False)
+
+                class_inds_per_im = class_inds_per_im[topk_indices]
+                box2d_reg_per_im = box2d_reg_per_im[topk_indices]
+                locations_per_im = locations_per_im[topk_indices]
+            else:
+                topk_indices = None
+
+            all_topk_indices.append(topk_indices)
+
+            detections = torch.stack([
+                locations_per_im[:, 0] - box2d_reg_per_im[:, 0],
+                locations_per_im[:, 1] - box2d_reg_per_im[:, 1],
+                locations_per_im[:, 0] + box2d_reg_per_im[:, 2],
+                locations_per_im[:, 1] + box2d_reg_per_im[:, 3],
+            ],
+                                     dim=1)
+
+            instances = Instances(image_sizes[i])
+            instances.pred_boxes = Boxes(detections)
+            instances.scores = torch.sqrt(scores_per_im)
+            instances.pred_classes = class_inds_per_im
+            instances.locations = locations_per_im
+
+            results.append(instances)
+
+        extra_info = {
+            "fg_inds_per_im": all_fg_inds_per_im,
+            "class_inds_per_im": all_class_inds_per_im,
+            "topk_indices": all_topk_indices
+        }
+        return results, extra_info
+
+    def nms_and_top_k(self, instances_per_im, score_key_for_nms="scores"):
+        results = []
+        for instances in instances_per_im:
+            if self.nms_thresh > 0:
+                # Multiclass NMS.
+                keep = batched_nms(
+                    instances.pred_boxes.tensor, instances.get(score_key_for_nms), instances.pred_classes,
+                    self.nms_thresh
+                )
+                instances = instances[keep]
+            num_detections = len(instances)
+
+            # Limit to max_per_image detections **over all classes**
+            if num_detections > self.post_nms_topk > 0:
+                scores = instances.scores
+                # image_thresh, _ = torch.kthvalue(scores.cpu(), num_detections - self.post_nms_topk + 1)
+                image_thresh, _ = torch.kthvalue(scores, num_detections - self.post_nms_topk + 1)
+                keep = scores >= image_thresh.item()
+                keep = torch.nonzero(keep).squeeze(1)
+                instances = instances[keep]
+            results.append(instances)
+        return results
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/fcos3d.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/fcos3d.py
new file mode 100644
index 0000000..f0669a6
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/fcos3d.py
@@ -0,0 +1,427 @@
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from mmcv.layers import Conv2d, batched_nms, cat, get_norm
+from mmcv.utils import force_fp32
+
+from .disentangled_box3d_loss import DisentangledBox3DLoss
+from adzoo.bevformer.mmdet3d_plugin.dd3d.layers.normalization import ModuleListDial, Offset, Scale
+from adzoo.bevformer.mmdet3d_plugin.dd3d.structures.boxes3d import Boxes3D
+from adzoo.bevformer.mmdet3d_plugin.dd3d.utils.geometry import allocentric_to_egocentric, unproject_points2d
+
+EPS = 1e-7
+
+
+def predictions_to_boxes3d(
+    quat,
+    proj_ctr,
+    depth,
+    size,
+    locations,
+    inv_intrinsics,
+    canon_box_sizes,
+    min_depth,
+    max_depth,
+    scale_depth_by_focal_lengths_factor,
+    scale_depth_by_focal_lengths=True,
+    quat_is_allocentric=True,
+    depth_is_distance=False
+):
+    # Normalize to make quat unit norm.
+    quat = quat / quat.norm(dim=1, keepdim=True).clamp(min=EPS)
+    # Make sure again it's numerically unit-norm.
+    quat = quat / quat.norm(dim=1, keepdim=True)
+
+    if scale_depth_by_focal_lengths:
+        pixel_size = torch.norm(torch.stack([inv_intrinsics[:, 0, 0], inv_intrinsics[:, 1, 1]], dim=-1), dim=-1)
+        depth = depth / (pixel_size * scale_depth_by_focal_lengths_factor)
+
+    if depth_is_distance:
+        depth = depth / unproject_points2d(locations, inv_intrinsics).norm(dim=1).clamp(min=EPS)
+
+    depth = depth.reshape(-1, 1).clamp(min_depth, max_depth)
+
+    proj_ctr = proj_ctr + locations
+
+    if quat_is_allocentric:
+        quat = allocentric_to_egocentric(quat, proj_ctr, inv_intrinsics)
+
+    size = (size.tanh() + 1.) * canon_box_sizes  # max size = 2 * canon_size
+
+    return Boxes3D(quat, proj_ctr, depth, size, inv_intrinsics)
+
+
+class FCOS3DHead(nn.Module):
+    def __init__(self, 
+                 num_classes,
+                 input_shape,
+                 num_convs=4,
+                 norm='BN',
+                 use_scale=True,
+                 depth_scale_init_factor=0.3,
+                 proj_ctr_scale_init_factor=1.0,
+                 use_per_level_predictors=False,
+                 class_agnostic=False,
+                 use_deformable=False,
+                 mean_depth_per_level=None,
+                 std_depth_per_level=None,
+                 ):
+        super().__init__()
+        self.num_classes = num_classes
+        self.in_strides = [shape.stride for shape in input_shape]
+        self.num_levels = len(input_shape)
+
+        self.use_scale = use_scale
+        self.depth_scale_init_factor = depth_scale_init_factor
+        self.proj_ctr_scale_init_factor = proj_ctr_scale_init_factor
+        self.use_per_level_predictors = use_per_level_predictors
+
+        self.register_buffer("mean_depth_per_level", torch.Tensor(mean_depth_per_level))
+        self.register_buffer("std_depth_per_level", torch.Tensor(std_depth_per_level))
+
+        in_channels = [s.channels for s in input_shape]
+        assert len(set(in_channels)) == 1, "Each level must have the same channel!"
+        in_channels = in_channels[0]
+
+        if use_deformable:
+            raise ValueError("Not supported yet.")
+
+        box3d_tower = []
+        for i in range(num_convs):
+            if norm in ("BN", "FrozenBN", "SyncBN", "GN"):
+                # NOTE: need to add norm here!
+                # Each FPN level has its own batchnorm layer.
+                # NOTE: do not use dd3d train.py!
+                # "BN" is converted to "SyncBN" in distributed training (see train.py)
+                norm_layer = ModuleListDial([get_norm(norm, in_channels) for _ in range(self.num_levels)])
+            else:
+                norm_layer = get_norm(norm, in_channels)
+            box3d_tower.append(
+                Conv2d(
+                    in_channels,
+                    in_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=norm_layer is None,
+                    norm=norm_layer,
+                    activation=F.relu
+                )
+            )
+        self.add_module('box3d_tower', nn.Sequential(*box3d_tower))
+
+        num_classes = self.num_classes if not class_agnostic else 1
+        num_levels = self.num_levels if use_per_level_predictors else 1
+
+        # 3D box branches.
+        self.box3d_quat = nn.ModuleList([
+            Conv2d(in_channels, 4 * num_classes, kernel_size=3, stride=1, padding=1, bias=True)
+            for _ in range(num_levels)
+        ])
+        self.box3d_ctr = nn.ModuleList([
+            Conv2d(in_channels, 2 * num_classes, kernel_size=3, stride=1, padding=1, bias=True)
+            for _ in range(num_levels)
+        ])
+        self.box3d_depth = nn.ModuleList([
+            Conv2d(in_channels, 1 * num_classes, kernel_size=3, stride=1, padding=1, bias=(not self.use_scale))
+            for _ in range(num_levels)
+        ])
+        self.box3d_size = nn.ModuleList([
+            Conv2d(in_channels, 3 * num_classes, kernel_size=3, stride=1, padding=1, bias=True)
+            for _ in range(num_levels)
+        ])
+        self.box3d_conf = nn.ModuleList([
+            Conv2d(in_channels, 1 * num_classes, kernel_size=3, stride=1, padding=1, bias=True)
+            for _ in range(num_levels)
+        ])
+
+        if self.use_scale:
+            self.scales_proj_ctr = nn.ModuleList([
+                Scale(init_value=stride * self.proj_ctr_scale_init_factor) for stride in self.in_strides
+            ])
+            # (pre-)compute (mean, std) of depth for each level, and determine the init value here.
+            self.scales_size = nn.ModuleList([Scale(init_value=1.0) for _ in range(self.num_levels)])
+            self.scales_conf = nn.ModuleList([Scale(init_value=1.0) for _ in range(self.num_levels)])
+
+            self.scales_depth = nn.ModuleList([
+                Scale(init_value=sigma * self.depth_scale_init_factor) for sigma in self.std_depth_per_level
+            ])
+            self.offsets_depth = nn.ModuleList([Offset(init_value=b) for b in self.mean_depth_per_level])
+
+        self._init_weights()
+
+    def _init_weights(self):
+
+        for l in self.box3d_tower.modules():
+            if isinstance(l, nn.Conv2d):
+                torch.nn.init.kaiming_normal_(l.weight, mode='fan_out', nonlinearity='relu')
+                if l.bias is not None:
+                    torch.nn.init.constant_(l.bias, 0)
+
+        predictors = [self.box3d_quat, self.box3d_ctr, self.box3d_depth, self.box3d_size, self.box3d_conf]
+
+        for modules in predictors:
+            for l in modules.modules():
+                if isinstance(l, nn.Conv2d):
+                    torch.nn.init.kaiming_uniform_(l.weight, a=1)
+                    if l.bias is not None:  # depth head may not have bias.
+                        torch.nn.init.constant_(l.bias, 0)
+
+    def forward(self, x):
+        box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf = [], [], [], [], []
+        dense_depth = None
+        for l, features in enumerate(x):
+            box3d_tower_out = self.box3d_tower(features)
+
+            _l = l if self.use_per_level_predictors else 0
+
+            # 3D box
+            quat = self.box3d_quat[_l](box3d_tower_out)
+            proj_ctr = self.box3d_ctr[_l](box3d_tower_out)
+            depth = self.box3d_depth[_l](box3d_tower_out)
+            size3d = self.box3d_size[_l](box3d_tower_out)
+            conf3d = self.box3d_conf[_l](box3d_tower_out)
+
+            if self.use_scale:
+                # TODO: to optimize the runtime, apply this scaling in inference (and loss compute) only on FG pixels?
+                proj_ctr = self.scales_proj_ctr[l](proj_ctr)
+                size3d = self.scales_size[l](size3d)
+                conf3d = self.scales_conf[l](conf3d)
+                depth = self.offsets_depth[l](self.scales_depth[l](depth))
+
+            box3d_quat.append(quat)
+            box3d_ctr.append(proj_ctr)
+            box3d_depth.append(depth)
+            box3d_size.append(size3d)
+            box3d_conf.append(conf3d)
+
+        return box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth
+
+
+class FCOS3DLoss(nn.Module):
+    def __init__(self, 
+                 num_classes,
+                 min_depth=0.1,
+                 max_depth=80.0,
+                 box3d_loss_weight=2.0,
+                 conf3d_loss_weight=1.0,
+                 conf_3d_temperature=1.0,
+                 smooth_l1_loss_beta=0.05, 
+                 max_loss_per_group=20,
+                 predict_allocentric_rot=True,
+                 scale_depth_by_focal_lengths=True,
+                 scale_depth_by_focal_lengths_factor=500.0,
+                 class_agnostic=False,
+                 predict_distance=False,
+                 canon_box_sizes=None):
+        super().__init__()
+        self.canon_box_sizes = canon_box_sizes
+        self.min_depth = min_depth
+        self.max_depth = max_depth
+        self.predict_allocentric_rot = predict_allocentric_rot
+        self.scale_depth_by_focal_lengths = scale_depth_by_focal_lengths
+        self.scale_depth_by_focal_lengths_factor = scale_depth_by_focal_lengths_factor
+        self.predict_distance = predict_distance
+
+        self.box3d_reg_loss_fn = DisentangledBox3DLoss(smooth_l1_loss_beta, max_loss_per_group)
+        self.box3d_loss_weight = box3d_loss_weight
+        self.conf3d_loss_weight = conf3d_loss_weight
+        self.conf_3d_temperature = conf_3d_temperature
+
+        self.num_classes = num_classes
+        self.class_agnostic = class_agnostic
+
+    @force_fp32(apply_to=('box3d_quat', 'box3d_ctr', 'box3d_depth', 'box3d_size','box3d_conf', 'inv_intrinsics'))
+    def forward(
+        self, box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth, inv_intrinsics, fcos2d_info,
+        targets
+    ):
+        labels = targets['labels']
+        box3d_targets = targets['box3d_targets']
+        pos_inds = targets["pos_inds"]
+
+        if pos_inds.numel() == 0:
+            losses = {
+                "loss_box3d_quat": torch.stack([x.sum() * 0. for x in box3d_quat]).sum(), 
+                "loss_box3d_proj_ctr": torch.stack([x.sum() * 0. for x in box3d_ctr]).sum(),
+                "loss_box3d_depth": torch.stack([x.sum() * 0. for x in box3d_depth]).sum(),
+                "loss_box3d_size": torch.stack([x.sum() * 0. for x in box3d_size]).sum(),
+                "loss_conf3d": torch.stack([x.sum() * 0. for x in box3d_conf]).sum()
+            }
+            return losses
+
+        if len(labels) != len(box3d_targets):
+            raise ValueError(
+                f"The size of 'labels' and 'box3d_targets' does not match: a={len(labels)}, b={len(box3d_targets)}"
+            )
+
+        num_classes = self.num_classes if not self.class_agnostic else 1
+
+        box3d_quat_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 4, num_classes) for x in box3d_quat])
+        box3d_ctr_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 2, num_classes) for x in box3d_ctr])
+        box3d_depth_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, num_classes) for x in box3d_depth])
+        box3d_size_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 3, num_classes) for x in box3d_size])
+        box3d_conf_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, num_classes) for x in box3d_conf])
+
+        # ----------------------
+        # 3D box disentangled loss
+        # ----------------------
+        box3d_targets = box3d_targets[pos_inds]
+
+        box3d_quat_pred = box3d_quat_pred[pos_inds]
+        box3d_ctr_pred = box3d_ctr_pred[pos_inds]
+        box3d_depth_pred = box3d_depth_pred[pos_inds]
+        box3d_size_pred = box3d_size_pred[pos_inds]
+        box3d_conf_pred = box3d_conf_pred[pos_inds]
+
+        if self.class_agnostic:
+            box3d_quat_pred = box3d_quat_pred.squeeze(-1)
+            box3d_ctr_pred = box3d_ctr_pred.squeeze(-1)
+            box3d_depth_pred = box3d_depth_pred.squeeze(-1)
+            box3d_size_pred = box3d_size_pred.squeeze(-1)
+            box3d_conf_pred = box3d_conf_pred.squeeze(-1)
+        else:
+            I = labels[pos_inds][..., None, None]
+            box3d_quat_pred = torch.gather(box3d_quat_pred, dim=2, index=I.repeat(1, 4, 1)).squeeze(-1)
+            box3d_ctr_pred = torch.gather(box3d_ctr_pred, dim=2, index=I.repeat(1, 2, 1)).squeeze(-1)
+            box3d_depth_pred = torch.gather(box3d_depth_pred, dim=1, index=I.squeeze(-1)).squeeze(-1)
+            box3d_size_pred = torch.gather(box3d_size_pred, dim=2, index=I.repeat(1, 3, 1)).squeeze(-1)
+            box3d_conf_pred = torch.gather(box3d_conf_pred, dim=1, index=I.squeeze(-1)).squeeze(-1)
+
+        canon_box_sizes = box3d_quat_pred.new_tensor(self.canon_box_sizes)[labels[pos_inds]]
+
+        locations = targets["locations"][pos_inds]
+        im_inds = targets["im_inds"][pos_inds]
+        inv_intrinsics = inv_intrinsics[im_inds]
+
+        box3d_pred = predictions_to_boxes3d(
+            box3d_quat_pred,
+            box3d_ctr_pred,
+            box3d_depth_pred,
+            box3d_size_pred,
+            locations,
+            inv_intrinsics,
+            canon_box_sizes,
+            self.min_depth,
+            self.max_depth,
+            scale_depth_by_focal_lengths_factor=self.scale_depth_by_focal_lengths_factor,
+            scale_depth_by_focal_lengths=self.scale_depth_by_focal_lengths,
+            quat_is_allocentric=self.predict_allocentric_rot,
+            depth_is_distance=self.predict_distance
+        )
+
+        centerness_targets = fcos2d_info["centerness_targets"]
+        loss_denom = fcos2d_info["loss_denom"]
+        losses_box3d, box3d_l1_error = self.box3d_reg_loss_fn(box3d_pred, box3d_targets, locations, centerness_targets)
+
+        losses_box3d = {k: self.box3d_loss_weight * v / loss_denom for k, v in losses_box3d.items()}
+
+        conf_3d_targets = torch.exp(-1. / self.conf_3d_temperature * box3d_l1_error)
+        loss_conf3d = F.binary_cross_entropy_with_logits(box3d_conf_pred, conf_3d_targets, reduction='none')
+        loss_conf3d = self.conf3d_loss_weight * (loss_conf3d * centerness_targets).sum() / loss_denom
+
+        losses = {"loss_conf3d": loss_conf3d, **losses_box3d}
+
+        return losses
+
+
+class FCOS3DInference():
+    def __init__(self, cfg):
+        self.canon_box_sizes = cfg.DD3D.FCOS3D.CANONICAL_BOX3D_SIZES
+        self.min_depth = cfg.DD3D.FCOS3D.MIN_DEPTH
+        self.max_depth = cfg.DD3D.FCOS3D.MAX_DEPTH
+        self.predict_allocentric_rot = cfg.DD3D.FCOS3D.PREDICT_ALLOCENTRIC_ROT
+        self.scale_depth_by_focal_lengths = cfg.DD3D.FCOS3D.SCALE_DEPTH_BY_FOCAL_LENGTHS
+        self.scale_depth_by_focal_lengths_factor = cfg.DD3D.FCOS3D.SCALE_DEPTH_BY_FOCAL_LENGTHS_FACTOR
+        self.predict_distance = cfg.DD3D.FCOS3D.PREDICT_DISTANCE
+
+        self.num_classes = cfg.DD3D.NUM_CLASSES
+        self.class_agnostic = cfg.DD3D.FCOS3D.CLASS_AGNOSTIC_BOX3D
+
+    def __call__(
+        self, box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances, fcos2d_info
+    ):
+        # pred_instances: # List[List[Instances]], shape = (L, B)
+        for lvl, (box3d_quat_lvl, box3d_ctr_lvl, box3d_depth_lvl, box3d_size_lvl, box3d_conf_lvl) in \
+            enumerate(zip(box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf)):
+
+            # In-place modification: update per-level pred_instances.
+            self.forward_for_single_feature_map(
+                box3d_quat_lvl, box3d_ctr_lvl, box3d_depth_lvl, box3d_size_lvl, box3d_conf_lvl, inv_intrinsics,
+                pred_instances[lvl], fcos2d_info[lvl]
+            )  # List of Instances; one for each image.
+
+    def forward_for_single_feature_map(
+        self, box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances, fcos2d_info
+    ):
+        N = box3d_quat.shape[0]
+
+        num_classes = self.num_classes if not self.class_agnostic else 1
+
+        box3d_quat = box3d_quat.permute(0, 2, 3, 1).reshape(N, -1, 4, num_classes)
+        box3d_ctr = box3d_ctr.permute(0, 2, 3, 1).reshape(N, -1, 2, num_classes)
+        box3d_depth = box3d_depth.permute(0, 2, 3, 1).reshape(N, -1, num_classes)
+        box3d_size = box3d_size.permute(0, 2, 3, 1).reshape(N, -1, 3, num_classes)
+        box3d_conf = box3d_conf.permute(0, 2, 3, 1).reshape(N, -1, num_classes).sigmoid()
+
+        for i in range(N):
+            fg_inds_per_im = fcos2d_info['fg_inds_per_im'][i]
+            class_inds_per_im = fcos2d_info['class_inds_per_im'][i]
+            topk_indices = fcos2d_info['topk_indices'][i]
+
+            box3d_quat_per_im = box3d_quat[i][fg_inds_per_im]
+            box3d_ctr_per_im = box3d_ctr[i][fg_inds_per_im]
+            box3d_depth_per_im = box3d_depth[i][fg_inds_per_im]
+            box3d_size_per_im = box3d_size[i][fg_inds_per_im]
+            box3d_conf_per_im = box3d_conf[i][fg_inds_per_im]
+
+            if self.class_agnostic:
+                box3d_quat_per_im = box3d_quat_per_im.squeeze(-1)
+                box3d_ctr_per_im = box3d_ctr_per_im.squeeze(-1)
+                box3d_depth_per_im = box3d_depth_per_im.squeeze(-1)
+                box3d_size_per_im = box3d_size_per_im.squeeze(-1)
+                box3d_conf_per_im = box3d_conf_per_im.squeeze(-1)
+            else:
+                I = class_inds_per_im[..., None, None]
+                box3d_quat_per_im = torch.gather(box3d_quat_per_im, dim=2, index=I.repeat(1, 4, 1)).squeeze(-1)
+                box3d_ctr_per_im = torch.gather(box3d_ctr_per_im, dim=2, index=I.repeat(1, 2, 1)).squeeze(-1)
+                box3d_depth_per_im = torch.gather(box3d_depth_per_im, dim=1, index=I.squeeze(-1)).squeeze(-1)
+                box3d_size_per_im = torch.gather(box3d_size_per_im, dim=2, index=I.repeat(1, 3, 1)).squeeze(-1)
+                box3d_conf_per_im = torch.gather(box3d_conf_per_im, dim=1, index=I.squeeze(-1)).squeeze(-1)
+
+            if topk_indices is not None:
+                box3d_quat_per_im = box3d_quat_per_im[topk_indices]
+                box3d_ctr_per_im = box3d_ctr_per_im[topk_indices]
+                box3d_depth_per_im = box3d_depth_per_im[topk_indices]
+                box3d_size_per_im = box3d_size_per_im[topk_indices]
+                box3d_conf_per_im = box3d_conf_per_im[topk_indices]
+
+            # scores_per_im = pred_instances[i].scores.square()
+            # NOTE: Before refactoring, the squared score was used. Is raw 2D score better?
+            scores_per_im = pred_instances[i].scores
+            scores_3d_per_im = scores_per_im * box3d_conf_per_im
+
+            canon_box_sizes = box3d_quat.new_tensor(self.canon_box_sizes)[pred_instances[i].pred_classes]
+            inv_K = inv_intrinsics[i][None, ...].expand(len(box3d_quat_per_im), 3, 3)
+            locations = pred_instances[i].locations
+            pred_boxes3d = predictions_to_boxes3d(
+                box3d_quat_per_im,
+                box3d_ctr_per_im,
+                box3d_depth_per_im,
+                box3d_size_per_im,
+                locations,
+                inv_K,
+                canon_box_sizes,
+                self.min_depth,
+                self.max_depth,
+                scale_depth_by_focal_lengths_factor=self.scale_depth_by_focal_lengths_factor,
+                scale_depth_by_focal_lengths=self.scale_depth_by_focal_lengths,
+                quat_is_allocentric=self.predict_allocentric_rot,
+                depth_is_distance=self.predict_distance
+            )
+
+            # In-place modification: add fields to instances.
+            pred_instances[i].pred_boxes3d = pred_boxes3d
+            pred_instances[i].scores_3d = scores_3d_per_im
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/nuscenes_dd3d.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/nuscenes_dd3d.py
new file mode 100644
index 0000000..04a78d7
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/nuscenes_dd3d.py
@@ -0,0 +1,525 @@
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import torch
+import torch.nn.functional as F
+from mmcv.losses.fvcore_smooth_l1_loss import smooth_l1_loss
+from torch import nn
+
+from mmcv.structures import Instances
+from mmcv.models.builder import HEADS
+from mmcv.utils import force_fp32
+from torch import distributed as dist
+from mmcv.modeling.postprocessing import detector_postprocess as resize_instances
+from mmcv.layers import cat, Conv2d
+from adzoo.bevformer.mmdet3d_plugin.dd3d.datasets.nuscenes import MAX_NUM_ATTRIBUTES
+from .core import DD3D
+from .prepare_targets import DD3DTargetPreparer
+from adzoo.bevformer.mmdet3d_plugin.dd3d.structures.boxes3d import Boxes3D
+from adzoo.bevformer.mmdet3d_plugin.dd3d.structures.image_list import ImageList
+from adzoo.bevformer.mmdet3d_plugin.dd3d.utils.comm import reduce_sum
+
+INF = 100000000.
+
+def get_world_size() -> int:
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+
+class NuscenesDD3DTargetPreparer(DD3DTargetPreparer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        assert self.dd3d_enabled, f"{type(self).__name__} requires dd3d_enabled = True"
+
+    def __call__(self, locations, gt_instances, feature_shapes):
+        num_loc_list = [len(loc) for loc in locations]
+
+        # compute locations to size ranges
+        loc_to_size_range = []
+        for l, loc_per_level in enumerate(locations):
+            loc_to_size_range_per_level = loc_per_level.new_tensor(self.sizes_of_interest[l])
+            loc_to_size_range.append(loc_to_size_range_per_level[None].expand(num_loc_list[l], -1))
+
+        loc_to_size_range = torch.cat(loc_to_size_range, dim=0)
+        locations = torch.cat(locations, dim=0)
+
+        training_targets = self.compute_targets_for_locations(locations, gt_instances, loc_to_size_range, num_loc_list)
+
+        training_targets["locations"] = [locations.clone() for _ in range(len(gt_instances))]
+        training_targets["im_inds"] = [
+            locations.new_ones(locations.size(0), dtype=torch.long) * i for i in range(len(gt_instances))
+        ]
+
+        box2d = training_targets.pop("box2d", None)
+
+        # transpose im first training_targets to level first ones
+        training_targets = {k: self._transpose(v, num_loc_list) for k, v in training_targets.items() if k != "box2d"}
+
+        training_targets["fpn_levels"] = [
+            loc.new_ones(len(loc), dtype=torch.long) * level for level, loc in enumerate(training_targets["locations"])
+        ]
+
+        # Flatten targets: (L x B x H x W, TARGET_SIZE)
+        labels = cat([x.reshape(-1) for x in training_targets["labels"]])
+        box2d_reg_targets = cat([x.reshape(-1, 4) for x in training_targets["box2d_reg"]])
+
+        target_inds = cat([x.reshape(-1) for x in training_targets["target_inds"]])
+        locations = cat([x.reshape(-1, 2) for x in training_targets["locations"]])
+        im_inds = cat([x.reshape(-1) for x in training_targets["im_inds"]])
+        fpn_levels = cat([x.reshape(-1) for x in training_targets["fpn_levels"]])
+
+        pos_inds = torch.nonzero(labels != self.num_classes).squeeze(1)
+
+        targets = {
+            "labels": labels,
+            "box2d_reg_targets": box2d_reg_targets,
+            "locations": locations,
+            "target_inds": target_inds,
+            "im_inds": im_inds,
+            "fpn_levels": fpn_levels,
+            "pos_inds": pos_inds
+        }
+
+        if self.dd3d_enabled:
+            box3d_targets = Boxes3D.cat(training_targets["box3d"])
+            targets.update({"box3d_targets": box3d_targets})
+
+            if box2d is not None:
+                # Original format is B x L x (H x W, 4)
+                # Need to be in L x (B, 4, H, W).
+                batched_box2d = []
+                for lvl, per_lvl_box2d in enumerate(zip(*box2d)):
+                    # B x (H x W, 4)
+                    h, w = feature_shapes[lvl]
+                    batched_box2d_lvl = torch.stack([x.T.reshape(4, h, w) for x in per_lvl_box2d], dim=0)
+                    batched_box2d.append(batched_box2d_lvl)
+                targets.update({"batched_box2d": batched_box2d})
+
+        # Nuscenes targets -- attribute / speed
+        attributes = cat([x.reshape(-1) for x in training_targets["attributes"]])
+        speeds = cat([x.reshape(-1) for x in training_targets["speeds"]])
+
+        targets.update({'attributes': attributes, 'speeds': speeds})
+
+        return targets
+
+    def compute_targets_for_locations(self, locations, targets, size_ranges, num_loc_list):
+        labels = []
+        box2d_reg = []
+
+        if self.dd3d_enabled:
+            box3d = []
+
+        target_inds = []
+        xs, ys = locations[:, 0], locations[:, 1]
+
+        # NuScenes targets  -- attribute / speed
+        attributes, speeds = [], []
+
+        num_targets = 0
+        for im_i in range(len(targets)):
+            targets_per_im = targets[im_i]
+            bboxes = targets_per_im.gt_boxes.tensor
+            labels_per_im = targets_per_im.gt_classes
+
+            # no gt
+            if bboxes.numel() == 0:
+                labels.append(labels_per_im.new_zeros(locations.size(0)) + self.num_classes)
+                # reg_targets.append(locations.new_zeros((locations.size(0), 4)))
+                box2d_reg.append(locations.new_zeros((locations.size(0), 4)))
+                target_inds.append(labels_per_im.new_zeros(locations.size(0)) - 1)
+
+                if self.dd3d_enabled:
+                    box3d.append(
+                        Boxes3D(
+                            locations.new_zeros(locations.size(0), 4),
+                            locations.new_zeros(locations.size(0), 2),
+                            locations.new_zeros(locations.size(0), 1),
+                            locations.new_zeros(locations.size(0), 3),
+                            locations.new_zeros(locations.size(0), 3, 3),
+                        ).to(torch.float32)
+                    )
+                # NOTE: attributes and speeds.
+                attributes.append(labels_per_im.new_zeros(locations.size(0)))
+                speeds.append(labels_per_im.new_zeros(locations.size(0)))  
+                continue
+
+            area = targets_per_im.gt_boxes.area()
+
+            l = xs[:, None] - bboxes[:, 0][None]
+            t = ys[:, None] - bboxes[:, 1][None]
+            r = bboxes[:, 2][None] - xs[:, None]
+            b = bboxes[:, 3][None] - ys[:, None]
+            # reg_targets_per_im = torch.stack([l, t, r, b], dim=2)
+            box2d_reg_per_im = torch.stack([l, t, r, b], dim=2)
+
+            if self.center_sample:
+                is_in_boxes = self.get_sample_region(bboxes, num_loc_list, xs, ys)
+            else:
+                is_in_boxes = box2d_reg_per_im.min(dim=2)[0] > 0
+
+            max_reg_targets_per_im = box2d_reg_per_im.max(dim=2)[0]
+            # limit the regression range for each location
+            is_cared_in_the_level = \
+                (max_reg_targets_per_im >= size_ranges[:, [0]]) & \
+                (max_reg_targets_per_im <= size_ranges[:, [1]])
+
+            locations_to_gt_area = area[None].repeat(len(locations), 1)
+            locations_to_gt_area[is_in_boxes == 0] = INF
+            locations_to_gt_area[is_cared_in_the_level == 0] = INF
+
+            # if there are still more than one objects for a location,
+            # we choose the one with minimal area
+            locations_to_min_area, locations_to_gt_inds = locations_to_gt_area.min(dim=1)
+
+            box2d_reg_per_im = box2d_reg_per_im[range(len(locations)), locations_to_gt_inds]
+            target_inds_per_im = locations_to_gt_inds + num_targets
+            num_targets += len(targets_per_im)
+
+            labels_per_im = labels_per_im[locations_to_gt_inds]
+            labels_per_im[locations_to_min_area == INF] = self.num_classes
+
+            labels.append(labels_per_im)
+            box2d_reg.append(box2d_reg_per_im)
+            target_inds.append(target_inds_per_im)
+
+            if self.dd3d_enabled:
+                # 3D box targets
+                box3d_per_im = targets_per_im.gt_boxes3d[locations_to_gt_inds]
+                box3d.append(box3d_per_im)
+
+            # NuScenes targets  -- attribute / speed
+            attributes_per_im = targets_per_im.gt_attributes[locations_to_gt_inds]
+            speeds_per_im = targets_per_im.gt_speeds[locations_to_gt_inds]
+            attributes.append(attributes_per_im)
+            speeds.append(speeds_per_im)
+
+        ret = {"labels": labels, "box2d_reg": box2d_reg, "target_inds": target_inds}
+        if self.dd3d_enabled:
+            ret.update({"box3d": box3d})
+
+        # NuScenes targets  -- attribute / speed
+        ret.update({"attributes": attributes, "speeds": speeds})
+
+        return ret
+
+
+class NuscenesLoss(nn.Module):
+    def __init__(self, attr_loss_weight=0.2, speed_loss_weight=0.2):
+        super().__init__()
+        self.attr_loss_weight = attr_loss_weight
+        self.speed_loss_weight = speed_loss_weight
+
+    @force_fp32(apply_to=('attr_logits', 'speeds'))
+    def forward(self, attr_logits, speeds, fcos2d_info, targets):
+        # Flatten predictions
+        attr_logits = cat([x.permute(0, 2, 3, 1).reshape(-1, MAX_NUM_ATTRIBUTES) for x in attr_logits])
+        speeds = cat([x.permute(0, 2, 3, 1).reshape(-1) for x in speeds])
+
+        pos_inds = targets['pos_inds']
+
+        losses = {}
+
+        # 1. Attributes
+        attr_logits = attr_logits[pos_inds]
+        target_attr = targets['attributes'][pos_inds]
+        valid_attr_mask = target_attr != MAX_NUM_ATTRIBUTES  # No attrs associated with class, or just attr missing.
+
+        if pos_inds.numel() == 0:
+            attr_weights = attr_logits.new_tensor(0.0) #torch.tensor(0.0).cuda()
+        else:
+            attr_weights = fcos2d_info['centerness_targets'][valid_attr_mask]
+        # Denominator for all foreground losses -- re-computed for features with valid attributes.
+        # attr_loss_denom = max(reduce_sum(attr_weights.sum()).item() / d2_comm.get_world_size(), 1e-6)
+        # NOTE: compute attr_weights_sum, and then feed it to reduce_sum() works, but not above.
+        attr_weights_sum = attr_weights.sum()
+        attr_loss_denom = max(reduce_sum(attr_weights_sum).item() / get_world_size(), 1e-6)
+
+        if valid_attr_mask.sum() == 0:
+            losses.update({"loss_attr": attr_logits.sum() * 0.})
+        else:
+            attr_logits = attr_logits[valid_attr_mask]
+            target_attr = target_attr[valid_attr_mask]
+
+            xent = F.cross_entropy(attr_logits, target_attr)
+            loss_attr = (xent * attr_weights).sum() / attr_loss_denom
+
+            losses.update({"loss_attr": self.attr_loss_weight * loss_attr})
+
+        # 2. Speed
+        speeds = speeds[pos_inds]
+        target_speeds = targets['speeds'][pos_inds]
+        # NOTE: some GT speeds are NaN.
+        valid_gt_mask = torch.logical_not(torch.isnan(target_speeds))
+
+        if pos_inds.numel() == 0:
+            speed_weights = speeds.new_tensor(0.0) #torch.tensor(0.0).cuda()
+        else:
+            speed_weights = fcos2d_info['centerness_targets'][valid_gt_mask]
+        # Denominator for all foreground losses -- re-computed for features with valid speeds.
+        # speed_loss_denom = max(reduce_sum(speed_weights.sum()).item() / d2_comm.get_world_size(), 1e-6)
+        speed_weights_sum = speed_weights.sum()
+        speed_loss_denom = max(reduce_sum(speed_weights_sum).item() / get_world_size(), 1e-6)
+
+        # NOTE: move after reduce sum
+        if pos_inds.numel() == 0:
+            losses = {"loss_attr": attr_logits.sum() * 0., "loss_speed": speeds.sum() * 0.}
+            # NOTE: This is probably un-reachable, because the training filter images with empty annotations.
+            # NOTE: If not, attr_weights can be unavailable in the reduce_sum below().
+            return losses
+
+        if valid_gt_mask.sum() == 0:
+            losses.update({"loss_speed": speeds.sum() * 0.})
+            # return losses
+        else:
+            speeds = speeds[valid_gt_mask]
+            target_speeds = target_speeds[valid_gt_mask]
+
+            l1_error = smooth_l1_loss(speeds, target_speeds, beta=0.05)
+            loss_speed = (l1_error * speed_weights).sum() / speed_loss_denom
+            losses.update({"loss_speed": self.speed_loss_weight * loss_speed})
+
+        return losses
+
+
+class NuscenesInference():
+    def __init__(self, cfg):
+        pass
+
+    def __call__(self, attr_logits, speeds, pred_instances, fcos2d_info):
+        """Add 'pred_attribute', 'pred_speed' to Instances in 'pred_instances'."""
+        N = attr_logits[0].shape[0]
+        for lvl, (attr_logits_lvl, speed_lvl, info_lvl, instances_lvl) in \
+            enumerate(zip(attr_logits, speeds, fcos2d_info, pred_instances)):
+
+            attr_logits_lvl = attr_logits_lvl.permute(0, 2, 3, 1).reshape(N, -1, MAX_NUM_ATTRIBUTES)
+            speed_lvl = speed_lvl.permute(0, 2, 3, 1).reshape(N, -1)
+            for i in range(N):
+                fg_inds_per_im = info_lvl['fg_inds_per_im'][i]
+                topk_indices = info_lvl['topk_indices'][i]
+
+                attr_logits_per_im = attr_logits_lvl[i][fg_inds_per_im]
+                speed_per_im = speed_lvl[i][fg_inds_per_im]
+
+                if topk_indices is not None:
+                    attr_logits_per_im = attr_logits_per_im[topk_indices]
+                    speed_per_im = speed_per_im[topk_indices]
+
+                if len(attr_logits_per_im) == 0:
+                    instances_lvl[i].pred_attributes = instances_lvl[i].pred_classes.new_tensor([])
+                    instances_lvl[i].pred_speeds = instances_lvl[i].scores.new_tensor([])
+                else:
+                    instances_lvl[i].pred_attributes = attr_logits_per_im.argmax(dim=1)
+                    instances_lvl[i].pred_speeds = speed_per_im
+
+
+@HEADS.register_module()
+class NuscenesDD3D(DD3D):
+    def __init__(self, 
+                 num_classes,
+                 in_channels,
+                 strides,
+                 fcos2d_cfg=dict(),
+                 fcos2d_loss_cfg=dict(),
+                 fcos3d_cfg=dict(),
+                 fcos3d_loss_cfg=dict(),
+                 target_assign_cfg=dict(),
+                 nusc_loss_weight=dict(),
+                 box3d_on=True,
+                 feature_locations_offset="none"):
+        super().__init__(num_classes,
+                        in_channels,
+                        strides,
+                        fcos2d_cfg=fcos2d_cfg,
+                        fcos2d_loss_cfg=fcos2d_loss_cfg,
+                        fcos3d_cfg=fcos3d_cfg,
+                        fcos3d_loss_cfg=fcos3d_loss_cfg,
+                        target_assign_cfg=target_assign_cfg,
+                        box3d_on=box3d_on,
+                        feature_locations_offset=feature_locations_offset)
+
+        # backbone_output_shape = self.backbone_output_shape
+        # in_channels = backbone_output_shape[0].channels
+
+        # --------------------------------------------------------------------------
+        # NuScenes predictions -- attribute / speed, computed from cls_tower output.
+        # --------------------------------------------------------------------------
+        self.attr_logits = Conv2d(in_channels, MAX_NUM_ATTRIBUTES, kernel_size=3, stride=1, padding=1, bias=True)
+        self.speed = Conv2d(in_channels, 1, kernel_size=3, stride=1, padding=1, bias=True, activation=F.relu)
+
+        # init weights
+        for modules in [self.attr_logits, self.speed]:
+            for l in modules.modules():
+                if isinstance(l, nn.Conv2d):
+                    torch.nn.init.kaiming_uniform_(l.weight, a=1)
+                    if l.bias is not None:  # depth head may not have bias.
+                        torch.nn.init.constant_(l.bias, 0)
+
+        # Re-define target preparer
+        del self.prepare_targets
+        self.prepare_targets = NuscenesDD3DTargetPreparer(num_classes=num_classes, 
+                                                          input_shape=self.backbone_output_shape,
+                                                          box3d_on=box3d_on,
+                                                          **target_assign_cfg)
+
+        self.nuscenes_loss = NuscenesLoss(**nusc_loss_weight)
+        # NOTE: inference later
+        # self.nuscenes_inference = NuscenesInference(cfg)
+
+        # self.num_images_per_sample = cfg.MODEL.FCOS3D.NUSC_NUM_IMAGES_PER_SAMPLE
+        # NOTE: inference later
+        # self.num_images_per_sample = cfg.DD3D.NUSC.INFERENCE.NUM_IMAGES_PER_SAMPLE
+
+        # assert self.num_images_per_sample == 6
+        # assert cfg.DATALOADER.TEST.NUM_IMAGES_PER_GROUP == 6
+
+        # NOTE: NuScenes evaluator allows max. 500 detections per sample.
+        # self.max_num_dets_per_sample = cfg.DD3D.NUSC.INFERENCE.MAX_NUM_DETS_PER_SAMPLE
+
+    @force_fp32(apply_to=('features'))
+    def forward(self, features, batched_inputs):
+        # NOTE:
+        # images = [x["image"].to(self.device) for x in batched_inputs]
+        # images = [self.preprocess_image(x) for x in images]
+
+        # NOTE: directly use inv_intrinsics
+        # if 'intrinsics' in batched_inputs[0]:
+        #     intrinsics = [x['intrinsics'].to(self.device) for x in batched_inputs]
+        # else:
+        #     intrinsics = None
+        # images = ImageList.from_tensors(images, self.backbone.size_divisibility, intrinsics=intrinsics)
+        if 'inv_intrinsics' in batched_inputs[0]:
+            inv_intrinsics = [x['inv_intrinsics'].to(features[0].device) for x in batched_inputs]
+            inv_intrinsics = torch.stack(inv_intrinsics, dim=0)
+        else:
+            inv_intrinsics = None
+
+        # NOTE:
+        # gt_dense_depth = None
+        # if 'depth' in batched_inputs[0]:
+        #     gt_dense_depth = [x["depth"].to(self.device) for x in batched_inputs]
+        #     gt_dense_depth = ImageList.from_tensors(
+        #         gt_dense_depth, self.backbone.size_divisibility, intrinsics=intrinsics
+        #     )
+
+        # NOTE: directly input feature
+        # features = self.backbone(images.tensor)
+        # features = [features[f] for f in self.in_features]
+
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(features[0].device) for x in batched_inputs]
+        else:
+            gt_instances = None
+
+        locations = self.compute_locations(features)
+        logits, box2d_reg, centerness, fcos2d_extra_output = self.fcos2d_head(features)
+        if not self.only_box2d:
+            box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth = self.fcos3d_head(features)
+        # NOTE: directly use inv_intrinsics
+        # inv_intrinsics = images.intrinsics.inverse() if images.intrinsics is not None else None
+
+        # --------------------------------------------------------------------------
+        # NuScenes predictions -- attribute / speed, computed from cls_tower output.
+        # --------------------------------------------------------------------------
+        attr_logits, speeds = [], []
+        for x in fcos2d_extra_output['cls_tower_out']:
+            attr_logits.append(self.attr_logits(x))
+            speeds.append(self.speed(x))
+
+        if self.training:
+            assert gt_instances is not None
+            feature_shapes = [x.shape[-2:] for x in features]
+            training_targets = self.prepare_targets(locations, gt_instances, feature_shapes)
+            # NOTE: 
+            # if gt_dense_depth is not None:
+            #    training_targets.update({"dense_depth": gt_dense_depth})
+
+            losses = {}
+            fcos2d_loss, fcos2d_info = self.fcos2d_loss(logits, box2d_reg, centerness, training_targets)
+            losses.update(fcos2d_loss)
+
+            if not self.only_box2d:
+                fcos3d_loss = self.fcos3d_loss(
+                    box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth, inv_intrinsics,
+                    fcos2d_info, training_targets
+                )
+                losses.update(fcos3d_loss)
+
+            # Nuscenes loss -- attribute / speed
+            nuscenes_loss = self.nuscenes_loss(attr_logits, speeds, fcos2d_info, training_targets)
+            losses.update(nuscenes_loss)
+            return losses
+        else:
+            # TODO: do not support inference now
+            raise NotImplementedError
+            pred_instances, fcos2d_info = self.fcos2d_inference(
+                logits, box2d_reg, centerness, locations, images.image_sizes
+            )
+            if not self.only_box2d:
+                # This adds 'pred_boxes3d' and 'scores_3d' to Instances in 'pred_instances'.
+                self.fcos3d_inference(
+                    box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances,
+                    fcos2d_info
+                )
+                score_key = "scores_3d"
+            else:
+                score_key = "scores"
+
+            # This adds 'pred_attributes', 'pred_speed' to Instances in 'pred_instances'.
+            self.nuscenes_inference(attr_logits, speeds, pred_instances, fcos2d_info)
+
+            # Transpose to "image-first", i.e. (B, L)
+            pred_instances = list(zip(*pred_instances))
+            pred_instances = [Instances.cat(instances) for instances in pred_instances]
+
+            # 2D NMS and pick top-K.
+            if self.do_nms:
+                pred_instances = self.fcos2d_inference.nms_and_top_k(pred_instances, score_key)
+
+            if not self.only_box2d and self.do_bev_nms:
+                # Bird-eye-view NMS.
+                dummy_group_idxs = {i: [i] for i, _ in enumerate(pred_instances)}
+                if 'pose' in batched_inputs[0]:
+                    poses = [x['pose'] for x in batched_inputs]
+                else:
+                    poses = [x['extrinsics'] for x in batched_inputs]
+                pred_instances = nuscenes_sample_aggregate(
+                    pred_instances,
+                    dummy_group_idxs,
+                    self.num_classes,
+                    poses,
+                    iou_threshold=self.bev_nms_iou_thresh,
+                    include_boxes3d_global=False
+                )
+
+            if self.postprocess_in_inference:
+                processed_results = []
+                for results_per_image, input_per_image, image_size in \
+                        zip(pred_instances, batched_inputs, images.image_sizes):
+                    height = input_per_image.get("height", image_size[0])
+                    width = input_per_image.get("width", image_size[1])
+                    r = resize_instances(results_per_image, height, width)
+                    processed_results.append({"instances": r})
+
+                # ----------------------------------------------------------
+                # NuScenes specific: cross-image (i.e. sample-level) BEV NMS.
+                # ----------------------------------------------------------
+                sample_tokens = [x['sample_token'] for x in batched_inputs]
+                group_idxs = get_group_idxs(sample_tokens, self.num_images_per_sample)
+
+                instances = [x['instances'] for x in processed_results]
+                global_poses = [x['pose'] for x in batched_inputs]
+
+                filtered_instances = nuscenes_sample_aggregate(
+                    instances,
+                    group_idxs,
+                    self.num_classes,
+                    global_poses,
+                    self.bev_nms_iou_thresh,
+                    max_num_dets_per_sample=self.max_num_dets_per_sample
+                )
+                processed_results = [{"instances": x} for x in filtered_instances]
+            else:
+                processed_results = [{"instances": x} for x in pred_instances]
+
+            return processed_results
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/prepare_targets.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/prepare_targets.py
new file mode 100644
index 0000000..91f76b5
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/modeling/prepare_targets.py
@@ -0,0 +1,242 @@
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import torch
+
+from mmcv.layers import cat
+
+from adzoo.bevformer.mmdet3d_plugin.dd3d.structures.boxes3d import Boxes3D
+
+INF = 100000000.
+
+
+class DD3DTargetPreparer():
+    def __init__(self, 
+                 num_classes, 
+                 input_shape,
+                 box3d_on=True,
+                 center_sample=True,
+                 pos_radius=1.5,
+                 sizes_of_interest=None):
+        self.num_classes = num_classes
+        self.center_sample = center_sample
+        self.strides = [shape.stride for shape in input_shape]
+        self.radius = pos_radius
+        self.dd3d_enabled = box3d_on
+
+        # generate sizes of interest
+        # NOTE:
+        # soi = []
+        # prev_size = -1
+        # for s in sizes_of_interest:
+        #     soi.append([prev_size, s])
+        #     prev_size = s
+        # soi.append([prev_size, INF])
+        self.sizes_of_interest = sizes_of_interest
+
+    def __call__(self, locations, gt_instances, feature_shapes):
+        num_loc_list = [len(loc) for loc in locations]
+
+        # compute locations to size ranges
+        loc_to_size_range = []
+        for l, loc_per_level in enumerate(locations):
+            loc_to_size_range_per_level = loc_per_level.new_tensor(self.sizes_of_interest[l])
+            loc_to_size_range.append(loc_to_size_range_per_level[None].expand(num_loc_list[l], -1))
+
+        loc_to_size_range = torch.cat(loc_to_size_range, dim=0)
+        locations = torch.cat(locations, dim=0)
+
+        training_targets = self.compute_targets_for_locations(locations, gt_instances, loc_to_size_range, num_loc_list)
+
+        training_targets["locations"] = [locations.clone() for _ in range(len(gt_instances))]
+        training_targets["im_inds"] = [
+            locations.new_ones(locations.size(0), dtype=torch.long) * i for i in range(len(gt_instances))
+        ]
+
+        box2d = training_targets.pop("box2d", None)
+
+        # transpose im first training_targets to level first ones
+        training_targets = {k: self._transpose(v, num_loc_list) for k, v in training_targets.items() if k != "box2d"}
+
+        training_targets["fpn_levels"] = [
+            loc.new_ones(len(loc), dtype=torch.long) * level for level, loc in enumerate(training_targets["locations"])
+        ]
+
+        # Flatten targets: (L x B x H x W, TARGET_SIZE)
+        labels = cat([x.reshape(-1) for x in training_targets["labels"]])
+        box2d_reg_targets = cat([x.reshape(-1, 4) for x in training_targets["box2d_reg"]])
+
+        target_inds = cat([x.reshape(-1) for x in training_targets["target_inds"]])
+        locations = cat([x.reshape(-1, 2) for x in training_targets["locations"]])
+        im_inds = cat([x.reshape(-1) for x in training_targets["im_inds"]])
+        fpn_levels = cat([x.reshape(-1) for x in training_targets["fpn_levels"]])
+
+        pos_inds = torch.nonzero(labels != self.num_classes).squeeze(1)
+
+        targets = {
+            "labels": labels,
+            "box2d_reg_targets": box2d_reg_targets,
+            "locations": locations,
+            "target_inds": target_inds,
+            "im_inds": im_inds,
+            "fpn_levels": fpn_levels,
+            "pos_inds": pos_inds
+        }
+
+        if self.dd3d_enabled:
+            box3d_targets = Boxes3D.cat(training_targets["box3d"])
+            targets.update({"box3d_targets": box3d_targets})
+
+            if box2d is not None:
+                # Original format is B x L x (H x W, 4)
+                # Need to be in L x (B, 4, H, W).
+                batched_box2d = []
+                for lvl, per_lvl_box2d in enumerate(zip(*box2d)):
+                    # B x (H x W, 4)
+                    h, w = feature_shapes[lvl]
+                    batched_box2d_lvl = torch.stack([x.T.reshape(4, h, w) for x in per_lvl_box2d], dim=0)
+                    batched_box2d.append(batched_box2d_lvl)
+                targets.update({"batched_box2d": batched_box2d})
+
+        return targets
+
+    def compute_targets_for_locations(self, locations, targets, size_ranges, num_loc_list):
+        labels = []
+        box2d_reg = []
+
+        if self.dd3d_enabled:
+            box3d = []
+
+        target_inds = []
+        xs, ys = locations[:, 0], locations[:, 1]
+
+        num_targets = 0
+        for im_i in range(len(targets)):
+            targets_per_im = targets[im_i]
+            bboxes = targets_per_im.gt_boxes.tensor
+            labels_per_im = targets_per_im.gt_classes
+
+            # no gt
+            if bboxes.numel() == 0:
+                labels.append(labels_per_im.new_zeros(locations.size(0)) + self.num_classes)
+                # reg_targets.append(locations.new_zeros((locations.size(0), 4)))
+                box2d_reg.append(locations.new_zeros((locations.size(0), 4)))
+                target_inds.append(labels_per_im.new_zeros(locations.size(0)) - 1)
+
+                if self.dd3d_enabled:
+                    box3d.append(
+                        Boxes3D(
+                            locations.new_zeros(locations.size(0), 4),
+                            locations.new_zeros(locations.size(0), 2),
+                            locations.new_zeros(locations.size(0), 1),
+                            locations.new_zeros(locations.size(0), 3),
+                            locations.new_zeros(locations.size(0), 3, 3),
+                        ).to(torch.float32)
+                    )
+                continue
+
+            area = targets_per_im.gt_boxes.area()
+
+            l = xs[:, None] - bboxes[:, 0][None]
+            t = ys[:, None] - bboxes[:, 1][None]
+            r = bboxes[:, 2][None] - xs[:, None]
+            b = bboxes[:, 3][None] - ys[:, None]
+            # reg_targets_per_im = torch.stack([l, t, r, b], dim=2)
+            box2d_reg_per_im = torch.stack([l, t, r, b], dim=2)
+
+            if self.center_sample:
+                is_in_boxes = self.get_sample_region(bboxes, num_loc_list, xs, ys)
+            else:
+                is_in_boxes = box2d_reg_per_im.min(dim=2)[0] > 0
+
+            max_reg_targets_per_im = box2d_reg_per_im.max(dim=2)[0]
+            # limit the regression range for each location
+            is_cared_in_the_level = \
+                (max_reg_targets_per_im >= size_ranges[:, [0]]) & \
+                (max_reg_targets_per_im <= size_ranges[:, [1]])
+
+            locations_to_gt_area = area[None].repeat(len(locations), 1)
+            locations_to_gt_area[is_in_boxes == 0] = INF
+            locations_to_gt_area[is_cared_in_the_level == 0] = INF
+
+            # if there are still more than one objects for a location,
+            # we choose the one with minimal area
+            locations_to_min_area, locations_to_gt_inds = locations_to_gt_area.min(dim=1)
+
+            box2d_reg_per_im = box2d_reg_per_im[range(len(locations)), locations_to_gt_inds]
+            target_inds_per_im = locations_to_gt_inds + num_targets
+            num_targets += len(targets_per_im)
+
+            labels_per_im = labels_per_im[locations_to_gt_inds]
+            labels_per_im[locations_to_min_area == INF] = self.num_classes
+
+            labels.append(labels_per_im)
+            box2d_reg.append(box2d_reg_per_im)
+            target_inds.append(target_inds_per_im)
+
+            if self.dd3d_enabled:
+                # 3D box targets
+                box3d_per_im = targets_per_im.gt_boxes3d[locations_to_gt_inds]
+                box3d.append(box3d_per_im)
+
+        ret = {"labels": labels, "box2d_reg": box2d_reg, "target_inds": target_inds}
+        if self.dd3d_enabled:
+            ret.update({"box3d": box3d})
+
+        return ret
+
+    def get_sample_region(self, boxes, num_loc_list, loc_xs, loc_ys):
+        center_x = boxes[..., [0, 2]].sum(dim=-1) * 0.5
+        center_y = boxes[..., [1, 3]].sum(dim=-1) * 0.5
+
+        num_gts = boxes.shape[0]
+        K = len(loc_xs)
+        boxes = boxes[None].expand(K, num_gts, 4)
+        center_x = center_x[None].expand(K, num_gts)
+        center_y = center_y[None].expand(K, num_gts)
+        center_gt = boxes.new_zeros(boxes.shape)
+        # no gt
+        if center_x.numel() == 0 or center_x[..., 0].sum() == 0:
+            return loc_xs.new_zeros(loc_xs.shape, dtype=torch.uint8)
+        beg = 0
+        for level, num_loc in enumerate(num_loc_list):
+            end = beg + num_loc
+            stride = self.strides[level] * self.radius
+            xmin = center_x[beg:end] - stride
+            ymin = center_y[beg:end] - stride
+            xmax = center_x[beg:end] + stride
+            ymax = center_y[beg:end] + stride
+            # limit sample region in gt
+            center_gt[beg:end, :, 0] = torch.where(xmin > boxes[beg:end, :, 0], xmin, boxes[beg:end, :, 0])
+            center_gt[beg:end, :, 1] = torch.where(ymin > boxes[beg:end, :, 1], ymin, boxes[beg:end, :, 1])
+            center_gt[beg:end, :, 2] = torch.where(xmax > boxes[beg:end, :, 2], boxes[beg:end, :, 2], xmax)
+            center_gt[beg:end, :, 3] = torch.where(ymax > boxes[beg:end, :, 3], boxes[beg:end, :, 3], ymax)
+            beg = end
+        left = loc_xs[:, None] - center_gt[..., 0]
+        right = center_gt[..., 2] - loc_xs[:, None]
+        top = loc_ys[:, None] - center_gt[..., 1]
+        bottom = center_gt[..., 3] - loc_ys[:, None]
+        center_bbox = torch.stack((left, top, right, bottom), -1)
+        inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0
+        return inside_gt_bbox_mask
+
+    def _transpose(self, training_targets, num_loc_list):
+        '''
+        This function is used to transpose image first training targets to level first ones
+        :return: level first training targets
+        '''
+        if isinstance(training_targets[0], Boxes3D):
+            for im_i in range(len(training_targets)):
+                # training_targets[im_i] = torch.split(training_targets[im_i], num_loc_list, dim=0)
+                training_targets[im_i] = training_targets[im_i].split(num_loc_list, dim=0)
+
+            targets_level_first = []
+            for targets_per_level in zip(*training_targets):
+                targets_level_first.append(Boxes3D.cat(targets_per_level, dim=0))
+            return targets_level_first
+
+        for im_i in range(len(training_targets)):
+            training_targets[im_i] = torch.split(training_targets[im_i], num_loc_list, dim=0)
+
+        targets_level_first = []
+        for targets_per_level in zip(*training_targets):
+            targets_level_first.append(torch.cat(targets_per_level, dim=0))
+        return targets_level_first
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/__init__.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/__init__.py
new file mode 100644
index 0000000..3857649
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+from .image_list import ImageList
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/boxes3d.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/boxes3d.py
new file mode 100644
index 0000000..0823602
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/boxes3d.py
@@ -0,0 +1,321 @@
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import numpy as np
+import torch
+from pyquaternion import Quaternion
+from torch.cuda import amp
+
+from adzoo.bevformer.mmdet3d_plugin.dd3d.utils.geometry import unproject_points2d
+import adzoo.bevformer.mmdet3d_plugin.dd3d.structures.transform3d as t3d
+# yapf: disable
+BOX3D_CORNER_MAPPING = [
+    [1, 1, 1, 1, -1, -1, -1, -1],
+    [1, -1, -1, 1, 1, -1, -1, 1],
+    [1, 1, -1, -1, 1, 1, -1, -1]
+]
+# yapf: enable
+
+def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as quaternions to rotation matrices.
+
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    r, i, j, k = torch.unbind(quaternions, -1)
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+
+def _to_tensor(x, dim):
+    if isinstance(x, torch.Tensor):
+        x = x.to(torch.float32)
+    elif isinstance(x, np.ndarray) or isinstance(x, list) or isinstance(x, tuple):
+        x = torch.tensor(x, dtype=torch.float32)
+    elif isinstance(x, Quaternion):
+        x = torch.tensor(x.elements, dtype=torch.float32)
+    else:
+        raise ValueError(f"Unsupported type: {type(x).__name__}")
+
+    if x.ndim == 1:
+        x = x.reshape(-1, dim)
+    elif x.ndim > 2:
+        raise ValueError(f"Invalid shape of input: {x.shape.__str__()}")
+    return x
+
+
+class GenericBoxes3D():
+    def __init__(self, quat, tvec, size):
+        self.quat = _to_tensor(quat, dim=4)
+        self._tvec = _to_tensor(tvec, dim=3)
+        self.size = _to_tensor(size, dim=3)
+
+    @property
+    def tvec(self):
+        return self._tvec
+
+    @property
+    @amp.autocast(enabled=False)
+    def corners(self):
+        allow_tf32 = torch.backends.cuda.matmul.allow_tf32
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.allow_tf32 = False
+
+        translation = t3d.Translate(self.tvec, device=self.device)
+
+        R = quaternion_to_matrix(self.quat)
+        rotation = t3d.Rotate(R=R.transpose(1, 2), device=self.device)  # Need to transpose to make it work.
+
+        tfm = rotation.compose(translation)
+
+        _corners = 0.5 * self.quat.new_tensor(BOX3D_CORNER_MAPPING).T
+        # corners_in_obj_frame = self.size.unsqueeze(1) * _corners.unsqueeze(0)
+        lwh = self.size[:, [1, 0, 2]]  # wlh -> lwh
+        corners_in_obj_frame = lwh.unsqueeze(1) * _corners.unsqueeze(0)
+
+        corners3d = tfm.transform_points(corners_in_obj_frame)
+        torch.backends.cuda.matmul.allow_tf32 = allow_tf32
+        torch.backends.cudnn.allow_tf32 = allow_tf32
+        return corners3d
+
+    @classmethod
+    def from_vectors(cls, vecs, device="cpu"):
+        """
+        Parameters
+        ----------
+        vecs: Iterable[np.ndarray]
+            Iterable of 10D pose representation.
+
+        intrinsics: np.ndarray
+            (3, 3) intrinsics matrix.
+        """
+        quats, tvecs, sizes = [], [], []
+        for vec in vecs:
+            quat = vec[:4]
+            tvec = vec[4:7]
+            size = vec[7:]
+
+            quats.append(quat)
+            tvecs.append(tvec)
+            sizes.append(size)
+
+        quats = torch.as_tensor(quats, dtype=torch.float32, device=device)
+        tvecs = torch.as_tensor(tvecs, dtype=torch.float32, device=device)
+        sizes = torch.as_tensor(sizes, device=device)
+
+        return cls(quats, tvecs, sizes)
+
+    @classmethod
+    def cat(cls, boxes_list, dim=0):
+
+        assert isinstance(boxes_list, (list, tuple))
+        if len(boxes_list) == 0:
+            return cls(torch.empty(0), torch.empty(0), torch.empty(0))
+        assert all([isinstance(box, GenericBoxes3D) for box in boxes_list])
+
+        # use torch.cat (v.s. layers.cat) so the returned boxes never share storage with input
+        quat = torch.cat([b.quat for b in boxes_list], dim=dim)
+        tvec = torch.cat([b.tvec for b in boxes_list], dim=dim)
+        size = torch.cat([b.size for b in boxes_list], dim=dim)
+
+        cat_boxes = cls(quat, tvec, size)
+        return cat_boxes
+
+    def split(self, split_sizes, dim=0):
+        assert sum(split_sizes) == len(self)
+        quat_list = torch.split(self.quat, split_sizes, dim=dim)
+        tvec_list = torch.split(self.tvec, split_sizes, dim=dim)
+        size_list = torch.split(self.size, split_sizes, dim=dim)
+
+        return [GenericBoxes3D(*x) for x in zip(quat_list, tvec_list, size_list)]
+
+    def __getitem__(self, item):
+        """
+        """
+        if isinstance(item, int):
+            return GenericBoxes3D(self.quat[item].view(1, -1), self.tvec[item].view(1, -1), self.size[item].view(1, -1))
+
+        quat = self.quat[item]
+        tvec = self.tvec[item]
+        size = self.size[item]
+
+        assert quat.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+        assert tvec.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+        assert size.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+
+        return GenericBoxes3D(quat, tvec, size)
+
+    def __len__(self):
+        assert len(self.quat) == len(self.tvec) == len(self.size)
+        return self.quat.shape[0]
+
+    def clone(self):
+        """
+        """
+        return GenericBoxes3D(self.quat.clone(), self.tvec.clone(), self.size.clone())
+
+    def vectorize(self):
+        xyz = self.tvec
+        return torch.cat([self.quat, xyz, self.size], dim=1)
+
+    @property
+    def device(self):
+        return self.quat.device
+
+    def to(self, *args, **kwargs):
+        quat = self.quat.to(*args, **kwargs)
+        tvec = self.tvec.to(*args, **kwargs)
+        size = self.size.to(*args, **kwargs)
+        return GenericBoxes3D(quat, tvec, size)
+
+
+class Boxes3D(GenericBoxes3D):
+    """Vision-based 3D box container.
+
+    The tvec is computed from projected center, depth, and intrinsics.
+    """
+    def __init__(self, quat, proj_ctr, depth, size, inv_intrinsics):
+        self.quat = quat
+        self.proj_ctr = proj_ctr
+        self.depth = depth
+        self.size = size
+        self.inv_intrinsics = inv_intrinsics
+
+    @property
+    def tvec(self):
+        ray = unproject_points2d(self.proj_ctr, self.inv_intrinsics)
+        xyz = ray * self.depth
+        return xyz
+
+    @classmethod
+    def from_vectors(cls, vecs, intrinsics, device="cpu"):
+        """
+        Parameters
+        ----------
+        vecs: Iterable[np.ndarray]
+            Iterable of 10D pose representation.
+
+        intrinsics: np.ndarray
+            (3, 3) intrinsics matrix.
+        """
+        if len(vecs) == 0:
+            quats = torch.as_tensor([], dtype=torch.float32, device=device).view(-1, 4)
+            proj_ctrs = torch.as_tensor([], dtype=torch.float32, device=device).view(-1, 2)
+            depths = torch.as_tensor([], dtype=torch.float32, device=device).view(-1, 1)
+            sizes = torch.as_tensor([], dtype=torch.float32, device=device).view(-1, 3)
+            inv_intrinsics = torch.as_tensor([], dtype=torch.float32, device=device).view(-1, 3, 3)
+            return cls(quats, proj_ctrs, depths, sizes, inv_intrinsics)
+
+        quats, proj_ctrs, depths, sizes = [], [], [], []
+        for vec in vecs:
+            quat = vec[:4]
+
+            proj_ctr = intrinsics.dot(vec[4:7])
+            proj_ctr = proj_ctr[:2] / proj_ctr[-1]
+
+            depth = vec[6:7]
+
+            size = vec[7:]
+
+            quats.append(quat)
+            proj_ctrs.append(proj_ctr)
+            depths.append(depth)
+            sizes.append(size)
+
+        quats = torch.as_tensor(np.array(quats), dtype=torch.float32, device=device)
+        proj_ctrs = torch.as_tensor(np.array(proj_ctrs), dtype=torch.float32, device=device)
+        depths = torch.as_tensor(np.array(depths), dtype=torch.float32, device=device)
+        sizes = torch.as_tensor(np.array(sizes), dtype=torch.float32, device=device)
+
+        inv_intrinsics = np.linalg.inv(intrinsics)
+        inv_intrinsics = torch.as_tensor(inv_intrinsics[None, ...], device=device).expand(len(vecs), 3, 3)
+
+        return cls(quats, proj_ctrs, depths, sizes, inv_intrinsics)
+
+    @classmethod
+    def cat(cls, boxes_list, dim=0):
+
+        assert isinstance(boxes_list, (list, tuple))
+        if len(boxes_list) == 0:
+            return cls(torch.empty(0), torch.empty(0), torch.empty(0), torch.empty(0), torch.empty(0))
+        assert all([isinstance(box, Boxes3D) for box in boxes_list])
+
+        # use torch.cat (v.s. layers.cat) so the returned boxes never share storage with input
+        quat = torch.cat([b.quat for b in boxes_list], dim=dim)
+        proj_ctr = torch.cat([b.proj_ctr for b in boxes_list], dim=dim)
+        depth = torch.cat([b.depth for b in boxes_list], dim=dim)
+        size = torch.cat([b.size for b in boxes_list], dim=dim)
+        inv_intrinsics = torch.cat([b.inv_intrinsics for b in boxes_list], dim=dim)
+
+        cat_boxes = cls(quat, proj_ctr, depth, size, inv_intrinsics)
+        return cat_boxes
+
+    def split(self, split_sizes, dim=0):
+        assert sum(split_sizes) == len(self)
+        quat_list = torch.split(self.quat, split_sizes, dim=dim)
+        proj_ctr_list = torch.split(self.proj_ctr, split_sizes, dim=dim)
+        depth_list = torch.split(self.depth, split_sizes, dim=dim)
+        size_list = torch.split(self.size, split_sizes, dim=dim)
+        inv_K_list = torch.split(self.inv_intrinsics, split_sizes, dim=dim)
+
+        return [Boxes3D(*x) for x in zip(quat_list, proj_ctr_list, depth_list, size_list, inv_K_list)]
+
+    def __getitem__(self, item):
+        """
+        """
+        if isinstance(item, int):
+            return Boxes3D(
+                self.quat[item].view(1, -1), self.proj_ctr[item].view(1, -1), self.depth[item].view(1, -1),
+                self.size[item].view(1, -1), self.inv_intrinsics[item].view(1, 3, 3)
+            )
+
+        quat = self.quat[item]
+        ctr = self.proj_ctr[item]
+        depth = self.depth[item]
+        size = self.size[item]
+        inv_K = self.inv_intrinsics[item]
+
+        assert quat.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+        assert ctr.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+        assert depth.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+        assert size.dim() == 2, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+        assert inv_K.dim() == 3, "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+        assert inv_K.shape[1:] == (3, 3), "Indexing on Boxes3D with {} failed to return a matrix!".format(item)
+
+        return Boxes3D(quat, ctr, depth, size, inv_K)
+
+    def __len__(self):
+        assert len(self.quat) == len(self.proj_ctr) == len(self.depth) == len(self.size) == len(self.inv_intrinsics)
+        return self.quat.shape[0]
+
+    def clone(self):
+        """
+        """
+        return Boxes3D(
+            self.quat.clone(), self.proj_ctr.clone(), self.depth.clone(), self.size.clone(), self.inv_intrinsics.clone()
+        )
+
+    def to(self, *args, **kwargs):
+        quat = self.quat.to(*args, **kwargs)
+        proj_ctr = self.proj_ctr.to(*args, **kwargs)
+        depth = self.depth.to(*args, **kwargs)
+        size = self.size.to(*args, **kwargs)
+        inv_K = self.inv_intrinsics.to(*args, **kwargs)
+        return Boxes3D(quat, proj_ctr, depth, size, inv_K)
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/image_list.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/image_list.py
new file mode 100644
index 0000000..f27b3c0
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/image_list.py
@@ -0,0 +1,157 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+from __future__ import division
+
+from typing import Any, List, Sequence, Tuple
+
+import torch
+from torch import device
+from torch.nn import functional as F
+
+TORCH_VERSION = tuple(int(x) for x in torch.__version__.split(".")[:2])
+
+def _as_tensor(x: Tuple[int, int]) -> torch.Tensor:
+    """
+    An equivalent of `torch.as_tensor`, but works under tracing if input
+    is a list of tensor. `torch.as_tensor` will record a constant in tracing,
+    but this function will use `torch.stack` instead.
+    """
+    if torch.jit.is_scripting():
+        return torch.as_tensor(x)
+    if isinstance(x, (list, tuple)) and all([isinstance(t, torch.Tensor) for t in x]):
+        return torch.stack(x)
+    return torch.as_tensor(x)
+
+
+class ImageList(object):
+    """
+    Adapted from detectron2:
+        https://github.com/facebookresearch/detectron2/blob/master/detectron2/structures/image_list.py)
+
+    Key differences:
+        - add optional intrinsics
+        - add optional image path (useful for debugging)
+    ==================================================================================================================
+
+    Structure that holds a list of images (of possibly
+    varying sizes) as a single tensor.
+    This works by padding the images to the same size,
+    and storing in a field the original sizes of each image
+
+    Attributes:
+        image_sizes (list[tuple[int, int]]): each tuple is (h, w)
+    """
+    def __init__(self, tensor: torch.Tensor, image_sizes: List[Tuple[int, int]], intrinsics=None, image_paths=None):
+        """
+        Arguments:
+            tensor (Tensor): of shape (N, H, W) or (N, C_1, ..., C_K, H, W) where K >= 1
+            image_sizes (list[tuple[int, int]]): Each tuple is (h, w). It can
+                be smaller than (H, W) due to padding.
+        """
+        self.tensor = tensor
+        self.image_sizes = image_sizes
+        self._intrinsics = intrinsics
+        self._image_paths = image_paths
+
+    @property
+    def intrinsics(self):
+        if torch.allclose(self._intrinsics[0], torch.eye(3, device=self._intrinsics.device)):
+            # TODO: torch.inverse(images.intrinsics) often return identity, when it shouldn't. Is it pytorch bug?
+            raise ValueError("Intrinsics is Identity.")
+        return self._intrinsics
+
+    @property
+    def image_paths(self):
+        return self._image_paths
+
+    def __len__(self) -> int:
+        return len(self.image_sizes)
+
+    def __getitem__(self, idx) -> torch.Tensor:
+        """
+        Access the individual image in its original size.
+
+        Args:
+            idx: int or slice
+
+        Returns:
+            Tensor: an image of shape (H, W) or (C_1, ..., C_K, H, W) where K >= 1
+        """
+        size = self.image_sizes[idx]
+        return self.tensor[idx, ..., :size[0], :size[1]]
+
+    @torch.jit.unused
+    def to(self, *args: Any, **kwargs: Any) -> "ImageList":
+        cast_tensor = self.tensor.to(*args, **kwargs)
+        return ImageList(cast_tensor, self.image_sizes, intrinsics=self.intrinsics)
+
+    @property
+    def device(self) -> device:
+        return self.tensor.device
+
+    @staticmethod
+    def from_tensors(
+        tensors: List[torch.Tensor],
+        size_divisibility: int = 0,
+        pad_value: float = 0.0,
+        intrinsics=None,
+        image_paths=None
+    ) -> "ImageList":
+        """
+        Args:
+            tensors: a tuple or list of `torch.Tensor`, each of shape (Hi, Wi) or
+                (C_1, ..., C_K, Hi, Wi) where K >= 1. The Tensors will be padded
+                to the same shape with `pad_value`.
+            size_divisibility (int): If `size_divisibility > 0`, add padding to ensure
+                the common height and width is divisible by `size_divisibility`.
+                This depends on the model and many models need a divisibility of 32.
+            pad_value (float): value to pad
+
+        Returns:
+            an `ImageList`.
+        """
+        assert len(tensors) > 0
+        assert isinstance(tensors, (tuple, list))
+        for t in tensors:
+            assert isinstance(t, torch.Tensor), type(t)
+            assert t.shape[:-2] == tensors[0].shape[:-2], t.shape
+
+        image_sizes = [(im.shape[-2], im.shape[-1]) for im in tensors]
+        image_sizes_tensor = [_as_tensor(x) for x in image_sizes]
+        max_size = torch.stack(image_sizes_tensor).max(0).values
+
+        if size_divisibility > 1:
+            stride = size_divisibility
+            # the last two dims are H,W, both subject to divisibility requirement
+            max_size = torch.div(max_size + (stride - 1),  stride, rounding_mode='floor') * stride
+
+        # handle weirdness of scripting and tracing ...
+        if torch.jit.is_scripting():
+            max_size: List[int] = max_size.to(dtype=torch.long).tolist()
+        else:
+            # https://github.com/pytorch/pytorch/issues/42448
+            if TORCH_VERSION >= (1, 7) and torch.jit.is_tracing():
+                image_sizes = image_sizes_tensor
+
+        if len(tensors) == 1:
+            # This seems slightly (2%) faster.
+            # TODO: check whether it's faster for multiple images as well
+            image_size = image_sizes[0]
+            padding_size = [0, max_size[-1] - image_size[1], 0, max_size[-2] - image_size[0]]
+            batched_imgs = F.pad(tensors[0], padding_size, value=pad_value).unsqueeze_(0)
+        else:
+            # max_size can be a tensor in tracing mode, therefore convert to list
+            batch_shape = [len(tensors)] + list(tensors[0].shape[:-2]) + list(max_size)
+            batched_imgs = tensors[0].new_full(batch_shape, pad_value)
+            for img, pad_img in zip(tensors, batched_imgs):
+                pad_img[..., :img.shape[-2], :img.shape[-1]].copy_(img)
+
+        if intrinsics is not None:
+            assert isinstance(intrinsics, (tuple, list))
+            assert len(intrinsics) == len(tensors)
+            intrinsics = torch.stack(intrinsics, dim=0)
+
+        if image_paths is not None:
+            assert len(image_paths) == len(tensors)
+
+        return ImageList(batched_imgs.contiguous(), image_sizes, intrinsics, image_paths)
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/pose.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/pose.py
new file mode 100644
index 0000000..2746f92
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/pose.py
@@ -0,0 +1,164 @@
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import numpy as np
+from pyquaternion import Quaternion
+
+
+class Pose:
+    """SE(3) rigid transform class that allows compounding of 6-DOF poses
+    and provides common transformations that are commonly seen in geometric problems.
+    """
+    def __init__(self, wxyz=np.float32([1., 0., 0., 0.]), tvec=np.float32([0., 0., 0.])):
+        """Initialize a Pose with Quaternion and 3D Position
+
+        Parameters
+        ----------
+        wxyz: np.float32 or Quaternion (default: np.float32([1,0,0,0]))
+            Quaternion/Rotation (wxyz)
+
+        tvec: np.float32 (default: np.float32([0,0,0]))
+            Translation (xyz)
+        """
+        assert isinstance(wxyz, (np.ndarray, Quaternion))
+        assert isinstance(tvec, np.ndarray)
+
+        if isinstance(wxyz, np.ndarray):
+            assert np.abs(1.0 - np.linalg.norm(wxyz)) < 1.0e-3
+
+        self.quat = Quaternion(wxyz)
+        self.tvec = tvec
+
+    def __repr__(self):
+        formatter = {'float_kind': lambda x: '%.2f' % x}
+        tvec_str = np.array2string(self.tvec, formatter=formatter)
+        return 'wxyz: {}, tvec: ({})'.format(self.quat, tvec_str)
+
+    def copy(self):
+        """Return a copy of this pose object.
+
+        Returns
+        ----------
+        result: Pose
+            Copied pose object.
+        """
+        return self.__class__(Quaternion(self.quat), self.tvec.copy())
+
+    def __mul__(self, other):
+        """Left-multiply Pose with another Pose or 3D-Points.
+
+        Parameters
+        ----------
+        other: Pose or np.ndarray
+            1. Pose: Identical to oplus operation.
+               (i.e. self_pose * other_pose)
+            2. ndarray: transform [N x 3] point set
+               (i.e. X' = self_pose * X)
+
+        Returns
+        ----------
+        result: Pose or np.ndarray
+            Transformed pose or point cloud
+        """
+        if isinstance(other, Pose):
+            assert isinstance(other, self.__class__)
+            t = self.quat.rotate(other.tvec) + self.tvec
+            q = self.quat * other.quat
+            return self.__class__(q, t)
+        elif isinstance(other, np.ndarray):
+            assert other.shape[-1] == 3, 'Point cloud is not 3-dimensional'
+            X = np.hstack([other, np.ones((len(other), 1))]).T
+            return (np.dot(self.matrix, X).T)[:, :3]
+        else:
+            return NotImplemented
+
+    def __rmul__(self, other):
+        raise NotImplementedError('Right multiply not implemented yet!')
+
+    def inverse(self):
+        """Returns a new Pose that corresponds to the
+        inverse of this one.
+
+        Returns
+        ----------
+        result: Pose
+            Inverted pose
+        """
+        qinv = self.quat.inverse
+        return self.__class__(qinv, qinv.rotate(-self.tvec))
+
+    @property
+    def matrix(self):
+        """Returns a 4x4 homogeneous matrix of the form [R t; 0 1]
+
+        Returns
+        ----------
+        result: np.ndarray
+            4x4 homogeneous matrix
+        """
+        result = self.quat.transformation_matrix
+        result[:3, 3] = self.tvec
+        return result
+
+    @property
+    def rotation_matrix(self):
+        """Returns the 3x3 rotation matrix (R)
+
+        Returns
+        ----------
+        result: np.ndarray
+            3x3 rotation matrix
+        """
+        result = self.quat.transformation_matrix
+        return result[:3, :3]
+
+    @property
+    def rotation(self):
+        """Return the rotation component of the pose as a Quaternion object.
+
+        Returns
+        ----------
+        self.quat: Quaternion
+            Rotation component of the Pose object.
+        """
+        return self.quat
+
+    @property
+    def translation(self):
+        """Return the translation component of the pose as a np.ndarray.
+
+        Returns
+        ----------
+        self.tvec: np.ndarray
+            Translation component of the Pose object.
+        """
+        return self.tvec
+
+    @classmethod
+    def from_matrix(cls, transformation_matrix):
+        """Initialize pose from 4x4 transformation matrix
+
+        Parameters
+        ----------
+        transformation_matrix: np.ndarray
+            4x4 containing rotation/translation
+
+        Returns
+        -------
+        Pose
+        """
+        return cls(wxyz=Quaternion(matrix=transformation_matrix[:3, :3]), tvec=np.float32(transformation_matrix[:3, 3]))
+
+    @classmethod
+    def from_rotation_translation(cls, rotation_matrix, tvec):
+        """Initialize pose from rotation matrix and translation vector.
+
+        Parameters
+        ----------
+        rotation_matrix : np.ndarray
+            3x3 rotation matrix
+        tvec : np.ndarray
+            length-3 translation vector
+        """
+        return cls(wxyz=Quaternion(matrix=rotation_matrix), tvec=np.float64(tvec))
+
+    def __eq__(self, other):
+        return self.quat == other.quat and (self.tvec == other.tvec).all()
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/transform3d.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/transform3d.py
new file mode 100644
index 0000000..36133d0
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/structures/transform3d.py
@@ -0,0 +1,896 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import warnings
+from typing import List, Optional, Union
+
+import torch
+
+Device = Union[str, torch.device]
+
+
+def make_device(device: Device) -> torch.device:
+    """
+    Makes an actual torch.device object from the device specified as
+    either a string or torch.device object. If the device is `cuda` without
+    a specific index, the index of the current device is assigned.
+
+    Args:
+        device: Device (as str or torch.device)
+
+    Returns:
+        A matching torch.device object
+    """
+    device = torch.device(device) if isinstance(device, str) else device
+    if device.type == "cuda" and device.index is None:  # pyre-ignore[16]
+        # If cuda but with no index, then the current cuda device is indicated.
+        # In that case, we fix to that device
+        device = torch.device(f"cuda:{torch.cuda.current_device()}")
+    return device
+
+
+def get_device(x, device: Optional[Device] = None) -> torch.device:
+    """
+    Gets the device of the specified variable x if it is a tensor, or
+    falls back to a default CPU device otherwise. Allows overriding by
+    providing an explicit device.
+
+    Args:
+        x: a torch.Tensor to get the device from or another type
+        device: Device (as str or torch.device) to fall back to
+
+    Returns:
+        A matching torch.device object
+    """
+
+    # User overrides device
+    if device is not None:
+        return make_device(device)
+
+    # Set device based on input tensor
+    if torch.is_tensor(x):
+        return x.device
+
+    # Default device is cpu
+    return torch.device("cpu")
+
+
+def _safe_det_3x3(t: torch.Tensor):
+    """
+    Fast determinant calculation for a batch of 3x3 matrices.
+
+    Note, result of this function might not be the same as `torch.det()`.
+    The differences might be in the last significant digit.
+
+    Args:
+        t: Tensor of shape (N, 3, 3).
+
+    Returns:
+        Tensor of shape (N) with determinants.
+    """
+
+    det = (
+        t[..., 0, 0] * (t[..., 1, 1] * t[..., 2, 2] - t[..., 1, 2] * t[..., 2, 1])
+        - t[..., 0, 1] * (t[..., 1, 0] * t[..., 2, 2] - t[..., 2, 0] * t[..., 1, 2])
+        + t[..., 0, 2] * (t[..., 1, 0] * t[..., 2, 1] - t[..., 2, 0] * t[..., 1, 1])
+    )
+
+    return det
+
+def _axis_angle_rotation(axis: str, angle: torch.Tensor) -> torch.Tensor:
+    """
+    Return the rotation matrices for one of the rotations about an axis
+    of which Euler angles describe, for each value of the angle given.
+
+    Args:
+        axis: Axis label "X" or "Y or "Z".
+        angle: any shape tensor of Euler angles in radians
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+
+    cos = torch.cos(angle)
+    sin = torch.sin(angle)
+    one = torch.ones_like(angle)
+    zero = torch.zeros_like(angle)
+
+    if axis == "X":
+        R_flat = (one, zero, zero, zero, cos, -sin, zero, sin, cos)
+    elif axis == "Y":
+        R_flat = (cos, zero, sin, zero, one, zero, -sin, zero, cos)
+    elif axis == "Z":
+        R_flat = (cos, -sin, zero, sin, cos, zero, zero, zero, one)
+    else:
+        raise ValueError("letter must be either X, Y or Z.")
+
+    return torch.stack(R_flat, -1).reshape(angle.shape + (3, 3))
+
+class Transform3d:
+    """
+    A Transform3d object encapsulates a batch of N 3D transformations, and knows
+    how to transform points and normal vectors. Suppose that t is a Transform3d;
+    then we can do the following:
+
+    .. code-block:: python
+
+        N = len(t)
+        points = torch.randn(N, P, 3)
+        normals = torch.randn(N, P, 3)
+        points_transformed = t.transform_points(points)    # => (N, P, 3)
+        normals_transformed = t.transform_normals(normals)  # => (N, P, 3)
+
+
+    BROADCASTING
+    Transform3d objects supports broadcasting. Suppose that t1 and tN are
+    Transform3d objects with len(t1) == 1 and len(tN) == N respectively. Then we
+    can broadcast transforms like this:
+
+    .. code-block:: python
+
+        t1.transform_points(torch.randn(P, 3))     # => (P, 3)
+        t1.transform_points(torch.randn(1, P, 3))  # => (1, P, 3)
+        t1.transform_points(torch.randn(M, P, 3))  # => (M, P, 3)
+        tN.transform_points(torch.randn(P, 3))     # => (N, P, 3)
+        tN.transform_points(torch.randn(1, P, 3))  # => (N, P, 3)
+
+
+    COMBINING TRANSFORMS
+    Transform3d objects can be combined in two ways: composing and stacking.
+    Composing is function composition. Given Transform3d objects t1, t2, t3,
+    the following all compute the same thing:
+
+    .. code-block:: python
+
+        y1 = t3.transform_points(t2.transform_points(t1.transform_points(x)))
+        y2 = t1.compose(t2).compose(t3).transform_points(x)
+        y3 = t1.compose(t2, t3).transform_points(x)
+
+
+    Composing transforms should broadcast.
+
+    .. code-block:: python
+
+        if len(t1) == 1 and len(t2) == N, then len(t1.compose(t2)) == N.
+
+    We can also stack a sequence of Transform3d objects, which represents
+    composition along the batch dimension; then the following should compute the
+    same thing.
+
+    .. code-block:: python
+
+        N, M = len(tN), len(tM)
+        xN = torch.randn(N, P, 3)
+        xM = torch.randn(M, P, 3)
+        y1 = torch.cat([tN.transform_points(xN), tM.transform_points(xM)], dim=0)
+        y2 = tN.stack(tM).transform_points(torch.cat([xN, xM], dim=0))
+
+    BUILDING TRANSFORMS
+    We provide convenience methods for easily building Transform3d objects
+    as compositions of basic transforms.
+
+    .. code-block:: python
+
+        # Scale by 0.5, then translate by (1, 2, 3)
+        t1 = Transform3d().scale(0.5).translate(1, 2, 3)
+
+        # Scale each axis by a different amount, then translate, then scale
+        t2 = Transform3d().scale(1, 3, 3).translate(2, 3, 1).scale(2.0)
+
+        t3 = t1.compose(t2)
+        tN = t1.stack(t3, t3)
+
+
+    BACKPROP THROUGH TRANSFORMS
+    When building transforms, we can also parameterize them by Torch tensors;
+    in this case we can backprop through the construction and application of
+    Transform objects, so they could be learned via gradient descent or
+    predicted by a neural network.
+
+    .. code-block:: python
+
+        s1_params = torch.randn(N, requires_grad=True)
+        t_params = torch.randn(N, 3, requires_grad=True)
+        s2_params = torch.randn(N, 3, requires_grad=True)
+
+        t = Transform3d().scale(s1_params).translate(t_params).scale(s2_params)
+        x = torch.randn(N, 3)
+        y = t.transform_points(x)
+        loss = compute_loss(y)
+        loss.backward()
+
+        with torch.no_grad():
+            s1_params -= lr * s1_params.grad
+            t_params -= lr * t_params.grad
+            s2_params -= lr * s2_params.grad
+
+    CONVENTIONS
+    We adopt a right-hand coordinate system, meaning that rotation about an axis
+    with a positive angle results in a counter clockwise rotation.
+
+    This class assumes that transformations are applied on inputs which
+    are row vectors. The internal representation of the Nx4x4 transformation
+    matrix is of the form:
+
+    .. code-block:: python
+
+        M = [
+                [Rxx, Ryx, Rzx, 0],
+                [Rxy, Ryy, Rzy, 0],
+                [Rxz, Ryz, Rzz, 0],
+                [Tx,  Ty,  Tz,  1],
+            ]
+
+    To apply the transformation to points which are row vectors, the M matrix
+    can be pre multiplied by the points:
+
+    .. code-block:: python
+
+        points = [[0, 1, 2]]  # (1 x 3) xyz coordinates of a point
+        transformed_points = points * M
+
+    """
+
+    def __init__(
+        self,
+        dtype: torch.dtype = torch.float32,
+        device: Device = "cpu",
+        matrix: Optional[torch.Tensor] = None,
+    ) -> None:
+        """
+        Args:
+            dtype: The data type of the transformation matrix.
+                to be used if `matrix = None`.
+            device: The device for storing the implemented transformation.
+                If `matrix != None`, uses the device of input `matrix`.
+            matrix: A tensor of shape (4, 4) or of shape (minibatch, 4, 4)
+                representing the 4x4 3D transformation matrix.
+                If `None`, initializes with identity using
+                the specified `device` and `dtype`.
+        """
+
+        if matrix is None:
+            self._matrix = torch.eye(4, dtype=dtype, device=device).view(1, 4, 4)
+        else:
+            if matrix.ndim not in (2, 3):
+                raise ValueError('"matrix" has to be a 2- or a 3-dimensional tensor.')
+            if matrix.shape[-2] != 4 or matrix.shape[-1] != 4:
+                raise ValueError(
+                    '"matrix" has to be a tensor of shape (minibatch, 4, 4)'
+                )
+            # set dtype and device from matrix
+            dtype = matrix.dtype
+            device = matrix.device
+            self._matrix = matrix.view(-1, 4, 4)
+
+        self._transforms = []  # store transforms to compose
+        self._lu = None
+        self.device = make_device(device)
+        self.dtype = dtype
+
+    def __len__(self) -> int:
+        return self.get_matrix().shape[0]
+
+    def __getitem__(
+        self, index: Union[int, List[int], slice, torch.Tensor]
+    ) -> "Transform3d":
+        """
+        Args:
+            index: Specifying the index of the transform to retrieve.
+                Can be an int, slice, list of ints, boolean, long tensor.
+                Supports negative indices.
+
+        Returns:
+            Transform3d object with selected transforms. The tensors are not cloned.
+        """
+        if isinstance(index, int):
+            index = [index]
+        return self.__class__(matrix=self.get_matrix()[index])
+
+    def compose(self, *others: "Transform3d") -> "Transform3d":
+        """
+        Return a new Transform3d representing the composition of self with the
+        given other transforms, which will be stored as an internal list.
+
+        Args:
+            *others: Any number of Transform3d objects
+
+        Returns:
+            A new Transform3d with the stored transforms
+        """
+        out = Transform3d(dtype=self.dtype, device=self.device)
+        out._matrix = self._matrix.clone()
+        for other in others:
+            if not isinstance(other, Transform3d):
+                msg = "Only possible to compose Transform3d objects; got %s"
+                raise ValueError(msg % type(other))
+        out._transforms = self._transforms + list(others)
+        return out
+
+    def get_matrix(self) -> torch.Tensor:
+        """
+        Return a matrix which is the result of composing this transform
+        with others stored in self.transforms. Where necessary transforms
+        are broadcast against each other.
+        For example, if self.transforms contains transforms t1, t2, and t3, and
+        given a set of points x, the following should be true:
+
+        .. code-block:: python
+
+            y1 = t1.compose(t2, t3).transform(x)
+            y2 = t3.transform(t2.transform(t1.transform(x)))
+            y1.get_matrix() == y2.get_matrix()
+
+        Returns:
+            A transformation matrix representing the composed inputs.
+        """
+        composed_matrix = self._matrix.clone()
+        if len(self._transforms) > 0:
+            for other in self._transforms:
+                other_matrix = other.get_matrix()
+                composed_matrix = _broadcast_bmm(composed_matrix, other_matrix)
+        return composed_matrix
+
+    def _get_matrix_inverse(self) -> torch.Tensor:
+        """
+        Return the inverse of self._matrix.
+        """
+        return torch.inverse(self._matrix)
+
+    def inverse(self, invert_composed: bool = False) -> "Transform3d":
+        """
+        Returns a new Transform3d object that represents an inverse of the
+        current transformation.
+
+        Args:
+            invert_composed:
+                - True: First compose the list of stored transformations
+                  and then apply inverse to the result. This is
+                  potentially slower for classes of transformations
+                  with inverses that can be computed efficiently
+                  (e.g. rotations and translations).
+                - False: Invert the individual stored transformations
+                  independently without composing them.
+
+        Returns:
+            A new Transform3d object containing the inverse of the original
+            transformation.
+        """
+
+        tinv = Transform3d(dtype=self.dtype, device=self.device)
+
+        if invert_composed:
+            # first compose then invert
+            tinv._matrix = torch.inverse(self.get_matrix())
+        else:
+            # self._get_matrix_inverse() implements efficient inverse
+            # of self._matrix
+            i_matrix = self._get_matrix_inverse()
+
+            # 2 cases:
+            if len(self._transforms) > 0:
+                # a) Either we have a non-empty list of transforms:
+                # Here we take self._matrix and append its inverse at the
+                # end of the reverted _transforms list. After composing
+                # the transformations with get_matrix(), this correctly
+                # right-multiplies by the inverse of self._matrix
+                # at the end of the composition.
+                tinv._transforms = [t.inverse() for t in reversed(self._transforms)]
+                last = Transform3d(dtype=self.dtype, device=self.device)
+                last._matrix = i_matrix
+                tinv._transforms.append(last)
+            else:
+                # b) Or there are no stored transformations
+                # we just set inverted matrix
+                tinv._matrix = i_matrix
+
+        return tinv
+
+    def stack(self, *others: "Transform3d") -> "Transform3d":
+        """
+        Return a new batched Transform3d representing the batch elements from
+        self and all the given other transforms all batched together.
+
+        Args:
+            *others: Any number of Transform3d objects
+
+        Returns:
+            A new Transform3d.
+        """
+        transforms = [self] + list(others)
+        matrix = torch.cat([t.get_matrix() for t in transforms], dim=0)
+        out = Transform3d(dtype=self.dtype, device=self.device)
+        out._matrix = matrix
+        return out
+
+    def transform_points(self, points, eps: Optional[float] = None) -> torch.Tensor:
+        """
+        Use this transform to transform a set of 3D points. Assumes row major
+        ordering of the input points.
+
+        Args:
+            points: Tensor of shape (P, 3) or (N, P, 3)
+            eps: If eps!=None, the argument is used to clamp the
+                last coordinate before performing the final division.
+                The clamping corresponds to:
+                last_coord := (last_coord.sign() + (last_coord==0)) *
+                torch.clamp(last_coord.abs(), eps),
+                i.e. the last coordinates that are exactly 0 will
+                be clamped to +eps.
+
+        Returns:
+            points_out: points of shape (N, P, 3) or (P, 3) depending
+            on the dimensions of the transform
+        """
+        points_batch = points.clone()
+        if points_batch.dim() == 2:
+            points_batch = points_batch[None]  # (P, 3) -> (1, P, 3)
+        if points_batch.dim() != 3:
+            msg = "Expected points to have dim = 2 or dim = 3: got shape %r"
+            raise ValueError(msg % repr(points.shape))
+
+        N, P, _3 = points_batch.shape
+        ones = torch.ones(N, P, 1, dtype=points.dtype, device=points.device)
+        points_batch = torch.cat([points_batch, ones], dim=2)
+
+        composed_matrix = self.get_matrix()
+        points_out = _broadcast_bmm(points_batch, composed_matrix)
+        denom = points_out[..., 3:]  # denominator
+        if eps is not None:
+            denom_sign = denom.sign() + (denom == 0.0).type_as(denom)
+            denom = denom_sign * torch.clamp(denom.abs(), eps)
+        points_out = points_out[..., :3] / denom
+
+        # When transform is (1, 4, 4) and points is (P, 3) return
+        # points_out of shape (P, 3)
+        if points_out.shape[0] == 1 and points.dim() == 2:
+            points_out = points_out.reshape(points.shape)
+
+        return points_out
+
+    def transform_normals(self, normals) -> torch.Tensor:
+        """
+        Use this transform to transform a set of normal vectors.
+
+        Args:
+            normals: Tensor of shape (P, 3) or (N, P, 3)
+
+        Returns:
+            normals_out: Tensor of shape (P, 3) or (N, P, 3) depending
+            on the dimensions of the transform
+        """
+        if normals.dim() not in [2, 3]:
+            msg = "Expected normals to have dim = 2 or dim = 3: got shape %r"
+            raise ValueError(msg % (normals.shape,))
+        composed_matrix = self.get_matrix()
+
+        # TODO: inverse is bad! Solve a linear system instead
+        mat = composed_matrix[:, :3, :3]
+        normals_out = _broadcast_bmm(normals, mat.transpose(1, 2).inverse())
+
+        # This doesn't pass unit tests. TODO investigate further
+        # if self._lu is None:
+        #     self._lu = self._matrix[:, :3, :3].transpose(1, 2).lu()
+        # normals_out = normals.lu_solve(*self._lu)
+
+        # When transform is (1, 4, 4) and normals is (P, 3) return
+        # normals_out of shape (P, 3)
+        if normals_out.shape[0] == 1 and normals.dim() == 2:
+            normals_out = normals_out.reshape(normals.shape)
+
+        return normals_out
+
+    def translate(self, *args, **kwargs) -> "Transform3d":
+        return self.compose(
+            Translate(device=self.device, dtype=self.dtype, *args, **kwargs)
+        )
+
+    def scale(self, *args, **kwargs) -> "Transform3d":
+        return self.compose(
+            Scale(device=self.device, dtype=self.dtype, *args, **kwargs)
+        )
+
+    def rotate(self, *args, **kwargs) -> "Transform3d":
+        return self.compose(
+            Rotate(device=self.device, dtype=self.dtype, *args, **kwargs)
+        )
+
+    def rotate_axis_angle(self, *args, **kwargs) -> "Transform3d":
+        return self.compose(
+            RotateAxisAngle(device=self.device, dtype=self.dtype, *args, **kwargs)
+        )
+
+    def clone(self) -> "Transform3d":
+        """
+        Deep copy of Transforms object. All internal tensors are cloned
+        individually.
+
+        Returns:
+            new Transforms object.
+        """
+        other = Transform3d(dtype=self.dtype, device=self.device)
+        if self._lu is not None:
+            other._lu = [elem.clone() for elem in self._lu]
+        other._matrix = self._matrix.clone()
+        other._transforms = [t.clone() for t in self._transforms]
+        return other
+
+    def to(
+        self,
+        device: Device,
+        copy: bool = False,
+        dtype: Optional[torch.dtype] = None,
+    ) -> "Transform3d":
+        """
+        Match functionality of torch.Tensor.to()
+        If copy = True or the self Tensor is on a different device, the
+        returned tensor is a copy of self with the desired torch.device.
+        If copy = False and the self Tensor already has the correct torch.device,
+        then self is returned.
+
+        Args:
+          device: Device (as str or torch.device) for the new tensor.
+          copy: Boolean indicator whether or not to clone self. Default False.
+          dtype: If not None, casts the internal tensor variables
+              to a given torch.dtype.
+
+        Returns:
+          Transform3d object.
+        """
+        device_ = make_device(device)
+        dtype_ = self.dtype if dtype is None else dtype
+        skip_to = self.device == device_ and self.dtype == dtype_
+
+        if not copy and skip_to:
+            return self
+
+        other = self.clone()
+
+        if skip_to:
+            return other
+
+        other.device = device_
+        other.dtype = dtype_
+        other._matrix = other._matrix.to(device=device_, dtype=dtype_)
+        other._transforms = [
+            t.to(device_, copy=copy, dtype=dtype_) for t in other._transforms
+        ]
+        return other
+
+    def cpu(self) -> "Transform3d":
+        return self.to("cpu")
+
+    def cuda(self) -> "Transform3d":
+        return self.to("cuda")
+
+
+class Translate(Transform3d):
+    def __init__(
+        self,
+        x,
+        y=None,
+        z=None,
+        dtype: torch.dtype = torch.float32,
+        device: Optional[Device] = None,
+    ) -> None:
+        """
+        Create a new Transform3d representing 3D translations.
+
+        Option I: Translate(xyz, dtype=torch.float32, device='cpu')
+            xyz should be a tensor of shape (N, 3)
+
+        Option II: Translate(x, y, z, dtype=torch.float32, device='cpu')
+            Here x, y, and z will be broadcast against each other and
+            concatenated to form the translation. Each can be:
+                - A python scalar
+                - A torch scalar
+                - A 1D torch tensor
+        """
+        xyz = _handle_input(x, y, z, dtype, device, "Translate")
+        super().__init__(device=xyz.device, dtype=dtype)
+        N = xyz.shape[0]
+
+        mat = torch.eye(4, dtype=dtype, device=self.device)
+        mat = mat.view(1, 4, 4).repeat(N, 1, 1)
+        mat[:, 3, :3] = xyz
+        self._matrix = mat
+
+    def _get_matrix_inverse(self) -> torch.Tensor:
+        """
+        Return the inverse of self._matrix.
+        """
+        inv_mask = self._matrix.new_ones([1, 4, 4])
+        inv_mask[0, 3, :3] = -1.0
+        i_matrix = self._matrix * inv_mask
+        return i_matrix
+
+
+class Scale(Transform3d):
+    def __init__(
+        self,
+        x,
+        y=None,
+        z=None,
+        dtype: torch.dtype = torch.float32,
+        device: Optional[Device] = None,
+    ) -> None:
+        """
+        A Transform3d representing a scaling operation, with different scale
+        factors along each coordinate axis.
+
+        Option I: Scale(s, dtype=torch.float32, device='cpu')
+            s can be one of
+                - Python scalar or torch scalar: Single uniform scale
+                - 1D torch tensor of shape (N,): A batch of uniform scale
+                - 2D torch tensor of shape (N, 3): Scale differently along each axis
+
+        Option II: Scale(x, y, z, dtype=torch.float32, device='cpu')
+            Each of x, y, and z can be one of
+                - python scalar
+                - torch scalar
+                - 1D torch tensor
+        """
+        xyz = _handle_input(x, y, z, dtype, device, "scale", allow_singleton=True)
+        super().__init__(device=xyz.device, dtype=dtype)
+        N = xyz.shape[0]
+
+        # TODO: Can we do this all in one go somehow?
+        mat = torch.eye(4, dtype=dtype, device=self.device)
+        mat = mat.view(1, 4, 4).repeat(N, 1, 1)
+        mat[:, 0, 0] = xyz[:, 0]
+        mat[:, 1, 1] = xyz[:, 1]
+        mat[:, 2, 2] = xyz[:, 2]
+        self._matrix = mat
+
+    def _get_matrix_inverse(self) -> torch.Tensor:
+        """
+        Return the inverse of self._matrix.
+        """
+        xyz = torch.stack([self._matrix[:, i, i] for i in range(4)], dim=1)
+        ixyz = 1.0 / xyz
+        imat = torch.diag_embed(ixyz, dim1=1, dim2=2)
+        return imat
+
+
+class Rotate(Transform3d):
+    def __init__(
+        self,
+        R: torch.Tensor,
+        dtype: torch.dtype = torch.float32,
+        device: Optional[Device] = None,
+        orthogonal_tol: float = 1e-5,
+    ) -> None:
+        """
+        Create a new Transform3d representing 3D rotation using a rotation
+        matrix as the input.
+
+        Args:
+            R: a tensor of shape (3, 3) or (N, 3, 3)
+            orthogonal_tol: tolerance for the test of the orthogonality of R
+
+        """
+        device_ = get_device(R, device)
+        super().__init__(device=device_, dtype=dtype)
+        if R.dim() == 2:
+            R = R[None]
+        if R.shape[-2:] != (3, 3):
+            msg = "R must have shape (3, 3) or (N, 3, 3); got %s"
+            raise ValueError(msg % repr(R.shape))
+        R = R.to(device=device_, dtype=dtype)
+        _check_valid_rotation_matrix(R, tol=orthogonal_tol)
+        N = R.shape[0]
+        mat = torch.eye(4, dtype=dtype, device=device_)
+        mat = mat.view(1, 4, 4).repeat(N, 1, 1)
+        mat[:, :3, :3] = R
+        self._matrix = mat
+
+    def _get_matrix_inverse(self) -> torch.Tensor:
+        """
+        Return the inverse of self._matrix.
+        """
+        return self._matrix.permute(0, 2, 1).contiguous()
+
+
+class RotateAxisAngle(Rotate):
+    def __init__(
+        self,
+        angle,
+        axis: str = "X",
+        degrees: bool = True,
+        dtype: torch.dtype = torch.float32,
+        device: Optional[Device] = None,
+    ) -> None:
+        """
+        Create a new Transform3d representing 3D rotation about an axis
+        by an angle.
+
+        Assuming a right-hand coordinate system, positive rotation angles result
+        in a counter clockwise rotation.
+
+        Args:
+            angle:
+                - A torch tensor of shape (N,)
+                - A python scalar
+                - A torch scalar
+            axis:
+                string: one of ["X", "Y", "Z"] indicating the axis about which
+                to rotate.
+                NOTE: All batch elements are rotated about the same axis.
+        """
+        axis = axis.upper()
+        if axis not in ["X", "Y", "Z"]:
+            msg = "Expected axis to be one of ['X', 'Y', 'Z']; got %s"
+            raise ValueError(msg % axis)
+        angle = _handle_angle_input(angle, dtype, device, "RotateAxisAngle")
+        angle = (angle / 180.0 * math.pi) if degrees else angle
+        # We assume the points on which this transformation will be applied
+        # are row vectors. The rotation matrix returned from _axis_angle_rotation
+        # is for transforming column vectors. Therefore we transpose this matrix.
+        # R will always be of shape (N, 3, 3)
+        R = _axis_angle_rotation(axis, angle).transpose(1, 2)
+        super().__init__(device=angle.device, R=R, dtype=dtype)
+
+
+def _handle_coord(c, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+    """
+    Helper function for _handle_input.
+
+    Args:
+        c: Python scalar, torch scalar, or 1D torch tensor
+
+    Returns:
+        c_vec: 1D torch tensor
+    """
+    if not torch.is_tensor(c):
+        c = torch.tensor(c, dtype=dtype, device=device)
+    if c.dim() == 0:
+        c = c.view(1)
+    if c.device != device or c.dtype != dtype:
+        c = c.to(device=device, dtype=dtype)
+    return c
+
+
+def _handle_input(
+    x,
+    y,
+    z,
+    dtype: torch.dtype,
+    device: Optional[Device],
+    name: str,
+    allow_singleton: bool = False,
+) -> torch.Tensor:
+    """
+    Helper function to handle parsing logic for building transforms. The output
+    is always a tensor of shape (N, 3), but there are several types of allowed
+    input.
+
+    Case I: Single Matrix
+        In this case x is a tensor of shape (N, 3), and y and z are None. Here just
+        return x.
+
+    Case II: Vectors and Scalars
+        In this case each of x, y, and z can be one of the following
+            - Python scalar
+            - Torch scalar
+            - Torch tensor of shape (N, 1) or (1, 1)
+        In this case x, y and z are broadcast to tensors of shape (N, 1)
+        and concatenated to a tensor of shape (N, 3)
+
+    Case III: Singleton (only if allow_singleton=True)
+        In this case y and z are None, and x can be one of the following:
+            - Python scalar
+            - Torch scalar
+            - Torch tensor of shape (N, 1) or (1, 1)
+        Here x will be duplicated 3 times, and we return a tensor of shape (N, 3)
+
+    Returns:
+        xyz: Tensor of shape (N, 3)
+    """
+    device_ = get_device(x, device)
+    # If x is actually a tensor of shape (N, 3) then just return it
+    if torch.is_tensor(x) and x.dim() == 2:
+        if x.shape[1] != 3:
+            msg = "Expected tensor of shape (N, 3); got %r (in %s)"
+            raise ValueError(msg % (x.shape, name))
+        if y is not None or z is not None:
+            msg = "Expected y and z to be None (in %s)" % name
+            raise ValueError(msg)
+        return x.to(device=device_, dtype=dtype)
+
+    if allow_singleton and y is None and z is None:
+        y = x
+        z = x
+
+    # Convert all to 1D tensors
+    xyz = [_handle_coord(c, dtype, device_) for c in [x, y, z]]
+
+    # Broadcast and concatenate
+    sizes = [c.shape[0] for c in xyz]
+    N = max(sizes)
+    for c in xyz:
+        if c.shape[0] != 1 and c.shape[0] != N:
+            msg = "Got non-broadcastable sizes %r (in %s)" % (sizes, name)
+            raise ValueError(msg)
+    xyz = [c.expand(N) for c in xyz]
+    xyz = torch.stack(xyz, dim=1)
+    return xyz
+
+
+def _handle_angle_input(
+    x, dtype: torch.dtype, device: Optional[Device], name: str
+) -> torch.Tensor:
+    """
+    Helper function for building a rotation function using angles.
+    The output is always of shape (N,).
+
+    The input can be one of:
+        - Torch tensor of shape (N,)
+        - Python scalar
+        - Torch scalar
+    """
+    device_ = get_device(x, device)
+    if torch.is_tensor(x) and x.dim() > 1:
+        msg = "Expected tensor of shape (N,); got %r (in %s)"
+        raise ValueError(msg % (x.shape, name))
+    else:
+        return _handle_coord(x, dtype, device_)
+
+
+def _broadcast_bmm(a, b) -> torch.Tensor:
+    """
+    Batch multiply two matrices and broadcast if necessary.
+
+    Args:
+        a: torch tensor of shape (P, K) or (M, P, K)
+        b: torch tensor of shape (N, K, K)
+
+    Returns:
+        a and b broadcast multiplied. The output batch dimension is max(N, M).
+
+    To broadcast transforms across a batch dimension if M != N then
+    expect that either M = 1 or N = 1. The tensor with batch dimension 1 is
+    expanded to have shape N or M.
+    """
+    if a.dim() == 2:
+        a = a[None]
+    if len(a) != len(b):
+        if not ((len(a) == 1) or (len(b) == 1)):
+            msg = "Expected batch dim for bmm to be equal or 1; got %r, %r"
+            raise ValueError(msg % (a.shape, b.shape))
+        if len(a) == 1:
+            a = a.expand(len(b), -1, -1)
+        if len(b) == 1:
+            b = b.expand(len(a), -1, -1)
+    return a.bmm(b)
+
+
+@torch.no_grad()
+def _check_valid_rotation_matrix(R, tol: float = 1e-7) -> None:
+    """
+    Determine if R is a valid rotation matrix by checking it satisfies the
+    following conditions:
+
+    ``RR^T = I and det(R) = 1``
+
+    Args:
+        R: an (N, 3, 3) matrix
+
+    Returns:
+        None
+
+    Emits a warning if R is an invalid rotation matrix.
+    """
+    N = R.shape[0]
+    eye = torch.eye(3, dtype=R.dtype, device=R.device)
+    eye = eye.view(1, 3, 3).expand(N, -1, -1)
+    orthogonal = torch.allclose(R.bmm(R.transpose(1, 2)), eye, atol=tol)
+    det_R = _safe_det_3x3(R)
+    no_distortion = torch.allclose(det_R, torch.ones_like(det_R))
+    if not (orthogonal and no_distortion):
+        msg = "R is not a valid rotation matrix"
+        warnings.warn(msg)
+    return
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/comm.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/comm.py
new file mode 100644
index 0000000..77f3bdb
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/comm.py
@@ -0,0 +1,105 @@
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import logging
+from functools import wraps
+
+import torch.distributed as dist
+
+LOG = logging.getLogger(__name__)
+
+_NESTED_BROADCAST_FROM_MASTER = False
+
+
+def get_world_size() -> int:
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+
+def is_distributed():
+    return get_world_size() > 1
+
+
+def broadcast_from_master(fn):
+    """If distributed, only the master executes the function and broadcast the results to other workers.
+
+    Usage:
+    @broadcast_from_master
+    def foo(a, b): ...
+    """
+    @wraps(fn)
+    def wrapper(*args, **kwargs):  # pylint: disable=unused-argument
+        global _NESTED_BROADCAST_FROM_MASTER
+
+        if not is_distributed():
+            return fn(*args, **kwargs)
+
+        if _NESTED_BROADCAST_FROM_MASTER:
+            assert d2_comm.is_main_process()
+            LOG.warning(f"_NESTED_BROADCAST_FROM_MASTER = True, {fn.__name__}")
+            return fn(*args, **kwargs)
+
+        if d2_comm.is_main_process():
+            _NESTED_BROADCAST_FROM_MASTER = True
+            ret = [fn(*args, **kwargs), ]
+            _NESTED_BROADCAST_FROM_MASTER = False
+        else:
+            ret = [None, ]
+        if dist.is_initialized():
+            dist.broadcast_object_list(ret)
+        ret = ret[0]
+
+        assert ret is not None
+        return ret
+
+    return wrapper
+
+
+def master_only(fn):
+    """If distributed, only the master executes the function.
+
+    Usage:
+    @master_only
+    def foo(a, b): ...
+    """
+    @wraps(fn)
+    def wrapped_fn(*args, **kwargs):
+        if d2_comm.is_main_process():
+            ret = fn(*args, **kwargs)
+        d2_comm.synchronize()
+        if d2_comm.is_main_process():
+            return ret
+
+    return wrapped_fn
+
+
+def gather_dict(dikt):
+    """Gather python dictionaries from all workers to the rank=0 worker.
+
+    Assumption: the keys of `dikt` are disjoint across all workers.
+
+    If rank = 0, then returned aggregated dict.
+    If rank > 0, then return `None`.
+    """
+    dict_lst = d2_comm.gather(dikt, dst=0)
+    if d2_comm.is_main_process():
+        gathered_dict = {}
+        for dic in dict_lst:
+            for k in dic.keys():
+                assert k not in gathered_dict, f"Dictionary key overlaps: {k}"
+            gathered_dict.update(dic)
+        return gathered_dict
+    else:
+        return None
+
+
+def reduce_sum(tensor):
+    """
+    Adapted from AdelaiDet:
+        https://github.com/aim-uofa/AdelaiDet/blob/master/adet/utils/comm.py
+    """
+    if not is_distributed():
+        return tensor
+    tensor = tensor.clone()
+    dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
+    return tensor
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/geometry.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/geometry.py
new file mode 100644
index 0000000..d8f546b
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/geometry.py
@@ -0,0 +1,204 @@
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import logging
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+LOG = logging.getLogger(__name__)
+
+PI = 3.14159265358979323846
+EPS = 1e-7
+
+def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
+    """
+    Returns torch.sqrt(torch.max(0, x))
+    but with a zero subgradient where x is 0.
+    """
+    ret = torch.zeros_like(x)
+    positive_mask = x > 0
+    ret[positive_mask] = torch.sqrt(x[positive_mask])
+    return ret
+
+def matrix_to_quaternion(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to quaternions.
+
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+
+    Returns:
+        quaternions with real part first, as tensor of shape (..., 4).
+    """
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+
+    batch_dim = matrix.shape[:-2]
+    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(
+        matrix.reshape(batch_dim + (9,)), dim=-1
+    )
+
+    q_abs = _sqrt_positive_part(
+        torch.stack(
+            [
+                1.0 + m00 + m11 + m22,
+                1.0 + m00 - m11 - m22,
+                1.0 - m00 + m11 - m22,
+                1.0 - m00 - m11 + m22,
+            ],
+            dim=-1,
+        )
+    )
+
+    # we produce the desired quaternion multiplied by each of r, i, j, k
+    quat_by_rijk = torch.stack(
+        [
+            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
+            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
+            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
+            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
+        ],
+        dim=-2,
+    )
+
+    # We floor here at 0.1 but the exact level is not important; if q_abs is small,
+    # the candidate won't be picked.
+    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
+    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
+
+    # if not for numerical problems, quat_candidates[i] should be same (up to a sign),
+    # forall i; we pick the best-conditioned one (with the largest denominator)
+
+    return quat_candidates[
+        F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :  # pyre-ignore[16]
+    ].reshape(batch_dim + (4,))
+
+def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as quaternions to rotation matrices.
+
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    r, i, j, k = torch.unbind(quaternions, -1)
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+
+def allocentric_to_egocentric(quat, proj_ctr, inv_intrinsics):
+    """
+    Parameters
+    ----------
+    quat: Tensor
+        (N, 4). Batch of (allocentric) quaternions.
+
+    proj_ctr: Tensor
+        (N, 2). Projected centers. xy coordninates.
+
+    inv_intrinsics: [type]
+        (N, 3, 3). Inverted intrinsics.
+    """
+    R_obj_to_local = quaternion_to_matrix(quat)
+
+    # ray == z-axis in local orientaion
+    ray = unproject_points2d(proj_ctr, inv_intrinsics)
+    z = ray / ray.norm(dim=1, keepdim=True)
+
+    # gram-schmit process: local_y = global_y - global_y \dot local_z
+    y = z.new_tensor([[0., 1., 0.]]) - z[:, 1:2] * z
+    y = y / y.norm(dim=1, keepdim=True)
+    x = torch.cross(y, z, dim=1)
+
+    # local -> global
+    R_local_to_global = torch.stack([x, y, z], dim=-1)
+
+    # obj -> global
+    R_obj_to_global = torch.bmm(R_local_to_global, R_obj_to_local)
+
+    egocentric_quat = matrix_to_quaternion(R_obj_to_global)
+
+    # Make sure it's unit norm.
+    quat_norm = egocentric_quat.norm(dim=1, keepdim=True)
+    if not torch.allclose(quat_norm, torch.as_tensor(1.), atol=1e-3):
+        LOG.warning(
+            f"Some of the input quaternions are not unit norm: min={quat_norm.min()}, max={quat_norm.max()}; therefore normalizing."
+        )
+        egocentric_quat = egocentric_quat / quat_norm.clamp(min=EPS)
+
+    return egocentric_quat
+
+
+def homogenize_points(xy):
+    """
+    Parameters
+    ----------
+    xy: Tensor
+        xy coordinates. shape=(N, ..., 2)
+        E.g., (N, 2) or (N, K, 2) or (N, H, W, 2)
+
+    Returns
+    -------
+    Tensor:
+        1. is appended to the last dimension. shape=(N, ..., 3)
+        E.g, (N, 3) or (N, K, 3) or (N, H, W, 3).
+    """
+    # NOTE: this seems to work for arbitrary number of dimensions of input
+    pad = torch.nn.ConstantPad1d(padding=(0, 1), value=1.)
+    return pad(xy)
+
+
+def project_points3d(Xw, K):
+    _, C = Xw.shape
+    assert C == 3
+    uv, _ = cv2.projectPoints(
+        Xw, np.zeros((3, 1), dtype=np.float32), np.zeros(3, dtype=np.float32), K, np.zeros(5, dtype=np.float32)
+    )
+    return uv.reshape(-1, 2)
+
+
+def unproject_points2d(points2d, inv_K, scale=1.0):
+    """
+    Parameters
+    ----------
+    points2d: Tensor
+        xy coordinates. shape=(N, ..., 2)
+        E.g., (N, 2) or (N, K, 2) or (N, H, W, 2)
+
+    inv_K: Tensor
+        Inverted intrinsics; shape=(N, 3, 3)
+
+    scale: float, default: 1.0
+        Scaling factor.
+
+    Returns
+    -------
+    Tensor:
+        Unprojected 3D point. shape=(N, ..., 3)
+        E.g., (N, 3) or (N, K, 3) or (N, H, W, 3)
+    """
+    points2d = homogenize_points(points2d)
+    siz = points2d.size()
+    points2d = points2d.view(-1, 3).unsqueeze(-1)  # (N, 3, 1)
+    unprojected = torch.matmul(inv_K, points2d)  # (N, 3, 3) x (N, 3, 1) -> (N, 3, 1)
+    unprojected = unprojected.view(siz)
+
+    return unprojected * scale
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/tasks.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/tasks.py
new file mode 100644
index 0000000..997fbb3
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/tasks.py
@@ -0,0 +1,97 @@
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+from collections import OrderedDict
+
+# from detectron2.config import configurable
+
+
+class Task():
+    def __init__(self, name, is_detection_task, is_dense_prediction_task):
+        self.name = name
+        self.is_detection_task = is_detection_task
+        self.is_dense_prediction_task = is_dense_prediction_task
+
+
+# yapf: disable
+TASKS = [
+    Task(
+        name="box2d",
+        is_detection_task=True,
+        is_dense_prediction_task=False,
+    ),
+    Task(
+        name="box3d",
+        is_detection_task=True,
+        is_dense_prediction_task=False,
+    ),
+    Task(
+        name="depth",
+        is_detection_task=False,
+        is_dense_prediction_task=True,
+    )
+]
+# yapf: enable
+
+NAME_TO_TASK = OrderedDict([(task.name, task) for task in TASKS])
+
+
+class TaskManager():
+    #@configurable
+    def __init__(self, box2d_on=False, box3d_on=False, depth_on=False):
+        """
+        configurable is experimental.
+        """
+        self._box2d_on = self._mask2d_on = self._box3d_on = self._semseg2d_on = self._depth_on = False
+        tasks = []
+        if box2d_on:
+            tasks.append(NAME_TO_TASK['box2d'])
+            self._box2d_on = True
+        if box3d_on:
+            tasks.append(NAME_TO_TASK['box3d'])
+            self._box3d_on = True
+        if depth_on:
+            tasks.append(NAME_TO_TASK['depth'])
+            self._depth_on = True
+
+        if not tasks:
+            raise ValueError("No task specified.")
+
+        self._tasks = tasks
+
+    @property
+    def tasks(self):
+        return self._tasks
+
+    '''@classmethod
+    def from_config(cls, cfg):
+        # yapf: disable
+        return OrderedDict(
+            box2d_on    = cfg.MODEL.BOX2D_ON,
+            box3d_on    = cfg.MODEL.BOX3D_ON,
+            depth_on    = cfg.MODEL.DEPTH_ON,
+        )
+        # yapf: enable'''
+
+    # Indicators that tells if each task is enabled.
+    @property
+    def box2d_on(self):
+        return self._box2d_on
+
+    @property
+    def box3d_on(self):
+        return self._box3d_on
+
+    @property
+    def depth_on(self):
+        return self._depth_on
+
+    @property
+    def has_dense_prediction_task(self):
+        return any([task.is_dense_prediction_task for task in self.tasks])
+
+    @property
+    def has_detection_task(self):
+        return any([task.is_detection_task for task in self.tasks])
+
+    @property
+    def task_names(self):
+        return [task.name for task in self.tasks]
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/tensor2d.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/tensor2d.py
new file mode 100644
index 0000000..2922567
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/tensor2d.py
@@ -0,0 +1,47 @@
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import torch
+import torch.nn.functional as F
+
+
+def compute_features_locations(h, w, stride, dtype=torch.float32, device='cpu', offset="none"):
+    """Adapted from AdelaiDet:
+        https://github.com/aim-uofa/AdelaiDet/blob/master/adet/utils/comm.py
+
+    Key differnece: offset is configurable.
+    """
+    shifts_x = torch.arange(0, w * stride, step=stride, dtype=dtype, device=device)
+    shifts_y = torch.arange(0, h * stride, step=stride, dtype=dtype, device=device)
+    shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
+    shift_x = shift_x.reshape(-1)
+    shift_y = shift_y.reshape(-1)
+    # (dennis.park)
+    # locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2
+    locations = torch.stack((shift_x, shift_y), dim=1)
+    if offset == "half":
+        locations += stride // 2
+    else:
+        assert offset == "none"
+
+    return locations
+
+
+def aligned_bilinear(tensor, factor, offset="none"):
+    """Adapted from AdelaiDet:
+        https://github.com/aim-uofa/AdelaiDet/blob/master/adet/utils/comm.py
+    """
+    assert tensor.dim() == 4
+    assert factor >= 1
+    assert int(factor) == factor
+
+    if factor == 1:
+        return tensor
+
+    h, w = tensor.size()[2:]
+    tensor = F.pad(tensor, pad=(0, 1, 0, 1), mode="replicate")
+    oh = factor * h + 1
+    ow = factor * w + 1
+    tensor = F.interpolate(tensor, size=(oh, ow), mode='bilinear', align_corners=True)
+    if offset == "half":
+        tensor = F.pad(tensor, pad=(factor // 2, 0, factor // 2, 0), mode="replicate")
+
+    return tensor[:, :, :oh - 1, :ow - 1]
diff --git a/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/visualization.py b/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/visualization.py
new file mode 100644
index 0000000..71e78b1
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/dd3d/utils/visualization.py
@@ -0,0 +1,147 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+import colorsys
+import os
+
+import cv2
+import matplotlib.colors as mplc
+import numpy as np
+from PIL import Image, ImageDraw
+
+
+def fill_color_polygon(image, polygon, color, alpha=0.5):
+    """Color interior of polygon with alpha-blending. This function modified input in place.
+    """
+    _mask = Image.new('L', (image.shape[1], image.shape[0]), 0)
+    ImageDraw.Draw(_mask).polygon(polygon, outline=1, fill=1)
+    mask = np.array(_mask, np.bool)
+    for c in range(3):
+        channel = image[:, :, c]
+        channel[mask] = channel[mask] * (1. - alpha) + color[c] * alpha
+
+
+def change_color_brightness(color, brightness_factor):
+    """
+    Copied from detectron2.utils.visualizer.py
+    -------------------------------------------
+
+    Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
+    less or more saturation than the original color.
+
+    Args:
+        color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+            formats that are accepted.
+        brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
+            0 will correspond to no change, a factor in [-1.0, 0) range will result in
+            a darker color and a factor in (0, 1.0] range will result in a lighter color.
+
+    Returns:
+        modified_color (tuple[double]): a tuple containing the RGB values of the
+            modified color. Each value in the tuple is in the [0.0, 1.0] range.
+    """
+    assert brightness_factor >= -1.0 and brightness_factor <= 1.0
+    color = mplc.to_rgb(color)
+    polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
+    modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
+    modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
+    modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
+    modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2])
+    return modified_color
+
+
+def draw_text(ax, text, position, *, font_size, color="g", horizontal_alignment="center", rotation=0):
+    """
+    Copied from Visualizer.draw_text()
+    -----------------------------------
+
+    Args:
+        text (str): class label
+        position (tuple): a tuple of the x and y coordinates to place text on image.
+        font_size (int, optional): font of the text. If not provided, a font size
+            proportional to the image width is calculated and used.
+        color: color of the text. Refer to `matplotlib.colors` for full list
+            of formats that are accepted.
+        horizontal_alignment (str): see `matplotlib.text.Text`
+        rotation: rotation angle in degrees CCW
+
+    Returns:
+        output (VisImage): image object with text drawn.
+    """
+    # since the text background is dark, we don't want the text to be dark
+    color = np.maximum(list(mplc.to_rgb(color)), 0.2)
+    color[np.argmax(color)] = max(0.8, np.max(color))
+
+    x, y = position
+    ax.text(
+        x,
+        y,
+        text,
+        size=font_size,
+        family="sans-serif",
+        bbox={
+            "facecolor": "black",
+            "alpha": 0.8,
+            "pad": 0.7,
+            "edgecolor": "none"
+        },
+        verticalalignment="top",
+        horizontalalignment=horizontal_alignment,
+        color=color,
+        zorder=10,
+        rotation=rotation,
+    )
+    return ax
+
+
+def float_to_uint8_color(float_clr):
+    assert all([c >= 0. for c in float_clr])
+    assert all([c <= 1. for c in float_clr])
+    return [int(c * 255.) for c in float_clr]
+
+
+def mosaic(items, scale=1.0, pad=3, grid_width=None):
+    """Creates a mosaic from list of images.
+
+    Parameters
+    ----------
+    items: list of np.ndarray
+        List of images to mosaic.
+
+    scale: float, default=1.0
+        Scale factor applied to images. scale > 1.0 enlarges images.
+
+    pad: int, default=3
+        Padding size of the images before mosaic
+
+    grid_width: int, default=None
+        Mosaic width or grid width of the mosaic
+
+    Returns
+    -------
+    image: np.array of shape (H, W, 3)
+        Image mosaic
+    """
+    # Determine tile width and height
+    N = len(items)
+    assert N > 0, 'No items to mosaic!'
+    grid_width = grid_width if grid_width else np.ceil(np.sqrt(N)).astype(int)
+    grid_height = np.ceil(N * 1. / grid_width).astype(np.int)
+    input_size = items[0].shape[:2]
+    target_shape = (int(input_size[1] * scale), int(input_size[0] * scale))
+    mosaic_items = []
+    for j in range(grid_width * grid_height):
+        if j < N:
+            # Only the first image is scaled, the rest are re-shaped
+            # to the same size as the previous image in the mosaic
+            im = cv2.resize(items[j], dsize=target_shape)
+            mosaic_items.append(im)
+        else:
+            mosaic_items.append(np.zeros_like(mosaic_items[-1]))
+
+    # Stack W tiles horizontally first, then vertically
+    im_pad = lambda im: cv2.copyMakeBorder(im, pad, pad, pad, pad, cv2.BORDER_CONSTANT, 0)
+    mosaic_items = [im_pad(im) for im in mosaic_items]
+    hstack = [np.hstack(mosaic_items[j:j + grid_width]) for j in range(0, len(mosaic_items), grid_width)]
+    mosaic_viz = np.vstack(hstack) if len(hstack) > 1 \
+        else hstack[0]
+    return mosaic_viz
diff --git a/adzoo/bevformer/mmdet3d_plugin/models/hooks/__init__.py b/adzoo/bevformer/mmdet3d_plugin/models/hooks/__init__.py
new file mode 100644
index 0000000..93b13c9
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/models/hooks/__init__.py
@@ -0,0 +1 @@
+from .hooks import GradChecker
\ No newline at end of file
diff --git a/adzoo/bevformer/mmdet3d_plugin/models/hooks/hooks.py b/adzoo/bevformer/mmdet3d_plugin/models/hooks/hooks.py
new file mode 100644
index 0000000..56ff7fd
--- /dev/null
+++ b/adzoo/bevformer/mmdet3d_plugin/models/hooks/hooks.py
@@ -0,0 +1,13 @@
+from mmcv.runner.hooks.hook import HOOKS, Hook
+from projects.mmdet3d_plugin.models.utils import run_time
+
+
+@HOOKS.register_module()
+class GradChecker(Hook):
+
+    def after_train_iter(self, runner):
+        for key, val in runner.model.named_parameters():
+            if val.grad == None and val.requires_grad:
+                print('WARNNING: {key}\'s parameters are not be used!!!!'.format(key=key))
+
+
diff --git a/adzoo/bevformer/model_converters/convert_votenet_checkpoints.py b/adzoo/bevformer/model_converters/convert_votenet_checkpoints.py
new file mode 100755
index 0000000..33792b0
--- /dev/null
+++ b/adzoo/bevformer/model_converters/convert_votenet_checkpoints.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import tempfile
+import torch
+from mmcv import Config
+from mmcv.runner import load_state_dict
+
+from mmdet3d.models import build_detector
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet3D upgrade model version(before v0.6.0) of VoteNet')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--out', help='path of the output checkpoint file')
+    args = parser.parse_args()
+    return args
+
+
+def parse_config(config_strings):
+    """Parse config from strings.
+
+    Args:
+        config_strings (string): strings of model config.
+
+    Returns:
+        Config: model config
+    """
+    temp_file = tempfile.NamedTemporaryFile()
+    config_path = f'{temp_file.name}.py'
+    with open(config_path, 'w') as f:
+        f.write(config_strings)
+
+    config = Config.fromfile(config_path)
+
+    # Update backbone config
+    if 'pool_mod' in config.model.backbone:
+        config.model.backbone.pop('pool_mod')
+
+    if 'sa_cfg' not in config.model.backbone:
+        config.model.backbone['sa_cfg'] = dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)
+
+    if 'type' not in config.model.bbox_head.vote_aggregation_cfg:
+        config.model.bbox_head.vote_aggregation_cfg['type'] = 'PointSAModule'
+
+    # Update bbox_head config
+    if 'pred_layer_cfg' not in config.model.bbox_head:
+        config.model.bbox_head['pred_layer_cfg'] = dict(
+            in_channels=128, shared_conv_channels=(128, 128), bias=True)
+
+    if 'feat_channels' in config.model.bbox_head:
+        config.model.bbox_head.pop('feat_channels')
+
+    if 'vote_moudule_cfg' in config.model.bbox_head:
+        config.model.bbox_head['vote_module_cfg'] = config.model.bbox_head.pop(
+            'vote_moudule_cfg')
+
+    if config.model.bbox_head.vote_aggregation_cfg.use_xyz:
+        config.model.bbox_head.vote_aggregation_cfg.mlp_channels[0] -= 3
+
+    temp_file.close()
+
+    return config
+
+
+def main():
+    """Convert keys in checkpoints for VoteNet.
+
+    There can be some breaking changes during the development of mmdetection3d,
+    and this tool is used for upgrading checkpoints trained with old versions
+    (before v0.6.0) to the latest one.
+    """
+    args = parse_args()
+    checkpoint = torch.load(args.checkpoint)
+    cfg = parse_config(checkpoint['meta']['config'])
+    # Build the model and load checkpoint
+    model = build_detector(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    orig_ckpt = checkpoint['state_dict']
+    converted_ckpt = orig_ckpt.copy()
+
+    if cfg['dataset_type'] == 'ScanNetDataset':
+        NUM_CLASSES = 18
+    elif cfg['dataset_type'] == 'SUNRGBDDataset':
+        NUM_CLASSES = 10
+    else:
+        raise NotImplementedError
+
+    RENAME_PREFIX = {
+        'bbox_head.conv_pred.0': 'bbox_head.conv_pred.shared_convs.layer0',
+        'bbox_head.conv_pred.1': 'bbox_head.conv_pred.shared_convs.layer1'
+    }
+
+    DEL_KEYS = [
+        'bbox_head.conv_pred.0.bn.num_batches_tracked',
+        'bbox_head.conv_pred.1.bn.num_batches_tracked'
+    ]
+
+    EXTRACT_KEYS = {
+        'bbox_head.conv_pred.conv_cls.weight':
+        ('bbox_head.conv_pred.conv_out.weight', [(0, 2), (-NUM_CLASSES, -1)]),
+        'bbox_head.conv_pred.conv_cls.bias':
+        ('bbox_head.conv_pred.conv_out.bias', [(0, 2), (-NUM_CLASSES, -1)]),
+        'bbox_head.conv_pred.conv_reg.weight':
+        ('bbox_head.conv_pred.conv_out.weight', [(2, -NUM_CLASSES)]),
+        'bbox_head.conv_pred.conv_reg.bias':
+        ('bbox_head.conv_pred.conv_out.bias', [(2, -NUM_CLASSES)])
+    }
+
+    # Delete some useless keys
+    for key in DEL_KEYS:
+        converted_ckpt.pop(key)
+
+    # Rename keys with specific prefix
+    RENAME_KEYS = dict()
+    for old_key in converted_ckpt.keys():
+        for rename_prefix in RENAME_PREFIX.keys():
+            if rename_prefix in old_key:
+                new_key = old_key.replace(rename_prefix,
+                                          RENAME_PREFIX[rename_prefix])
+                RENAME_KEYS[new_key] = old_key
+    for new_key, old_key in RENAME_KEYS.items():
+        converted_ckpt[new_key] = converted_ckpt.pop(old_key)
+
+    # Extract weights and rename the keys
+    for new_key, (old_key, indices) in EXTRACT_KEYS.items():
+        cur_layers = orig_ckpt[old_key]
+        converted_layers = []
+        for (start, end) in indices:
+            if end != -1:
+                converted_layers.append(cur_layers[start:end])
+            else:
+                converted_layers.append(cur_layers[start:])
+        converted_layers = torch.cat(converted_layers, 0)
+        converted_ckpt[new_key] = converted_layers
+        if old_key in converted_ckpt.keys():
+            converted_ckpt.pop(old_key)
+
+    # Check the converted checkpoint by loading to the model
+    load_state_dict(model, converted_ckpt, strict=True)
+    checkpoint['state_dict'] = converted_ckpt
+    torch.save(checkpoint, args.out)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/bevformer/model_converters/publish_model.py b/adzoo/bevformer/model_converters/publish_model.py
new file mode 100755
index 0000000..318fd46
--- /dev/null
+++ b/adzoo/bevformer/model_converters/publish_model.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import subprocess
+import torch
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Process a checkpoint to be published')
+    parser.add_argument('in_file', help='input checkpoint filename')
+    parser.add_argument('out_file', help='output checkpoint filename')
+    args = parser.parse_args()
+    return args
+
+
+def process_checkpoint(in_file, out_file):
+    checkpoint = torch.load(in_file, map_location='cpu')
+    # remove optimizer for smaller file size
+    if 'optimizer' in checkpoint:
+        del checkpoint['optimizer']
+    # if it is necessary to remove some sensitive data in checkpoint['meta'],
+    # add the code here.
+    torch.save(checkpoint, out_file)
+    sha = subprocess.check_output(['sha256sum', out_file]).decode()
+    final_file = out_file.rstrip('.pth') + '-{}.pth'.format(sha[:8])
+    subprocess.Popen(['mv', out_file, final_file])
+
+
+def main():
+    args = parse_args()
+    process_checkpoint(args.in_file, args.out_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/bevformer/model_converters/regnet2mmdet.py b/adzoo/bevformer/model_converters/regnet2mmdet.py
new file mode 100755
index 0000000..9dee3c8
--- /dev/null
+++ b/adzoo/bevformer/model_converters/regnet2mmdet.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import torch
+from collections import OrderedDict
+
+
+def convert_stem(model_key, model_weight, state_dict, converted_names):
+    new_key = model_key.replace('stem.conv', 'conv1')
+    new_key = new_key.replace('stem.bn', 'bn1')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+    print(f'Convert {model_key} to {new_key}')
+
+
+def convert_head(model_key, model_weight, state_dict, converted_names):
+    new_key = model_key.replace('head.fc', 'fc')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+    print(f'Convert {model_key} to {new_key}')
+
+
+def convert_reslayer(model_key, model_weight, state_dict, converted_names):
+    split_keys = model_key.split('.')
+    layer, block, module = split_keys[:3]
+    block_id = int(block[1:])
+    layer_name = f'layer{int(layer[1:])}'
+    block_name = f'{block_id - 1}'
+
+    if block_id == 1 and module == 'bn':
+        new_key = f'{layer_name}.{block_name}.downsample.1.{split_keys[-1]}'
+    elif block_id == 1 and module == 'proj':
+        new_key = f'{layer_name}.{block_name}.downsample.0.{split_keys[-1]}'
+    elif module == 'f':
+        if split_keys[3] == 'a_bn':
+            module_name = 'bn1'
+        elif split_keys[3] == 'b_bn':
+            module_name = 'bn2'
+        elif split_keys[3] == 'c_bn':
+            module_name = 'bn3'
+        elif split_keys[3] == 'a':
+            module_name = 'conv1'
+        elif split_keys[3] == 'b':
+            module_name = 'conv2'
+        elif split_keys[3] == 'c':
+            module_name = 'conv3'
+        new_key = f'{layer_name}.{block_name}.{module_name}.{split_keys[-1]}'
+    else:
+        raise ValueError(f'Unsupported conversion of key {model_key}')
+    print(f'Convert {model_key} to {new_key}')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+
+
+def convert(src, dst):
+    """Convert keys in pycls pretrained RegNet models to mmdet style."""
+    # load caffe model
+    regnet_model = torch.load(src)
+    blobs = regnet_model['model_state']
+    # convert to pytorch style
+    state_dict = OrderedDict()
+    converted_names = set()
+    for key, weight in blobs.items():
+        if 'stem' in key:
+            convert_stem(key, weight, state_dict, converted_names)
+        elif 'head' in key:
+            convert_head(key, weight, state_dict, converted_names)
+        elif key.startswith('s'):
+            convert_reslayer(key, weight, state_dict, converted_names)
+
+    # check if all layers are converted
+    for key in blobs:
+        if key not in converted_names:
+            print(f'not converted: {key}')
+    # save checkpoint
+    checkpoint = dict()
+    checkpoint['state_dict'] = state_dict
+    torch.save(checkpoint, dst)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert model keys')
+    parser.add_argument('src', help='src detectron model path')
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+    convert(args.src, args.dst)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/bevformer/test.py b/adzoo/bevformer/test.py
new file mode 100755
index 0000000..ca3a035
--- /dev/null
+++ b/adzoo/bevformer/test.py
@@ -0,0 +1,259 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+import argparse
+import os
+import torch
+import warnings
+from mmcv.utils import get_dist_info, init_dist, wrap_fp16_model, set_random_seed, Config, DictAction, load_checkpoint
+from mmcv.models import build_model, fuse_conv_bn
+from torch.nn import DataParallel
+from torch.nn.parallel.distributed import DistributedDataParallel
+
+from mmcv.datasets import build_dataset, build_dataloader, replace_ImageToTensor
+import time
+import os.path as osp
+from adzoo.bevformer.apis.test import custom_multi_gpu_test
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--out', help='output result file in pickle format')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    parser.add_argument(
+        '--format-only',
+        action='store_true',
+        help='Format the output results without perform evaluation. It is'
+        'useful when you want to format the result to a specific format and '
+        'submit it to the test server')
+    parser.add_argument(
+        '--eval',
+        type=str,
+        nargs='+',
+        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
+        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
+    parser.add_argument('--show', action='store_true', help='show results')
+    parser.add_argument(
+        '--show-dir', help='directory where results will be saved')
+    parser.add_argument(
+        '--gpu-collect',
+        action='store_true',
+        help='whether to use gpu to collect results.')
+    parser.add_argument(
+        '--tmpdir',
+        help='tmp directory used for collecting results from multiple '
+        'workers, available when gpu-collect is not specified')
+    parser.add_argument('--seed', type=int, default=0, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function (deprecate), '
+        'change to --eval-options instead.')
+    parser.add_argument(
+        '--eval-options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.eval_options:
+        raise ValueError(
+            '--options and --eval-options cannot be both specified, '
+            '--options is deprecated in favor of --eval-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --eval-options')
+        args.eval_options = args.options
+    return args
+
+
+def main():
+    args = parse_args()
+
+    assert args.out or args.eval or args.format_only or args.show \
+        or args.show_dir, \
+        ('Please specify at least one operation (save/eval/format/show the '
+         'results / save the results) with the argument "--out", "--eval"'
+         ', "--format-only", "--show" or "--show-dir"')
+
+    if args.eval and args.format_only:
+        raise ValueError('--eval and --format_only cannot be both specified')
+
+    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
+        raise ValueError('The output file must be a pkl file.')
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+
+    # # import modules from plguin/xx, registry will be updated
+    # if hasattr(cfg, 'plugin'):
+    #     if cfg.plugin:
+    #         import importlib
+    #         if hasattr(cfg, 'plugin_dir'):
+    #             plugin_dir = cfg.plugin_dir
+    #             _module_dir = os.path.dirname(plugin_dir)
+    #             _module_dir = _module_dir.split('/')
+    #             _module_path = _module_dir[0]
+
+    #             for m in _module_dir[1:]:
+    #                 _module_path = _module_path + '.' + m
+    #             print(_module_path)
+    #             plg_lib = importlib.import_module(_module_path)
+    #         else:
+    #             # import dir is the dirpath for the config file
+    #             _module_dir = os.path.dirname(args.config)
+    #             _module_dir = _module_dir.split('/')
+    #             _module_path = _module_dir[0]
+    #             for m in _module_dir[1:]:
+    #                 _module_path = _module_path + '.' + m
+    #             print(_module_path)
+    #             plg_lib = importlib.import_module(_module_path)
+
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    # set tf32
+    if cfg.get('close_tf32', False):
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.allow_tf32 = False
+
+    cfg.model.pretrained = None
+    # in case the test dataset is concatenated
+    samples_per_gpu = 1
+    if isinstance(cfg.data.test, dict):
+        cfg.data.test.test_mode = True
+        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
+        if samples_per_gpu > 1:
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.test.pipeline = replace_ImageToTensor(
+                cfg.data.test.pipeline)
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            ds_cfg.test_mode = True
+        samples_per_gpu = max(
+            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
+        if samples_per_gpu > 1:
+            for ds_cfg in cfg.data.test:
+                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+
+    # set random seeds
+    if args.seed is not None:
+        set_random_seed(args.seed, deterministic=args.deterministic)
+
+    # build the dataloader
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=samples_per_gpu,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=distributed,
+        shuffle=False,
+        nonshuffler_sampler=cfg.data.nonshuffler_sampler,
+    )
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    model = build_model(cfg.model, test_cfg=cfg.get('test_cfg'))
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
+    if args.fuse_conv_bn:
+        model = fuse_conv_bn(model)
+    # old versions did not save class info in checkpoints, this walkaround is
+    # for backward compatibility
+    if 'CLASSES' in checkpoint.get('meta', {}):
+        model.CLASSES = checkpoint['meta']['CLASSES']
+    else:
+        model.CLASSES = dataset.CLASSES
+    # palette for visualization in segmentation tasks
+    if 'PALETTE' in checkpoint.get('meta', {}):
+        model.PALETTE = checkpoint['meta']['PALETTE']
+    elif hasattr(dataset, 'PALETTE'):
+        # segmentation dataset has `PALETTE` attribute
+        model.PALETTE = dataset.PALETTE
+
+    if not distributed:
+        assert False
+        # model = MMDataParallel(model, device_ids=[0])
+        # outputs = single_gpu_test(model, data_loader, args.show, args.show_dir)
+    else:
+        model = DistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False)
+        outputs = custom_multi_gpu_test(model, data_loader, args.tmpdir,
+                                        args.gpu_collect)
+
+    rank, _ = get_dist_info()
+    if rank == 0:
+        if args.out:
+            print(f'\nwriting results to {args.out}')
+            assert False
+            #mmcv.dump(outputs['bbox_results'], args.out)
+        kwargs = {} if args.eval_options is None else args.eval_options
+        kwargs['jsonfile_prefix'] = osp.join('test', args.config.split(
+            '/')[-1].split('.')[-2], time.ctime().replace(' ', '_').replace(':', '_'))
+        if args.format_only:
+            dataset.format_results(outputs, **kwargs)
+
+        if args.eval:
+            eval_kwargs = cfg.get('evaluation', {}).copy()
+            # hard-code way to remove EvalHook args
+            for key in [
+                    'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
+                    'rule'
+            ]:
+                eval_kwargs.pop(key, None)
+            eval_kwargs.update(dict(metric=args.eval, **kwargs))
+
+            print(dataset.evaluate(outputs, **eval_kwargs))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/bevformer/train.py b/adzoo/bevformer/train.py
new file mode 100755
index 0000000..ce20ce4
--- /dev/null
+++ b/adzoo/bevformer/train.py
@@ -0,0 +1,237 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+ 
+from __future__ import division
+
+import argparse
+import copy
+import mmcv
+import os
+import time
+import torch
+import warnings
+from mmcv import Config, DictAction
+from mmcv.utils import get_dist_info, init_dist
+from os import path as osp
+
+
+from mmcv.datasets import build_dataset
+from mmcv.models import build_model
+from mmcv.utils import collect_env, get_root_logger
+from mmcv.utils import set_random_seed
+
+from mmcv.utils import TORCH_VERSION, digit_version
+from adzoo.bevformer.mmdet3d_plugin.bevformer.apis.train import custom_train_model
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume-from', help='the checkpoint file to resume from')
+    parser.add_argument(
+        '--load-from', help='the checkpoint file to resume from')
+    parser.add_argument(
+        '--no-validate',
+        action='store_true',
+        help='whether not to evaluate the checkpoint during training')
+    group_gpus = parser.add_mutually_exclusive_group()
+    group_gpus.add_argument(
+        '--gpus',
+        type=int,
+        help='number of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument('--seed', type=int, default=0, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file (deprecate), '
+        'change to --cfg-options instead.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local-rank', type=int, default=0)
+    parser.add_argument(
+        '--autoscale-lr',
+        action='store_true',
+        help='automatically scale lr with the number of gpus')
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.cfg_options:
+        raise ValueError(
+            '--options and --cfg-options cannot be both specified, '
+            '--options is deprecated in favor of --cfg-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --cfg-options')
+        args.cfg_options = args.options
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+
+    # set cudnn_benchmark
+    #if cfg.get('cudnn_benchmark', False):
+    torch.backends.cudnn.benchmark = True
+    # set tf32
+    # if cfg.get('close_tf32', False):
+    #     torch.backends.cuda.matmul.allow_tf32 = False
+    #     torch.backends.cudnn.allow_tf32 = False
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+    # if args.resume_from is not None:
+    if args.resume_from is not None and osp.isfile(args.resume_from):
+        cfg.resume_from = args.resume_from
+    if args.load_from is not None and osp.isfile(args.load_from):
+        cfg.load_from = args.load_from
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids
+    else:
+        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+    if digit_version(TORCH_VERSION) == digit_version('1.8.1') and cfg.optimizer['type'] == 'AdamW':
+        cfg.optimizer['type'] = 'AdamW2' # fix bug in Adamw
+    if args.autoscale_lr:
+        # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
+        cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
+        init_dist(args.launcher, **cfg.dist_params)
+        # re-set gpu_ids with distributed training mode
+        _, world_size = get_dist_info()
+        cfg.gpu_ids = range(world_size)
+
+    # create work_dir
+    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+    # dump config
+    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
+    # init the logger before other steps
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+    # specify logger name, if we still use 'mmdet', the output info will be
+    # filtered and won't be saved in the log_file
+    # TODO: ugly workaround to judge whether we are training det or seg model
+    if cfg.model.type in ['EncoderDecoder3D']:
+        logger_name = 'mmseg'
+    else:
+        logger_name = 'mmdet'
+    logger = get_root_logger(
+        log_file=log_file, log_level=cfg.log_level, name=logger_name)
+
+    # init the meta dict to record some important information such as
+    # environment info and seed, which will be logged
+    meta = dict()
+    # log env info
+    env_info_dict = collect_env()
+    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+    dash_line = '-' * 60 + '\n'
+    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
+                dash_line)
+    meta['env_info'] = env_info
+    meta['config'] = cfg.pretty_text
+
+    # log some basic info
+    logger.info(f'Distributed training: {distributed}')
+    logger.info(f'Config:\n{cfg.pretty_text}')
+
+    # set random seeds
+    if args.seed is not None:
+        logger.info(f'Set random seed to {args.seed}, '
+                    f'deterministic: {args.deterministic}')
+        set_random_seed(args.seed, deterministic=args.deterministic)
+    cfg.seed = args.seed
+    meta['seed'] = args.seed
+    meta['exp_name'] = osp.basename(args.config)
+
+    model = build_model(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    model.init_weights()
+
+    logger.info(f'Model:\n{model}')
+    datasets = [build_dataset(cfg.data.train)]
+    if len(cfg.workflow) == 2:
+        val_dataset = copy.deepcopy(cfg.data.val)
+        # in case we use a dataset wrapper
+        if 'dataset' in cfg.data.train:
+            val_dataset.pipeline = cfg.data.train.dataset.pipeline
+        else:
+            val_dataset.pipeline = cfg.data.train.pipeline
+        # set test_mode=False here in deep copied config
+        # which do not affect AP/AR calculation later
+        # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow  # noqa
+        val_dataset.test_mode = False
+        datasets.append(build_dataset(val_dataset))
+    if cfg.checkpoint_config is not None:
+        # save mmdet version, config file content and class names in
+        # checkpoints as meta data
+        cfg.checkpoint_config.meta = dict(
+            config=cfg.pretty_text,
+            CLASSES=datasets[0].CLASSES,
+            PALETTE=datasets[0].PALETTE  # for segmentors
+            if hasattr(datasets[0], 'PALETTE') else None)
+    # add an attribute for visualization convenience
+    model.CLASSES = datasets[0].CLASSES
+    custom_train_model(
+        model,
+        datasets,
+        cfg,
+        distributed=distributed,
+        validate=(not args.no_validate),
+        timestamp=timestamp,
+        meta=meta)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/uniad/analysis_tools/__init__.py b/adzoo/uniad/analysis_tools/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/adzoo/uniad/analysis_tools/analyze_logs.py b/adzoo/uniad/analysis_tools/analyze_logs.py
new file mode 100755
index 0000000..806175f
--- /dev/null
+++ b/adzoo/uniad/analysis_tools/analyze_logs.py
@@ -0,0 +1,201 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+import numpy as np
+import seaborn as sns
+from collections import defaultdict
+from matplotlib import pyplot as plt
+
+
+def cal_train_time(log_dicts, args):
+    for i, log_dict in enumerate(log_dicts):
+        print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}')
+        all_times = []
+        for epoch in log_dict.keys():
+            if args.include_outliers:
+                all_times.append(log_dict[epoch]['time'])
+            else:
+                all_times.append(log_dict[epoch]['time'][1:])
+        all_times = np.array(all_times)
+        epoch_ave_time = all_times.mean(-1)
+        slowest_epoch = epoch_ave_time.argmax()
+        fastest_epoch = epoch_ave_time.argmin()
+        std_over_epoch = epoch_ave_time.std()
+        print(f'slowest epoch {slowest_epoch + 1}, '
+              f'average time is {epoch_ave_time[slowest_epoch]:.4f}')
+        print(f'fastest epoch {fastest_epoch + 1}, '
+              f'average time is {epoch_ave_time[fastest_epoch]:.4f}')
+        print(f'time std over epochs is {std_over_epoch:.4f}')
+        print(f'average iter time: {np.mean(all_times):.4f} s/iter')
+        print()
+
+
+def plot_curve(log_dicts, args):
+    if args.backend is not None:
+        plt.switch_backend(args.backend)
+    sns.set_style(args.style)
+    # if legend is None, use {filename}_{key} as legend
+    legend = args.legend
+    if legend is None:
+        legend = []
+        for json_log in args.json_logs:
+            for metric in args.keys:
+                legend.append(f'{json_log}_{metric}')
+    assert len(legend) == (len(args.json_logs) * len(args.keys))
+    metrics = args.keys
+
+    num_metrics = len(metrics)
+    for i, log_dict in enumerate(log_dicts):
+        epochs = list(log_dict.keys())
+        for j, metric in enumerate(metrics):
+            print(f'plot curve of {args.json_logs[i]}, metric is {metric}')
+            if metric not in log_dict[epochs[args.interval - 1]]:
+                raise KeyError(
+                    f'{args.json_logs[i]} does not contain metric {metric}')
+
+            if args.mode == 'eval':
+                if min(epochs) == args.interval:
+                    x0 = args.interval
+                else:
+                    # if current training is resumed from previous checkpoint
+                    # we lost information in early epochs
+                    # `xs` should start according to `min(epochs)`
+                    if min(epochs) % args.interval == 0:
+                        x0 = min(epochs)
+                    else:
+                        # find the first epoch that do eval
+                        x0 = min(epochs) + args.interval - \
+                            min(epochs) % args.interval
+                xs = np.arange(x0, max(epochs) + 1, args.interval)
+                ys = []
+                for epoch in epochs[args.interval - 1::args.interval]:
+                    ys += log_dict[epoch][metric]
+
+                # if training is aborted before eval of the last epoch
+                # `xs` and `ys` will have different length and cause an error
+                # check if `ys[-1]` is empty here
+                if not log_dict[epoch][metric]:
+                    xs = xs[:-1]
+
+                ax = plt.gca()
+                ax.set_xticks(xs)
+                plt.xlabel('epoch')
+                plt.plot(xs, ys, label=legend[i * num_metrics + j], marker='o')
+            else:
+                xs = []
+                ys = []
+                num_iters_per_epoch = \
+                    log_dict[epochs[args.interval-1]]['iter'][-1]
+                for epoch in epochs[args.interval - 1::args.interval]:
+                    iters = log_dict[epoch]['iter']
+                    if log_dict[epoch]['mode'][-1] == 'val':
+                        iters = iters[:-1]
+                    xs.append(
+                        np.array(iters) + (epoch - 1) * num_iters_per_epoch)
+                    ys.append(np.array(log_dict[epoch][metric][:len(iters)]))
+                xs = np.concatenate(xs)
+                ys = np.concatenate(ys)
+                plt.xlabel('iter')
+                plt.plot(
+                    xs, ys, label=legend[i * num_metrics + j], linewidth=0.5)
+            plt.legend()
+        if args.title is not None:
+            plt.title(args.title)
+    if args.out is None:
+        plt.show()
+    else:
+        print(f'save curve to: {args.out}')
+        plt.savefig(args.out)
+        plt.cla()
+
+
+def add_plot_parser(subparsers):
+    parser_plt = subparsers.add_parser(
+        'plot_curve', help='parser for plotting curves')
+    parser_plt.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_plt.add_argument(
+        '--keys',
+        type=str,
+        nargs='+',
+        default=['mAP_0.25'],
+        help='the metric that you want to plot')
+    parser_plt.add_argument('--title', type=str, help='title of figure')
+    parser_plt.add_argument(
+        '--legend',
+        type=str,
+        nargs='+',
+        default=None,
+        help='legend of each plot')
+    parser_plt.add_argument(
+        '--backend', type=str, default=None, help='backend of plt')
+    parser_plt.add_argument(
+        '--style', type=str, default='dark', help='style of plt')
+    parser_plt.add_argument('--out', type=str, default=None)
+    parser_plt.add_argument('--mode', type=str, default='train')
+    parser_plt.add_argument('--interval', type=int, default=1)
+
+
+def add_time_parser(subparsers):
+    parser_time = subparsers.add_parser(
+        'cal_train_time',
+        help='parser for computing the average time per training iteration')
+    parser_time.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_time.add_argument(
+        '--include-outliers',
+        action='store_true',
+        help='include the first value of every epoch when computing '
+        'the average time')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Analyze Json Log')
+    # currently only support plot curve and calculate average train time
+    subparsers = parser.add_subparsers(dest='task', help='task parser')
+    add_plot_parser(subparsers)
+    add_time_parser(subparsers)
+    args = parser.parse_args()
+    return args
+
+
+def load_json_logs(json_logs):
+    # load and convert json_logs to log_dict, key is epoch, value is a sub dict
+    # keys of sub dict is different metrics, e.g. memory, bbox_mAP
+    # value of sub dict is a list of corresponding values of all iterations
+    log_dicts = [dict() for _ in json_logs]
+    for json_log, log_dict in zip(json_logs, log_dicts):
+        with open(json_log, 'r') as log_file:
+            for line in log_file:
+                log = json.loads(line.strip())
+                # skip lines without `epoch` field
+                if 'epoch' not in log:
+                    continue
+                epoch = log.pop('epoch')
+                if epoch not in log_dict:
+                    log_dict[epoch] = defaultdict(list)
+                for k, v in log.items():
+                    log_dict[epoch][k].append(v)
+    return log_dicts
+
+
+def main():
+    args = parse_args()
+
+    json_logs = args.json_logs
+    for json_log in json_logs:
+        assert json_log.endswith('.json')
+
+    log_dicts = load_json_logs(json_logs)
+
+    eval(args.task)(log_dicts, args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/uniad/analysis_tools/benchmark.py b/adzoo/uniad/analysis_tools/benchmark.py
new file mode 100755
index 0000000..6ed3976
--- /dev/null
+++ b/adzoo/uniad/analysis_tools/benchmark.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import time
+import torch
+from mmcv import Config
+from mmcv.parallel import MMDataParallel
+from mmcv.runner import load_checkpoint, wrap_fp16_model
+import sys
+sys.path.append('.')
+from mmcv.datasets.builder import build_dataloader
+from mmcv.datasets import build_dataset
+from mmcv.models import build_detector
+#from tools.misc.fuse_conv_bn import fuse_module
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMDet benchmark a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--checkpoint', default=None, help='checkpoint file')
+    parser.add_argument('--samples', default=2000, help='samples to benchmark')
+    parser.add_argument(
+        '--log-interval', default=10, help='interval of logging')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    # build the dataloader
+    # TODO: support multiple images per gpu (only minor changes are needed)
+    print(cfg.data.test)
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=1,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=False,
+        shuffle=False)
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    if args.checkpoint is not None:
+        load_checkpoint(model, args.checkpoint, map_location='cpu')
+    #if args.fuse_conv_bn:
+    #    model = fuse_module(model)
+
+    model = MMDataParallel(model, device_ids=[0])
+
+    model.eval()
+
+    # the first several iterations may be very slow so skip them
+    num_warmup = 5
+    pure_inf_time = 0
+
+    # benchmark with several samples and take the average
+    for i, data in enumerate(data_loader):
+        torch.cuda.synchronize()
+        start_time = time.perf_counter()
+        with torch.no_grad():
+            model(return_loss=False, rescale=True, **data)
+
+        torch.cuda.synchronize()
+        elapsed = time.perf_counter() - start_time
+
+        if i >= num_warmup:
+            pure_inf_time += elapsed
+            if (i + 1) % args.log_interval == 0:
+                fps = (i + 1 - num_warmup) / pure_inf_time
+                print(f'Done image [{i + 1:<3}/ {args.samples}], '
+                      f'fps: {fps:.1f} img / s')
+
+        if (i + 1) == args.samples:
+            pure_inf_time += elapsed
+            fps = (i + 1 - num_warmup) / pure_inf_time
+            print(f'Overall fps: {fps:.1f} img / s')
+            break
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/uniad/analysis_tools/visualize/render/base_render.py b/adzoo/uniad/analysis_tools/visualize/render/base_render.py
new file mode 100644
index 0000000..65dbbeb
--- /dev/null
+++ b/adzoo/uniad/analysis_tools/visualize/render/base_render.py
@@ -0,0 +1,32 @@
+import matplotlib.pyplot as plt
+from pyquaternion import Quaternion
+
+
+class BaseRender:
+    """
+    BaseRender class
+    """
+
+    def __init__(
+            self,
+            figsize=(10, 10)):
+        self.figsize = figsize
+        self.fig, self.axes = None, None
+
+    def reset_canvas(self, dx=1, dy=1, tight_layout=False):
+        plt.close()
+        plt.gca().set_axis_off()
+        plt.axis('off')
+        self.fig, self.axes = plt.subplots(dx, dy, figsize=self.figsize)
+        if tight_layout:
+            plt.tight_layout()
+
+    def close_canvas(self):
+        plt.close()
+
+    def save_fig(self, filename):
+        plt.subplots_adjust(top=1, bottom=0, right=1, left=0,
+                            hspace=0, wspace=0)
+        plt.margins(0, 0)
+        print(f'saving to {filename}')
+        plt.savefig(filename)
diff --git a/adzoo/uniad/analysis_tools/visualize/render/bev_render.py b/adzoo/uniad/analysis_tools/visualize/render/bev_render.py
new file mode 100644
index 0000000..fcc6ffa
--- /dev/null
+++ b/adzoo/uniad/analysis_tools/visualize/render/bev_render.py
@@ -0,0 +1,264 @@
+import cv2
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+from pyquaternion import Quaternion
+from nuscenes.prediction import PredictHelper, convert_local_coords_to_global
+from tools.analysis_tools.visualize.render.base_render import BaseRender
+from tools.analysis_tools.visualize.utils import color_mapping, AgentPredictionData
+
+
+class BEVRender(BaseRender):
+    """
+    Render class for BEV
+    """
+
+    def __init__(self,
+                 figsize=(20, 20),
+                 margin: float = 50,
+                 view: np.ndarray = np.eye(4),
+                 show_gt_boxes=False):
+        super(BEVRender, self).__init__(figsize)
+        self.margin = margin
+        self.view = view
+        self.show_gt_boxes = show_gt_boxes
+
+    def set_plot_cfg(self):
+        self.axes.set_xlim([-self.margin, self.margin])
+        self.axes.set_ylim([-self.margin, self.margin])
+        self.axes.set_aspect('equal')
+        self.axes.grid(False)
+
+    def render_sample_data(self, canvas, sample_token):
+        pass
+
+    def render_anno_data(
+            self,
+            sample_token,
+            nusc,
+            predict_helper):
+        sample_record = nusc.get('sample', sample_token)
+        assert 'LIDAR_TOP' in sample_record['data'].keys(
+        ), 'Error: No LIDAR_TOP in data, unable to render.'
+        lidar_record = sample_record['data']['LIDAR_TOP']
+        data_path, boxes, _ = nusc.get_sample_data(
+            lidar_record, selected_anntokens=sample_record['anns'])
+        for box in boxes:
+            instance_token = nusc.get('sample_annotation', box.token)[
+                'instance_token']
+            future_xy_local = predict_helper.get_future_for_agent(
+                instance_token, sample_token, seconds=6, in_agent_frame=True)
+            if future_xy_local.shape[0] > 0:
+                trans = box.center
+                rot = Quaternion(matrix=box.rotation_matrix)
+                future_xy = convert_local_coords_to_global(
+                    future_xy_local, trans, rot)
+                future_xy = np.concatenate(
+                    [trans[None, :2], future_xy], axis=0)
+                c = np.array([0, 0.8, 0])
+                box.render(self.axes, view=self.view, colors=(c, c, c))
+                self._render_traj(future_xy, line_color=c, dot_color=(0, 0, 0))
+        self.axes.set_xlim([-self.margin, self.margin])
+        self.axes.set_ylim([-self.margin, self.margin])
+
+    def show_lidar_data(
+            self,
+            sample_token,
+            nusc):
+        sample_record = nusc.get('sample', sample_token)
+        assert 'LIDAR_TOP' in sample_record['data'].keys(
+        ), 'Error: No LIDAR_TOP in data, unable to render.'
+        lidar_record = sample_record['data']['LIDAR_TOP']
+        data_path, boxes, _ = nusc.get_sample_data(
+            lidar_record, selected_anntokens=sample_record['anns'])
+        LidarPointCloud.from_file(data_path).render_height(
+            self.axes, view=self.view)
+        self.axes.set_xlim([-self.margin, self.margin])
+        self.axes.set_ylim([-self.margin, self.margin])
+        self.axes.axis('off')
+        self.axes.set_aspect('equal')
+
+    def render_pred_box_data(self, agent_prediction_list):
+        for pred_agent in agent_prediction_list:
+            c = np.array([0, 1, 0])
+            if hasattr(pred_agent, 'pred_track_id') and pred_agent.pred_track_id is not None:  # this is true
+                tr_id = pred_agent.pred_track_id
+                c = color_mapping[tr_id % len(color_mapping)]
+            pred_agent.nusc_box.render(
+                axis=self.axes, view=self.view, colors=(c, c, c))
+            if pred_agent.is_sdc:
+                c = np.array([1, 0, 0])
+                pred_agent.nusc_box.render(
+                    axis=self.axes, view=self.view, colors=(c, c, c))
+
+    def render_pred_traj(self, agent_prediction_list, top_k=3):
+        for pred_agent in agent_prediction_list:
+            if pred_agent.is_sdc:
+                continue
+            sorted_ind = np.argsort(pred_agent.pred_traj_score)[
+                ::-1]  # from high to low
+            num_modes = len(sorted_ind)
+            sorted_traj = pred_agent.pred_traj[sorted_ind, :, :2]
+            sorted_score = pred_agent.pred_traj_score[sorted_ind]
+            # norm_score = np.sum(np.exp(sorted_score))
+            norm_score = np.exp(sorted_score[0])
+
+            sorted_traj = np.concatenate(
+                [np.zeros((num_modes, 1, 2)), sorted_traj], axis=1)
+            trans = pred_agent.pred_center
+            rot = Quaternion(axis=np.array([0, 0.0, 1.0]), angle=np.pi/2)
+            vehicle_id_list = [0, 1, 2, 3, 4, 6, 7]
+            if pred_agent.pred_label in vehicle_id_list:
+                dot_size = 150
+            else:
+                dot_size = 25
+            # print(sorted_score)
+            for i in range(top_k-1, -1, -1):
+                viz_traj = sorted_traj[i, :, :2]
+                viz_traj = convert_local_coords_to_global(viz_traj, trans, rot)
+                traj_score = np.exp(sorted_score[i])/norm_score
+                # traj_score = [1.0, 0.01, 0.01, 0.01, 0.01, 0.01][i]
+                self._render_traj(viz_traj, traj_score=traj_score,
+                                  colormap='winter', dot_size=dot_size)
+
+    def render_pred_map_data(self, predicted_map_seg):
+        # rendered_map = map_color_dict
+        # divider, crossing, contour
+        map_color_dict = np.array(
+            [(204, 128, 0), (102, 255, 102), (102, 255, 102)])
+        rendered_map = map_color_dict[predicted_map_seg.argmax(
+            -1).reshape(-1)].reshape(200, 200, -1)
+        bg_mask = predicted_map_seg.sum(-1) == 0
+        rendered_map[bg_mask, :] = 255
+        self.axes.imshow(rendered_map, alpha=0.6,
+                         interpolation='nearest', extent=(-51.2, 51.2, -51.2, 51.2))
+
+    def render_occ_map_data(self, agent_list):
+        rendered_map = np.ones((200, 200, 3))
+        rendered_map_hsv = matplotlib.colors.rgb_to_hsv(rendered_map)
+        occ_prob_map = np.zeros((200, 200))
+        for i in range(len(agent_list)):
+            pred_agent = agent_list[i]
+            if pred_agent.pred_occ_map is None:
+                continue
+            if hasattr(pred_agent, 'pred_track_id') and pred_agent.pred_track_id is not None:  # this is true
+                tr_id = pred_agent.pred_track_id
+                c = color_mapping[tr_id % len(color_mapping)]
+            pred_occ_map = pred_agent.pred_occ_map.max(0)
+            update_mask = pred_occ_map > occ_prob_map
+            occ_prob_map[update_mask] = pred_occ_map[update_mask]
+            pred_occ_map *= update_mask
+            hsv_c = matplotlib.colors.rgb_to_hsv(c)
+            rendered_map_hsv[pred_occ_map > 0.1] = (
+                np.ones((200, 200, 1)) * hsv_c)[pred_occ_map > 0.1]
+            max_prob = pred_occ_map.max()
+            renorm_pred_occ_map = (pred_occ_map - max_prob) * 0.7 + 1
+            sat_map = (renorm_pred_occ_map * hsv_c[1])
+            rendered_map_hsv[pred_occ_map > 0.1,
+                             1] = sat_map[pred_occ_map > 0.1]
+            rendered_map = matplotlib.colors.hsv_to_rgb(rendered_map_hsv)
+        self.axes.imshow(rendered_map, alpha=0.8,
+                         interpolation='nearest', extent=(-50, 50, -50, 50))
+
+    def render_occ_map_data_time(self, agent_list, t):
+        rendered_map = np.ones((200, 200, 3))
+        rendered_map_hsv = matplotlib.colors.rgb_to_hsv(rendered_map)
+        occ_prob_map = np.zeros((200, 200))
+        for i in range(len(agent_list)):
+            pred_agent = agent_list[i]
+            if pred_agent.pred_occ_map is None:
+                continue
+            if hasattr(pred_agent, 'pred_track_id') and pred_agent.pred_track_id is not None:  # this is true
+                tr_id = pred_agent.pred_track_id
+                c = color_mapping[tr_id % len(color_mapping)]
+            pred_occ_map = pred_agent.pred_occ_map[t]
+            update_mask = pred_occ_map > occ_prob_map
+            occ_prob_map[update_mask] = pred_occ_map[update_mask]
+            pred_occ_map *= update_mask
+            hsv_c = matplotlib.colors.rgb_to_hsv(c)
+            rendered_map_hsv[pred_occ_map > 0.1] = (
+                np.ones((200, 200, 1)) * hsv_c)[pred_occ_map > 0.1]
+            max_prob = pred_occ_map.max()
+            renorm_pred_occ_map = (pred_occ_map - max_prob) * 0.7 + 1
+            sat_map = (renorm_pred_occ_map * hsv_c[1])
+            rendered_map_hsv[pred_occ_map > 0.1,
+                             1] = sat_map[pred_occ_map > 0.1]
+            rendered_map = matplotlib.colors.hsv_to_rgb(rendered_map_hsv)
+        self.axes.imshow(rendered_map, alpha=0.8,
+                         interpolation='nearest', extent=(-50, 50, -50, 50))
+
+    def render_planning_data(self, predicted_planning, show_command=False):
+        planning_traj = predicted_planning.pred_traj
+        planning_traj = np.concatenate(
+            [np.zeros((1, 2)), planning_traj], axis=0)
+        self._render_traj(planning_traj, colormap='autumn', dot_size=50)
+        if show_command:
+            self._render_command(predicted_planning.command)
+
+    def render_planning_attn_mask(self, predicted_planning):
+        planning_attn_mask = predicted_planning.attn_mask
+        planning_attn_mask = planning_attn_mask/planning_attn_mask.max()
+        cmap_name = 'plasma'
+        self.axes.imshow(planning_attn_mask, alpha=0.8, interpolation='nearest', extent=(
+            -51.2, 51.2, -51.2, 51.2), vmax=0.2, cmap=matplotlib.colormaps[cmap_name])
+
+    def render_hd_map(self, nusc, nusc_maps, sample_token):
+        sample_record = nusc.get('sample', sample_token)
+        sd_rec = nusc.get('sample_data', sample_record['data']['LIDAR_TOP'])
+        cs_record = nusc.get('calibrated_sensor',
+                             sd_rec['calibrated_sensor_token'])
+        pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+        info = {
+            'lidar2ego_translation': cs_record['translation'],
+            'lidar2ego_rotation': cs_record['rotation'],
+            'ego2global_translation': pose_record['translation'],
+            'ego2global_rotation': pose_record['rotation'],
+            'scene_token': sample_record['scene_token']
+        }
+
+        layer_names = ['road_divider', 'road_segment', 'lane_divider',
+                       'lane',  'road_divider', 'traffic_light', 'ped_crossing']
+        map_mask = obtain_map_info(nusc,
+                                   nusc_maps,
+                                   info,
+                                   patch_size=(102.4, 102.4),
+                                   canvas_size=(1024, 1024),
+                                   layer_names=layer_names)
+        map_mask = np.flip(map_mask, axis=1)
+        map_mask = np.rot90(map_mask, k=-1, axes=(1, 2))
+        map_mask = map_mask[:, ::-1] > 0
+        map_show = np.ones((1024, 1024, 3))
+        map_show[map_mask[0], :] = np.array([1.00, 0.50, 0.31])
+        map_show[map_mask[1], :] = np.array([159./255., 0.0, 1.0])
+        self.axes.imshow(map_show, alpha=0.2, interpolation='nearest',
+                         extent=(-51.2, 51.2, -51.2, 51.2))
+
+    def _render_traj(self, future_traj, traj_score=1, colormap='winter', points_per_step=20, line_color=None, dot_color=None, dot_size=25):
+        total_steps = (len(future_traj)-1) * points_per_step + 1
+        dot_colors = matplotlib.colormaps[colormap](
+            np.linspace(0, 1, total_steps))[:, :3]
+        dot_colors = dot_colors*traj_score + \
+            (1-traj_score)*np.ones_like(dot_colors)
+        total_xy = np.zeros((total_steps, 2))
+        for i in range(total_steps-1):
+            unit_vec = future_traj[i//points_per_step +
+                                   1] - future_traj[i//points_per_step]
+            total_xy[i] = (i/points_per_step - i//points_per_step) * \
+                unit_vec + future_traj[i//points_per_step]
+        total_xy[-1] = future_traj[-1]
+        self.axes.scatter(
+            total_xy[:, 0], total_xy[:, 1], c=dot_colors, s=dot_size)
+
+    def _render_command(self, command):
+        command_dict = ['TURN RIGHT', 'TURN LEFT', 'KEEP FORWARD']
+        self.axes.text(-48, -45, command_dict[int(command)], fontsize=45)
+
+    def render_sdc_car(self):
+        sdc_car_png = cv2.imread('sources/sdc_car.png')
+        sdc_car_png = cv2.cvtColor(sdc_car_png, cv2.COLOR_BGR2RGB)
+        self.axes.imshow(sdc_car_png, extent=(-1, 1, -2, 2))
+
+    def render_legend(self):
+        legend = cv2.imread('sources/legend.png')
+        legend = cv2.cvtColor(legend, cv2.COLOR_BGR2RGB)
+        self.axes.imshow(legend, extent=(23, 51.2, -50, -40))
diff --git a/adzoo/uniad/analysis_tools/visualize/render/cam_render.py b/adzoo/uniad/analysis_tools/visualize/render/cam_render.py
new file mode 100644
index 0000000..c2646b1
--- /dev/null
+++ b/adzoo/uniad/analysis_tools/visualize/render/cam_render.py
@@ -0,0 +1,202 @@
+import cv2
+import numpy as np
+from PIL import Image
+import matplotlib.pyplot as plt
+from nuscenes.utils.data_classes import LidarPointCloud, Box
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix
+from tools.analysis_tools.visualize.utils import color_mapping, AgentPredictionData
+from tools.analysis_tools.visualize.render.base_render import BaseRender
+from pyquaternion import Quaternion
+
+# Define a constant for camera names
+CAM_NAMES = [
+    'CAM_FRONT_LEFT',
+    'CAM_FRONT',
+    'CAM_FRONT_RIGHT',
+    'CAM_BACK_RIGHT',
+    'CAM_BACK',
+    'CAM_BACK_LEFT',
+]
+
+
+class CameraRender(BaseRender):
+    """
+    Render class for Camera View
+    """
+
+    def __init__(self,
+                 figsize=(53.3333, 20),
+                 show_gt_boxes=False):
+        super().__init__(figsize)
+        self.cams = CAM_NAMES
+        self.show_gt_boxes = show_gt_boxes
+
+    def get_axis(self, index):
+        """Retrieve the corresponding axis based on the index."""
+        return self.axes[index//3, index % 3]
+
+    def project_to_cam(self,
+                       agent_prediction_list,
+                       sample_data_token,
+                       nusc,
+                       lidar_cs_record,
+                       project_traj=False,
+                       cam=None,
+                       ):
+        """Project predictions to camera view."""
+        _, cs_record, pose_record, cam_intrinsic, imsize = self.get_image_info(
+            sample_data_token, nusc)
+        boxes = []
+        for agent in agent_prediction_list:
+            box = Box(agent.pred_center, agent.pred_dim, Quaternion(axis=(0.0, 0.0, 1.0), radians=agent.pred_yaw),
+                      name=agent.pred_label, token='predicted')
+            box.is_sdc = agent.is_sdc
+            if project_traj:
+                box.pred_traj = np.zeros((agent.pred_traj_max.shape[0]+1, 3))
+                box.pred_traj[:, 0] = agent.pred_center[0]
+                box.pred_traj[:, 1] = agent.pred_center[1]
+                box.pred_traj[:, 2] = agent.pred_center[2] - \
+                    agent.pred_dim[2]/2
+                box.pred_traj[1:, :2] += agent.pred_traj_max[:, :2]
+                box.pred_traj = (Quaternion(
+                    lidar_cs_record['rotation']).rotation_matrix @ box.pred_traj.T).T
+                box.pred_traj += np.array(
+                    lidar_cs_record['translation'])[None, :]
+            box.rotate(Quaternion(lidar_cs_record['rotation']))
+            box.translate(np.array(lidar_cs_record['translation']))
+            boxes.append(box)
+        # Make list of Box objects including coord system transforms.
+
+        box_list = []
+        tr_id_list = []
+        for i, box in enumerate(boxes):
+            #  Move box to sensor coord system.
+            box.translate(-np.array(cs_record['translation']))
+            box.rotate(Quaternion(cs_record['rotation']).inverse)
+            if project_traj:
+                box.pred_traj += -np.array(cs_record['translation'])[None, :]
+                box.pred_traj = (Quaternion(
+                    cs_record['rotation']).inverse.rotation_matrix @ box.pred_traj.T).T
+
+            tr_id = agent_prediction_list[i].pred_track_id
+            if box.is_sdc and cam == 'CAM_FRONT':
+                box_list.append(box)
+            if not box_in_image(box, cam_intrinsic, imsize):
+                continue
+            box_list.append(box)
+            tr_id_list.append(tr_id)
+        return box_list, tr_id_list, cam_intrinsic, imsize
+
+    def render_image_data(self, sample_token, nusc):
+        """Load and annotate image based on the provided path."""
+        sample = nusc.get('sample', sample_token)
+        for i, cam in enumerate(self.cams):
+            sample_data_token = sample['data'][cam]
+            data_path, _, _, _, _ = self.get_image_info(
+                sample_data_token, nusc)
+            image = self.load_image(data_path, cam)
+            self.update_image(image, i, cam)
+
+    def load_image(self, data_path, cam):
+        """Update the axis of the plot with the provided image."""
+        image = np.array(Image.open(data_path))
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        org = (50, 60)
+        fontScale = 2
+        color = (0, 0, 0)
+        thickness = 4
+        return cv2.putText(image, cam, org, font, fontScale, color, thickness, cv2.LINE_AA)
+
+    def update_image(self, image, index, cam):
+        """Render image data for each camera."""
+        ax = self.get_axis(index)
+        ax.imshow(image)
+        plt.axis('off')
+        ax.axis('off')
+        ax.grid(False)
+
+    def render_pred_track_bbox(self, predicted_agent_list, sample_token, nusc):
+        """Render bounding box for predicted tracks."""
+        sample = nusc.get('sample', sample_token)
+        lidar_cs_record = nusc.get('calibrated_sensor', nusc.get(
+            'sample_data', sample['data']['LIDAR_TOP'])['calibrated_sensor_token'])
+        for i, cam in enumerate(self.cams):
+            sample_data_token = sample['data'][cam]
+            box_list, tr_id_list, camera_intrinsic, imsize = self.project_to_cam(
+                predicted_agent_list, sample_data_token, nusc, lidar_cs_record)
+            for j, box in enumerate(box_list):
+                if box.is_sdc:
+                    continue
+                tr_id = tr_id_list[j]
+                if tr_id is None:
+                    tr_id = 0
+                c = color_mapping[tr_id % len(color_mapping)]
+                box.render(
+                    self.axes[i//3, i % 3], view=camera_intrinsic, normalize=True, colors=(c, c, c))
+            # plot gt
+            if self.show_gt_boxes:
+                data_path, boxes, camera_intrinsic = nusc.get_sample_data(
+                    sample_data_token, selected_anntokens=sample['anns'])
+                for j, box in enumerate(boxes):
+                    c = [0, 1, 0]
+                    box.render(
+                        self.axes[i//3, i % 3], view=camera_intrinsic, normalize=True, colors=(c, c, c))
+            self.axes[i//3, i % 3].set_xlim(0, imsize[0])
+            self.axes[i//3, i % 3].set_ylim(imsize[1], 0)
+
+    def render_pred_traj(self, predicted_agent_list, sample_token, nusc, render_sdc=False, points_per_step=10):
+        """Render predicted trajectories."""
+        sample = nusc.get('sample', sample_token)
+        lidar_cs_record = nusc.get('calibrated_sensor', nusc.get(
+            'sample_data', sample['data']['LIDAR_TOP'])['calibrated_sensor_token'])
+        for i, cam in enumerate(self.cams):
+            sample_data_token = sample['data'][cam]
+            box_list, tr_id_list, camera_intrinsic, imsize = self.project_to_cam(
+                predicted_agent_list, sample_data_token, nusc, lidar_cs_record, project_traj=True, cam=cam)
+            for j, box in enumerate(box_list):
+                traj_points = box.pred_traj[:, :3]
+
+                total_steps = (len(traj_points)-1) * points_per_step + 1
+                total_xy = np.zeros((total_steps, 3))
+                for k in range(total_steps-1):
+                    unit_vec = traj_points[k//points_per_step +
+                                           1] - traj_points[k//points_per_step]
+                    total_xy[k] = (k/points_per_step - k//points_per_step) * \
+                        unit_vec + traj_points[k//points_per_step]
+                in_range_mask = total_xy[:, 2] > 0.1
+                traj_points = view_points(
+                    total_xy.T, camera_intrinsic, normalize=True)[:2, :]
+                traj_points = traj_points[:2, in_range_mask]
+                if box.is_sdc:
+                    if render_sdc:
+                        self.axes[i//3, i % 3].scatter(
+                            traj_points[0], traj_points[1], color=(1, 0.5, 0), s=150)
+                    else:
+                        continue
+                else:
+                    tr_id = tr_id_list[j]
+                    if tr_id is None:
+                        tr_id = 0
+                    c = color_mapping[tr_id % len(color_mapping)]
+                    self.axes[i//3, i %
+                              3].scatter(traj_points[0], traj_points[1], color=c, s=15)
+            self.axes[i//3, i % 3].set_xlim(0, imsize[0])
+            self.axes[i//3, i % 3].set_ylim(imsize[1], 0)
+
+    def get_image_info(self, sample_data_token, nusc):
+        """Retrieve image information."""
+        sd_record = nusc.get('sample_data', sample_data_token)
+        cs_record = nusc.get('calibrated_sensor',
+                             sd_record['calibrated_sensor_token'])
+        sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+        pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+
+        data_path = nusc.get_sample_data_path(sample_data_token)
+
+        if sensor_record['modality'] == 'camera':
+            cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+            imsize = (sd_record['width'], sd_record['height'])
+        else:
+            cam_intrinsic = None
+            imsize = None
+        return data_path, cs_record, pose_record, cam_intrinsic, imsize
diff --git a/adzoo/uniad/analysis_tools/visualize/run.py b/adzoo/uniad/analysis_tools/visualize/run.py
new file mode 100644
index 0000000..b64b545
--- /dev/null
+++ b/adzoo/uniad/analysis_tools/visualize/run.py
@@ -0,0 +1,338 @@
+import cv2
+import torch
+import argparse
+import os
+import glob
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+from nuscenes import NuScenes
+from nuscenes.prediction import PredictHelper, convert_local_coords_to_global
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix
+from nuscenes.utils.data_classes import LidarPointCloud, Box
+from nuscenes.utils import splits
+from pyquaternion import Quaternion
+from mmcv.datasets.nuscenes_e2e_dataset import obtain_map_info
+from mmcv.datasets.eval_utils.map_api import NuScenesMap
+from mmcv.fileio.io import load
+from PIL import Image
+from tools.analysis_tools.visualize.utils import color_mapping, AgentPredictionData
+from tools.analysis_tools.visualize.render.bev_render import BEVRender
+from tools.analysis_tools.visualize.render.cam_render import CameraRender
+
+
+class Visualizer:
+    """
+    BaseRender class
+    """
+
+    def __init__(
+            self,
+            dataroot='/mnt/petrelfs/yangjiazhi/e2e_proj/data/nus_mini',
+            version='v1.0-mini',
+            predroot=None,
+            with_occ_map=False,
+            with_map=False,
+            with_planning=False,
+            with_pred_box=True,
+            with_pred_traj=False,
+            show_gt_boxes=False,
+            show_lidar=False,
+            show_command=False,
+            show_hd_map=False,
+            show_sdc_car=False,
+            show_sdc_traj=False,
+            show_legend=False):
+        self.nusc = NuScenes(version=version, dataroot=dataroot, verbose=True)
+        self.predict_helper = PredictHelper(self.nusc)
+        self.with_occ_map = with_occ_map
+        self.with_map = with_map
+        self.with_planning = with_planning
+        self.show_lidar = show_lidar
+        self.show_command = show_command
+        self.show_hd_map = show_hd_map
+        self.show_sdc_car = show_sdc_car
+        self.show_sdc_traj = show_sdc_traj
+        self.show_legend = show_legend
+        self.with_pred_traj = with_pred_traj
+        self.with_pred_box = with_pred_box
+        self.veh_id_list = [0, 1, 2, 3, 4, 6, 7]
+        self.use_json = '.json' in predroot
+        self.token_set = set()
+        self.predictions = self._parse_predictions_multitask_pkl(predroot)
+        self.bev_render = BEVRender(show_gt_boxes=show_gt_boxes)
+        self.cam_render = CameraRender(show_gt_boxes=show_gt_boxes)
+
+        if self.show_hd_map:
+            self.nusc_maps = {
+                'boston-seaport': NuScenesMap(dataroot=dataroot, map_name='boston-seaport'),
+                'singapore-hollandvillage': NuScenesMap(dataroot=dataroot, map_name='singapore-hollandvillage'),
+                'singapore-onenorth': NuScenesMap(dataroot=dataroot, map_name='singapore-onenorth'),
+                'singapore-queenstown': NuScenesMap(dataroot=dataroot, map_name='singapore-queenstown'),
+            }
+
+    def _parse_predictions_multitask_pkl(self, predroot):
+
+        outputs = load(predroot)
+        outputs = outputs['bbox_results']
+        prediction_dict = dict()
+        for k in range(len(outputs)):
+            token = outputs[k]['token']
+            self.token_set.add(token)
+            if self.show_sdc_traj:
+                outputs[k]['boxes_3d'].tensor = torch.cat(
+                    [outputs[k]['boxes_3d'].tensor, outputs[k]['sdc_boxes_3d'].tensor], dim=0)
+                outputs[k]['scores_3d'] = torch.cat(
+                    [outputs[k]['scores_3d'], outputs[k]['sdc_scores_3d']], dim=0)
+                outputs[k]['labels_3d'] = torch.cat([outputs[k]['labels_3d'], torch.zeros(
+                    (1,), device=outputs[k]['labels_3d'].device)], dim=0)
+            # detection
+            bboxes = outputs[k]['boxes_3d']
+            scores = outputs[k]['scores_3d']
+            labels = outputs[k]['labels_3d']
+
+            track_scores = scores.cpu().detach().numpy()
+            track_labels = labels.cpu().detach().numpy()
+            track_boxes = bboxes.tensor.cpu().detach().numpy()
+
+            track_centers = bboxes.gravity_center.cpu().detach().numpy()
+            track_dims = bboxes.dims.cpu().detach().numpy()
+            track_yaw = bboxes.yaw.cpu().detach().numpy()
+
+            if 'track_ids' in outputs[k]:
+                track_ids = outputs[k]['track_ids'].cpu().detach().numpy()
+            else:
+                track_ids = None
+
+            # speed
+            track_velocity = bboxes.tensor.cpu().detach().numpy()[:, -2:]
+
+            # trajectories
+            trajs = outputs[k][f'traj'].numpy()
+            traj_scores = outputs[k][f'traj_scores'].numpy()
+
+            predicted_agent_list = []
+
+            # occflow
+            if self.with_occ_map:
+                if 'topk_query_ins_segs' in outputs[k]['occ']:
+                    occ_map = outputs[k]['occ']['topk_query_ins_segs'][0].cpu(
+                    ).numpy()
+                else:
+                    occ_map = np.zeros((1, 5, 200, 200))
+            else:
+                occ_map = None
+
+            occ_idx = 0
+            for i in range(track_scores.shape[0]):
+                if track_scores[i] < 0.25:
+                    continue
+                if occ_map is not None and track_labels[i] in self.veh_id_list:
+                    occ_map_cur = occ_map[occ_idx, :, ::-1]
+                    occ_idx += 1
+                else:
+                    occ_map_cur = None
+                if track_ids is not None:
+                    if i < len(track_ids):
+                        track_id = track_ids[i]
+                    else:
+                        track_id = 0
+                else:
+                    track_id = None
+                # if track_labels[i] not in [0, 1, 2, 3, 4, 6, 7]:
+                #     continue
+                predicted_agent_list.append(
+                    AgentPredictionData(
+                        track_scores[i],
+                        track_labels[i],
+                        track_centers[i],
+                        track_dims[i],
+                        track_yaw[i],
+                        track_velocity[i],
+                        trajs[i],
+                        traj_scores[i],
+                        pred_track_id=track_id,
+                        pred_occ_map=occ_map_cur,
+                        past_pred_traj=None
+                    )
+                )
+
+            if self.with_map:
+                map_thres = 0.7
+                score_list = outputs[k]['pts_bbox']['score_list'].cpu().numpy().transpose([
+                    1, 2, 0])
+                predicted_map_seg = outputs[k]['pts_bbox']['lane_score'].cpu().numpy().transpose([
+                    1, 2, 0])  # H, W, C
+                predicted_map_seg[..., -1] = score_list[..., -1]
+                predicted_map_seg = (predicted_map_seg > map_thres) * 1.0
+                predicted_map_seg = predicted_map_seg[::-1, :, :]
+            else:
+                predicted_map_seg = None
+
+            if self.with_planning:
+                # detection
+                bboxes = outputs[k]['sdc_boxes_3d']
+                scores = outputs[k]['sdc_scores_3d']
+                labels = 0
+
+                track_scores = scores.cpu().detach().numpy()
+                track_labels = labels
+                track_boxes = bboxes.tensor.cpu().detach().numpy()
+
+                track_centers = bboxes.gravity_center.cpu().detach().numpy()
+                track_dims = bboxes.dims.cpu().detach().numpy()
+                track_yaw = bboxes.yaw.cpu().detach().numpy()
+                track_velocity = bboxes.tensor.cpu().detach().numpy()[:, -2:]
+
+                if self.show_command:
+                    command = outputs[k]['command'][0].cpu().detach().numpy()
+                else:
+                    command = None
+                planning_agent = AgentPredictionData(
+                    track_scores[0],
+                    track_labels,
+                    track_centers[0],
+                    track_dims[0],
+                    track_yaw[0],
+                    track_velocity[0],
+                    outputs[k]['planning_traj'][0].cpu().detach().numpy(),
+                    1,
+                    pred_track_id=-1,
+                    pred_occ_map=None,
+                    past_pred_traj=None,
+                    is_sdc=True,
+                    command=command,
+                )
+                predicted_agent_list.append(planning_agent)
+            else:
+                planning_agent = None
+            prediction_dict[token] = dict(predicted_agent_list=predicted_agent_list,
+                                          predicted_map_seg=predicted_map_seg,
+                                          predicted_planning=planning_agent)
+        return prediction_dict
+
+    def visualize_bev(self, sample_token, out_filename, t=None):
+        self.bev_render.reset_canvas(dx=1, dy=1)
+        self.bev_render.set_plot_cfg()
+
+        if self.show_lidar:
+            self.bev_render.show_lidar_data(sample_token, self.nusc)
+        if self.bev_render.show_gt_boxes:
+            self.bev_render.render_anno_data(
+                sample_token, self.nusc, self.predict_helper)
+        if self.with_pred_box:
+            self.bev_render.render_pred_box_data(
+                self.predictions[sample_token]['predicted_agent_list'])
+        if self.with_pred_traj:
+            self.bev_render.render_pred_traj(
+                self.predictions[sample_token]['predicted_agent_list'])
+        if self.with_map:
+            self.bev_render.render_pred_map_data(
+                self.predictions[sample_token]['predicted_map_seg'])
+        if self.with_occ_map:
+            self.bev_render.render_occ_map_data(
+                self.predictions[sample_token]['predicted_agent_list'])
+        if self.with_planning:
+            self.bev_render.render_pred_box_data(
+                [self.predictions[sample_token]['predicted_planning']])
+            self.bev_render.render_planning_data(
+                self.predictions[sample_token]['predicted_planning'], show_command=self.show_command)
+        if self.show_hd_map:
+            self.bev_render.render_hd_map(
+                self.nusc, self.nusc_maps, sample_token)
+        if self.show_sdc_car:
+            self.bev_render.render_sdc_car()
+        if self.show_legend:
+            self.bev_render.render_legend()
+        self.bev_render.save_fig(out_filename + '.jpg')
+
+    def visualize_cam(self, sample_token, out_filename):
+        self.cam_render.reset_canvas(dx=2, dy=3, tight_layout=True)
+        self.cam_render.render_image_data(sample_token, self.nusc)
+        self.cam_render.render_pred_track_bbox(
+            self.predictions[sample_token]['predicted_agent_list'], sample_token, self.nusc)
+        self.cam_render.render_pred_traj(
+            self.predictions[sample_token]['predicted_agent_list'], sample_token, self.nusc, render_sdc=self.with_planning)
+        self.cam_render.save_fig(out_filename + '_cam.jpg')
+
+    def combine(self, out_filename):
+        # pass
+        bev_image = cv2.imread(out_filename + '.jpg')
+        cam_image = cv2.imread(out_filename + '_cam.jpg')
+        merge_image = cv2.hconcat([cam_image, bev_image])
+        cv2.imwrite(out_filename + '.jpg', merge_image)
+        os.remove(out_filename + '_cam.jpg')
+
+    def to_video(self, folder_path, out_path, fps=4, downsample=1):
+        imgs_path = glob.glob(os.path.join(folder_path, '*.jpg'))
+        imgs_path = sorted(imgs_path)
+        img_array = []
+        for img_path in imgs_path:
+            img = cv2.imread(img_path)
+            height, width, channel = img.shape
+            img = cv2.resize(img, (width//downsample, height //
+                             downsample), interpolation=cv2.INTER_AREA)
+            height, width, channel = img.shape
+            size = (width, height)
+            img_array.append(img)
+        out = cv2.VideoWriter(
+            out_path, cv2.VideoWriter_fourcc(*'DIVX'), fps, size)
+        for i in range(len(img_array)):
+            out.write(img_array[i])
+        out.release()
+
+def main(args):
+    render_cfg = dict(
+        with_occ_map=False,
+        with_map=False,
+        with_planning=True,
+        with_pred_box=True,
+        with_pred_traj=True,
+        show_gt_boxes=False,
+        show_lidar=False,
+        show_command=True,
+        show_hd_map=False,
+        show_sdc_car=True,
+        show_legend=True,
+        show_sdc_traj=False
+    )
+
+    viser = Visualizer(version='v1.0-mini', predroot=args.predroot, dataroot='data/nuscenes', **render_cfg)
+
+    if not os.path.exists(args.out_folder):
+        os.makedirs(args.out_folder)
+
+    val_splits = splits.val
+
+    scene_token_to_name = dict()
+    for i in range(len(viser.nusc.scene)):
+        scene_token_to_name[viser.nusc.scene[i]['token']] = viser.nusc.scene[i]['name']
+
+    for i in range(len(viser.nusc.sample)):
+        sample_token = viser.nusc.sample[i]['token']
+        scene_token = viser.nusc.sample[i]['scene_token']
+
+        if scene_token_to_name[scene_token] not in val_splits:
+            continue
+
+        if sample_token not in viser.token_set:
+            print(i, sample_token, 'not in prediction pkl!')
+            continue
+
+        viser.visualize_bev(sample_token, os.path.join(args.out_folder, str(i).zfill(3)))
+
+        if args.project_to_cam:
+            viser.visualize_cam(sample_token, os.path.join(args.out_folder, str(i).zfill(3)))
+            viser.combine(os.path.join(args.out_folder, str(i).zfill(3)))
+
+    viser.to_video(args.out_folder, args.demo_video, fps=4, downsample=2)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--predroot', default='/mnt/nas20/yihan01.hu/tmp/results.pkl', help='Path to results.pkl')
+    parser.add_argument('--out_folder', default='/mnt/nas20/yihan01.hu/tmp/viz/demo_test/', help='Output folder path')
+    parser.add_argument('--demo_video', default='mini_val_final.avi', help='Demo video name')
+    parser.add_argument('--project_to_cam', default=True, help='Project to cam (default: True)')
+    args = parser.parse_args()
+    main(args)
diff --git a/adzoo/uniad/analysis_tools/visualize/utils.py b/adzoo/uniad/analysis_tools/visualize/utils.py
new file mode 100644
index 0000000..315344e
--- /dev/null
+++ b/adzoo/uniad/analysis_tools/visualize/utils.py
@@ -0,0 +1,131 @@
+import numpy as np
+from nuscenes.utils.data_classes import LidarPointCloud, Box
+from pyquaternion import Quaternion
+
+
+color_mapping = np.asarray([
+    [0, 0, 0],
+    [255, 179, 0],
+    [128, 62, 117],
+    [255, 104, 0],
+    [166, 189, 215],
+    [193, 0, 32],
+    [206, 162, 98],
+    [129, 112, 102],
+    [0, 125, 52],
+    [246, 118, 142],
+    [0, 83, 138],
+    [255, 122, 92],
+    [83, 55, 122],
+    [255, 142, 0],
+    [179, 40, 81],
+    [244, 200, 0],
+    [127, 24, 13],
+    [147, 170, 0],
+    [89, 51, 21],
+    [241, 58, 19],
+    [35, 44, 22],
+    [112, 224, 255],
+    [70, 184, 160],
+    [153, 0, 255],
+    [71, 255, 0],
+    [255, 0, 163],
+    [255, 204, 0],
+    [0, 255, 235],
+    [255, 0, 235],
+    [255, 0, 122],
+    [255, 245, 0],
+    [10, 190, 212],
+    [214, 255, 0],
+    [0, 204, 255],
+    [20, 0, 255],
+    [255, 255, 0],
+    [0, 153, 255],
+    [0, 255, 204],
+    [41, 255, 0],
+    [173, 0, 255],
+    [0, 245, 255],
+    [71, 0, 255],
+    [0, 255, 184],
+    [0, 92, 255],
+    [184, 255, 0],
+    [255, 214, 0],
+    [25, 194, 194],
+    [92, 0, 255],
+    [220, 220, 220],
+    [255, 9, 92],
+    [112, 9, 255],
+    [8, 255, 214],
+    [255, 184, 6],
+    [10, 255, 71],
+    [255, 41, 10],
+    [7, 255, 255],
+    [224, 255, 8],
+    [102, 8, 255],
+    [255, 61, 6],
+    [255, 194, 7],
+    [0, 255, 20],
+    [255, 8, 41],
+    [255, 5, 153],
+    [6, 51, 255],
+    [235, 12, 255],
+    [160, 150, 20],
+    [0, 163, 255],
+    [140, 140, 140],
+    [250, 10, 15],
+    [20, 255, 0],
+])/255
+
+
+class AgentPredictionData:
+    """
+    Agent data class, includes bbox, traj, and occflow
+    """
+
+    def __init__(self,
+                 pred_score,
+                 pred_label,
+                 pred_center,
+                 pred_dim,
+                 pred_yaw,
+                 pred_vel,
+                 pred_traj,
+                 pred_traj_score,
+                 pred_track_id=None,
+                 pred_occ_map=None,
+                 is_sdc=False,
+                 past_pred_traj=None,
+                 command=None,
+                 attn_mask=None,
+                 ):
+        self.pred_score = pred_score
+        self.pred_label = pred_label
+        self.pred_center = pred_center
+        self.pred_dim = pred_dim
+        self.pred_yaw = -pred_yaw-np.pi/2
+        self.pred_vel = pred_vel
+        self.pred_traj = pred_traj
+        self.pred_traj_score = pred_traj_score
+        self.pred_track_id = pred_track_id
+        self.pred_occ_map = pred_occ_map
+        if self.pred_traj is not None:
+            if isinstance(self.pred_traj_score, int):
+                self.pred_traj_max = self.pred_traj
+            else:
+                self.pred_traj_max = self.pred_traj[self.pred_traj_score.argmax(
+                )]
+        else:
+            self.pred_traj_max = None
+        self.nusc_box = Box(
+            center=pred_center,
+            size=pred_dim,
+            orientation=Quaternion(axis=[0, 0, 1], radians=self.pred_yaw),
+            label=pred_label,
+            score=pred_score
+        )
+        if is_sdc:
+            self.pred_center = [0, 0, -1.2+1.56/2]
+        self.is_sdc = is_sdc
+        self.past_pred_traj = past_pred_traj
+        self.command = command
+        self.attn_mask = attn_mask
diff --git a/adzoo/uniad/configs/_base_/datasets/nus-3d.py b/adzoo/uniad/configs/_base_/datasets/nus-3d.py
new file mode 100644
index 0000000..1548171
--- /dev/null
+++ b/adzoo/uniad/configs/_base_/datasets/nus-3d.py
@@ -0,0 +1,142 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/uniad/configs/_base_/default_runtime.py b/adzoo/uniad/configs/_base_/default_runtime.py
new file mode 100644
index 0000000..4e85b69
--- /dev/null
+++ b/adzoo/uniad/configs/_base_/default_runtime.py
@@ -0,0 +1,18 @@
+checkpoint_config = dict(interval=1)
+# yapf:disable push
+# By default we use textlogger hook and tensorboard
+# For more loggers see
+# https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = None
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/adzoo/uniad/configs/stage1_track_map/base_track_map.py b/adzoo/uniad/configs/stage1_track_map/base_track_map.py
new file mode 100644
index 0000000..cd18640
--- /dev/null
+++ b/adzoo/uniad/configs/stage1_track_map/base_track_map.py
@@ -0,0 +1,580 @@
+_base_ = ["../_base_/datasets/nus-3d.py",
+          "../_base_/default_runtime.py"]
+
+# Update-2023-06-12: 
+# [Enhance] Update some freezing args of UniAD 
+# [Bugfix] Reproduce the from-scratch results of stage1
+# 1. Remove loss_past_traj in stage1 training
+# 2. Unfreeze neck and BN
+# --> Reproduced tracking result: AMOTA 0.393
+
+
+# Unfreeze neck and BN, the from-scratch results of stage1 could be reproduced
+plugin = True
+# plugin_dir = "projects/mmdet3d_plugin/"
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+patch_size = [102.4, 102.4]
+img_norm_cfg = dict(mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+# For nuScenes we usually do 10-class detection
+class_names = [
+    "car",
+    "truck",
+    "construction_vehicle",
+    "bus",
+    "trailer",
+    "barrier",
+    "motorcycle",
+    "bicycle",
+    "pedestrian",
+    "traffic_cone",
+]
+
+input_modality = dict(
+    use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True
+)
+_dim_ = 256
+_pos_dim_ = _dim_ // 2
+_ffn_dim_ = _dim_ * 2
+_num_levels_ = 4
+bev_h_ = 200
+bev_w_ = 200
+_feed_dim_ = _ffn_dim_
+_dim_half_ = _pos_dim_
+canvas_size = (bev_h_, bev_w_)
+
+# NOTE: You can change queue_length from 5 to 3 to save GPU memory, but at risk of performance drop.
+queue_length = 3  # each sequence contains `queue_length` frames.
+
+### traj prediction args ###
+predict_steps = 12
+predict_modes = 6
+fut_steps = 4
+past_steps = 4
+use_nonlinear_optimizer = True
+
+## occflow setting	
+occ_n_future = 4	
+occ_n_future_plan = 6
+occ_n_future_max = max([occ_n_future, occ_n_future_plan])	
+
+### planning ###
+planning_steps = 6
+use_col_optim = True
+
+### Occ args ### 
+occflow_grid_conf = {
+    'xbound': [-50.0, 50.0, 0.5],
+    'ybound': [-50.0, 50.0, 0.5],
+    'zbound': [-10.0, 10.0, 20.0],
+}
+
+# Other settings
+train_gt_iou_threshold=0.3
+
+model = dict(
+    type="UniAD",
+    gt_iou_threshold=train_gt_iou_threshold,
+    queue_length=queue_length,
+    use_grid_mask=True,
+    video_test_mode=True,
+    num_query=900,
+    num_classes=10,
+    pc_range=point_cloud_range,
+    img_backbone=dict(
+        type="ResNet",
+        depth=101,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=4,
+        norm_cfg=dict(type="BN2d", requires_grad=False),
+        norm_eval=True,
+        style="caffe",
+        dcn=dict(
+            type="DCNv2", deform_groups=1, fallback_on_stride=False
+        ),  # original DCNv2 will print log when perform load_state_dict
+        stage_with_dcn=(False, False, True, True),
+    ),
+    img_neck=dict(
+        type="FPN",
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs="on_output",
+        num_outs=4,
+        relu_before_extra_convs=True,
+    ),
+    freeze_img_backbone=True,
+    freeze_img_neck=False,
+    freeze_bn=False,
+    score_thresh=0.4,
+    filter_score_thresh=0.35,
+    qim_args=dict(
+        qim_type="QIMBase",
+        merger_dropout=0,
+        update_query_pos=True,
+        fp_ratio=0.3,
+        random_drop=0.1,
+    ),  # hyper-param for query dropping mentioned in MOTR
+    mem_args=dict(
+        memory_bank_type="MemoryBank",
+        memory_bank_score_thresh=0.0,
+        memory_bank_len=4,
+    ),
+    loss_cfg=dict(
+        type="ClipMatcher",
+        num_classes=10,
+        weight_dict=None,
+        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+        assigner=dict(
+            type="HungarianAssigner3DTrack",
+            cls_cost=dict(type="FocalLossCost", weight=2.0),
+            reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+            pc_range=point_cloud_range,
+        ),
+        loss_cls=dict(
+            type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+        ),
+        loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+        loss_past_traj_weight=0.0,
+    ),  # loss cfg for tracking
+    pts_bbox_head=dict(
+        type="BEVFormerTrackHead",
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=10,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        past_steps=past_steps,
+        fut_steps=fut_steps,
+        transformer=dict(
+            type="UniADPerceptionTransformer",
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type="BEVFormerEncoder",
+                num_layers=6,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type="BEVFormerLayer",
+                    attn_cfgs=[
+                        dict(
+                            type="TemporalSelfAttention", embed_dims=_dim_, num_levels=1
+                        ),
+                        dict(
+                            type="SpatialCrossAttention",
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type="MSDeformableAttention3D",
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_,
+                            ),
+                            embed_dims=_dim_,
+                        ),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=(
+                        "self_attn",
+                        "norm",
+                        "cross_attn",
+                        "norm",
+                        "ffn",
+                        "norm",
+                    ),
+                ),
+            ),
+            decoder=dict(
+                type="DetectionTransformerDecoder",
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type="DetrTransformerDecoderLayer",
+                    attn_cfgs=[
+                        dict(
+                            type="MultiheadAttention",
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1,
+                        ),
+                        dict(
+                            type="CustomMSDeformableAttention",
+                            embed_dims=_dim_,
+                            num_levels=1,
+                        ),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=(
+                        "self_attn",
+                        "norm",
+                        "cross_attn",
+                        "norm",
+                        "ffn",
+                        "norm",
+                    ),
+                ),
+            ),
+        ),
+        bbox_coder=dict(
+            type="NMSFreeCoder",
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10,
+        ),
+        positional_encoding=dict(
+            type="LearnedPositionalEncoding",
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+        ),
+        loss_cls=dict(
+            type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+        ),
+        loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+        loss_iou=dict(type="GIoULoss", loss_weight=0.0),
+    ),
+    seg_head=dict(
+        type='PansegformerHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        canvas_size=canvas_size,
+        pc_range=point_cloud_range,
+        num_query=300,
+        num_classes=4,
+        num_things_classes=3,
+        num_stuff_classes=1,
+        in_channels=2048,
+        sync_cls_avg_factor=True,
+        as_two_stage=False,
+        with_box_refine=True,
+        transformer=dict(
+            type='SegDeformableTransformer',
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        embed_dims=_dim_,
+                        num_levels=_num_levels_,
+                         ),
+                    feedforward_channels=_feed_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+            decoder=dict(
+                type='DeformableDetrTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='MultiScaleDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=_num_levels_,
+                        )
+                    ],
+                    feedforward_channels=_feed_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')
+                ),
+            ),
+        ),
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=_dim_half_,
+            normalize=True,
+            offset=-0.5),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+        loss_mask=dict(type='DiceLoss', loss_weight=2.0),
+        thing_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=4),
+        stuff_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=6,self_attn=True),
+        train_cfg=dict(
+            assigner=dict(
+                type='HungarianAssigner',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+                ),
+            assigner_with_mask=dict(
+                type='HungarianAssigner_multi_info',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+                mask_cost=dict(type='DiceCost', weight=2.0),
+                ),
+            sampler =dict(type='PseudoSampler'),
+            sampler_with_mask =dict(type='PseudoSampler_segformer'),
+        ),
+    ),
+ 
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            out_size_factor=4,
+            assigner=dict(
+                type="HungarianAssigner3D",
+                cls_cost=dict(type="FocalLossCost", weight=2.0),
+                reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+                iou_cost=dict(
+                    type="IoUCost", weight=0.0
+                ),  # Fake cost. This is just to make it compatible with DETR head.
+                pc_range=point_cloud_range,
+            ),
+        )
+    ),
+)
+dataset_type = "NuScenesE2EDataset"
+data_root = "data/nuscenes/"
+info_root = "data/infos/"
+file_client_args = dict(backend="disk")
+ann_file_train=info_root + f"nuscenes_infos_temporal_train.pkl"
+ann_file_val=info_root + f"nuscenes_infos_temporal_val.pkl"
+ann_file_test=info_root + f"nuscenes_infos_temporal_val.pkl"
+
+
+train_pipeline = [
+    dict(type="LoadMultiViewImageFromFilesInCeph", to_float32=True, file_client_args=file_client_args, img_root=data_root),
+    dict(type="PhotoMetricDistortionMultiViewImage"),
+    dict(
+        type="LoadAnnotations3D_E2E",
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_attr_label=False,
+
+        with_future_anns=True,  # occ_flow gt
+        with_ins_inds_3d=True,  # ins_inds 
+        ins_inds_add_1=True,    # ins_inds start from 1
+    ),
+
+    dict(type='GenerateOccFlowLabels', grid_conf=occflow_grid_conf, ignore_index=255, only_vehicle=True, 
+                                    filter_invisible=False),  # NOTE: Currently vis_token is not in pkl 
+
+    dict(type="ObjectRangeFilterTrack", point_cloud_range=point_cloud_range),
+    dict(type="ObjectNameFilterTrack", classes=class_names),
+    dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+    dict(type="PadMultiViewImage", size_divisor=32),
+    dict(type="DefaultFormatBundle3D", class_names=class_names),
+    dict(
+        type="CustomCollect3D",
+        keys=[
+            "gt_bboxes_3d",
+            "gt_labels_3d",
+            "gt_inds",
+            "img",
+            "timestamp",
+            "l2g_r_mat",
+            "l2g_t",
+            "gt_fut_traj",
+            "gt_fut_traj_mask",
+            "gt_past_traj",
+            "gt_past_traj_mask",
+            "gt_sdc_bbox",
+            "gt_sdc_label",
+            "gt_sdc_fut_traj",
+            "gt_sdc_fut_traj_mask",
+            "gt_lane_labels",
+            "gt_lane_bboxes",
+            "gt_lane_masks",
+             # Occ gt
+            # "gt_segmentation",
+            # "gt_instance", 
+            # "gt_centerness", 
+            # "gt_offset", 
+            # "gt_flow",
+            # "gt_backward_flow",
+            # "gt_occ_has_invalid_frame",
+            # "gt_occ_img_is_valid",
+            # # gt future bbox for plan	
+            # "gt_future_boxes",	
+            # "gt_future_labels",	
+            # # planning	
+            # "sdc_planning",	
+            # "sdc_planning_mask",	
+            # "command",
+        ],
+    ),
+]
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFilesInCeph', to_float32=True,
+            file_client_args=file_client_args, img_root=data_root),
+    dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+    dict(type="PadMultiViewImage", size_divisor=32),
+    dict(type='LoadAnnotations3D_E2E', 
+         with_bbox_3d=False,
+         with_label_3d=False, 
+         with_attr_label=False,
+
+         with_future_anns=True,
+         with_ins_inds_3d=False,
+         ins_inds_add_1=True, # ins_inds start from 1
+         ),
+    dict(type='GenerateOccFlowLabels', grid_conf=occflow_grid_conf, ignore_index=255, only_vehicle=True, 
+                                       filter_invisible=False),
+    dict(
+        type="MultiScaleFlipAug3D",
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type="DefaultFormatBundle3D", class_names=class_names, with_label=False
+            ),
+            dict(
+                type="CustomCollect3D", keys=[
+                                            "img",
+                                            "timestamp",
+                                            "l2g_r_mat",
+                                            "l2g_t",
+                                            "gt_lane_labels",
+                                            "gt_lane_bboxes",
+                                            "gt_lane_masks",
+                                            "gt_segmentation",
+                                            "gt_instance", 
+                                            "gt_centerness", 
+                                            "gt_offset", 
+                                            "gt_flow",
+                                            "gt_backward_flow",
+                                            "gt_occ_has_invalid_frame",
+                                            "gt_occ_img_is_valid",
+                                             # planning	
+                                            "sdc_planning",	
+                                            "sdc_planning_mask",	
+                                            "command",
+                                        ]
+            ),
+        ],
+    ),
+]
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=1,
+    train=dict(
+        type=dataset_type,
+        file_client_args=file_client_args,
+        data_root=data_root,
+        ann_file=ann_file_train,
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        patch_size=patch_size,
+        canvas_size=canvas_size,
+        bev_size=(bev_h_, bev_w_),
+        queue_length=queue_length,
+        predict_steps=predict_steps,
+        past_steps=past_steps,
+        fut_steps=fut_steps,
+        use_nonlinear_optimizer=use_nonlinear_optimizer,
+        occ_receptive_field=3,
+        occ_n_future=occ_n_future_max,
+        occ_filter_invalid_sample=False,
+
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d="LiDAR",
+    ),
+    val=dict(
+        type=dataset_type,
+        file_client_args=file_client_args,
+        data_root=data_root,
+        ann_file=ann_file_val,
+        pipeline=test_pipeline,
+        patch_size=patch_size,
+        canvas_size=canvas_size,
+        bev_size=(bev_h_, bev_w_),
+        predict_steps=predict_steps,
+        past_steps=past_steps,
+        fut_steps=fut_steps,
+        use_nonlinear_optimizer=use_nonlinear_optimizer,
+        classes=class_names,
+        modality=input_modality,
+        samples_per_gpu=1,
+        eval_mod=['det', 'track', 'map'],
+
+        occ_receptive_field=3,
+        occ_n_future=occ_n_future_max,
+        occ_filter_invalid_sample=False,
+    ),
+    test=dict(
+        type=dataset_type,
+        file_client_args=file_client_args,
+        data_root=data_root,
+        test_mode=True,
+        ann_file=ann_file_test,
+        pipeline=test_pipeline,
+        patch_size=patch_size,
+        canvas_size=canvas_size,
+        bev_size=(bev_h_, bev_w_),
+        predict_steps=predict_steps,
+        past_steps=past_steps,
+        fut_steps=fut_steps,
+        occ_n_future=occ_n_future_max,
+        use_nonlinear_optimizer=use_nonlinear_optimizer,
+        classes=class_names,
+        modality=input_modality,
+        eval_mod=['det', 'map', 'track'],
+    ),
+    shuffler_sampler=dict(type="DistributedGroupSampler"),
+    nonshuffler_sampler=dict(type="DistributedSampler"),
+)
+optimizer = dict(
+    type="AdamW",
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            "img_backbone": dict(lr_mult=0.1),
+        }
+    ),
+    weight_decay=0.01,
+)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy="CosineAnnealing",
+    warmup="linear",
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3,
+)
+total_epochs = 6
+evaluation = dict(interval=6, pipeline=test_pipeline)
+runner = dict(type="EpochBasedRunner", max_epochs=total_epochs)
+log_config = dict(
+    interval=1, hooks=[dict(type="TextLoggerHook"), dict(type="TensorboardLoggerHook")]
+)
+checkpoint_config = dict(interval=1)
+load_from = "ckpts/bevformer_r101_dcn_24ep.pth"
+
+find_unused_parameters = True
\ No newline at end of file
diff --git a/adzoo/uniad/configs/stage1_track_map/base_track_map_b2d.py b/adzoo/uniad/configs/stage1_track_map/base_track_map_b2d.py
new file mode 100644
index 0000000..2b0308d
--- /dev/null
+++ b/adzoo/uniad/configs/stage1_track_map/base_track_map_b2d.py
@@ -0,0 +1,665 @@
+_base_ = ["../_base_/datasets/nus-3d.py",
+          "../_base_/default_runtime.py"]
+
+# Update-2023-06-12: 
+# [Enhance] Update some freezing args of UniAD 
+# [Bugfix] Reproduce the from-scratch results of stage1
+# 1. Remove loss_past_traj in stage1 training
+# 2. Unfreeze neck and BN
+# --> Reproduced tracking result: AMOTA 0.393
+
+
+# Unfreeze neck and BN, the from-scratch results of stage1 could be reproduced
+plugin = True
+# plugin_dir = "projects/mmdet3d_plugin/"
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+patch_size = [102.4, 102.4]
+img_norm_cfg = dict(mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+# For nuScenes we usually do 10-class detection
+
+NameMapping = {
+    #=================vehicle=================
+    # bicycle
+    'vehicle.bh.crossbike': 'bicycle',
+    "vehicle.diamondback.century": 'bicycle',
+    "vehicle.gazelle.omafiets": 'bicycle',
+    # car
+    "vehicle.chevrolet.impala": 'car',
+    "vehicle.dodge.charger_2020": 'car',
+    "vehicle.dodge.charger_police": 'car',
+    "vehicle.dodge.charger_police_2020": 'car',
+    "vehicle.lincoln.mkz_2017": 'car',
+    "vehicle.lincoln.mkz_2020": 'car',
+    "vehicle.mini.cooper_s_2021": 'car',
+    "vehicle.mercedes.coupe_2020": 'car',
+    "vehicle.ford.mustang": 'car',
+    "vehicle.nissan.patrol_2021": 'car',
+    "vehicle.audi.tt": 'car',
+    "vehicle.audi.etron": 'car',
+    "vehicle.ford.crown": 'car',
+    "vehicle.ford.mustang": 'car',
+    "vehicle.tesla.model3": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/FordCrown/SM_FordCrown_parked.SM_FordCrown_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Charger/SM_ChargerParked.SM_ChargerParked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Lincoln/SM_LincolnParked.SM_LincolnParked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/MercedesCCC/SM_MercedesCCC_Parked.SM_MercedesCCC_Parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Mini2021/SM_Mini2021_parked.SM_Mini2021_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/NissanPatrol2021/SM_NissanPatrol2021_parked.SM_NissanPatrol2021_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/TeslaM3/SM_TeslaM3_parked.SM_TeslaM3_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": 'car',
+    # bus
+    # van
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": "van",
+    "vehicle.ford.ambulance": "van",
+    # truck
+    "vehicle.carlamotors.firetruck": 'truck',
+    #=========================================
+
+    #=================traffic sign============
+    # traffic.speed_limit
+    "traffic.speed_limit.30": 'traffic_sign',
+    "traffic.speed_limit.40": 'traffic_sign',
+    "traffic.speed_limit.50": 'traffic_sign',
+    "traffic.speed_limit.60": 'traffic_sign',
+    "traffic.speed_limit.90": 'traffic_sign',
+    "traffic.speed_limit.120": 'traffic_sign',
+    
+    "traffic.stop": 'traffic_sign',
+    "traffic.yield": 'traffic_sign',
+    "traffic.traffic_light": 'traffic_light',
+    #=========================================
+
+    #===================Construction===========
+    "static.prop.warningconstruction" : 'traffic_cone',
+    "static.prop.warningaccident": 'traffic_cone',
+    "static.prop.trafficwarning": "traffic_cone",
+
+    #===================Construction===========
+    "static.prop.constructioncone": 'traffic_cone',
+
+    #=================pedestrian==============
+    "walker.pedestrian.0001": 'pedestrian',
+    "walker.pedestrian.0004": 'pedestrian',
+    "walker.pedestrian.0005": 'pedestrian',
+    "walker.pedestrian.0007": 'pedestrian',
+    "walker.pedestrian.0013": 'pedestrian',
+    "walker.pedestrian.0014": 'pedestrian',
+    "walker.pedestrian.0017": 'pedestrian',
+    "walker.pedestrian.0018": 'pedestrian',
+    "walker.pedestrian.0019": 'pedestrian',
+    "walker.pedestrian.0020": 'pedestrian',
+    "walker.pedestrian.0022": 'pedestrian',
+    "walker.pedestrian.0025": 'pedestrian',
+    "walker.pedestrian.0035": 'pedestrian',
+    "walker.pedestrian.0041": 'pedestrian',
+    "walker.pedestrian.0046": 'pedestrian',
+    "walker.pedestrian.0047": 'pedestrian',
+
+    # ==========================================
+    "static.prop.dirtdebris01": 'others',
+    "static.prop.dirtdebris02": 'others',
+}
+
+eval_cfg = {
+            "dist_ths": [0.5, 1.0, 2.0, 4.0],
+            "dist_th_tp": 2.0,
+            "min_recall": 0.1,
+            "min_precision": 0.1,
+            "mean_ap_weight": 5,
+            "class_names":['car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian'],
+            "tp_metrics":['trans_err', 'scale_err', 'orient_err', 'vel_err'],
+            "err_name_maping":{'trans_err': 'mATE','scale_err': 'mASE','orient_err': 'mAOE','vel_err': 'mAVE','attr_err': 'mAAE'},
+            "class_range":{'car':(50,50),'van':(50,50),'truck':(50,50),'bicycle':(40,40),'traffic_sign':(30,30),'traffic_cone':(30,30),'traffic_light':(30,30),'pedestrian':(40,40)}
+            }
+
+
+
+
+class_names = [
+'car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian','others'
+]
+
+input_modality = dict(
+    use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True
+)
+_dim_ = 256
+_pos_dim_ = _dim_ // 2
+_ffn_dim_ = _dim_ * 2
+_num_levels_ = 4
+bev_h_ = 200
+bev_w_ = 200
+_feed_dim_ = _ffn_dim_
+_dim_half_ = _pos_dim_
+canvas_size = (bev_h_, bev_w_)
+
+# NOTE: You can change queue_length from 5 to 3 to save GPU memory, but at risk of performance drop.
+queue_length = 5  # each sequence contains `queue_length` frames.
+
+### traj prediction args ###
+predict_steps = 12
+predict_modes = 6
+fut_steps = 4
+past_steps = 4
+use_nonlinear_optimizer = True
+
+## occflow setting	
+occ_n_future = 4	
+occ_n_future_plan = 6
+occ_n_future_max = max([occ_n_future, occ_n_future_plan])	
+
+### planning ###
+planning_steps = 6
+use_col_optim = True
+
+### Occ args ### 
+occflow_grid_conf = {
+    'xbound': [-50.0, 50.0, 0.5],
+    'ybound': [-50.0, 50.0, 0.5],
+    'zbound': [-10.0, 10.0, 20.0],
+}
+
+# Other settings
+train_gt_iou_threshold=0.3
+
+model = dict(
+    type="UniAD",
+    gt_iou_threshold=train_gt_iou_threshold,
+    queue_length=queue_length,
+    use_grid_mask=True,
+    video_test_mode=True,
+    num_query=900,
+    num_classes=len(class_names),
+    pc_range=point_cloud_range,
+    img_backbone=dict(
+        type="ResNet",
+        depth=101,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=4,
+        norm_cfg=dict(type="BN2d", requires_grad=False),
+        norm_eval=True,
+        style="caffe",
+        dcn=dict(
+            type="DCNv2", deform_groups=1, fallback_on_stride=False
+        ),  # original DCNv2 will print log when perform load_state_dict
+        stage_with_dcn=(False, False, True, True),
+    ),
+    img_neck=dict(
+        type="FPN",
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs="on_output",
+        num_outs=4,
+        relu_before_extra_convs=True,
+    ),
+    freeze_img_backbone=True,
+    freeze_img_neck=False,
+    freeze_bn=False,
+    score_thresh=0.4,
+    filter_score_thresh=0.35,
+    qim_args=dict(
+        qim_type="QIMBase",
+        merger_dropout=0,
+        update_query_pos=True,
+        fp_ratio=0.3,
+        random_drop=0.1,
+    ),  # hyper-param for query dropping mentioned in MOTR
+    mem_args=dict(
+        memory_bank_type="MemoryBank",
+        memory_bank_score_thresh=0.0,
+        memory_bank_len=4,
+    ),
+    loss_cfg=dict(
+        type="ClipMatcher",
+        num_classes=len(class_names),
+        weight_dict=None,
+        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+        assigner=dict(
+            type="HungarianAssigner3DTrack",
+            cls_cost=dict(type="FocalLossCost", weight=2.0),
+            reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+            pc_range=point_cloud_range,
+        ),
+        loss_cls=dict(
+            type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+        ),
+        loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+        loss_past_traj_weight=0.0,
+    ),  # loss cfg for tracking
+    pts_bbox_head=dict(
+        type="BEVFormerTrackHead",
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=len(class_names),
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        past_steps=past_steps,
+        fut_steps=fut_steps,
+        transformer=dict(
+            type="UniADPerceptionTransformer",
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type="BEVFormerEncoder",
+                num_layers=6,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type="BEVFormerLayer",
+                    attn_cfgs=[
+                        dict(
+                            type="TemporalSelfAttention", embed_dims=_dim_, num_levels=1
+                        ),
+                        dict(
+                            type="SpatialCrossAttention",
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type="MSDeformableAttention3D",
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_,
+                            ),
+                            embed_dims=_dim_,
+                        ),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.0,
+                    operation_order=(
+                        "self_attn",
+                        "norm",
+                        "cross_attn",
+                        "norm",
+                        "ffn",
+                        "norm",
+                    ),
+                ),
+            ),
+            decoder=dict(
+                type="DetectionTransformerDecoder",
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type="DetrTransformerDecoderLayer",
+                    attn_cfgs=[
+                        dict(
+                            type="MultiheadAttention",
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.0,
+                        ),
+                        dict(
+                            type="CustomMSDeformableAttention",
+                            embed_dims=_dim_,
+                            num_levels=1,
+                        ),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.0,
+                    operation_order=(
+                        "self_attn",
+                        "norm",
+                        "cross_attn",
+                        "norm",
+                        "ffn",
+                        "norm",
+                    ),
+                ),
+            ),
+        ),
+        bbox_coder=dict(
+            type="NMSFreeCoder",
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=len(class_names),
+        ),
+        positional_encoding=dict(
+            type="LearnedPositionalEncoding",
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+        ),
+        loss_cls=dict(
+            type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+        ),
+        loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+        loss_iou=dict(type="GIoULoss", loss_weight=0.0),
+    ),
+    seg_head=dict(
+        type='PansegformerHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        canvas_size=canvas_size,
+        pc_range=point_cloud_range,
+        num_query=300,
+        num_classes=6,
+        num_things_classes=6,
+        num_stuff_classes=0,
+        in_channels=2048,
+        sync_cls_avg_factor=True,
+        as_two_stage=False,
+        with_box_refine=True,
+        transformer=dict(
+            type='SegDeformableTransformer',
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        embed_dims=_dim_,
+                        num_levels=_num_levels_,
+                         ),
+                    feedforward_channels=_feed_dim_,
+                    ffn_dropout=0.0,
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+            decoder=dict(
+                type='DeformableDetrTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.0),
+                        dict(
+                            type='MultiScaleDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=_num_levels_,
+                        )
+                    ],
+                    feedforward_channels=_feed_dim_,
+                    ffn_dropout=0.0,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')
+                ),
+            ),
+        ),
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=_dim_half_,
+            normalize=True,
+            offset=-0.5),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+        loss_mask=dict(type='DiceLoss', loss_weight=2.0),
+        thing_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=4),
+        stuff_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=6,self_attn=True),
+        train_cfg=dict(
+            assigner=dict(
+                type='HungarianAssigner',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+                ),
+            assigner_with_mask=dict(
+                type='HungarianAssigner_multi_info',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+                mask_cost=dict(type='DiceCost', weight=2.0),
+                ),
+            sampler =dict(type='PseudoSampler'),
+            sampler_with_mask =dict(type='PseudoSampler_segformer'),
+        ),
+    ),
+ 
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            out_size_factor=4,
+            assigner=dict(
+                type="HungarianAssigner3D",
+                cls_cost=dict(type="FocalLossCost", weight=2.0),
+                reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+                iou_cost=dict(
+                    type="IoUCost", weight=0.0
+                ),  # Fake cost. This is just to make it compatible with DETR head.
+                pc_range=point_cloud_range,
+            ),
+        )
+    ),
+)
+dataset_type = "B2D_E2E_Dataset"
+data_root = "data/bench2drive"
+info_root = "data/infos"
+map_root = "data/bench2drive/maps"
+map_file = "data/infos/b2d_map_infos.pkl"
+file_client_args = dict(backend="disk")
+ann_file_train=info_root + f"/b2d_infos_train.pkl"
+ann_file_val=info_root + f"/b2d_infos_val.pkl"
+ann_file_test=info_root + f"/b2d_infos_val.pkl"
+
+
+train_pipeline = [
+    dict(type="LoadMultiViewImageFromFilesInCeph", to_float32=True, file_client_args=file_client_args, img_root=data_root),
+    dict(type="PhotoMetricDistortionMultiViewImage"),
+    dict(
+        type="LoadAnnotations3D_E2E",
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_attr_label=False,
+        with_vis_token=False,
+        with_future_anns=False,  # occ_flow gt
+        with_ins_inds_3d=True,  # ins_inds 
+        ins_inds_add_1=True,    # ins_inds start from 1
+    ),
+
+    # dict(type='GenerateOccFlowLabels', grid_conf=occflow_grid_conf, ignore_index=255, only_vehicle=True, 
+    #                                 filter_invisible=False),  # NOTE: Currently vis_token is not in pkl 
+
+    dict(type="ObjectRangeFilterTrack", point_cloud_range=point_cloud_range),
+    dict(type="ObjectNameFilterTrack", classes=class_names),
+    dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+    dict(type="PadMultiViewImage", size_divisor=32),
+    dict(type="DefaultFormatBundle3D", class_names=class_names),
+    dict(
+        type="CustomCollect3D",
+        keys=[
+            "gt_bboxes_3d",
+            "gt_labels_3d",
+            "gt_inds",
+            "img",
+            "timestamp",
+            "l2g_r_mat",
+            "l2g_t",
+            "gt_fut_traj",
+            "gt_fut_traj_mask",
+            "gt_past_traj",
+            "gt_past_traj_mask",
+            "gt_sdc_bbox",
+            "gt_sdc_label",
+            "gt_sdc_fut_traj",
+            "gt_sdc_fut_traj_mask",
+            "gt_lane_labels",
+            "gt_lane_bboxes",
+            "gt_lane_masks",
+             # Occ gt
+            # "gt_segmentation",
+            # "gt_instance", 
+            # "gt_centerness", 
+            # "gt_offset", 
+            # "gt_flow",
+            # "gt_backward_flow",
+            # "gt_occ_has_invalid_frame",
+            # "gt_occ_img_is_valid",
+            # # gt future bbox for plan	
+            # "gt_future_boxes",	
+            # "gt_future_labels",	
+            # # planning	
+            # "sdc_planning",	
+            # "sdc_planning_mask",	
+            # "command",
+        ],
+    ),
+]
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFilesInCeph', to_float32=True,
+            file_client_args=file_client_args, img_root=data_root),
+    dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+    dict(type="PadMultiViewImage", size_divisor=32),
+    dict(type='LoadAnnotations3D_E2E', 
+         with_bbox_3d=False,
+         with_label_3d=False, 
+         with_attr_label=False,
+         with_vis_token=False,
+         with_future_anns=False,
+         with_ins_inds_3d=False,
+         ins_inds_add_1=True, # ins_inds start from 1
+         ),
+    # dict(type='GenerateOccFlowLabels', grid_conf=occflow_grid_conf, ignore_index=255, only_vehicle=True, 
+    #                                    filter_invisible=False),
+    dict(
+        type="MultiScaleFlipAug3D",
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type="DefaultFormatBundle3D", class_names=class_names, with_label=False
+            ),
+            dict(
+                type="CustomCollect3D", keys=[
+                                            "img",
+                                            "timestamp",
+                                            "l2g_r_mat",
+                                            "l2g_t",
+                                            "gt_lane_labels",
+                                            "gt_lane_bboxes",
+                                            "gt_lane_masks",
+                                            # "gt_segmentation",
+                                            # "gt_instance", 
+                                            # "gt_centerness", 
+                                            # "gt_offset", 
+                                            # "gt_flow",
+                                            # "gt_backward_flow",
+                                            # "gt_occ_has_invalid_frame",
+                                            # "gt_occ_img_is_valid",
+                                            #  # planning	
+                                            # "sdc_planning",	
+                                            # "sdc_planning_mask",	
+                                            # "command",
+                                        ]
+            ),
+        ],
+    ),
+]
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=ann_file_train,
+        pipeline=train_pipeline,
+        classes=class_names,
+        name_mapping=NameMapping,
+        map_root=map_root,
+        map_file=map_file,
+        modality=input_modality,
+        patch_size=patch_size,
+        bev_size=(bev_h_, bev_w_),
+        queue_length=queue_length,
+        predict_frames=predict_steps,
+        past_frames=past_steps,
+        future_frames=fut_steps,
+        point_cloud_range=point_cloud_range,
+        box_type_3d="LiDAR",
+    ),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=ann_file_val,
+        pipeline=test_pipeline,
+        name_mapping=NameMapping,
+        map_root=map_root,
+        map_file=map_file,
+        bev_size=(bev_h_, bev_w_),
+        predict_frames=predict_steps,
+        past_frames=past_steps,
+        future_frames=fut_steps,
+        classes=class_names,
+        modality=input_modality,
+        samples_per_gpu=1,
+        point_cloud_range=point_cloud_range,
+        eval_cfg=eval_cfg,
+        #eval_mod=['det', 'track', 'map'],
+        box_type_3d="LiDAR",
+    ),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=ann_file_val,
+        pipeline=test_pipeline,
+        name_mapping=NameMapping,
+        map_root=map_root,
+        map_file=map_file,
+        bev_size=(bev_h_, bev_w_),
+        predict_frames=predict_steps,
+        past_frames=past_steps,
+        future_frames=fut_steps,
+        classes=class_names,
+        modality=input_modality,
+        samples_per_gpu=1,
+        point_cloud_range=point_cloud_range,
+        eval_cfg=eval_cfg,
+        #eval_mod=['det', 'track', 'map'],
+        box_type_3d="LiDAR",
+    ),
+    shuffler_sampler=dict(type="DistributedGroupSampler"),
+    nonshuffler_sampler=dict(type="DistributedSampler"),
+)
+
+optimizer = dict(
+    type="AdamW",
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            "img_backbone": dict(lr_mult=0.1),
+        }
+    ),
+    weight_decay=0.01,
+)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    by_epoch=False,
+    policy="CosineAnnealing",
+    warmup="linear",
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3,
+)
+total_epochs = 1
+evaluation = dict(interval=1, pipeline=test_pipeline)
+runner = dict(type="EpochBasedRunner", max_epochs=total_epochs)
+log_config = dict(
+    interval=1, hooks=[dict(type="TextLoggerHook"), dict(type="TensorboardLoggerHook")]
+)
+checkpoint_config = dict(interval=3000, by_epoch=False)
+
+find_unused_parameters = True
\ No newline at end of file
diff --git a/adzoo/uniad/configs/stage1_track_map/tiny_track_map_b2d.py b/adzoo/uniad/configs/stage1_track_map/tiny_track_map_b2d.py
new file mode 100644
index 0000000..c94ff40
--- /dev/null
+++ b/adzoo/uniad/configs/stage1_track_map/tiny_track_map_b2d.py
@@ -0,0 +1,656 @@
+_base_ = ["../_base_/datasets/nus-3d.py",
+          "../_base_/default_runtime.py"]
+
+# Update-2023-06-12: 
+# [Enhance] Update some freezing args of UniAD 
+# [Bugfix] Reproduce the from-scratch results of stage1
+# 1. Remove loss_past_traj in stage1 training
+# 2. Unfreeze neck and BN
+# --> Reproduced tracking result: AMOTA 0.393
+
+
+# Unfreeze neck and BN, the from-scratch results of stage1 could be reproduced
+plugin = True
+# plugin_dir = "projects/mmdet3d_plugin/"
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+patch_size = [102.4, 102.4]
+img_norm_cfg = dict(mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+# For nuScenes we usually do 10-class detection
+
+NameMapping = {
+    #=================vehicle=================
+    # bicycle
+    'vehicle.bh.crossbike': 'bicycle',
+    "vehicle.diamondback.century": 'bicycle',
+    "vehicle.gazelle.omafiets": 'bicycle',
+    # car
+    "vehicle.chevrolet.impala": 'car',
+    "vehicle.dodge.charger_2020": 'car',
+    "vehicle.dodge.charger_police": 'car',
+    "vehicle.dodge.charger_police_2020": 'car',
+    "vehicle.lincoln.mkz_2017": 'car',
+    "vehicle.lincoln.mkz_2020": 'car',
+    "vehicle.mini.cooper_s_2021": 'car',
+    "vehicle.mercedes.coupe_2020": 'car',
+    "vehicle.ford.mustang": 'car',
+    "vehicle.nissan.patrol_2021": 'car',
+    "vehicle.audi.tt": 'car',
+    "vehicle.audi.etron": 'car',
+    "vehicle.ford.crown": 'car',
+    "vehicle.ford.mustang": 'car',
+    "vehicle.tesla.model3": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/FordCrown/SM_FordCrown_parked.SM_FordCrown_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Charger/SM_ChargerParked.SM_ChargerParked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Lincoln/SM_LincolnParked.SM_LincolnParked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/MercedesCCC/SM_MercedesCCC_Parked.SM_MercedesCCC_Parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Mini2021/SM_Mini2021_parked.SM_Mini2021_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/NissanPatrol2021/SM_NissanPatrol2021_parked.SM_NissanPatrol2021_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/TeslaM3/SM_TeslaM3_parked.SM_TeslaM3_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": 'car',
+    # bus
+    # van
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": "van",
+    "vehicle.ford.ambulance": "van",
+    # truck
+    "vehicle.carlamotors.firetruck": 'truck',
+    #=========================================
+
+    #=================traffic sign============
+    # traffic.speed_limit
+    "traffic.speed_limit.30": 'traffic_sign',
+    "traffic.speed_limit.40": 'traffic_sign',
+    "traffic.speed_limit.50": 'traffic_sign',
+    "traffic.speed_limit.60": 'traffic_sign',
+    "traffic.speed_limit.90": 'traffic_sign',
+    "traffic.speed_limit.120": 'traffic_sign',
+    
+    "traffic.stop": 'traffic_sign',
+    "traffic.yield": 'traffic_sign',
+    "traffic.traffic_light": 'traffic_light',
+    #=========================================
+
+    #===================Construction===========
+    "static.prop.warningconstruction" : 'traffic_cone',
+    "static.prop.warningaccident": 'traffic_cone',
+    "static.prop.trafficwarning": "traffic_cone",
+
+    #===================Construction===========
+    "static.prop.constructioncone": 'traffic_cone',
+
+    #=================pedestrian==============
+    "walker.pedestrian.0001": 'pedestrian',
+    "walker.pedestrian.0004": 'pedestrian',
+    "walker.pedestrian.0005": 'pedestrian',
+    "walker.pedestrian.0007": 'pedestrian',
+    "walker.pedestrian.0013": 'pedestrian',
+    "walker.pedestrian.0014": 'pedestrian',
+    "walker.pedestrian.0017": 'pedestrian',
+    "walker.pedestrian.0018": 'pedestrian',
+    "walker.pedestrian.0019": 'pedestrian',
+    "walker.pedestrian.0020": 'pedestrian',
+    "walker.pedestrian.0022": 'pedestrian',
+    "walker.pedestrian.0025": 'pedestrian',
+    "walker.pedestrian.0035": 'pedestrian',
+    "walker.pedestrian.0041": 'pedestrian',
+    "walker.pedestrian.0046": 'pedestrian',
+    "walker.pedestrian.0047": 'pedestrian',
+
+    # ==========================================
+    "static.prop.dirtdebris01": 'others',
+    "static.prop.dirtdebris02": 'others',
+}
+
+eval_cfg = {
+            "dist_ths": [0.5, 1.0, 2.0, 4.0],
+            "dist_th_tp": 2.0,
+            "min_recall": 0.1,
+            "min_precision": 0.1,
+            "mean_ap_weight": 5,
+            "class_names":['car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian'],
+            "tp_metrics":['trans_err', 'scale_err', 'orient_err', 'vel_err'],
+            "err_name_maping":{'trans_err': 'mATE','scale_err': 'mASE','orient_err': 'mAOE','vel_err': 'mAVE','attr_err': 'mAAE'},
+            "class_range":{'car':(50,50),'van':(50,50),'truck':(50,50),'bicycle':(40,40),'traffic_sign':(30,30),'traffic_cone':(30,30),'traffic_light':(30,30),'pedestrian':(40,40)}
+            }
+
+class_names = [
+'car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian','others'
+]
+
+input_modality = dict(
+    use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True
+)
+_dim_ = 256
+_pos_dim_ = _dim_ // 2
+_ffn_dim_ = _dim_ * 2
+_num_levels_ = 4
+bev_h_ = 100
+bev_w_ = 100
+_feed_dim_ = _ffn_dim_
+_dim_half_ = _pos_dim_
+canvas_size = (bev_h_*2, bev_w_*2)
+
+# NOTE: You can change queue_length from 5 to 3 to save GPU memory, but at risk of performance drop.
+queue_length = 3  # each sequence contains `queue_length` frames.
+
+### traj prediction args ###
+predict_steps = 12
+predict_modes = 6
+fut_steps = 4
+past_steps = 4
+use_nonlinear_optimizer = True
+
+## occflow setting	
+occ_n_future = 4	
+occ_n_future_plan = 6
+occ_n_future_max = max([occ_n_future, occ_n_future_plan])	
+
+### planning ###
+planning_steps = 6
+use_col_optim = True
+
+### Occ args ### 
+occflow_grid_conf = {
+    'xbound': [-50.0, 50.0, 0.5],
+    'ybound': [-50.0, 50.0, 0.5],
+    'zbound': [-10.0, 10.0, 20.0],
+}
+
+# Other settings
+train_gt_iou_threshold=0.3
+
+model = dict(
+    type="UniAD",
+    gt_iou_threshold=train_gt_iou_threshold,
+    queue_length=queue_length,
+    use_grid_mask=True,
+    video_test_mode=True,
+    num_query=900,
+    num_classes=len(class_names),
+    pc_range=point_cloud_range,
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(1,2,3),
+        frozen_stages=4,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch'),
+    img_neck=dict(
+        type="FPN",
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs="on_output",
+        num_outs=4,
+        relu_before_extra_convs=True,
+    ),
+    freeze_img_backbone=True,
+    freeze_img_neck=False,
+    freeze_bn=False,
+    score_thresh=0.4,
+    filter_score_thresh=0.35,
+    qim_args=dict(
+        qim_type="QIMBase",
+        merger_dropout=0,
+        update_query_pos=True,
+        fp_ratio=0.3,
+        random_drop=0.1,
+    ),  # hyper-param for query dropping mentioned in MOTR
+    mem_args=dict(
+        memory_bank_type="MemoryBank",
+        memory_bank_score_thresh=0.0,
+        memory_bank_len=4,
+    ),
+    loss_cfg=dict(
+        type="ClipMatcher",
+        num_classes=len(class_names),
+        weight_dict=None,
+        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+        assigner=dict(
+            type="HungarianAssigner3DTrack",
+            cls_cost=dict(type="FocalLossCost", weight=2.0),
+            reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+            pc_range=point_cloud_range,
+        ),
+        loss_cls=dict(
+            type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+        ),
+        loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+        loss_past_traj_weight=0.0,
+    ),  # loss cfg for tracking
+    pts_bbox_head=dict(
+        type="BEVFormerTrackHead",
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=len(class_names),
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        past_steps=past_steps,
+        fut_steps=fut_steps,
+        transformer=dict(
+            type="UniADPerceptionTransformer",
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type="BEVFormerEncoder",
+                num_layers=3,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type="BEVFormerLayer",
+                    attn_cfgs=[
+                        dict(
+                            type="TemporalSelfAttention", embed_dims=_dim_, num_levels=1
+                        ),
+                        dict(
+                            type="SpatialCrossAttention",
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type="MSDeformableAttention3D",
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_,
+                            ),
+                            embed_dims=_dim_,
+                        ),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.0,
+                    operation_order=(
+                        "self_attn",
+                        "norm",
+                        "cross_attn",
+                        "norm",
+                        "ffn",
+                        "norm",
+                    ),
+                ),
+            ),
+            decoder=dict(
+                type="DetectionTransformerDecoder",
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type="DetrTransformerDecoderLayer",
+                    attn_cfgs=[
+                        dict(
+                            type="MultiheadAttention",
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.0,
+                        ),
+                        dict(
+                            type="CustomMSDeformableAttention",
+                            embed_dims=_dim_,
+                            num_levels=1,
+                        ),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.0,
+                    operation_order=(
+                        "self_attn",
+                        "norm",
+                        "cross_attn",
+                        "norm",
+                        "ffn",
+                        "norm",
+                    ),
+                ),
+            ),
+        ),
+        bbox_coder=dict(
+            type="NMSFreeCoder",
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=len(class_names),
+        ),
+        positional_encoding=dict(
+            type="LearnedPositionalEncoding",
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+        ),
+        loss_cls=dict(
+            type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+        ),
+        loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+        loss_iou=dict(type="GIoULoss", loss_weight=0.0),
+    ),
+    seg_head=dict(
+        type='PansegformerHead',
+        bev_h=bev_h_*2,
+        bev_w=bev_w_*2,
+        canvas_size=canvas_size,
+        pc_range=point_cloud_range,
+        num_query=300,
+        num_classes=6,
+        num_things_classes=6,
+        num_stuff_classes=0,
+        in_channels=2048,
+        sync_cls_avg_factor=True,
+        as_two_stage=False,
+        with_box_refine=True,
+        transformer=dict(
+            type='SegDeformableTransformer',
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        embed_dims=_dim_,
+                        num_levels=_num_levels_,
+                         ),
+                    feedforward_channels=_feed_dim_,
+                    ffn_dropout=0.0,
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+            decoder=dict(
+                type='DeformableDetrTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.0),
+                        dict(
+                            type='MultiScaleDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=_num_levels_,
+                        )
+                    ],
+                    feedforward_channels=_feed_dim_,
+                    ffn_dropout=0.0,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')
+                ),
+            ),
+        ),
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=_dim_half_,
+            normalize=True,
+            offset=-0.5),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+        loss_mask=dict(type='DiceLoss', loss_weight=2.0),
+        thing_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=4),
+        stuff_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=6,self_attn=True),
+        train_cfg=dict(
+            assigner=dict(
+                type='HungarianAssigner',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+                ),
+            assigner_with_mask=dict(
+                type='HungarianAssigner_multi_info',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+                mask_cost=dict(type='DiceCost', weight=2.0),
+                ),
+            sampler =dict(type='PseudoSampler'),
+            sampler_with_mask =dict(type='PseudoSampler_segformer'),
+        ),
+    ),
+ 
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            out_size_factor=4,
+            assigner=dict(
+                type="HungarianAssigner3D",
+                cls_cost=dict(type="FocalLossCost", weight=2.0),
+                reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+                iou_cost=dict(
+                    type="IoUCost", weight=0.0
+                ),  # Fake cost. This is just to make it compatible with DETR head.
+                pc_range=point_cloud_range,
+            ),
+        )
+    ),
+)
+dataset_type = "B2D_E2E_Dataset"
+data_root = "data/bench2drive"
+info_root = "data/infos"
+map_root = "data/bench2drive/maps"
+map_file = "data/infos/b2d_map_infos.pkl"
+file_client_args = dict(backend="disk")
+ann_file_train=info_root + f"/b2d_infos_train.pkl"
+ann_file_val=info_root + f"/b2d_infos_val.pkl"
+ann_file_test=info_root + f"/b2d_infos_val.pkl"
+
+
+train_pipeline = [
+    dict(type="LoadMultiViewImageFromFilesInCeph", to_float32=True, file_client_args=file_client_args, img_root=data_root),
+    dict(type="PhotoMetricDistortionMultiViewImage"),
+    dict(
+        type="LoadAnnotations3D_E2E",
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_attr_label=False,
+
+        with_future_anns=False,  # occ_flow gt
+        with_ins_inds_3d=True,  # ins_inds 
+        ins_inds_add_1=True,    # ins_inds start from 1
+    ),
+
+    # dict(type='GenerateOccFlowLabels', grid_conf=occflow_grid_conf, ignore_index=255, only_vehicle=True, 
+    #                                 filter_invisible=False),  # NOTE: Currently vis_token is not in pkl 
+
+    dict(type="ObjectRangeFilterTrack", point_cloud_range=point_cloud_range),
+    dict(type="ObjectNameFilterTrack", classes=class_names),
+    dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+    dict(type="PadMultiViewImage", size_divisor=32),
+    dict(type="DefaultFormatBundle3D", class_names=class_names),
+    dict(
+        type="CustomCollect3D",
+        keys=[
+            "gt_bboxes_3d",
+            "gt_labels_3d",
+            "gt_inds",
+            "img",
+            "timestamp",
+            "l2g_r_mat",
+            "l2g_t",
+            "gt_fut_traj",
+            "gt_fut_traj_mask",
+            "gt_past_traj",
+            "gt_past_traj_mask",
+            "gt_sdc_bbox",
+            "gt_sdc_label",
+            "gt_sdc_fut_traj",
+            "gt_sdc_fut_traj_mask",
+            "gt_lane_labels",
+            "gt_lane_bboxes",
+            "gt_lane_masks",
+            #  Occ gt
+            # "gt_segmentation",
+            # "gt_instance", 
+            # "gt_centerness", 
+            # "gt_offset", 
+            # "gt_flow",
+            # "gt_backward_flow",
+            # "gt_occ_has_invalid_frame",
+            # "gt_occ_img_is_valid",
+            # # gt future bbox for plan	
+            # "gt_future_boxes",	
+            # "gt_future_labels",	
+            # # planning	
+            # "sdc_planning",	
+            # "sdc_planning_mask",	
+            # "command",
+        ],
+    ),
+]
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFilesInCeph', to_float32=True,
+            file_client_args=file_client_args, img_root=data_root),
+    dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+    dict(type="PadMultiViewImage", size_divisor=32),
+    dict(type='LoadAnnotations3D_E2E', 
+         with_bbox_3d=False,
+         with_label_3d=False, 
+         with_attr_label=False,
+
+         with_future_anns=False,
+         with_ins_inds_3d=False,
+         ins_inds_add_1=True, # ins_inds start from 1
+         ),
+    # dict(type='GenerateOccFlowLabels', grid_conf=occflow_grid_conf, ignore_index=255, only_vehicle=True, 
+    #                                    filter_invisible=False),
+    dict(
+        type="MultiScaleFlipAug3D",
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type="DefaultFormatBundle3D", class_names=class_names, with_label=False
+            ),
+            dict(
+                type="CustomCollect3D", keys=[
+                                            "img",
+                                            # "timestamp",
+                                            "l2g_r_mat",
+                                            "l2g_t",
+                                            "gt_lane_labels",
+                                            "gt_lane_bboxes",
+                                            "gt_lane_masks",
+                                            # "gt_segmentation",
+                                            # "gt_instance", 
+                                            # "gt_centerness", 
+                                            # "gt_offset", 
+                                            # "gt_flow",
+                                            # "gt_backward_flow",
+                                            # "gt_occ_has_invalid_frame",
+                                            # "gt_occ_img_is_valid",
+                                            #  # planning	
+                                            # "sdc_planning",	
+                                            # "sdc_planning_mask",	
+                                            # "command",
+                                        ]
+            ),
+        ],
+    ),
+]
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=ann_file_train,
+        pipeline=train_pipeline,
+        classes=class_names,
+        name_mapping=NameMapping,
+        map_root=map_root,
+        map_file=map_file,
+        modality=input_modality,
+        patch_size=patch_size,
+        bev_size=(bev_h_, bev_w_),
+        queue_length=queue_length,
+        predict_frames=predict_steps,
+        past_frames=past_steps,
+        future_frames=fut_steps,
+        point_cloud_range=point_cloud_range,
+        box_type_3d="LiDAR",
+    ),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=ann_file_val,
+        pipeline=test_pipeline,
+        name_mapping=NameMapping,
+        map_root=map_root,
+        map_file=map_file,
+        bev_size=(bev_h_, bev_w_),
+        predict_frames=predict_steps,
+        past_frames=past_steps,
+        future_frames=fut_steps,
+        classes=class_names,
+        modality=input_modality,
+        samples_per_gpu=1,
+        point_cloud_range=point_cloud_range,
+        eval_cfg=eval_cfg,
+        #eval_mod=['det', 'track', 'map'],
+        box_type_3d="LiDAR",
+    ),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=ann_file_val,
+        pipeline=test_pipeline,
+        name_mapping=NameMapping,
+        map_root=map_root,
+        map_file=map_file,
+        bev_size=(bev_h_, bev_w_),
+        predict_frames=predict_steps,
+        past_frames=past_steps,
+        future_frames=fut_steps,
+        classes=class_names,
+        modality=input_modality,
+        samples_per_gpu=1,
+        point_cloud_range=point_cloud_range,
+        eval_cfg=eval_cfg,
+        #eval_mod=['det', 'track', 'map'],
+        box_type_3d="LiDAR",
+    ),
+    shuffler_sampler=dict(type="DistributedGroupSampler"),
+    nonshuffler_sampler=dict(type="DistributedSampler"),
+)
+optimizer = dict(
+    type="AdamW",
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            "img_backbone": dict(lr_mult=0.1),
+        }
+    ),
+    weight_decay=0.01,
+)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    by_epoch=False,
+    policy="CosineAnnealing",
+    warmup="linear",
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3,
+)
+total_epochs = 1
+evaluation = dict(interval=1, pipeline=test_pipeline)
+runner = dict(type="EpochBasedRunner", max_epochs=total_epochs)
+log_config = dict(
+    interval=1, hooks=[dict(type="TextLoggerHook"), dict(type="TensorboardLoggerHook")]
+)
+checkpoint_config = dict(interval=3000, by_epoch=False)
+
+find_unused_parameters = True
\ No newline at end of file
diff --git a/adzoo/uniad/configs/stage2_e2e/base_e2e.py b/adzoo/uniad/configs/stage2_e2e/base_e2e.py
new file mode 100644
index 0000000..86a09fd
--- /dev/null
+++ b/adzoo/uniad/configs/stage2_e2e/base_e2e.py
@@ -0,0 +1,696 @@
+_base_ = ["../_base_/datasets/nus-3d.py",
+          "../_base_/default_runtime.py"]
+
+# Update-2023-06-12: 
+# [Enhance] Update some freezing args of UniAD 
+plugin = True
+# plugin_dir = "projects/mmdet3d_plugin/"
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+patch_size = [102.4, 102.4]
+img_norm_cfg = dict(mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+# For nuScenes we usually do 10-class detection
+class_names = [
+    "car",
+    "truck",
+    "construction_vehicle",
+    "bus",
+    "trailer",
+    "barrier",
+    "motorcycle",
+    "bicycle",
+    "pedestrian",
+    "traffic_cone",
+]
+vehicle_id_list = [0, 1, 2, 3, 4, 6, 7]
+group_id_list = [[0,1,2,3,4], [6,7], [8], [5,9]]
+input_modality = dict(
+    use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True
+)
+_dim_ = 256
+_pos_dim_ = _dim_ // 2
+_ffn_dim_ = _dim_ * 2
+_num_levels_ = 4
+bev_h_ = 200
+bev_w_ = 200
+_feed_dim_ = _ffn_dim_
+_dim_half_ = _pos_dim_
+canvas_size = (bev_h_, bev_w_)
+queue_length = 3  # each sequence contains `queue_length` frames.
+
+### traj prediction args ###
+predict_steps = 12
+predict_modes = 6
+fut_steps = 4
+past_steps = 4
+use_nonlinear_optimizer = True
+
+## occflow setting	
+occ_n_future = 4	
+occ_n_future_plan = 6	
+occ_n_future_max = max([occ_n_future, occ_n_future_plan])	
+
+### planning ###
+planning_steps = 6
+use_col_optim = True
+
+### Occ args ### 
+occflow_grid_conf = {
+    'xbound': [-50.0, 50.0, 0.5],
+    'ybound': [-50.0, 50.0, 0.5],
+    'zbound': [-10.0, 10.0, 20.0],
+}
+
+# Other settings
+train_gt_iou_threshold=0.3
+
+model = dict(
+    type="UniAD",
+    gt_iou_threshold=train_gt_iou_threshold,
+    queue_length=queue_length,
+    use_grid_mask=True,
+    video_test_mode=True,
+    num_query=900,
+    num_classes=10,
+    vehicle_id_list=vehicle_id_list,
+    pc_range=point_cloud_range,
+    img_backbone=dict(
+        type="ResNet",
+        depth=101,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=4,
+        norm_cfg=dict(type="BN2d", requires_grad=False),
+        norm_eval=True,
+        style="caffe",
+        dcn=dict(
+            type="DCNv2", deform_groups=1, fallback_on_stride=False
+        ),  # original DCNv2 will print log when perform load_state_dict
+        stage_with_dcn=(False, False, True, True),
+    ),
+    img_neck=dict(
+        type="FPN",
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs="on_output",
+        num_outs=4,
+        relu_before_extra_convs=True,
+    ),
+    freeze_img_backbone=True,
+    freeze_img_neck=True,
+    freeze_bn=True,
+    freeze_bev_encoder=True,
+    score_thresh=0.4,
+    filter_score_thresh=0.35,
+    qim_args=dict(
+        qim_type="QIMBase",
+        merger_dropout=0,
+        update_query_pos=True,
+        fp_ratio=0.3,
+        random_drop=0.1,
+    ),  # hyper-param for query dropping mentioned in MOTR
+    mem_args=dict(
+        memory_bank_type="MemoryBank",
+        memory_bank_score_thresh=0.0,
+        memory_bank_len=4,
+    ),
+    loss_cfg=dict(
+        type="ClipMatcher",
+        num_classes=10,
+        weight_dict=None,
+        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+        assigner=dict(
+            type="HungarianAssigner3DTrack",
+            cls_cost=dict(type="FocalLossCost", weight=2.0),
+            reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+            pc_range=point_cloud_range,
+        ),
+        loss_cls=dict(
+            type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+        ),
+        loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+    ),  # loss cfg for tracking
+    pts_bbox_head=dict(
+        type="BEVFormerTrackHead",
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=10,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        past_steps=past_steps,
+        fut_steps=fut_steps,
+        transformer=dict(
+            type="UniADPerceptionTransformer",
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type="BEVFormerEncoder",
+                num_layers=6,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type="BEVFormerLayer",
+                    attn_cfgs=[
+                        dict(
+                            type="TemporalSelfAttention", embed_dims=_dim_, num_levels=1
+                        ),
+                        dict(
+                            type="SpatialCrossAttention",
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type="MSDeformableAttention3D",
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_,
+                            ),
+                            embed_dims=_dim_,
+                        ),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=(
+                        "self_attn",
+                        "norm",
+                        "cross_attn",
+                        "norm",
+                        "ffn",
+                        "norm",
+                    ),
+                ),
+            ),
+            decoder=dict(
+                type="DetectionTransformerDecoder",
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type="DetrTransformerDecoderLayer",
+                    attn_cfgs=[
+                        dict(
+                            type="MultiheadAttention",
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1,
+                        ),
+                        dict(
+                            type="CustomMSDeformableAttention",
+                            embed_dims=_dim_,
+                            num_levels=1,
+                        ),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=(
+                        "self_attn",
+                        "norm",
+                        "cross_attn",
+                        "norm",
+                        "ffn",
+                        "norm",
+                    ),
+                ),
+            ),
+        ),
+        bbox_coder=dict(
+            type="NMSFreeCoder",
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10,
+        ),
+        positional_encoding=dict(
+            type="LearnedPositionalEncoding",
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+        ),
+        loss_cls=dict(
+            type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+        ),
+        loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+        loss_iou=dict(type="GIoULoss", loss_weight=0.0),
+    ),
+    seg_head=dict(
+        type='PansegformerHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        canvas_size=canvas_size,
+        pc_range=point_cloud_range,
+        num_query=300,
+        num_classes=4,
+        num_things_classes=3,
+        num_stuff_classes=1,
+        in_channels=2048,
+        sync_cls_avg_factor=True,
+        as_two_stage=False,
+        with_box_refine=True,
+        transformer=dict(
+            type='SegDeformableTransformer',
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        embed_dims=_dim_,
+                        num_levels=_num_levels_,
+                         ),
+                    feedforward_channels=_feed_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+            decoder=dict(
+                type='DeformableDetrTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='MultiScaleDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=_num_levels_,
+                        )
+                    ],
+                    feedforward_channels=_feed_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')
+                ),
+            ),
+        ),
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=_dim_half_,
+            normalize=True,
+            offset=-0.5),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+        loss_mask=dict(type='DiceLoss', loss_weight=2.0),
+        thing_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=4),
+        stuff_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=6,self_attn=True),
+        train_cfg=dict(
+            assigner=dict(
+                type='HungarianAssigner',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+                ),
+            assigner_with_mask=dict(
+                type='HungarianAssigner_multi_info',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+                mask_cost=dict(type='DiceCost', weight=2.0),
+                ),
+            sampler =dict(type='PseudoSampler'),
+            sampler_with_mask =dict(type='PseudoSampler_segformer'),
+        ),
+    ),
+    occ_head=dict(
+        type='OccHead',
+
+        grid_conf=occflow_grid_conf,
+        ignore_index=255,
+
+        bev_proj_dim=256,
+        bev_proj_nlayers=4,
+
+        # Transformer
+        attn_mask_thresh=0.3,
+        transformer_decoder=dict(
+            type='DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=5,
+            transformerlayers=dict(
+                type='DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,  # change to 512
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True),
+                feedforward_channels=2048,
+                operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        # Query
+        query_dim=256,
+        query_mlp_layers=3,
+
+        aux_loss_weight=1.,
+        loss_mask=dict(
+            type='FieryBinarySegmentationLoss',
+            use_top_k=True,
+            top_k_ratio=0.25,
+            future_discount=0.95,
+            loss_weight=5.0,
+            ignore_index=255,
+        ),
+        loss_dice=dict(
+            type='DiceLossWithMasks',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            ignore_index=255,
+            loss_weight=1.0),
+
+        
+        pan_eval=True,
+        test_seg_thresh=0.1,
+        test_with_track_score=True,
+    ),
+    motion_head=dict(
+        type='MotionHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=300,
+        num_classes=10,
+        predict_steps=predict_steps,
+        predict_modes=predict_modes,
+        embed_dims=_dim_,
+        loss_traj=dict(type='TrajLoss', 
+            use_variance=True, 
+            cls_loss_weight=0.5, 	
+            nll_loss_weight=0.5, 	
+            loss_weight_minade=0., 	
+            loss_weight_minfde=0.25),
+        num_cls_fcs=3,
+        pc_range=point_cloud_range,
+        group_id_list=group_id_list,
+        num_anchor=6,
+        use_nonlinear_optimizer=use_nonlinear_optimizer,
+        anchor_info_path='data/others/motion_anchor_infos_mode6.pkl',
+        transformerlayers=dict(
+            type='MotionTransformerDecoder',
+            pc_range=point_cloud_range,
+            embed_dims=_dim_,
+            num_layers=3,
+            transformerlayers=dict(
+                type='MotionTransformerAttentionLayer',
+                batch_first=True,
+                attn_cfgs=[
+                    dict(
+                        type='MotionDeformableAttention',
+                        num_steps=predict_steps,
+                        embed_dims=_dim_,
+                        num_levels=1,
+                        num_heads=8,
+                        num_points=4,
+                        sample_index=-1),
+                ],
+
+                feedforward_channels=_ffn_dim_,
+                ffn_dropout=0.1,
+                operation_order=('cross_attn', 'norm', 'ffn', 'norm')),
+        ),
+    ),
+    planning_head=dict(
+        type='PlanningHeadSingleMode',
+        embed_dims=256,
+        planning_steps=planning_steps,
+        loss_planning=dict(type='PlanningLoss'),
+        loss_collision=[dict(type='CollisionLoss', delta=0.0, weight=2.5),
+                        dict(type='CollisionLoss', delta=0.5, weight=1.0),
+                        dict(type='CollisionLoss', delta=1.0, weight=0.25)],
+        use_col_optim=use_col_optim,
+        planning_eval=True,
+        with_adapter=True,
+    ),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            out_size_factor=4,
+            assigner=dict(
+                type="HungarianAssigner3D",
+                cls_cost=dict(type="FocalLossCost", weight=2.0),
+                reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+                iou_cost=dict(
+                    type="IoUCost", weight=0.0
+                ),  # Fake cost. This is just to make it compatible with DETR head.
+                pc_range=point_cloud_range,
+            ),
+        )
+    ),
+)
+dataset_type = "NuScenesE2EDataset"
+data_root = "data/nuscenes/"
+info_root = "data/infos/"
+file_client_args = dict(backend="disk")
+ann_file_train=info_root + f"nuscenes_infos_temporal_train.pkl"
+ann_file_val=info_root + f"nuscenes_infos_temporal_val.pkl"
+ann_file_test=info_root + f"nuscenes_infos_temporal_val.pkl"
+
+
+train_pipeline = [
+    dict(type="LoadMultiViewImageFromFilesInCeph", to_float32=True, file_client_args=file_client_args, img_root=data_root),
+    dict(type="PhotoMetricDistortionMultiViewImage"),
+    dict(
+        type="LoadAnnotations3D_E2E",
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_attr_label=False,
+
+        with_future_anns=True,  # occ_flow gt
+        with_ins_inds_3d=True,  # ins_inds 
+        ins_inds_add_1=True,    # ins_inds start from 1
+    ),
+
+    dict(type='GenerateOccFlowLabels', grid_conf=occflow_grid_conf, ignore_index=255, only_vehicle=True, 
+                                    filter_invisible=False),  # NOTE: Currently vis_token is not in pkl 
+
+    dict(type="ObjectRangeFilterTrack", point_cloud_range=point_cloud_range),
+    dict(type="ObjectNameFilterTrack", classes=class_names),
+    dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+    dict(type="PadMultiViewImage", size_divisor=32),
+    dict(type="DefaultFormatBundle3D", class_names=class_names),
+    dict(
+        type="CustomCollect3D",
+        keys=[
+            "gt_bboxes_3d",
+            "gt_labels_3d",
+            "gt_inds",
+            "img",
+            "timestamp",
+            "l2g_r_mat",
+            "l2g_t",
+            "gt_fut_traj",
+            "gt_fut_traj_mask",
+            "gt_past_traj",
+            "gt_past_traj_mask",
+            "gt_sdc_bbox",
+            "gt_sdc_label",
+            "gt_sdc_fut_traj",
+            "gt_sdc_fut_traj_mask",
+            "gt_lane_labels",
+            "gt_lane_bboxes",
+            "gt_lane_masks",
+             # Occ gt
+            "gt_segmentation",
+            "gt_instance", 
+            "gt_centerness", 
+            "gt_offset", 
+            "gt_flow",
+            "gt_backward_flow",
+            "gt_occ_has_invalid_frame",	
+            "gt_occ_img_is_valid",	
+            # gt future bbox for plan	
+            "gt_future_boxes",	
+            "gt_future_labels",	
+            # planning	
+            "sdc_planning",	
+            "sdc_planning_mask",	
+            "command",
+        ],
+    ),
+]
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFilesInCeph', to_float32=True,
+            file_client_args=file_client_args, img_root=data_root),
+    dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+    dict(type="PadMultiViewImage", size_divisor=32),
+    dict(type='LoadAnnotations3D_E2E', 
+         with_bbox_3d=False,
+         with_label_3d=False, 
+         with_attr_label=False,
+
+         with_future_anns=True,
+         with_ins_inds_3d=False,
+         ins_inds_add_1=True, # ins_inds start from 1
+         ),
+    dict(type='GenerateOccFlowLabels', grid_conf=occflow_grid_conf, ignore_index=255, only_vehicle=True, 
+                                       filter_invisible=False),
+    dict(
+        type="MultiScaleFlipAug3D",
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type="DefaultFormatBundle3D", class_names=class_names, with_label=False
+            ),
+            dict(
+                type="CustomCollect3D", keys=[
+                                            "img",
+                                            "timestamp",
+                                            "l2g_r_mat",
+                                            "l2g_t",
+                                            "gt_lane_labels",
+                                            "gt_lane_bboxes",
+                                            "gt_lane_masks",
+                                            "gt_segmentation",
+                                            "gt_instance", 
+                                            "gt_centerness", 
+                                            "gt_offset", 
+                                            "gt_flow",
+                                            "gt_backward_flow",
+                                            "gt_occ_has_invalid_frame",	
+                                            "gt_occ_img_is_valid",	
+                                            # planning	
+                                            "sdc_planning",	
+                                            "sdc_planning_mask",	
+                                            "command",
+                                        ]
+            ),
+        ],
+    ),
+]
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=8,
+    train=dict(
+        type=dataset_type,
+        file_client_args=file_client_args,
+        data_root=data_root,
+        ann_file=ann_file_train,
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        patch_size=patch_size,
+        canvas_size=canvas_size,
+        bev_size=(bev_h_, bev_w_),
+        queue_length=queue_length,
+        predict_steps=predict_steps,
+        past_steps=past_steps,
+        fut_steps=fut_steps,
+        use_nonlinear_optimizer=use_nonlinear_optimizer,
+
+        occ_receptive_field=3,
+        occ_n_future=occ_n_future_max,
+        occ_filter_invalid_sample=False,
+        
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d="LiDAR",
+    ),
+    val=dict(
+        type=dataset_type,
+        file_client_args=file_client_args,
+        data_root=data_root,
+        ann_file=ann_file_val,
+        pipeline=test_pipeline,
+        patch_size=patch_size,
+        canvas_size=canvas_size,
+        bev_size=(bev_h_, bev_w_),
+        predict_steps=predict_steps,
+        past_steps=past_steps,
+        fut_steps=fut_steps,
+        use_nonlinear_optimizer=use_nonlinear_optimizer,
+        classes=class_names,
+        modality=input_modality,
+        samples_per_gpu=1,
+        eval_mod=['det', 'map', 'track','motion'],
+        
+
+        occ_receptive_field=3,
+        occ_n_future=occ_n_future_max,
+        occ_filter_invalid_sample=False,
+    ),
+    test=dict(
+        type=dataset_type,
+        file_client_args=file_client_args,
+        data_root=data_root,
+        test_mode=True,
+        ann_file=ann_file_test,
+        pipeline=test_pipeline,
+        patch_size=patch_size,
+        canvas_size=canvas_size,
+        bev_size=(bev_h_, bev_w_),
+        predict_steps=predict_steps,
+        past_steps=past_steps,
+        fut_steps=fut_steps,
+        occ_n_future=occ_n_future_max,
+        use_nonlinear_optimizer=use_nonlinear_optimizer,
+        classes=class_names,
+        modality=input_modality,
+        eval_mod=['det', 'map', 'track','motion'],
+    ),
+    shuffler_sampler=dict(type="DistributedGroupSampler"),
+    nonshuffler_sampler=dict(type="DistributedSampler"),
+)
+optimizer = dict(
+    type="AdamW",
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            "img_backbone": dict(lr_mult=0.1),
+        }
+    ),
+    weight_decay=0.01,
+)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy="CosineAnnealing",
+    warmup="linear",
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3,
+)
+total_epochs = 2
+evaluation = dict(interval=1, pipeline=test_pipeline)
+runner = dict(type="EpochBasedRunner", max_epochs=total_epochs)
+log_config = dict(
+    interval=10, hooks=[dict(type="TextLoggerHook"), dict(type="TensorboardLoggerHook")]
+)
+checkpoint_config = dict(interval=1)
+load_from = "ckpts/uniad_base_track_map.pth"
+
+find_unused_parameters = True
\ No newline at end of file
diff --git a/adzoo/uniad/configs/stage2_e2e/base_e2e_b2d.py b/adzoo/uniad/configs/stage2_e2e/base_e2e_b2d.py
new file mode 100644
index 0000000..a0e156c
--- /dev/null
+++ b/adzoo/uniad/configs/stage2_e2e/base_e2e_b2d.py
@@ -0,0 +1,819 @@
+_base_ = ["../_base_/datasets/nus-3d.py",
+          "../_base_/default_runtime.py"]
+
+# Update-2023-06-12: 
+# [Enhance] Update some freezing args of UniAD 
+# plugin_dir = "projects/mmdet3d_plugin/"
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+patch_size = [102.4, 102.4]
+img_norm_cfg = dict(mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+NameMapping = {
+    #=================vehicle=================
+    # bicycle
+    'vehicle.bh.crossbike': 'bicycle',
+    "vehicle.diamondback.century": 'bicycle',
+    "vehicle.gazelle.omafiets": 'bicycle',
+    # car
+    "vehicle.chevrolet.impala": 'car',
+    "vehicle.dodge.charger_2020": 'car',
+    "vehicle.dodge.charger_police": 'car',
+    "vehicle.dodge.charger_police_2020": 'car',
+    "vehicle.lincoln.mkz_2017": 'car',
+    "vehicle.lincoln.mkz_2020": 'car',
+    "vehicle.mini.cooper_s_2021": 'car',
+    "vehicle.mercedes.coupe_2020": 'car',
+    "vehicle.ford.mustang": 'car',
+    "vehicle.nissan.patrol_2021": 'car',
+    "vehicle.audi.tt": 'car',
+    "vehicle.audi.etron": 'car',
+    "vehicle.ford.crown": 'car',
+    "vehicle.ford.mustang": 'car',
+    "vehicle.tesla.model3": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/FordCrown/SM_FordCrown_parked.SM_FordCrown_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Charger/SM_ChargerParked.SM_ChargerParked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Lincoln/SM_LincolnParked.SM_LincolnParked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/MercedesCCC/SM_MercedesCCC_Parked.SM_MercedesCCC_Parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Mini2021/SM_Mini2021_parked.SM_Mini2021_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/NissanPatrol2021/SM_NissanPatrol2021_parked.SM_NissanPatrol2021_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/TeslaM3/SM_TeslaM3_parked.SM_TeslaM3_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": 'car',
+    # bus
+    # van
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": "van",
+    "vehicle.ford.ambulance": "van",
+    # truck
+    "vehicle.carlamotors.firetruck": 'truck',
+    #=========================================
+
+    #=================traffic sign============
+    # traffic.speed_limit
+    "traffic.speed_limit.30": 'traffic_sign',
+    "traffic.speed_limit.40": 'traffic_sign',
+    "traffic.speed_limit.50": 'traffic_sign',
+    "traffic.speed_limit.60": 'traffic_sign',
+    "traffic.speed_limit.90": 'traffic_sign',
+    "traffic.speed_limit.120": 'traffic_sign',
+    
+    "traffic.stop": 'traffic_sign',
+    "traffic.yield": 'traffic_sign',
+    "traffic.traffic_light": 'traffic_light',
+    #=========================================
+
+    #===================Construction===========
+    "static.prop.warningconstruction" : 'traffic_cone',
+    "static.prop.warningaccident": 'traffic_cone',
+    "static.prop.trafficwarning": "traffic_cone",
+
+    #===================Construction===========
+    "static.prop.constructioncone": 'traffic_cone',
+
+    #=================pedestrian==============
+    "walker.pedestrian.0001": 'pedestrian',
+    "walker.pedestrian.0004": 'pedestrian',
+    "walker.pedestrian.0005": 'pedestrian',
+    "walker.pedestrian.0007": 'pedestrian',
+    "walker.pedestrian.0013": 'pedestrian',
+    "walker.pedestrian.0014": 'pedestrian',
+    "walker.pedestrian.0017": 'pedestrian',
+    "walker.pedestrian.0018": 'pedestrian',
+    "walker.pedestrian.0019": 'pedestrian',
+    "walker.pedestrian.0020": 'pedestrian',
+    "walker.pedestrian.0022": 'pedestrian',
+    "walker.pedestrian.0025": 'pedestrian',
+    "walker.pedestrian.0035": 'pedestrian',
+    "walker.pedestrian.0041": 'pedestrian',
+    "walker.pedestrian.0046": 'pedestrian',
+    "walker.pedestrian.0047": 'pedestrian',
+
+    # ==========================================
+    "static.prop.dirtdebris01": 'others',
+    "static.prop.dirtdebris02": 'others',
+}
+
+eval_cfg = {
+            "dist_ths": [0.5, 1.0, 2.0, 4.0],
+            "dist_th_tp": 2.0,
+            "min_recall": 0.1,
+            "min_precision": 0.1,
+            "mean_ap_weight": 5,
+            "class_names":['car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian'],
+            "tp_metrics":['trans_err', 'scale_err', 'orient_err', 'vel_err'],
+            "err_name_maping":{'trans_err': 'mATE','scale_err': 'mASE','orient_err': 'mAOE','vel_err': 'mAVE','attr_err': 'mAAE'},
+            "class_range":{'car':(50,50),'van':(50,50),'truck':(50,50),'bicycle':(40,40),'traffic_sign':(30,30),'traffic_cone':(30,30),'traffic_light':(30,30),'pedestrian':(40,40)}
+            }
+
+class_names = [
+'car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian','others'
+]
+
+vehicle_id_list = [0,1,2]
+group_id_list =  [[0, 1, 2], [3], [7]]
+
+input_modality = dict(
+    use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True
+)
+_dim_ = 256
+_pos_dim_ = _dim_ // 2
+_ffn_dim_ = _dim_ * 2
+_num_levels_ = 4
+bev_h_ = 200
+bev_w_ = 200
+_feed_dim_ = _ffn_dim_
+_dim_half_ = _pos_dim_
+canvas_size = (bev_h_, bev_w_)
+queue_length = 3  # each sequence contains `queue_length` frames.
+
+### traj prediction args ###
+predict_steps = 12
+predict_modes = 6
+fut_steps = 4
+past_steps = 4
+use_nonlinear_optimizer = True
+
+## occflow setting	
+occ_n_future = 4	
+occ_n_future_plan = 6
+occ_n_future_max = max([occ_n_future, occ_n_future_plan])	
+
+### planning ###
+planning_steps = 6
+use_col_optim = True
+
+### Occ args ### 
+occflow_grid_conf = {
+    'xbound': [-50.0, 50.0, 0.5],
+    'ybound': [-50.0, 50.0, 0.5],
+    'zbound': [-10.0, 10.0, 20.0],
+}
+
+# Other settings
+train_gt_iou_threshold=0.3
+
+model = dict(
+    type="UniAD",
+    gt_iou_threshold=train_gt_iou_threshold,
+    queue_length=queue_length,
+    use_grid_mask=True,
+    video_test_mode=True,
+    num_query=900,
+    num_classes=len(class_names),
+    vehicle_id_list=vehicle_id_list,
+    pc_range=point_cloud_range,
+    img_backbone=dict(
+        type="ResNet",
+        depth=101,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=4,
+        norm_cfg=dict(type="BN2d", requires_grad=False),
+        norm_eval=True,
+        style="caffe",
+        dcn=dict(
+            type="DCNv2", deform_groups=1, fallback_on_stride=False
+        ),  # original DCNv2 will print log when perform load_state_dict
+        stage_with_dcn=(False, False, True, True),
+    ),
+    img_neck=dict(
+        type="FPN",
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs="on_output",
+        num_outs=4,
+        relu_before_extra_convs=True,
+    ),
+    freeze_img_backbone=True,
+    freeze_img_neck=True,
+    freeze_bn=True,
+    freeze_bev_encoder=True,
+    score_thresh=0.4,
+    filter_score_thresh=0.35,
+    qim_args=dict(
+        qim_type="QIMBase",
+        merger_dropout=0,
+        update_query_pos=True,
+        fp_ratio=0.3,
+        random_drop=0.1,
+    ),  # hyper-param for query dropping mentioned in MOTR
+    mem_args=dict(
+        memory_bank_type="MemoryBank",
+        memory_bank_score_thresh=0.0,
+        memory_bank_len=4,
+    ),
+    loss_cfg=dict(
+        type="ClipMatcher",
+        num_classes=len(class_names),
+        weight_dict=None,
+        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+        assigner=dict(
+            type="HungarianAssigner3DTrack",
+            cls_cost=dict(type="FocalLossCost", weight=2.0),
+            reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+            pc_range=point_cloud_range,
+        ),
+        loss_cls=dict(
+            type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+        ),
+        loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+    ),  # loss cfg for tracking
+    pts_bbox_head=dict(
+        type="BEVFormerTrackHead",
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=len(class_names),
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        past_steps=past_steps,
+        fut_steps=fut_steps,
+        transformer=dict(
+            type="UniADPerceptionTransformer",
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type="BEVFormerEncoder",
+                num_layers=6,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type="BEVFormerLayer",
+                    attn_cfgs=[
+                        dict(
+                            type="TemporalSelfAttention", embed_dims=_dim_, num_levels=1
+                        ),
+                        dict(
+                            type="SpatialCrossAttention",
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type="MSDeformableAttention3D",
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_,
+                            ),
+                            embed_dims=_dim_,
+                        ),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.0,
+                    operation_order=(
+                        "self_attn",
+                        "norm",
+                        "cross_attn",
+                        "norm",
+                        "ffn",
+                        "norm",
+                    ),
+                ),
+            ),
+            decoder=dict(
+                type="DetectionTransformerDecoder",
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type="DetrTransformerDecoderLayer",
+                    attn_cfgs=[
+                        dict(
+                            type="MultiheadAttention",
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.0,
+                        ),
+                        dict(
+                            type="CustomMSDeformableAttention",
+                            embed_dims=_dim_,
+                            num_levels=1,
+                        ),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.0,
+                    operation_order=(
+                        "self_attn",
+                        "norm",
+                        "cross_attn",
+                        "norm",
+                        "ffn",
+                        "norm",
+                    ),
+                ),
+            ),
+        ),
+        bbox_coder=dict(
+            type="NMSFreeCoder",
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=len(class_names),
+        ),
+        positional_encoding=dict(
+            type="LearnedPositionalEncoding",
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+        ),
+        loss_cls=dict(
+            type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+        ),
+        loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+        loss_iou=dict(type="GIoULoss", loss_weight=0.0),
+    ),
+    seg_head=dict(
+        type='PansegformerHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        canvas_size=canvas_size,
+        pc_range=point_cloud_range,
+        num_query=300,
+        num_classes=6,
+        num_things_classes=6,
+        num_stuff_classes=0,
+        in_channels=2048,
+        sync_cls_avg_factor=True,
+        as_two_stage=False,
+        with_box_refine=True,
+        transformer=dict(
+            type='SegDeformableTransformer',
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        embed_dims=_dim_,
+                        num_levels=_num_levels_,
+                         ),
+                    feedforward_channels=_feed_dim_,
+                    ffn_dropout=0.0,
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+            decoder=dict(
+                type='DeformableDetrTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.0),
+                        dict(
+                            type='MultiScaleDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=_num_levels_,
+                        )
+                    ],
+                    feedforward_channels=_feed_dim_,
+                    ffn_dropout=0.0,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')
+                ),
+            ),
+        ),
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=_dim_half_,
+            normalize=True,
+            offset=-0.5),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+        loss_mask=dict(type='DiceLoss', loss_weight=2.0),
+        thing_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=4),
+        stuff_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=6,self_attn=True),
+        train_cfg=dict(
+            assigner=dict(
+                type='HungarianAssigner',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+                ),
+            assigner_with_mask=dict(
+                type='HungarianAssigner_multi_info',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+                mask_cost=dict(type='DiceCost', weight=2.0),
+                ),
+            sampler =dict(type='PseudoSampler'),
+            sampler_with_mask =dict(type='PseudoSampler_segformer'),
+        ),
+    ),
+    occ_head=dict(
+        type='OccHead',
+
+        grid_conf=occflow_grid_conf,
+        ignore_index=255,
+
+        bev_proj_dim=256,
+        bev_proj_nlayers=4,
+
+        # Transformer
+        attn_mask_thresh=0.3,
+        transformer_decoder=dict(
+            type='DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=5,
+            transformerlayers=dict(
+                type='DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,  # change to 512
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True),
+                feedforward_channels=2048,
+                operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        # Query
+        query_dim=256,
+        query_mlp_layers=3,
+
+        aux_loss_weight=1.,
+        loss_mask=dict(
+            type='FieryBinarySegmentationLoss',
+            use_top_k=True,
+            top_k_ratio=0.25,
+            future_discount=0.95,
+            loss_weight=5.0,
+            ignore_index=255,
+        ),
+        loss_dice=dict(
+            type='DiceLossWithMasks',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            ignore_index=255,
+            loss_weight=1.0),
+
+        
+        pan_eval=True,
+        test_seg_thresh=0.1,
+        test_with_track_score=True,
+    ),
+    motion_head=dict(
+        type='MotionHead',
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=300,
+        num_classes=len(class_names),
+        vehicle_id_list=vehicle_id_list,
+        predict_steps=predict_steps,
+        predict_modes=predict_modes,
+        embed_dims=_dim_,
+        loss_traj=dict(type='TrajLoss', 
+            use_variance=True, 
+            cls_loss_weight=0.5, 	
+            nll_loss_weight=0.5, 	
+            loss_weight_minade=0., 	
+            loss_weight_minfde=0.25),
+        num_cls_fcs=3,
+        pc_range=point_cloud_range,
+        group_id_list=group_id_list,
+        num_anchor=6,
+        use_nonlinear_optimizer=use_nonlinear_optimizer,
+        anchor_info_path='data/others/b2d_motion_anchor_infos_mode6.pkl',
+        transformerlayers=dict(
+            type='MotionTransformerDecoder',
+            pc_range=point_cloud_range,
+            embed_dims=_dim_,
+            num_layers=3,
+            transformerlayers=dict(
+                type='MotionTransformerAttentionLayer',
+                batch_first=True,
+                attn_cfgs=[
+                    dict(
+                        type='MotionDeformableAttention',
+                        num_steps=predict_steps,
+                        embed_dims=_dim_,
+                        num_levels=1,
+                        num_heads=8,
+                        num_points=4,
+                        sample_index=-1),
+                ],
+
+                feedforward_channels=_ffn_dim_,
+                ffn_dropout=0.0,
+                operation_order=('cross_attn', 'norm', 'ffn', 'norm')),
+        ),
+    ),
+    planning_head=dict(
+        type='PlanningHeadSingleMode',
+        embed_dims=256,
+        command_dim=6,
+        planning_steps=planning_steps,
+        loss_planning=dict(type='PlanningLoss'),
+        loss_collision=[dict(type='CollisionLoss', delta=0.0, weight=2.5),
+                        dict(type='CollisionLoss', delta=0.5, weight=1.0),
+                        dict(type='CollisionLoss', delta=1.0, weight=0.25)],
+        use_col_optim=use_col_optim,
+        planning_eval=True,
+        with_adapter=True,
+    ),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            out_size_factor=4,
+            assigner=dict(
+                type="HungarianAssigner3D",
+                cls_cost=dict(type="FocalLossCost", weight=2.0),
+                reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+                iou_cost=dict(
+                    type="IoUCost", weight=0.0
+                ),  # Fake cost. This is just to make it compatible with DETR head.
+                pc_range=point_cloud_range,
+            ),
+        )
+    ),
+)
+dataset_type = "B2D_E2E_Dataset"
+data_root = "data/bench2drive"
+info_root = "data/infos"
+map_root = "data/bench2drive/maps"
+map_file = "data/infos/b2d_map_infos.pkl"
+file_client_args = dict(backend="disk")
+ann_file_train=info_root + f"/b2d_infos_train.pkl"
+ann_file_val=info_root + f"/b2d_infos_val.pkl"
+ann_file_test=info_root + f"/b2d_infos_val.pkl"
+
+
+train_pipeline = [
+    dict(type="LoadMultiViewImageFromFilesInCeph", to_float32=True, file_client_args=file_client_args, img_root=data_root),
+    dict(type="PhotoMetricDistortionMultiViewImage"),
+    dict(
+        type="LoadAnnotations3D_E2E",
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_attr_label=False,
+        with_vis_token=False,
+        with_future_anns=True,  # occ_flow gt
+        with_ins_inds_3d=True,  # ins_inds 
+        ins_inds_add_1=True,    # ins_inds start from 1
+    ),
+
+    dict(type='GenerateOccFlowLabels', 
+         grid_conf=occflow_grid_conf, 
+         ignore_index=255, 
+         only_vehicle=True, 
+         filter_invisible=False,
+         all_classes = class_names,
+         vehicle_classes = ['car','van','truck','bicycle'],
+         plan_classes = ['car','van','truck','bicycle','pedestrian'],
+         ),
+
+    dict(type="ObjectRangeFilterTrack", point_cloud_range=point_cloud_range),
+    dict(type="ObjectNameFilterTrack", classes=class_names),
+    dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+    dict(type="PadMultiViewImage", size_divisor=32),
+    dict(type="DefaultFormatBundle3D", class_names=class_names),
+    dict(
+        type="CustomCollect3D",
+        keys=[
+            "gt_bboxes_3d",
+            "gt_labels_3d",
+            "gt_inds",
+            "img",
+            "timestamp",
+            "l2g_r_mat",
+            "l2g_t",
+            "gt_fut_traj",
+            "gt_fut_traj_mask",
+            "gt_past_traj",
+            "gt_past_traj_mask",
+            "gt_sdc_bbox",
+            "gt_sdc_label",
+            "gt_sdc_fut_traj",
+            "gt_sdc_fut_traj_mask",
+            "gt_lane_labels",
+            "gt_lane_bboxes",
+            "gt_lane_masks",
+             # Occ gt
+            "gt_segmentation",
+            "gt_instance", 
+            "gt_centerness", 
+            "gt_offset", 
+            "gt_flow",
+            "gt_backward_flow",
+            "gt_occ_has_invalid_frame",	
+            "gt_occ_img_is_valid",	
+            # gt future bbox for plan	
+            "gt_future_boxes",	
+            "gt_future_labels",	
+            # planning	
+            "sdc_planning",	
+            "sdc_planning_mask",	
+            "command",
+        ],
+    ),
+]
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFilesInCeph', to_float32=True,
+            file_client_args=file_client_args, img_root=data_root),
+    dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+    dict(type="PadMultiViewImage", size_divisor=32),
+    dict(type='LoadAnnotations3D_E2E', 
+         with_bbox_3d=False,
+         with_label_3d=False, 
+         with_attr_label=False,
+         with_vis_token=False,
+         with_future_anns=True,
+         with_ins_inds_3d=False,
+         ins_inds_add_1=True, # ins_inds start from 1
+         ),
+    dict(type='GenerateOccFlowLabels', 
+         grid_conf=occflow_grid_conf, 
+         ignore_index=255, 
+         only_vehicle=True, 
+         filter_invisible=False,
+         all_classes = class_names,
+         vehicle_classes = ['car','van','truck','bicycle'],
+         plan_classes = ['car','van','truck','bicycle','pedestrian'],
+         ),
+    dict(
+        type="MultiScaleFlipAug3D",
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type="DefaultFormatBundle3D", class_names=class_names, with_label=False
+            ),
+            dict(
+                type="CustomCollect3D", keys=[
+                                            "img",
+                                            "timestamp",
+                                            "l2g_r_mat",
+                                            "l2g_t",
+                                            "gt_lane_labels",
+                                            "gt_lane_bboxes",
+                                            "gt_lane_masks",
+                                            "gt_segmentation",
+                                            "gt_instance", 
+                                            "gt_centerness", 
+                                            "gt_offset", 
+                                            "gt_flow",
+                                            "gt_backward_flow",
+                                            "gt_occ_has_invalid_frame",	
+                                            "gt_occ_img_is_valid",	
+                                            # planning	
+                                            "sdc_planning",	
+                                            "sdc_planning_mask",	
+                                            "command",
+                                        ]
+            ),
+        ],
+    ),
+]
+
+inference_only_pipeline = [
+    dict(type='LoadMultiViewImageFromFilesInCeph', to_float32=True,
+            file_client_args=file_client_args, img_root=data_root),
+    dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+    dict(type="PadMultiViewImage", size_divisor=32),
+    dict(
+        type="MultiScaleFlipAug3D",
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type="DefaultFormatBundle3D", class_names=class_names, with_label=False
+            ),
+            dict(
+                type="CustomCollect3D", keys=[
+                                            "img",
+                                            "timestamp",
+                                            "l2g_r_mat",
+                                            "l2g_t",
+                                            "command",
+                                        ]
+            ),
+        ],
+    ),
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=ann_file_train,
+        pipeline=train_pipeline,
+        classes=class_names,
+        name_mapping=NameMapping,
+        map_root=map_root,
+        map_file=map_file,
+        modality=input_modality,
+        patch_size=patch_size,
+        bev_size=(bev_h_, bev_w_),
+        queue_length=queue_length,
+        predict_frames=predict_steps,
+        past_frames=past_steps,
+        future_frames=fut_steps,
+        point_cloud_range=point_cloud_range,
+        box_type_3d="LiDAR",
+    ),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=ann_file_val,
+        pipeline=test_pipeline,
+        name_mapping=NameMapping,
+        map_root=map_root,
+        map_file=map_file,
+        bev_size=(bev_h_, bev_w_),
+        predict_frames=predict_steps,
+        past_frames=past_steps,
+        future_frames=fut_steps,
+        classes=class_names,
+        modality=input_modality,
+        samples_per_gpu=1,
+        point_cloud_range=point_cloud_range,
+        eval_cfg=eval_cfg,
+        #eval_mod=['det', 'track', 'map'],
+        box_type_3d="LiDAR",
+    ),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=ann_file_val,
+        pipeline=test_pipeline,
+        name_mapping=NameMapping,
+        map_root=map_root,
+        map_file=map_file,
+        bev_size=(bev_h_, bev_w_),
+        predict_frames=predict_steps,
+        past_frames=past_steps,
+        future_frames=fut_steps,
+        classes=class_names,
+        modality=input_modality,
+        samples_per_gpu=1,
+        point_cloud_range=point_cloud_range,
+        eval_cfg=eval_cfg,
+        #eval_mod=['det', 'track', 'map'],
+        box_type_3d="LiDAR",
+    ),
+    shuffler_sampler=dict(type="DistributedGroupSampler"),
+    nonshuffler_sampler=dict(type="DistributedSampler"),
+)
+optimizer = dict(
+    type="AdamW",
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            "img_backbone": dict(lr_mult=0.1),
+        }
+    ),
+    weight_decay=0.01,
+)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    by_epoch=False,
+    policy="CosineAnnealing",
+    warmup="linear",
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3,
+)
+total_epochs = 2
+evaluation = dict(interval=2, pipeline=test_pipeline)
+runner = dict(type="EpochBasedRunner", max_epochs=total_epochs)
+log_config = dict(
+    interval=1, hooks=[dict(type="TextLoggerHook"), dict(type="TensorboardLoggerHook")]
+)
+checkpoint_config = dict(interval=3000, by_epoch=False)
+
+
+find_unused_parameters = True
\ No newline at end of file
diff --git a/adzoo/uniad/configs/stage2_e2e/tiny_e2e_b2d.py b/adzoo/uniad/configs/stage2_e2e/tiny_e2e_b2d.py
new file mode 100644
index 0000000..b6c7a2c
--- /dev/null
+++ b/adzoo/uniad/configs/stage2_e2e/tiny_e2e_b2d.py
@@ -0,0 +1,813 @@
+_base_ = ["../_base_/datasets/nus-3d.py",
+          "../_base_/default_runtime.py"]
+
+# Update-2023-06-12: 
+# [Enhance] Update some freezing args of UniAD 
+# plugin_dir = "projects/mmdet3d_plugin/"
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+patch_size = [102.4, 102.4]
+img_norm_cfg = dict(mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+NameMapping = {
+    #=================vehicle=================
+    # bicycle
+    'vehicle.bh.crossbike': 'bicycle',
+    "vehicle.diamondback.century": 'bicycle',
+    "vehicle.gazelle.omafiets": 'bicycle',
+    # car
+    "vehicle.chevrolet.impala": 'car',
+    "vehicle.dodge.charger_2020": 'car',
+    "vehicle.dodge.charger_police": 'car',
+    "vehicle.dodge.charger_police_2020": 'car',
+    "vehicle.lincoln.mkz_2017": 'car',
+    "vehicle.lincoln.mkz_2020": 'car',
+    "vehicle.mini.cooper_s_2021": 'car',
+    "vehicle.mercedes.coupe_2020": 'car',
+    "vehicle.ford.mustang": 'car',
+    "vehicle.nissan.patrol_2021": 'car',
+    "vehicle.audi.tt": 'car',
+    "vehicle.audi.etron": 'car',
+    "vehicle.ford.crown": 'car',
+    "vehicle.ford.mustang": 'car',
+    "vehicle.tesla.model3": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/FordCrown/SM_FordCrown_parked.SM_FordCrown_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Charger/SM_ChargerParked.SM_ChargerParked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Lincoln/SM_LincolnParked.SM_LincolnParked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/MercedesCCC/SM_MercedesCCC_Parked.SM_MercedesCCC_Parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Mini2021/SM_Mini2021_parked.SM_Mini2021_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/NissanPatrol2021/SM_NissanPatrol2021_parked.SM_NissanPatrol2021_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/TeslaM3/SM_TeslaM3_parked.SM_TeslaM3_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": 'car',
+    # bus
+    # van
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": "van",
+    "vehicle.ford.ambulance": "van",
+    # truck
+    "vehicle.carlamotors.firetruck": 'truck',
+    #=========================================
+
+    #=================traffic sign============
+    # traffic.speed_limit
+    "traffic.speed_limit.30": 'traffic_sign',
+    "traffic.speed_limit.40": 'traffic_sign',
+    "traffic.speed_limit.50": 'traffic_sign',
+    "traffic.speed_limit.60": 'traffic_sign',
+    "traffic.speed_limit.90": 'traffic_sign',
+    "traffic.speed_limit.120": 'traffic_sign',
+    
+    "traffic.stop": 'traffic_sign',
+    "traffic.yield": 'traffic_sign',
+    "traffic.traffic_light": 'traffic_light',
+    #=========================================
+
+    #===================Construction===========
+    "static.prop.warningconstruction" : 'traffic_cone',
+    "static.prop.warningaccident": 'traffic_cone',
+    "static.prop.trafficwarning": "traffic_cone",
+
+    #===================Construction===========
+    "static.prop.constructioncone": 'traffic_cone',
+
+    #=================pedestrian==============
+    "walker.pedestrian.0001": 'pedestrian',
+    "walker.pedestrian.0004": 'pedestrian',
+    "walker.pedestrian.0005": 'pedestrian',
+    "walker.pedestrian.0007": 'pedestrian',
+    "walker.pedestrian.0013": 'pedestrian',
+    "walker.pedestrian.0014": 'pedestrian',
+    "walker.pedestrian.0017": 'pedestrian',
+    "walker.pedestrian.0018": 'pedestrian',
+    "walker.pedestrian.0019": 'pedestrian',
+    "walker.pedestrian.0020": 'pedestrian',
+    "walker.pedestrian.0022": 'pedestrian',
+    "walker.pedestrian.0025": 'pedestrian',
+    "walker.pedestrian.0035": 'pedestrian',
+    "walker.pedestrian.0041": 'pedestrian',
+    "walker.pedestrian.0046": 'pedestrian',
+    "walker.pedestrian.0047": 'pedestrian',
+
+    # ==========================================
+    "static.prop.dirtdebris01": 'others',
+    "static.prop.dirtdebris02": 'others',
+}
+
+eval_cfg = {
+            "dist_ths": [0.5, 1.0, 2.0, 4.0],
+            "dist_th_tp": 2.0,
+            "min_recall": 0.1,
+            "min_precision": 0.1,
+            "mean_ap_weight": 5,
+            "class_names":['car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian'],
+            "tp_metrics":['trans_err', 'scale_err', 'orient_err', 'vel_err'],
+            "err_name_maping":{'trans_err': 'mATE','scale_err': 'mASE','orient_err': 'mAOE','vel_err': 'mAVE','attr_err': 'mAAE'},
+            "class_range":{'car':(50,50),'van':(50,50),'truck':(50,50),'bicycle':(40,40),'traffic_sign':(30,30),'traffic_cone':(30,30),'traffic_light':(30,30),'pedestrian':(40,40)}
+            }
+
+class_names = [
+'car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian','others'
+]
+
+vehicle_id_list = [0,1,2]
+group_id_list =  [[0, 1, 2], [3], [7]]
+
+input_modality = dict(
+    use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True
+)
+_dim_ = 256
+_pos_dim_ = _dim_ // 2
+_ffn_dim_ = _dim_ * 2
+_num_levels_ = 4
+bev_h_ = 100
+bev_w_ = 100
+_feed_dim_ = _ffn_dim_
+_dim_half_ = _pos_dim_
+canvas_size = (bev_h_*2, bev_w_*2)
+queue_length = 3  # each sequence contains `queue_length` frames.
+
+### traj prediction args ###
+predict_steps = 12
+predict_modes = 6
+fut_steps = 4
+past_steps = 4
+use_nonlinear_optimizer = True
+
+## occflow setting	
+occ_n_future = 4	
+occ_n_future_plan = 6
+occ_n_future_max = max([occ_n_future, occ_n_future_plan])	
+
+### planning ###
+planning_steps = 6
+use_col_optim = True
+
+### Occ args ### 
+occflow_grid_conf = {
+    'xbound': [-50.0, 50.0, 0.5],
+    'ybound': [-50.0, 50.0, 0.5],
+    'zbound': [-10.0, 10.0, 20.0],
+}
+
+# Other settings
+train_gt_iou_threshold=0.3
+
+model = dict(
+    type="UniAD",
+    gt_iou_threshold=train_gt_iou_threshold,
+    queue_length=queue_length,
+    use_grid_mask=True,
+    video_test_mode=True,
+    prev_frame_num=10,
+    num_query=900,
+    num_classes=len(class_names),
+    vehicle_id_list=vehicle_id_list,
+    pc_range=point_cloud_range,
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(1,2,3),
+        frozen_stages=4,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch'),
+    img_neck=dict(
+        type="FPN",
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs="on_output",
+        num_outs=4,
+        relu_before_extra_convs=True,
+    ),
+    freeze_img_backbone=True,
+    freeze_img_neck=True,
+    freeze_bn=True,
+    freeze_bev_encoder=True,
+    score_thresh=0.4,
+    filter_score_thresh=0.35,
+    qim_args=dict(
+        qim_type="QIMBase",
+        merger_dropout=0,
+        update_query_pos=True,
+        fp_ratio=0.3,
+        random_drop=0.1,
+    ),  # hyper-param for query dropping mentioned in MOTR
+    mem_args=dict(
+        memory_bank_type="MemoryBank",
+        memory_bank_score_thresh=0.0,
+        memory_bank_len=4,
+    ),
+    loss_cfg=dict(
+        type="ClipMatcher",
+        num_classes=len(class_names),
+        weight_dict=None,
+        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+        assigner=dict(
+            type="HungarianAssigner3DTrack",
+            cls_cost=dict(type="FocalLossCost", weight=2.0),
+            reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+            pc_range=point_cloud_range,
+        ),
+        loss_cls=dict(
+            type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+        ),
+        loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+    ),  # loss cfg for tracking
+    pts_bbox_head=dict(
+        type="BEVFormerTrackHead",
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=900,
+        num_classes=len(class_names),
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        past_steps=past_steps,
+        fut_steps=fut_steps,
+        transformer=dict(
+            type="UniADPerceptionTransformer",
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type="BEVFormerEncoder",
+                num_layers=3,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type="BEVFormerLayer",
+                    attn_cfgs=[
+                        dict(
+                            type="TemporalSelfAttention", embed_dims=_dim_, num_levels=1
+                        ),
+                        dict(
+                            type="SpatialCrossAttention",
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type="MSDeformableAttention3D",
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_,
+                            ),
+                            embed_dims=_dim_,
+                        ),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.0,
+                    operation_order=(
+                        "self_attn",
+                        "norm",
+                        "cross_attn",
+                        "norm",
+                        "ffn",
+                        "norm",
+                    ),
+                ),
+            ),
+            decoder=dict(
+                type="DetectionTransformerDecoder",
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type="DetrTransformerDecoderLayer",
+                    attn_cfgs=[
+                        dict(
+                            type="MultiheadAttention",
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.0,
+                        ),
+                        dict(
+                            type="CustomMSDeformableAttention",
+                            embed_dims=_dim_,
+                            num_levels=1,
+                        ),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.0,
+                    operation_order=(
+                        "self_attn",
+                        "norm",
+                        "cross_attn",
+                        "norm",
+                        "ffn",
+                        "norm",
+                    ),
+                ),
+            ),
+        ),
+        bbox_coder=dict(
+            type="NMSFreeCoder",
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=len(class_names),
+        ),
+        positional_encoding=dict(
+            type="LearnedPositionalEncoding",
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+        ),
+        loss_cls=dict(
+            type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0
+        ),
+        loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+        loss_iou=dict(type="GIoULoss", loss_weight=0.0),
+    ),
+    seg_head=dict(
+        type='PansegformerHead',
+        bev_h=bev_h_*2,
+        bev_w=bev_w_*2,
+        canvas_size=canvas_size,
+        pc_range=point_cloud_range,
+        num_query=300,
+        num_classes=6,
+        num_things_classes=6,
+        num_stuff_classes=0,
+        in_channels=2048,
+        sync_cls_avg_factor=True,
+        as_two_stage=False,
+        with_box_refine=True,
+        transformer=dict(
+            type='SegDeformableTransformer',
+            encoder=dict(
+                type='DetrTransformerEncoder',
+                num_layers=6,
+                transformerlayers=dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=dict(
+                        type='MultiScaleDeformableAttention',
+                        embed_dims=_dim_,
+                        num_levels=_num_levels_,
+                         ),
+                    feedforward_channels=_feed_dim_,
+                    ffn_dropout=0.0,
+                    operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
+            decoder=dict(
+                type='DeformableDetrTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.0),
+                        dict(
+                            type='MultiScaleDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=_num_levels_,
+                        )
+                    ],
+                    feedforward_channels=_feed_dim_,
+                    ffn_dropout=0.0,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')
+                ),
+            ),
+        ),
+        positional_encoding=dict(
+            type='SinePositionalEncoding',
+            num_feats=_dim_half_,
+            normalize=True,
+            offset=-0.5),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+        loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+        loss_mask=dict(type='DiceLoss', loss_weight=2.0),
+        thing_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=4),
+        stuff_transformer_head=dict(type='SegMaskHead',d_model=_dim_,nhead=8,num_decoder_layers=6,self_attn=True),
+        train_cfg=dict(
+            assigner=dict(
+                type='HungarianAssigner',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+                ),
+            assigner_with_mask=dict(
+                type='HungarianAssigner_multi_info',
+                cls_cost=dict(type='FocalLossCost', weight=2.0),
+                reg_cost=dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'),
+                iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+                mask_cost=dict(type='DiceCost', weight=2.0),
+                ),
+            sampler =dict(type='PseudoSampler'),
+            sampler_with_mask =dict(type='PseudoSampler_segformer'),
+        ),
+    ),
+    occ_head=dict(
+        type='OccHead',
+
+        grid_conf=occflow_grid_conf,
+        ignore_index=255,
+
+        bev_proj_dim=256,
+        bev_proj_nlayers=4,
+
+        # Transformer
+        attn_mask_thresh=0.3,
+        transformer_decoder=dict(
+            type='DetrTransformerDecoder',
+            return_intermediate=True,
+            num_layers=5,
+            transformerlayers=dict(
+                type='DetrTransformerDecoderLayer',
+                attn_cfgs=dict(
+                    type='MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=False),
+                ffn_cfgs=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,  # change to 512
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True),
+                feedforward_channels=2048,
+                operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                 'ffn', 'norm')),
+            init_cfg=None),
+        # Query
+        query_dim=256,
+        query_mlp_layers=3,
+
+        aux_loss_weight=1.,
+        loss_mask=dict(
+            type='FieryBinarySegmentationLoss',
+            use_top_k=True,
+            top_k_ratio=0.25,
+            future_discount=0.95,
+            loss_weight=5.0,
+            ignore_index=255,
+        ),
+        loss_dice=dict(
+            type='DiceLossWithMasks',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            ignore_index=255,
+            loss_weight=1.0),
+
+        
+        pan_eval=True,
+        test_seg_thresh=0.1,
+        test_with_track_score=True,
+    ),
+    motion_head=dict(
+        type='MotionHead',
+        bev_h=bev_h_*2,
+        bev_w=bev_w_*2,
+        num_query=300,
+        num_classes=len(class_names),
+        predict_steps=predict_steps,
+        predict_modes=predict_modes,
+        embed_dims=_dim_,
+        loss_traj=dict(type='TrajLoss', 
+            use_variance=True, 
+            cls_loss_weight=0.5, 	
+            nll_loss_weight=0.5, 	
+            loss_weight_minade=0., 	
+            loss_weight_minfde=0.25),
+        num_cls_fcs=3,
+        pc_range=point_cloud_range,
+        group_id_list=group_id_list,
+        num_anchor=6,
+        use_nonlinear_optimizer=use_nonlinear_optimizer,
+        anchor_info_path='data/others/b2d_motion_anchor_infos_mode6.pkl',
+        transformerlayers=dict(
+            type='MotionTransformerDecoder',
+            pc_range=point_cloud_range,
+            embed_dims=_dim_,
+            num_layers=3,
+            transformerlayers=dict(
+                type='MotionTransformerAttentionLayer',
+                batch_first=True,
+                attn_cfgs=[
+                    dict(
+                        type='MotionDeformableAttention',
+                        num_steps=predict_steps,
+                        embed_dims=_dim_,
+                        num_levels=1,
+                        num_heads=8,
+                        num_points=4,
+                        sample_index=-1),
+                ],
+
+                feedforward_channels=_ffn_dim_,
+                ffn_dropout=0.0,
+                operation_order=('cross_attn', 'norm', 'ffn', 'norm')),
+        ),
+    ),
+    planning_head=dict(
+        type='PlanningHeadSingleMode',
+        embed_dims=256,
+        command_dim=6,
+        planning_steps=planning_steps,
+        loss_planning=dict(type='PlanningLoss'),
+        loss_collision=[dict(type='CollisionLoss', delta=0.0, weight=2.5),
+                        dict(type='CollisionLoss', delta=0.5, weight=1.0),
+                        dict(type='CollisionLoss', delta=1.0, weight=0.25)],
+        use_col_optim=use_col_optim,
+        planning_eval=True,
+        with_adapter=True,
+    ),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            out_size_factor=4,
+            assigner=dict(
+                type="HungarianAssigner3D",
+                cls_cost=dict(type="FocalLossCost", weight=2.0),
+                reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+                iou_cost=dict(
+                    type="IoUCost", weight=0.0
+                ),  # Fake cost. This is just to make it compatible with DETR head.
+                pc_range=point_cloud_range,
+            ),
+        )
+    ),
+)
+dataset_type = "B2D_E2E_Dataset"
+data_root = "data/bench2drive"
+info_root = "data/infos"
+map_root = "data/bench2drive/maps"
+map_file = "data/infos/b2d_map_infos.pkl"
+file_client_args = dict(backend="disk")
+ann_file_train=info_root + f"/b2d_infos_train.pkl"
+ann_file_val=info_root + f"/b2d_infos_val.pkl"
+ann_file_test=info_root + f"/b2d_infos_val.pkl"
+
+train_pipeline = [
+    dict(type="LoadMultiViewImageFromFilesInCeph", to_float32=True, file_client_args=file_client_args, img_root=data_root),
+    dict(type="PhotoMetricDistortionMultiViewImage"),
+    dict(
+        type="LoadAnnotations3D_E2E",
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_attr_label=False,
+        with_vis_token=False,
+        with_future_anns=True,  # occ_flow gt
+        with_ins_inds_3d=True,  # ins_inds 
+        ins_inds_add_1=True,    # ins_inds start from 1
+    ),
+
+    dict(type='GenerateOccFlowLabels', 
+         grid_conf=occflow_grid_conf, 
+         ignore_index=255, 
+         only_vehicle=True, 
+         filter_invisible=False,
+         all_classes = class_names,
+         vehicle_classes = ['car','van','truck','bicycle'],
+         plan_classes = ['car','van','truck','bicycle','pedestrian'],
+         ),
+
+    dict(type="ObjectRangeFilterTrack", point_cloud_range=point_cloud_range),
+    dict(type="ObjectNameFilterTrack", classes=class_names),
+    dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+    dict(type="PadMultiViewImage", size_divisor=32),
+    dict(type="DefaultFormatBundle3D", class_names=class_names),
+    dict(
+        type="CustomCollect3D",
+        keys=[
+            "gt_bboxes_3d",
+            "gt_labels_3d",
+            "gt_inds",
+            "img",
+            "timestamp",
+            "l2g_r_mat",
+            "l2g_t",
+            "gt_fut_traj",
+            "gt_fut_traj_mask",
+            "gt_past_traj",
+            "gt_past_traj_mask",
+            "gt_sdc_bbox",
+            "gt_sdc_label",
+            "gt_sdc_fut_traj",
+            "gt_sdc_fut_traj_mask",
+            "gt_lane_labels",
+            "gt_lane_bboxes",
+            "gt_lane_masks",
+             # Occ gt
+            "gt_segmentation",
+            "gt_instance", 
+            "gt_centerness", 
+            "gt_offset", 
+            "gt_flow",
+            "gt_backward_flow",
+            "gt_occ_has_invalid_frame",	
+            "gt_occ_img_is_valid",	
+            # gt future bbox for plan	
+            "gt_future_boxes",	
+            "gt_future_labels",	
+            # planning	
+            "sdc_planning",	
+            "sdc_planning_mask",	
+            "command",
+        ],
+    ),
+]
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFilesInCeph', to_float32=True,
+            file_client_args=file_client_args, img_root=data_root),
+    dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+    dict(type="PadMultiViewImage", size_divisor=32),
+    dict(type='LoadAnnotations3D_E2E', 
+         with_bbox_3d=False,
+         with_label_3d=False, 
+         with_attr_label=False,
+         with_vis_token=False,
+         with_future_anns=True,
+         with_ins_inds_3d=False,
+         ins_inds_add_1=True, # ins_inds start from 1
+         ),
+    dict(type='GenerateOccFlowLabels', 
+         grid_conf=occflow_grid_conf, 
+         ignore_index=255, 
+         only_vehicle=True, 
+         filter_invisible=False,
+         all_classes = class_names,
+         vehicle_classes = ['car','van','truck','bicycle'],
+         plan_classes = ['car','van','truck','bicycle','pedestrian'],
+         ),
+    dict(
+        type="MultiScaleFlipAug3D",
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type="DefaultFormatBundle3D", class_names=class_names, with_label=False
+            ),
+            dict(
+                type="CustomCollect3D", keys=[
+                                            "img",
+                                            "timestamp",
+                                            "l2g_r_mat",
+                                            "l2g_t",
+                                            "gt_lane_labels",
+                                            "gt_lane_bboxes",
+                                            "gt_lane_masks",
+                                            "gt_segmentation",
+                                            "gt_instance", 
+                                            "gt_centerness", 
+                                            "gt_offset", 
+                                            "gt_flow",
+                                            "gt_backward_flow",
+                                            "gt_occ_has_invalid_frame",	
+                                            "gt_occ_img_is_valid",	
+                                            # planning	
+                                            "sdc_planning",	
+                                            "sdc_planning_mask",	
+                                            "command",
+                                        ]
+            ),
+        ],
+    ),
+]
+
+inference_only_pipeline = [
+    dict(type='LoadMultiViewImageFromFilesInCeph', to_float32=True,
+            file_client_args=file_client_args, img_root=data_root),
+    dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+    dict(type="PadMultiViewImage", size_divisor=32),
+    dict(
+        type="MultiScaleFlipAug3D",
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type="DefaultFormatBundle3D", class_names=class_names, with_label=False
+            ),
+            dict(
+                type="CustomCollect3D", keys=[
+                                            "img",
+                                            "timestamp",
+                                            "l2g_r_mat",
+                                            "l2g_t",
+                                            "command",
+                                        ]
+            ),
+        ],
+    ),
+]
+
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=ann_file_train,
+        pipeline=train_pipeline,
+        classes=class_names,
+        name_mapping=NameMapping,
+        map_root=map_root,
+        map_file=map_file,
+        modality=input_modality,
+        patch_size=patch_size,
+        bev_size=(bev_h_, bev_w_),
+        queue_length=queue_length,
+        predict_frames=predict_steps,
+        past_frames=past_steps,
+        future_frames=fut_steps,
+        point_cloud_range=point_cloud_range,
+        box_type_3d="LiDAR",
+    ),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=ann_file_val,
+        pipeline=test_pipeline,
+        name_mapping=NameMapping,
+        map_root=map_root,
+        map_file=map_file,
+        bev_size=(bev_h_, bev_w_),
+        predict_frames=predict_steps,
+        past_frames=past_steps,
+        future_frames=fut_steps,
+        classes=class_names,
+        modality=input_modality,
+        samples_per_gpu=1,
+        point_cloud_range=point_cloud_range,
+        eval_cfg=eval_cfg,
+        #eval_mod=['det', 'track', 'map'],
+        box_type_3d="LiDAR",
+    ),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=ann_file_val,
+        pipeline=test_pipeline,
+        name_mapping=NameMapping,
+        map_root=map_root,
+        map_file=map_file,
+        bev_size=(bev_h_, bev_w_),
+        predict_frames=predict_steps,
+        past_frames=past_steps,
+        future_frames=fut_steps,
+        classes=class_names,
+        modality=input_modality,
+        samples_per_gpu=1,
+        point_cloud_range=point_cloud_range,
+        eval_cfg=eval_cfg,
+        #eval_mod=['det', 'track', 'map'],
+        box_type_3d="LiDAR",
+    ),
+    shuffler_sampler=dict(type="DistributedGroupSampler"),
+    nonshuffler_sampler=dict(type="DistributedSampler"),
+)
+optimizer = dict(
+    type="AdamW",
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            "img_backbone": dict(lr_mult=0.1),
+        }
+    ),
+    weight_decay=0.01,
+)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    by_epoch=False,
+    policy="CosineAnnealing",
+    warmup="linear",
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3,
+)
+total_epochs = 1
+evaluation = dict(interval=1, pipeline=test_pipeline)
+runner = dict(type="EpochBasedRunner", max_epochs=total_epochs)
+log_config = dict(
+    interval=1, hooks=[dict(type="TextLoggerHook"), dict(type="TensorboardLoggerHook")]
+)
+checkpoint_config = dict(interval=3000, by_epoch=False)
+
+find_unused_parameters = True
\ No newline at end of file
diff --git a/adzoo/uniad/data_converter/create_data.py b/adzoo/uniad/data_converter/create_data.py
new file mode 100755
index 0000000..0adb360
--- /dev/null
+++ b/adzoo/uniad/data_converter/create_data.py
@@ -0,0 +1,109 @@
+import argparse
+from os import path as osp
+import sys
+from data_converter import uniad_nuscenes_converter as nuscenes_converter
+
+def nuscenes_data_prep(root_path,
+                       can_bus_root_path,
+                       info_prefix,
+                       version,
+                       dataset_name,
+                       out_dir,
+                       max_sweeps=10):
+    """Prepare data related to nuScenes dataset.
+
+    Related data consists of '.pkl' files recording basic infos,
+    2D annotations and groundtruth database.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        version (str): Dataset version.
+        dataset_name (str): The dataset class name.
+        out_dir (str): Output directory of the groundtruth database info.
+        max_sweeps (int): Number of input consecutive frames. Default: 10
+    """
+    nuscenes_converter.create_nuscenes_infos(
+        root_path, out_dir, can_bus_root_path, info_prefix, version=version, max_sweeps=max_sweeps)
+
+    if version == 'v1.0-test':
+        info_test_path = osp.join(
+            out_dir, f'{info_prefix}_infos_temporal_test.pkl')
+        nuscenes_converter.export_2d_annotation(
+            root_path, info_test_path, version=version)
+    else:
+        info_train_path = osp.join(
+            out_dir, f'{info_prefix}_infos_temporal_train.pkl')
+        info_val_path = osp.join(
+            out_dir, f'{info_prefix}_infos_temporal_val.pkl')
+        nuscenes_converter.export_2d_annotation(
+            root_path, info_train_path, version=version)
+        nuscenes_converter.export_2d_annotation(
+            root_path, info_val_path, version=version)
+
+
+parser = argparse.ArgumentParser(description='Data converter arg parser')
+parser.add_argument('dataset', metavar='kitti', help='name of the dataset')
+parser.add_argument(
+    '--root-path',
+    type=str,
+    default='./data/kitti',
+    help='specify the root path of dataset')
+parser.add_argument(
+    '--canbus',
+    type=str,
+    default='./data',
+    help='specify the root path of nuScenes canbus')
+parser.add_argument(
+    '--version',
+    type=str,
+    default='v1.0',
+    required=False,
+    help='specify the dataset version, no need for kitti')
+parser.add_argument(
+    '--max-sweeps',
+    type=int,
+    default=10,
+    required=False,
+    help='specify sweeps of lidar per example')
+parser.add_argument(
+    '--out-dir',
+    type=str,
+    default='./data/kitti',
+    required=False,
+    help='name of info pkl')
+parser.add_argument('--extra-tag', type=str, default='kitti')
+parser.add_argument(
+    '--workers', type=int, default=4, help='number of threads to be used')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    if args.dataset == 'nuscenes' and args.version != 'v1.0-mini':
+        train_version = f'{args.version}-trainval'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
+        test_version = f'{args.version}-test'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=test_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'nuscenes' and args.version == 'v1.0-mini':
+        train_version = f'{args.version}'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
\ No newline at end of file
diff --git a/adzoo/uniad/data_converter/uniad_create_data.sh b/adzoo/uniad/data_converter/uniad_create_data.sh
new file mode 100755
index 0000000..b9ac04d
--- /dev/null
+++ b/adzoo/uniad/data_converter/uniad_create_data.sh
@@ -0,0 +1,7 @@
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python tools/create_data.py nuscenes --root-path ./data/nuscenes \
+       --out-dir ./data/infos \
+       --extra-tag nuscenes \
+       --version v1.0 \
+       --canbus ./data/nuscenes \
\ No newline at end of file
diff --git a/adzoo/uniad/data_converter/uniad_nuscenes_converter.py b/adzoo/uniad/data_converter/uniad_nuscenes_converter.py
new file mode 100644
index 0000000..4ff6ef8
--- /dev/null
+++ b/adzoo/uniad/data_converter/uniad_nuscenes_converter.py
@@ -0,0 +1,723 @@
+import numpy as np
+import os
+from collections import OrderedDict
+from nuscenes.nuscenes import NuScenes
+from nuscenes.utils.geometry_utils import view_points
+from nuscenes.prediction import PredictHelper
+from os import path as osp
+from pyquaternion import Quaternion
+from shapely.geometry import MultiPoint, box
+from typing import List, Tuple, Union
+
+from mmcv.core.bbox.box_np_ops import points_cam2img
+from mmcv.datasets import NuScenesDataset
+from mmcv.fileio.io import load, dump
+from mmcv.utils import is_filepath, track_iter_progress, check_file_exist
+from mmcv.image import imread
+
+nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+                  'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+                  'barrier')
+
+nus_attributes = ('cycle.with_rider', 'cycle.without_rider',
+                  'pedestrian.moving', 'pedestrian.standing',
+                  'pedestrian.sitting_lying_down', 'vehicle.moving',
+                  'vehicle.parked', 'vehicle.stopped', 'None')
+
+
+def create_nuscenes_infos(root_path,
+                          out_path,
+                          can_bus_root_path,
+                          info_prefix,
+                          version='v1.0-trainval',
+                          max_sweeps=10):
+    """Create info file of nuscene dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        root_path (str): Path of the data root.
+        info_prefix (str): Prefix of the info file to be generated.
+        version (str): Version of the data.
+            Default: 'v1.0-trainval'
+        max_sweeps (int): Max number of sweeps.
+            Default: 10
+    """
+    from nuscenes.nuscenes import NuScenes
+    from nuscenes.can_bus.can_bus_api import NuScenesCanBus
+    print(version, root_path)
+    nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+    nusc_can_bus = NuScenesCanBus(dataroot=can_bus_root_path)
+    from nuscenes.utils import splits
+    available_vers = ['v1.0-trainval', 'v1.0-test', 'v1.0-mini']
+    assert version in available_vers
+    if version == 'v1.0-trainval':
+        train_scenes = splits.train
+        val_scenes = splits.val
+    elif version == 'v1.0-test':
+        train_scenes = splits.test
+        val_scenes = []
+    elif version == 'v1.0-mini':
+        train_scenes = splits.mini_train
+        val_scenes = splits.mini_val
+    else:
+        raise ValueError('unknown')
+
+    # filter existing scenes.
+    available_scenes = get_available_scenes(nusc)
+    available_scene_names = [s['name'] for s in available_scenes]
+    train_scenes = list(
+        filter(lambda x: x in available_scene_names, train_scenes))
+    val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes))
+    train_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in train_scenes
+    ])
+    val_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in val_scenes
+    ])
+
+    test = 'test' in version
+    if test:
+        print('test scene: {}'.format(len(train_scenes)))
+    else:
+        print('train scene: {}, val scene: {}'.format(
+            len(train_scenes), len(val_scenes)))
+
+    train_nusc_infos, val_nusc_infos = _fill_trainval_infos(
+        nusc, nusc_can_bus, train_scenes, val_scenes, test, max_sweeps=max_sweeps)
+
+    metadata = dict(version=version)
+    if test:
+        print('test sample: {}'.format(len(train_nusc_infos)))
+        data = dict(infos=train_nusc_infos, metadata=metadata)
+        info_path = osp.join(out_path,
+                             '{}_infos_temporal_test.pkl'.format(info_prefix))
+        dump(data, info_path)
+    else:
+        print('train sample: {}, val sample: {}'.format(
+            len(train_nusc_infos), len(val_nusc_infos)))
+        data = dict(infos=train_nusc_infos, metadata=metadata)
+        info_path = osp.join(out_path,
+                             '{}_infos_temporal_train.pkl'.format(info_prefix))
+        dump(data, info_path)
+        data['infos'] = val_nusc_infos
+        info_val_path = osp.join(out_path,
+                                 '{}_infos_temporal_val.pkl'.format(info_prefix))
+        dump(data, info_val_path)
+
+
+def get_available_scenes(nusc):
+    """Get available scenes from the input nuscenes class.
+
+    Given the raw data, get the information of available scenes for
+    further info generation.
+
+    Args:
+        nusc (class): Dataset class in the nuScenes dataset.
+
+    Returns:
+        available_scenes (list[dict]): List of basic information for the
+            available scenes.
+    """
+    available_scenes = []
+    print('total scene num: {}'.format(len(nusc.scene)))
+    for scene in nusc.scene:
+        scene_token = scene['token']
+        scene_rec = nusc.get('scene', scene_token)
+        sample_rec = nusc.get('sample', scene_rec['first_sample_token'])
+        sd_rec = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+        has_more_frames = True
+        scene_not_exist = False
+        while has_more_frames:
+            lidar_path, boxes, _ = nusc.get_sample_data(sd_rec['token'])
+            lidar_path = str(lidar_path)
+            if os.getcwd() in lidar_path:
+                # path from lyftdataset is absolute path
+                lidar_path = lidar_path.split(f'{os.getcwd()}/')[-1]
+                # relative path
+            if not is_filepath(lidar_path):
+                scene_not_exist = True
+                break
+            else:
+                break
+        if scene_not_exist:
+            continue
+        available_scenes.append(scene)
+    print('exist scene num: {}'.format(len(available_scenes)))
+    return available_scenes
+
+
+def _get_can_bus_info(nusc, nusc_can_bus, sample):
+    scene_name = nusc.get('scene', sample['scene_token'])['name']
+    sample_timestamp = sample['timestamp']
+    try:
+        pose_list = nusc_can_bus.get_messages(scene_name, 'pose')
+    except:
+        return np.zeros(18)  # server scenes do not have can bus information.
+    can_bus = []
+    # during each scene, the first timestamp of can_bus may be large than the first sample's timestamp
+    last_pose = pose_list[0]
+    for i, pose in enumerate(pose_list):
+        if pose['utime'] > sample_timestamp:
+            break
+        last_pose = pose
+    _ = last_pose.pop('utime')  # useless
+    pos = last_pose.pop('pos')
+    rotation = last_pose.pop('orientation')
+    can_bus.extend(pos)
+    can_bus.extend(rotation)
+    for key in last_pose.keys():
+        can_bus.extend(pose[key])  # 16 elements
+    can_bus.extend([0., 0.])
+    return np.array(can_bus)
+
+def _get_future_traj_info(nusc, sample, predict_steps=16):
+    sample_token = sample['token']
+    ann_tokens = np.array(sample['anns'])
+    sd_rec = nusc.get('sample', sample_token)
+    fut_traj_all = []
+    fut_traj_valid_mask_all = []
+    _, boxes, _ = nusc.get_sample_data(sd_rec['data']['LIDAR_TOP'], selected_anntokens=ann_tokens)
+    predict_helper = PredictHelper(nusc)
+    for i, ann_token in enumerate(ann_tokens):
+        box = boxes[i]
+        instance_token = nusc.get('sample_annotation', ann_token)['instance_token']
+        fut_traj_local = predict_helper.get_future_for_agent(instance_token,
+                                                             sample_token,
+                                                             seconds=predict_steps//2,
+                                                             in_agent_frame=True)
+
+        fut_traj = np.zeros((predict_steps, 2))
+        fut_traj_valid_mask = np.zeros((predict_steps, 2))
+        if fut_traj_local.shape[0] > 0:
+            # trans = box.center
+            # trans = np.array([0, 0, 0])
+            # rot = Quaternion(matrix=box.rotation_matrix)
+            # fut_traj_scence_centric = convert_local_coords_to_global(fut_traj_local, trans, rot)  
+            fut_traj_scence_centric = fut_traj_local
+            fut_traj[:fut_traj_scence_centric.shape[0], :] = fut_traj_scence_centric
+            fut_traj_valid_mask[:fut_traj_scence_centric.shape[0], :] = 1
+        fut_traj_all.append(fut_traj)
+        fut_traj_valid_mask_all.append(fut_traj_valid_mask)
+    if len(ann_tokens) > 0:
+        fut_traj_all = np.stack(fut_traj_all, axis=0)
+        fut_traj_valid_mask_all = np.stack(fut_traj_valid_mask_all, axis=0)
+    else:
+        fut_traj_all = np.zeros((0, predict_steps, 2))
+        fut_traj_valid_mask_all = np.zeros((0, predict_steps, 2))
+    return fut_traj_all, fut_traj_valid_mask_all
+
+def _fill_trainval_infos(nusc,
+                         nusc_can_bus,
+                         train_scenes,
+                         val_scenes,
+                         test=False,
+                         max_sweeps=10):
+    """Generate the train/val infos from the raw data.
+
+    Args:
+        nusc (:obj:`NuScenes`): Dataset class in the nuScenes dataset.
+        train_scenes (list[str]): Basic information of training scenes.
+        val_scenes (list[str]): Basic information of validation scenes.
+        test (bool): Whether use the test mode. In the test mode, no
+            annotations can be accessed. Default: False.
+        max_sweeps (int): Max number of sweeps. Default: 10.
+
+    Returns:
+        tuple[list[dict]]: Information of training set and validation set
+            that will be saved to the info file.
+    """
+    train_nusc_infos = []
+    val_nusc_infos = []
+    frame_idx = 0
+    for sample in track_iter_progress(nusc.sample):
+        lidar_token = sample['data']['LIDAR_TOP']
+        sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+        cs_record = nusc.get('calibrated_sensor',
+                             sd_rec['calibrated_sensor_token'])
+        pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+        lidar_path, boxes, _ = nusc.get_sample_data(lidar_token)
+
+        check_file_exist(lidar_path)
+        can_bus = _get_can_bus_info(nusc, nusc_can_bus, sample)
+        ##
+        info = {
+            'lidar_path': lidar_path,
+            'token': sample['token'],
+            'prev': sample['prev'],
+            'next': sample['next'],
+            'can_bus': can_bus,
+            'frame_idx': frame_idx,  # temporal related info
+            'sweeps': [],
+            'cams': dict(),
+            'scene_token': sample['scene_token'],  # temporal related info
+            'lidar2ego_translation': cs_record['translation'],
+            'lidar2ego_rotation': cs_record['rotation'],
+            'ego2global_translation': pose_record['translation'],
+            'ego2global_rotation': pose_record['rotation'],
+            'timestamp': sample['timestamp'],
+        }
+
+        if sample['next'] == '':
+            frame_idx = 0
+        else:
+            frame_idx += 1
+
+        l2e_r = info['lidar2ego_rotation']
+        l2e_t = info['lidar2ego_translation']
+        e2g_r = info['ego2global_rotation']
+        e2g_t = info['ego2global_translation']
+        l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+        e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+        # obtain 6 image's information per frame
+        camera_types = [
+            'CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_FRONT_LEFT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_BACK_RIGHT',
+        ]
+        for cam in camera_types:
+            cam_token = sample['data'][cam]
+            cam_path, _, cam_intrinsic = nusc.get_sample_data(cam_token)
+            cam_info = obtain_sensor2top(nusc, cam_token, l2e_t, l2e_r_mat,
+                                         e2g_t, e2g_r_mat, cam)
+            cam_info.update(cam_intrinsic=cam_intrinsic)
+            info['cams'].update({cam: cam_info})
+
+        # obtain sweeps for a single key-frame
+        sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+        sweeps = []
+        while len(sweeps) < max_sweeps:
+            if not sd_rec['prev'] == '':
+                sweep = obtain_sensor2top(nusc, sd_rec['prev'], l2e_t,
+                                          l2e_r_mat, e2g_t, e2g_r_mat, 'lidar')
+                sweeps.append(sweep)
+                sd_rec = nusc.get('sample_data', sd_rec['prev'])
+            else:
+                break
+        info['sweeps'] = sweeps
+        # obtain annotation
+        if not test:
+            annotations = [
+                nusc.get('sample_annotation', token)
+                for token in sample['anns']
+            ]
+            locs = np.array([b.center for b in boxes]).reshape(-1, 3)
+            dims = np.array([b.wlh for b in boxes]).reshape(-1, 3)
+            rots = np.array([b.orientation.yaw_pitch_roll[0]
+                             for b in boxes]).reshape(-1, 1)
+            velocity = np.array(
+                [nusc.box_velocity(token)[:2] for token in sample['anns']])
+            valid_flag = np.array(
+                [(anno['num_lidar_pts'] + anno['num_radar_pts']) > 0
+                 for anno in annotations],
+                dtype=bool).reshape(-1)
+            instance_inds = [nusc.getind('instance', ann['instance_token'])
+                             for ann in annotations]
+            future_traj_all, future_traj_valid_mask_all = _get_future_traj_info(nusc, sample)
+            instance_tokens = [ann['instance_token'] for ann in annotations]  # dtype('<U[length_of_str]')
+
+            # TODO: Add traj in next dataset_version
+            # future_traj_all, future_traj_valid_mask_all = _get_future_traj_info(nusc, sample)
+            # convert velo from global to lidar
+            for i in range(len(boxes)):
+                velo = np.array([*velocity[i], 0.0])
+                velo = velo @ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(
+                    l2e_r_mat).T
+                velocity[i] = velo[:2]
+
+            names = [b.name for b in boxes]
+            for i in range(len(names)):
+                if names[i] in NuScenesDataset.NameMapping:
+                    names[i] = NuScenesDataset.NameMapping[names[i]]
+            names = np.array(names)
+            # instance_inds = [nusc.getind('instance', ann['instance_token']) for ann in annotations]
+            # we need to convert rot to SECOND format.
+            gt_boxes = np.concatenate([locs, dims, -rots - np.pi / 2], axis=1)
+            assert len(gt_boxes) == len(
+                annotations), f'{len(gt_boxes)}, {len(annotations)}'
+            info['gt_boxes'] = gt_boxes
+            info['gt_names'] = names
+            info['gt_velocity'] = velocity.reshape(-1, 2)
+            info['num_lidar_pts'] = np.array(
+                [a['num_lidar_pts'] for a in annotations])
+            info['num_radar_pts'] = np.array(
+                [a['num_radar_pts'] for a in annotations])
+            info['valid_flag'] = valid_flag
+            info['gt_inds'] = np.array(instance_inds)
+            info['gt_ins_tokens'] = np.array(instance_tokens)
+            info['fut_traj'] = future_traj_all
+            info['fut_traj_valid_mask'] = future_traj_valid_mask_all
+
+            # add visibility_tokens
+            visibility_tokens = [int(anno['visibility_token'])
+                                 for anno in annotations]
+            info['visibility_tokens'] = np.array(visibility_tokens)
+
+        if sample['scene_token'] in train_scenes:
+            train_nusc_infos.append(info)
+        else:
+            val_nusc_infos.append(info)
+
+    return train_nusc_infos, val_nusc_infos
+
+
+def obtain_sensor2top(nusc,
+                      sensor_token,
+                      l2e_t,
+                      l2e_r_mat,
+                      e2g_t,
+                      e2g_r_mat,
+                      sensor_type='lidar'):
+    """Obtain the info with RT matric from general sensor to Top LiDAR.
+
+    Args:
+        nusc (class): Dataset class in the nuScenes dataset.
+        sensor_token (str): Sample data token corresponding to the
+            specific sensor type.
+        l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3).
+        l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego
+            in shape (3, 3).
+        e2g_t (np.ndarray): Translation from ego to global in shape (1, 3).
+        e2g_r_mat (np.ndarray): Rotation matrix from ego to global
+            in shape (3, 3).
+        sensor_type (str): Sensor to calibrate. Default: 'lidar'.
+
+    Returns:
+        sweep (dict): Sweep information after transformation.
+    """
+    sd_rec = nusc.get('sample_data', sensor_token)
+    cs_record = nusc.get('calibrated_sensor',
+                         sd_rec['calibrated_sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    data_path = str(nusc.get_sample_data_path(sd_rec['token']))
+    if os.getcwd() in data_path:  # path from lyftdataset is absolute path
+        data_path = data_path.split(f'{os.getcwd()}/')[-1]  # relative path
+    sweep = {
+        'data_path': data_path,
+        'type': sensor_type,
+        'sample_data_token': sd_rec['token'],
+        'sensor2ego_translation': cs_record['translation'],
+        'sensor2ego_rotation': cs_record['rotation'],
+        'ego2global_translation': pose_record['translation'],
+        'ego2global_rotation': pose_record['rotation'],
+        'timestamp': sd_rec['timestamp']
+    }
+
+    l2e_r_s = sweep['sensor2ego_rotation']
+    l2e_t_s = sweep['sensor2ego_translation']
+    e2g_r_s = sweep['ego2global_rotation']
+    e2g_t_s = sweep['ego2global_translation']
+
+    # obtain the RT from sensor to Top LiDAR
+    # sweep->ego->global->ego'->lidar
+    l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix
+    e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix
+    R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
+                  ) + l2e_t @ np.linalg.inv(l2e_r_mat).T
+    sweep['sensor2lidar_rotation'] = R.T  # points @ R.T + T
+    sweep['sensor2lidar_translation'] = T
+    return sweep
+
+
+def export_2d_annotation(root_path, info_path, version, mono3d=True):
+    """Export 2d annotation from the info file and raw data.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        info_path (str): Path of the info file.
+        version (str): Dataset version.
+        mono3d (bool): Whether to export mono3d annotation. Default: True.
+    """
+    # get bbox annotations for camera
+    camera_types = [
+        'CAM_FRONT',
+        'CAM_FRONT_RIGHT',
+        'CAM_FRONT_LEFT',
+        'CAM_BACK',
+        'CAM_BACK_LEFT',
+        'CAM_BACK_RIGHT',
+    ]
+    nusc_infos = load(info_path)['infos']
+    nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+    # info_2d_list = []
+    cat2Ids = [
+        dict(id=nus_categories.index(cat_name), name=cat_name)
+        for cat_name in nus_categories
+    ]
+    coco_ann_id = 0
+    coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+    for info in track_iter_progress(nusc_infos):
+        for cam in camera_types:
+            cam_info = info['cams'][cam]
+            coco_infos = get_2d_boxes(
+                nusc,
+                cam_info['sample_data_token'],
+                visibilities=['', '1', '2', '3', '4'],
+                mono3d=mono3d)
+            (height, width, _) = imread(cam_info['data_path']).shape
+            coco_2d_dict['images'].append(
+                dict(
+                    file_name=cam_info['data_path'].split('data/nuscenes/')
+                    [-1],
+                    id=cam_info['sample_data_token'],
+                    token=info['token'],
+                    cam2ego_rotation=cam_info['sensor2ego_rotation'],
+                    cam2ego_translation=cam_info['sensor2ego_translation'],
+                    ego2global_rotation=info['ego2global_rotation'],
+                    ego2global_translation=info['ego2global_translation'],
+                    cam_intrinsic=cam_info['cam_intrinsic'],
+                    width=width,
+                    height=height))
+            for coco_info in coco_infos:
+                if coco_info is None:
+                    continue
+                # add an empty key for coco format
+                coco_info['segmentation'] = []
+                coco_info['id'] = coco_ann_id
+                coco_2d_dict['annotations'].append(coco_info)
+                coco_ann_id += 1
+    if mono3d:
+        json_prefix = f'{info_path[:-4]}_mono3d'
+    else:
+        json_prefix = f'{info_path[:-4]}'
+    dump(coco_2d_dict, f'{json_prefix}.coco.json')
+
+
+def get_2d_boxes(nusc,
+                 sample_data_token: str,
+                 visibilities: List[str],
+                 mono3d=True):
+    """Get the 2D annotation records for a given `sample_data_token`.
+
+    Args:
+        sample_data_token (str): Sample data token belonging to a camera \
+            keyframe.
+        visibilities (list[str]): Visibility filter.
+        mono3d (bool): Whether to get boxes with mono3d annotation.
+
+    Return:
+        list[dict]: List of 2D annotation record that belongs to the input
+            `sample_data_token`.
+    """
+
+    # Get the sample data and the sample corresponding to that sample data.
+    sd_rec = nusc.get('sample_data', sample_data_token)
+
+    assert sd_rec[
+        'sensor_modality'] == 'camera', 'Error: get_2d_boxes only works' \
+        ' for camera sample_data!'
+    if not sd_rec['is_key_frame']:
+        raise ValueError(
+            'The 2D re-projections are available only for keyframes.')
+
+    s_rec = nusc.get('sample', sd_rec['sample_token'])
+
+    # Get the calibrated sensor and ego pose
+    # record to get the transformation matrices.
+    cs_rec = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token'])
+    pose_rec = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    camera_intrinsic = np.array(cs_rec['camera_intrinsic'])
+
+    # Get all the annotation with the specified visibilties.
+    ann_recs = [
+        nusc.get('sample_annotation', token) for token in s_rec['anns']
+    ]
+    ann_recs = [
+        ann_rec for ann_rec in ann_recs
+        if (ann_rec['visibility_token'] in visibilities)
+    ]
+
+    repro_recs = []
+
+    for ann_rec in ann_recs:
+        # Augment sample_annotation with token information.
+        ann_rec['sample_annotation_token'] = ann_rec['token']
+        ann_rec['sample_data_token'] = sample_data_token
+
+        # Get the box in global coordinates.
+        box = nusc.get_box(ann_rec['token'])
+
+        # Move them to the ego-pose frame.
+        box.translate(-np.array(pose_rec['translation']))
+        box.rotate(Quaternion(pose_rec['rotation']).inverse)
+
+        # Move them to the calibrated sensor frame.
+        box.translate(-np.array(cs_rec['translation']))
+        box.rotate(Quaternion(cs_rec['rotation']).inverse)
+
+        # Filter out the corners that are not in front of the calibrated
+        # sensor.
+        corners_3d = box.corners()
+        in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+        corners_3d = corners_3d[:, in_front]
+
+        # Project 3d box to 2d.
+        corner_coords = view_points(corners_3d, camera_intrinsic,
+                                    True).T[:, :2].tolist()
+
+        # Keep only corners that fall within the image.
+        final_coords = post_process_coords(corner_coords)
+
+        # Skip if the convex hull of the re-projected corners
+        # does not intersect the image canvas.
+        if final_coords is None:
+            continue
+        else:
+            min_x, min_y, max_x, max_y = final_coords
+
+        # Generate dictionary record to be included in the .json file.
+        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+                                    sample_data_token, sd_rec['filename'])
+
+        # If mono3d=True, add 3D annotations in camera coordinates
+        if mono3d and (repro_rec is not None):
+            loc = box.center.tolist()
+
+            dim = box.wlh
+            dim[[0, 1, 2]] = dim[[1, 2, 0]]  # convert wlh to our lhw
+            dim = dim.tolist()
+
+            rot = box.orientation.yaw_pitch_roll[0]
+            rot = [-rot]  # convert the rot to our cam coordinate
+
+            global_velo2d = nusc.box_velocity(box.token)[:2]
+            global_velo3d = np.array([*global_velo2d, 0.0])
+            e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix
+            c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix
+            cam_velo3d = global_velo3d @ np.linalg.inv(
+                e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T
+            velo = cam_velo3d[0::2].tolist()
+
+            repro_rec['bbox_cam3d'] = loc + dim + rot
+            repro_rec['velo_cam3d'] = velo
+
+            center3d = np.array(loc).reshape([1, 3])
+            center2d = points_cam2img(
+                center3d, camera_intrinsic, with_depth=True)
+            repro_rec['center2d'] = center2d.squeeze().tolist()
+            # normalized center2D + depth
+            # if samples with depth < 0 will be removed
+            if repro_rec['center2d'][2] <= 0:
+                continue
+
+            ann_token = nusc.get('sample_annotation',
+                                 box.token)['attribute_tokens']
+            if len(ann_token) == 0:
+                attr_name = 'None'
+            else:
+                attr_name = nusc.get('attribute', ann_token[0])['name']
+            attr_id = nus_attributes.index(attr_name)
+            repro_rec['attribute_name'] = attr_name
+            repro_rec['attribute_id'] = attr_id
+
+        repro_recs.append(repro_rec)
+
+    return repro_recs
+
+
+def post_process_coords(
+    corner_coords: List, imsize: Tuple[int, int] = (1600, 900)
+) -> Union[Tuple[float, float, float, float], None]:
+    """Get the intersection of the convex hull of the reprojected bbox corners
+    and the image canvas, return None if no intersection.
+
+    Args:
+        corner_coords (list[int]): Corner coordinates of reprojected
+            bounding box.
+        imsize (tuple[int]): Size of the image canvas.
+
+    Return:
+        tuple [float]: Intersection of the convex hull of the 2D box
+            corners and the image canvas.
+    """
+    polygon_from_2d_box = MultiPoint(corner_coords).convex_hull
+    img_canvas = box(0, 0, imsize[0], imsize[1])
+
+    if polygon_from_2d_box.intersects(img_canvas):
+        img_intersection = polygon_from_2d_box.intersection(img_canvas)
+        intersection_coords = np.array(
+            [coord for coord in img_intersection.exterior.coords])
+
+        min_x = min(intersection_coords[:, 0])
+        min_y = min(intersection_coords[:, 1])
+        max_x = max(intersection_coords[:, 0])
+        max_y = max(intersection_coords[:, 1])
+
+        return min_x, min_y, max_x, max_y
+    else:
+        return None
+
+
+def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float,
+                    sample_data_token: str, filename: str) -> OrderedDict:
+    """Generate one 2D annotation record given various informations on top of
+    the 2D bounding box coordinates.
+
+    Args:
+        ann_rec (dict): Original 3d annotation record.
+        x1 (float): Minimum value of the x coordinate.
+        y1 (float): Minimum value of the y coordinate.
+        x2 (float): Maximum value of the x coordinate.
+        y2 (float): Maximum value of the y coordinate.
+        sample_data_token (str): Sample data token.
+        filename (str):The corresponding image file where the annotation
+            is present.
+
+    Returns:
+        dict: A sample 2D annotation record.
+            - file_name (str): flie name
+            - image_id (str): sample data token
+            - area (float): 2d box area
+            - category_name (str): category name
+            - category_id (int): category id
+            - bbox (list[float]): left x, top y, dx, dy of 2d box
+            - iscrowd (int): whether the area is crowd
+    """
+    repro_rec = OrderedDict()
+    repro_rec['sample_data_token'] = sample_data_token
+    coco_rec = dict()
+
+    relevant_keys = [
+        'attribute_tokens',
+        'category_name',
+        'instance_token',
+        'next',
+        'num_lidar_pts',
+        'num_radar_pts',
+        'prev',
+        'sample_annotation_token',
+        'sample_data_token',
+        'visibility_token',
+    ]
+
+    for key, value in ann_rec.items():
+        if key in relevant_keys:
+            repro_rec[key] = value
+
+    repro_rec['bbox_corners'] = [x1, y1, x2, y2]
+    repro_rec['filename'] = filename
+
+    coco_rec['file_name'] = filename
+    coco_rec['image_id'] = sample_data_token
+    coco_rec['area'] = (y2 - y1) * (x2 - x1)
+
+    if repro_rec['category_name'] not in NuScenesDataset.NameMapping:
+        return None
+    cat_name = NuScenesDataset.NameMapping[repro_rec['category_name']]
+    coco_rec['category_name'] = cat_name
+    coco_rec['category_id'] = nus_categories.index(cat_name)
+    coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1]
+    coco_rec['iscrowd'] = 0
+
+    return coco_rec
+    
\ No newline at end of file
diff --git a/adzoo/uniad/test.py b/adzoo/uniad/test.py
new file mode 100755
index 0000000..9442514
--- /dev/null
+++ b/adzoo/uniad/test.py
@@ -0,0 +1,145 @@
+import argparse
+import torch
+import os
+import warnings
+from torch.nn.parallel.distributed import DistributedDataParallel
+from mmcv.utils import get_dist_info, init_dist, wrap_fp16_model, set_random_seed, Config, DictAction, load_checkpoint
+from mmcv.fileio.io import dump
+from mmcv.datasets import build_dataset, build_dataloader, replace_ImageToTensor
+from mmcv.models import build_model, fuse_conv_bn
+import time
+import os.path as osp
+from adzoo.uniad.test_utils import custom_multi_gpu_test, custom_single_gpu_test
+import cv2
+cv2.setNumThreads(1)
+
+warnings.filterwarnings("ignore")
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--out', default='output/results.pkl', help='output result file in pickle format')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    parser.add_argument(
+        '--eval',
+        type=str,
+        nargs='+',
+        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
+        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
+    parser.add_argument('--show', action='store_true', help='show results')
+    parser.add_argument(
+        '--show-dir', help='directory where results will be saved')
+    parser.add_argument(
+        '--gpu-collect',
+        action='store_true',
+        help='whether to use gpu to collect results.')
+    parser.add_argument(
+        '--tmpdir',
+        help='tmp directory used for collecting results from multiple '
+        'workers, available when gpu-collect is not specified')
+    parser.add_argument('--seed', type=int, default=0, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+
+
+def main():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+    samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
+    if samples_per_gpu > 1:
+        # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+        cfg.data.test.pipeline = replace_ImageToTensor(cfg.data.test.pipeline)
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        torch.backends.cudnn.benchmark = True
+        init_dist(args.launcher, **cfg.dist_params)
+        rank, world_size = get_dist_info()
+
+    set_random_seed(args.seed, deterministic=args.deterministic)
+
+    # Dataloader
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(dataset,
+                                    samples_per_gpu=samples_per_gpu,
+                                    workers_per_gpu=cfg.data.workers_per_gpu,
+                                    dist=distributed,
+                                    shuffle=False,
+                                    nonshuffler_sampler=cfg.data.nonshuffler_sampler,
+                                    )
+
+    # Model
+    cfg.model.train_cfg = None
+    model = build_model(cfg.model, test_cfg=cfg.get('test_cfg'))
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
+    if args.fuse_conv_bn:
+        model = fuse_conv_bn(model)
+    
+    # Add classese info
+    if 'CLASSES' in checkpoint.get('meta', {}): # for det
+        model.CLASSES = checkpoint['meta']['CLASSES']
+    else:
+        model.CLASSES = dataset.CLASSES
+    if 'PALETTE' in checkpoint.get('meta', {}):  # for seg
+        model.PALETTE = checkpoint['meta']['PALETTE']
+    elif hasattr(dataset, 'PALETTE'):
+        model.PALETTE = dataset.PALETTE
+
+    if not distributed:
+        assert False #TODO(yzj)
+        # model = MMDataParallel(model, device_ids=[0])
+        # outputs = custom_single_gpu_test(model, data_loader, args.show, args.show_dir)
+    else:
+        model = DistributedDataParallel(model.cuda(),
+                                        device_ids=[torch.cuda.current_device()],
+                                        broadcast_buffers=False,
+                                        )
+        outputs = custom_multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect)
+
+
+
+    if rank == 0:
+        if args.out:
+            print(f'\nwriting results to {args.out}')
+            dump(outputs, args.out)
+        kwargs = {}
+        kwargs['jsonfile_prefix'] = osp.join('test', args.config.split('/')[-1].split('.')[-2], time.ctime().replace(' ', '_').replace(':', '_'))
+
+        if args.eval:
+            eval_kwargs = cfg.get('evaluation', {}).copy()
+            # hard-code way to remove EvalHook args
+            for key in ['interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', 'rule']:
+                eval_kwargs.pop(key, None)
+            eval_kwargs.update(dict(metric=args.eval, **kwargs))
+            print(dataset.evaluate(outputs, **eval_kwargs))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/uniad/test_utils.py b/adzoo/uniad/test_utils.py
new file mode 100644
index 0000000..4be8936
--- /dev/null
+++ b/adzoo/uniad/test_utils.py
@@ -0,0 +1,318 @@
+import os
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+
+import torch
+import torch.distributed as dist
+
+from mmcv.models.dense_heads.occ_head_plugin import IntersectionOverUnion, PanopticMetric
+from mmcv.models.dense_heads.planning_head_plugin import UniADPlanningMetric
+from mmcv.utils import ProgressBar, mkdir_or_exist, get_dist_info
+from mmcv.fileio.io import load, dump
+import numpy as np
+import pycocotools.mask as mask_util
+
+def custom_encode_mask_results(mask_results):
+    """Encode bitmap mask to RLE code. Semantic Masks only
+    Args:
+        mask_results (list | tuple[list]): bitmap mask results.
+            In mask scoring rcnn, mask_results is a tuple of (segm_results,
+            segm_cls_score).
+    Returns:
+        list | tuple: RLE encoded mask.
+    """
+    cls_segms = mask_results
+    num_classes = len(cls_segms)
+    encoded_mask_results = []
+    for i in range(len(cls_segms)):
+        encoded_mask_results.append(
+            mask_util.encode(
+                np.array(
+                    cls_segms[i][:, :, np.newaxis], order='F',
+                        dtype='uint8'))[0])  # encoded with RLE
+    return [encoded_mask_results]
+
+def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+    """Test model with multiple gpus.
+    This method tests model with multiple gpus and collects the results
+    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
+    it encodes results to gpu tensors and use gpu communication for results
+    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
+    and collects them by the rank 0 worker.
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode.
+        gpu_collect (bool): Option to use either gpu or cpu to collect results.
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+
+    # Occ eval init
+    eval_occ = hasattr(model.module, 'with_occ_head') \
+                and model.module.with_occ_head
+    if eval_occ:
+        # 30mx30m, 100mx100m at 50cm resolution
+        EVALUATION_RANGES = {'30x30': (70, 130),
+                            '100x100': (0, 200)}
+        n_classes = 2
+        iou_metrics = {}
+        for key in EVALUATION_RANGES.keys():
+            iou_metrics[key] = IntersectionOverUnion(n_classes).cuda()
+        panoptic_metrics = {}
+        for key in EVALUATION_RANGES.keys():
+            panoptic_metrics[key] = PanopticMetric(n_classes=n_classes, temporally_consistent=True).cuda()
+    
+    # Plan eval init
+    eval_planning =  hasattr(model.module, 'with_planning_head') \
+                      and model.module.with_planning_head
+    if eval_planning:
+        planning_metrics = UniADPlanningMetric().cuda()
+        
+    bbox_results = []
+    mask_results = []
+    dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        prog_bar = ProgressBar(len(dataset))
+    time.sleep(2)  # This line can prevent deadlock problem in some cases.
+    have_mask = False
+    num_occ = 0
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(data, return_loss=False, rescale=True)
+
+
+            #import pdb;pdb.set_trace()
+            
+            # # EVAL planning
+            if eval_planning:
+                # TODO: Wrap below into a func
+                segmentation = result[0]['planning']['planning_gt']['segmentation']
+                sdc_planning = result[0]['planning']['planning_gt']['sdc_planning']
+                sdc_planning_mask = result[0]['planning']['planning_gt']['sdc_planning_mask']
+                pred_sdc_traj = result[0]['planning']['result_planning']['sdc_traj']
+                result[0]['planning_traj'] = result[0]['planning']['result_planning']['sdc_traj']
+                result[0]['planning_traj_gt'] = result[0]['planning']['planning_gt']['sdc_planning']
+                result[0]['command'] = result[0]['planning']['planning_gt']['command']
+                planning_metrics(pred_sdc_traj[:, :6, :2], sdc_planning[0][0,:, :6, :2], sdc_planning_mask[0][0,:, :6, :2], segmentation[0][:, [1,2,3,4,5,6]])
+
+            # # Eval Occ
+            if eval_occ:
+                occ_has_invalid_frame = data['gt_occ_has_invalid_frame'][0]
+                occ_to_eval = not occ_has_invalid_frame.item()
+                if occ_to_eval and 'occ' in result[0].keys():
+                    num_occ += 1
+                    for key, grid in EVALUATION_RANGES.items():
+                        limits = slice(grid[0], grid[1])
+                        iou_metrics[key](result[0]['occ']['seg_out'][..., limits, limits].contiguous(),
+                                        result[0]['occ']['seg_gt'][..., limits, limits].contiguous())
+                        panoptic_metrics[key](result[0]['occ']['ins_seg_out'][..., limits, limits].contiguous().detach(),
+                                                result[0]['occ']['ins_seg_gt'][..., limits, limits].contiguous())
+
+            # Pop out unnecessary occ results, avoid appending it to cpu when collect_results_cpu
+            if os.environ.get('ENABLE_PLOT_MODE', None) is None:
+                result[0].pop('occ', None)
+                result[0].pop('planning', None)
+            else:
+                for k in ['seg_gt', 'ins_seg_gt', 'pred_ins_sigmoid', 'seg_out', 'ins_seg_out']:
+                    if k in result[0]['occ']:
+                        result[0]['occ'][k] = result[0]['occ'][k].detach().cpu()
+                for k in ['bbox', 'segm', 'labels', 'panoptic', 'drivable', 'score_list', 'lane', 'lane_score', 'stuff_score_list']:
+                    if k in result[0]['pts_bbox'] and isinstance(result[0]['pts_bbox'][k], torch.Tensor):
+                        result[0]['pts_bbox'][k] = result[0]['pts_bbox'][k].detach().cpu()
+
+            # # encode mask results
+            if isinstance(result, dict):
+                if 'bbox_results' in result.keys():
+                    bbox_result = result['bbox_results']
+                    batch_size = len(result['bbox_results'])
+                    bbox_results.extend(bbox_result)
+                if 'mask_results' in result.keys() and result['mask_results'] is not None:
+                    mask_result = custom_encode_mask_results(result['mask_results'])
+                    mask_results.extend(mask_result)
+                    have_mask = True
+            else:
+                batch_size = len(result)
+                bbox_results.extend(result)
+
+
+        if rank == 0:
+            for _ in range(batch_size * world_size):
+                prog_bar.update()
+                
+        # break
+
+    # collect results from all ranks
+    if gpu_collect:
+        bbox_results = collect_results_gpu(bbox_results, len(dataset))
+        if have_mask:
+            mask_results = collect_results_gpu(mask_results, len(dataset))
+        else:
+            mask_results = None
+    else:
+        bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir)
+        tmpdir = tmpdir+'_mask' if tmpdir is not None else None
+        if have_mask:
+            mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir)
+        else:
+            mask_results = None
+
+    if eval_planning:
+        planning_results = planning_metrics.compute()
+        planning_metrics.reset()
+
+    ret_results = dict()
+    ret_results['bbox_results'] = bbox_results
+    if eval_occ:
+        occ_results = {}
+        for key, grid in EVALUATION_RANGES.items():
+            panoptic_scores = panoptic_metrics[key].compute()
+            for panoptic_key, value in panoptic_scores.items():
+                occ_results[f'{panoptic_key}'] = occ_results.get(f'{panoptic_key}', []) + [100 * value[1].item()]
+            panoptic_metrics[key].reset()
+
+            iou_scores = iou_metrics[key].compute()
+            occ_results['iou'] = occ_results.get('iou', []) + [100 * iou_scores[1].item()]
+            iou_metrics[key].reset()
+
+        occ_results['num_occ'] = num_occ  # count on one gpu
+        occ_results['ratio_occ'] = num_occ / len(dataset)  # count on one gpu, but reflect the relative ratio
+        ret_results['occ_results_computed'] = occ_results
+    if eval_planning:
+        ret_results['planning_results_computed'] = planning_results
+
+    if mask_results is not None:
+        ret_results['mask_results'] = mask_results
+    return ret_results
+
+
+def collect_results_cpu(result_part, size, tmpdir=None):
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ),
+                                32,
+                                dtype=torch.uint8,
+                                device='cuda')
+        if rank == 0:
+            mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+            dir_tensor[:len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mkdir_or_exist(tmpdir)
+    # dump the part result to the dir
+    dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            part_file = osp.join(tmpdir, f'part_{i}.pkl')
+            part_list.append(load(part_file))
+        # sort the results
+        ordered_results = []
+        '''
+        bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample,
+        '''
+        #for res in zip(*part_list):
+        for res in part_list:  
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)
+        return ordered_results
+
+
+def collect_results_gpu(result_part, size):
+    collect_results_cpu(result_part, size)
+
+def custom_single_gpu_test(model,
+                    data_loader,
+                    show=False,
+                    out_dir=None,
+                    show_score_thr=0.3):
+    """Test model with single gpu.
+
+    This method tests model with single gpu and gives the 'show' option.
+    By setting ``show=True``, it saves the visualization results under
+    ``out_dir``.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        show (bool): Whether to save viualization results.
+            Default: True.
+        out_dir (str): The path to save visualization results.
+            Default: None.
+
+    Returns:
+        list[dict]: The prediction results.
+    """
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    prog_bar = ProgressBar(len(dataset))
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, rescale=True, **data)
+
+        if show:
+            # Visualize the results of MMDetection3D model
+            # 'show_results' is MMdetection3D visualization API
+            models_3d = (Base3DDetector, Base3DSegmentor,
+                         SingleStageMono3DDetector)
+            if isinstance(model.module, models_3d):
+                model.module.show_results(data, result, out_dir=out_dir)
+            # Visualize the results of MMDetection model
+            # 'show_result' is MMdetection visualization API
+            else:
+                batch_size = len(result)
+                if batch_size == 1 and isinstance(data['img'][0],
+                                                  torch.Tensor):
+                    img_tensor = data['img'][0]
+                else:
+                    img_tensor = data['img'][0].data[0]
+                img_metas = data['img_metas'][0].data[0]
+                imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg'])
+                assert len(imgs) == len(img_metas)
+
+                for i, (img, img_meta) in enumerate(zip(imgs, img_metas)):
+                    h, w, _ = img_meta['img_shape']
+                    img_show = img[:h, :w, :]
+
+                    ori_h, ori_w = img_meta['ori_shape'][:-1]
+                    img_show = imresize(img_show, (ori_w, ori_h))
+
+                    if out_dir:
+                        out_file = osp.join(out_dir, img_meta['ori_filename'])
+                    else:
+                        out_file = None
+
+                    model.module.show_result(
+                        img_show,
+                        result[i],
+                        show=show,
+                        out_file=out_file,
+                        score_thr=show_score_thr)
+        results.extend(result)
+
+        batch_size = len(result)
+        for _ in range(batch_size):
+            prog_bar.update()
+    return results
\ No newline at end of file
diff --git a/adzoo/uniad/train.py b/adzoo/uniad/train.py
new file mode 100755
index 0000000..1df7f99
--- /dev/null
+++ b/adzoo/uniad/train.py
@@ -0,0 +1,212 @@
+import argparse
+import torch
+import copy
+import os
+import time
+import warnings
+from os import path as osp
+from mmcv import __version__ as mmcv_version
+from mmcv.datasets import build_dataset
+from mmcv.models import build_model
+from mmcv.utils import collect_env, get_root_logger, mkdir_or_exist, set_random_seed, get_dist_info, init_dist, \
+                    Config, DictAction, TORCH_VERSION, digit_version
+from mmcv.datasets.builder import build_dataloader
+from mmcv.optims import build_optimizer
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+from mmcv.core.evaluation.eval_hooks import CustomDistEvalHook
+from mmcv.core import EvalHook
+from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
+                         Fp16OptimizerHook, OptimizerHook, build_runner)
+from adzoo.uniad.test_utils import custom_multi_gpu_test
+
+warnings.filterwarnings("ignore")
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume-from', help='the checkpoint file to resume from')
+    parser.add_argument(
+        '--no-validate',
+        action='store_true',
+        help='whether not to evaluate the checkpoint during training')
+    group_gpus = parser.add_mutually_exclusive_group()
+    group_gpus.add_argument(
+        '--gpus',
+        type=int,
+        help='number of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument('--seed', type=int, default=0, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local-rank', type=int, default=0)
+    parser.add_argument(
+        '--autoscale-lr',
+        action='store_true',
+        help='automatically scale lr with the number of gpus')
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    return args
+
+
+def main():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+
+    if args.work_dir is not None:
+        cfg.work_dir = args.work_dir
+    else:
+        cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0])
+
+    # if args.resume_from is not None:
+    if args.resume_from is not None and osp.isfile(args.resume_from):
+        cfg.resume_from = args.resume_from
+
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids
+    else:
+        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+    if args.autoscale_lr:
+        # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
+        cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    elif args.launcher == 'pytorch':
+        torch.backends.cudnn.benchmark = True
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+        rank, world_size = get_dist_info()
+        cfg.gpu_ids = range(world_size)
+
+    # Create work_dir
+    mkdir_or_exist(osp.abspath(cfg.work_dir))
+    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+
+    # meta info
+    meta = dict()
+    env_info_dict = collect_env()
+    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+    dash_line = '-' * 60 + '\n'
+    meta['env_info'] = env_info
+    meta['config'] = cfg.pretty_text
+    meta['seed'] = args.seed
+    meta['exp_name'] = osp.basename(args.config)
+
+    # seed
+    cfg.seed = args.seed
+    set_random_seed(args.seed, deterministic=args.deterministic)
+
+    # logger
+    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level, name=cfg.model.type)
+    logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line)
+    logger.info(f'Distributed training: {distributed}')
+    logger.info(f'Config:\n{cfg.pretty_text}')
+    logger.info(f'Set random seed to {args.seed}, 'f'deterministic: {args.deterministic}')
+
+    # Dataset
+    datasets = [build_dataset(cfg.data.train)]
+
+    # Save meta info
+    if cfg.checkpoint_config is not None:
+        cfg.checkpoint_config.meta = dict(mmcv_version=mmcv_version, config=cfg.pretty_text, CLASSES=datasets[0].CLASSES, \
+                                          PALETTE=datasets[0].PALETTE if hasattr(datasets[0], 'PALETTE') else None) # # for segmentors
+    
+    # Dataloader
+    datasets = datasets if isinstance(datasets, (list, tuple)) else [datasets]
+    data_loaders = [build_dataloader(ds,
+                        cfg.data.samples_per_gpu,
+                        cfg.data.workers_per_gpu,
+                        # cfg.gpus will be ignored if distributed
+                        len(cfg.gpu_ids),
+                        dist=distributed,
+                        seed=cfg.seed,
+                        shuffler_sampler=cfg.data.shuffler_sampler,  # dict(type='DistributedGroupSampler'),
+                        nonshuffler_sampler=cfg.data.nonshuffler_sampler,  # dict(type='DistributedSampler'),
+                        ) for ds in datasets
+                        ]
+
+    # Model
+    model = build_model(cfg.model, train_cfg=cfg.get('train_cfg'), test_cfg=cfg.get('test_cfg'))
+    model.init_weights()
+    model.CLASSES = datasets[0].CLASSES  # add an attribute for visualization convenience
+    logger.info(f'Model:\n{model}')
+    if distributed:
+        find_unused_parameters = cfg.get('find_unused_parameters', False)
+        model = DistributedDataParallel(model.cuda(),
+                                        device_ids=[torch.cuda.current_device()],
+                                        broadcast_buffers=False,
+                                        find_unused_parameters=find_unused_parameters
+                                        )
+    else:
+        model = DataParallel(model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+
+    # Optimizer
+    optimizer = build_optimizer(model, cfg.optimizer)
+    optimizer_config = OptimizerHook(**cfg.optimizer_config)
+
+    # Runner
+    runner = build_runner(cfg.runner, default_args=dict(model=model,
+                                                        optimizer=optimizer,
+                                                        work_dir=cfg.work_dir,
+                                                        logger=logger,
+                                                        meta=meta))
+    runner.timestamp = timestamp
+    runner.register_training_hooks(cfg.lr_config, optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config,
+                                   cfg.get('momentum_config', None))
+    if distributed:
+        if isinstance(runner, EpochBasedRunner):
+            runner.register_hook(DistSamplerSeedHook())
+    
+    # Register eval hooks for interval eval
+    val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
+    if val_samples_per_gpu > 1:
+        assert False
+        # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+        cfg.data.val.pipeline = replace_ImageToTensor(
+            cfg.data.val.pipeline)
+    val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
+
+    val_dataloader = build_dataloader(
+        val_dataset,
+        samples_per_gpu=val_samples_per_gpu,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=distributed,
+        shuffle=False,
+        shuffler_sampler=cfg.data.shuffler_sampler,  # dict(type='DistributedGroupSampler'),
+        nonshuffler_sampler=cfg.data.nonshuffler_sampler,  # dict(type='DistributedSampler'),
+    )
+    eval_cfg = cfg.get('evaluation', {})
+    eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
+    eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_'))
+    eval_hook = CustomDistEvalHook if distributed else EvalHook
+    runner.register_hook(eval_hook(val_dataloader, test_fn=custom_multi_gpu_test, **eval_cfg))
+
+    if cfg.resume_from and os.path.exists(cfg.resume_from):
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow)
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/uniad/uniad_dist_eval.sh b/adzoo/uniad/uniad_dist_eval.sh
new file mode 100755
index 0000000..12b2720
--- /dev/null
+++ b/adzoo/uniad/uniad_dist_eval.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+T=`date +%m%d%H%M`
+
+# -------------------------------------------------- #
+# Usually you only need to customize these variables #
+CFG=$1                                               #
+CKPT=$2                                              #
+GPUS=$3                                              #    
+# -------------------------------------------------- #
+GPUS_PER_NODE=$(($GPUS<8?$GPUS:8))
+
+MASTER_PORT=${MASTER_PORT:-12145}
+WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
+# Intermediate files and logs will be saved to UniAD/projects/work_dirs/
+
+if [ ! -d ${WORK_DIR}logs ]; then
+    mkdir -p ${WORK_DIR}logs
+fi
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+    --nproc_per_node=$GPUS_PER_NODE \
+    --master_port=$MASTER_PORT \
+    $(dirname "$0")/test.py \
+    $CFG \
+    $CKPT \
+    --launcher pytorch ${@:4} \
+    --eval bbox \
+    --show-dir ${WORK_DIR} \
+    2>&1 | tee ${WORK_DIR}logs/eval.$T
\ No newline at end of file
diff --git a/adzoo/uniad/uniad_dist_train.sh b/adzoo/uniad/uniad_dist_train.sh
new file mode 100755
index 0000000..313e20a
--- /dev/null
+++ b/adzoo/uniad/uniad_dist_train.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+T=`date +%m%d%H%M`
+
+# -------------------------------------------------- #
+# Usually you only need to customize these variables #
+CFG=$1                                               #
+GPUS=$2                                              #
+# -------------------------------------------------- #
+GPUS_PER_NODE=$(($GPUS<8?$GPUS:8))
+NNODES=`expr $GPUS / $GPUS_PER_NODE`
+
+MASTER_PORT=${MASTER_PORT:-54621}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+RANK=${RANK:-0}
+
+WORK_DIR=$(echo ${CFG%.*} | sed -e "s/configs/work_dirs/g")/
+# Intermediate files and logs will be saved to UniAD/projects/work_dirs/
+
+if [ ! -d ${WORK_DIR}logs ]; then
+    mkdir -p ${WORK_DIR}logs
+fi
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+    --nproc_per_node=${GPUS_PER_NODE} \
+    --master_addr=${MASTER_ADDR} \
+    --master_port=${MASTER_PORT} \
+    --nnodes=${NNODES} \
+    --node_rank=${RANK} \
+    $(dirname "$0")/train.py \
+    $CFG \
+    --launcher pytorch ${@:3} \
+    --deterministic \
+    --work-dir ${WORK_DIR} \
+    2>&1 | tee ${WORK_DIR}logs/train.$T
\ No newline at end of file
diff --git a/adzoo/uniad/uniad_vis_result.sh b/adzoo/uniad/uniad_vis_result.sh
new file mode 100755
index 0000000..b43a1be
--- /dev/null
+++ b/adzoo/uniad/uniad_vis_result.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+python ./tools/analysis_tools/visualize/run.py \
+    --predroot PATH_TO_YOUR_PREDISION_RESULT_PKL \
+    --out_folder PATH_TO_YOUR_OUTPUT_FOLDER \
+    --demo_video FILENAME_OF_OUTPUT_VIDEO \
+    --project_to_cam True
\ No newline at end of file
diff --git a/adzoo/vad/analysis_tools/__init__.py b/adzoo/vad/analysis_tools/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/adzoo/vad/analysis_tools/analyze_logs.py b/adzoo/vad/analysis_tools/analyze_logs.py
new file mode 100644
index 0000000..806175f
--- /dev/null
+++ b/adzoo/vad/analysis_tools/analyze_logs.py
@@ -0,0 +1,201 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+import numpy as np
+import seaborn as sns
+from collections import defaultdict
+from matplotlib import pyplot as plt
+
+
+def cal_train_time(log_dicts, args):
+    for i, log_dict in enumerate(log_dicts):
+        print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}')
+        all_times = []
+        for epoch in log_dict.keys():
+            if args.include_outliers:
+                all_times.append(log_dict[epoch]['time'])
+            else:
+                all_times.append(log_dict[epoch]['time'][1:])
+        all_times = np.array(all_times)
+        epoch_ave_time = all_times.mean(-1)
+        slowest_epoch = epoch_ave_time.argmax()
+        fastest_epoch = epoch_ave_time.argmin()
+        std_over_epoch = epoch_ave_time.std()
+        print(f'slowest epoch {slowest_epoch + 1}, '
+              f'average time is {epoch_ave_time[slowest_epoch]:.4f}')
+        print(f'fastest epoch {fastest_epoch + 1}, '
+              f'average time is {epoch_ave_time[fastest_epoch]:.4f}')
+        print(f'time std over epochs is {std_over_epoch:.4f}')
+        print(f'average iter time: {np.mean(all_times):.4f} s/iter')
+        print()
+
+
+def plot_curve(log_dicts, args):
+    if args.backend is not None:
+        plt.switch_backend(args.backend)
+    sns.set_style(args.style)
+    # if legend is None, use {filename}_{key} as legend
+    legend = args.legend
+    if legend is None:
+        legend = []
+        for json_log in args.json_logs:
+            for metric in args.keys:
+                legend.append(f'{json_log}_{metric}')
+    assert len(legend) == (len(args.json_logs) * len(args.keys))
+    metrics = args.keys
+
+    num_metrics = len(metrics)
+    for i, log_dict in enumerate(log_dicts):
+        epochs = list(log_dict.keys())
+        for j, metric in enumerate(metrics):
+            print(f'plot curve of {args.json_logs[i]}, metric is {metric}')
+            if metric not in log_dict[epochs[args.interval - 1]]:
+                raise KeyError(
+                    f'{args.json_logs[i]} does not contain metric {metric}')
+
+            if args.mode == 'eval':
+                if min(epochs) == args.interval:
+                    x0 = args.interval
+                else:
+                    # if current training is resumed from previous checkpoint
+                    # we lost information in early epochs
+                    # `xs` should start according to `min(epochs)`
+                    if min(epochs) % args.interval == 0:
+                        x0 = min(epochs)
+                    else:
+                        # find the first epoch that do eval
+                        x0 = min(epochs) + args.interval - \
+                            min(epochs) % args.interval
+                xs = np.arange(x0, max(epochs) + 1, args.interval)
+                ys = []
+                for epoch in epochs[args.interval - 1::args.interval]:
+                    ys += log_dict[epoch][metric]
+
+                # if training is aborted before eval of the last epoch
+                # `xs` and `ys` will have different length and cause an error
+                # check if `ys[-1]` is empty here
+                if not log_dict[epoch][metric]:
+                    xs = xs[:-1]
+
+                ax = plt.gca()
+                ax.set_xticks(xs)
+                plt.xlabel('epoch')
+                plt.plot(xs, ys, label=legend[i * num_metrics + j], marker='o')
+            else:
+                xs = []
+                ys = []
+                num_iters_per_epoch = \
+                    log_dict[epochs[args.interval-1]]['iter'][-1]
+                for epoch in epochs[args.interval - 1::args.interval]:
+                    iters = log_dict[epoch]['iter']
+                    if log_dict[epoch]['mode'][-1] == 'val':
+                        iters = iters[:-1]
+                    xs.append(
+                        np.array(iters) + (epoch - 1) * num_iters_per_epoch)
+                    ys.append(np.array(log_dict[epoch][metric][:len(iters)]))
+                xs = np.concatenate(xs)
+                ys = np.concatenate(ys)
+                plt.xlabel('iter')
+                plt.plot(
+                    xs, ys, label=legend[i * num_metrics + j], linewidth=0.5)
+            plt.legend()
+        if args.title is not None:
+            plt.title(args.title)
+    if args.out is None:
+        plt.show()
+    else:
+        print(f'save curve to: {args.out}')
+        plt.savefig(args.out)
+        plt.cla()
+
+
+def add_plot_parser(subparsers):
+    parser_plt = subparsers.add_parser(
+        'plot_curve', help='parser for plotting curves')
+    parser_plt.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_plt.add_argument(
+        '--keys',
+        type=str,
+        nargs='+',
+        default=['mAP_0.25'],
+        help='the metric that you want to plot')
+    parser_plt.add_argument('--title', type=str, help='title of figure')
+    parser_plt.add_argument(
+        '--legend',
+        type=str,
+        nargs='+',
+        default=None,
+        help='legend of each plot')
+    parser_plt.add_argument(
+        '--backend', type=str, default=None, help='backend of plt')
+    parser_plt.add_argument(
+        '--style', type=str, default='dark', help='style of plt')
+    parser_plt.add_argument('--out', type=str, default=None)
+    parser_plt.add_argument('--mode', type=str, default='train')
+    parser_plt.add_argument('--interval', type=int, default=1)
+
+
+def add_time_parser(subparsers):
+    parser_time = subparsers.add_parser(
+        'cal_train_time',
+        help='parser for computing the average time per training iteration')
+    parser_time.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_time.add_argument(
+        '--include-outliers',
+        action='store_true',
+        help='include the first value of every epoch when computing '
+        'the average time')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Analyze Json Log')
+    # currently only support plot curve and calculate average train time
+    subparsers = parser.add_subparsers(dest='task', help='task parser')
+    add_plot_parser(subparsers)
+    add_time_parser(subparsers)
+    args = parser.parse_args()
+    return args
+
+
+def load_json_logs(json_logs):
+    # load and convert json_logs to log_dict, key is epoch, value is a sub dict
+    # keys of sub dict is different metrics, e.g. memory, bbox_mAP
+    # value of sub dict is a list of corresponding values of all iterations
+    log_dicts = [dict() for _ in json_logs]
+    for json_log, log_dict in zip(json_logs, log_dicts):
+        with open(json_log, 'r') as log_file:
+            for line in log_file:
+                log = json.loads(line.strip())
+                # skip lines without `epoch` field
+                if 'epoch' not in log:
+                    continue
+                epoch = log.pop('epoch')
+                if epoch not in log_dict:
+                    log_dict[epoch] = defaultdict(list)
+                for k, v in log.items():
+                    log_dict[epoch][k].append(v)
+    return log_dicts
+
+
+def main():
+    args = parse_args()
+
+    json_logs = args.json_logs
+    for json_log in json_logs:
+        assert json_log.endswith('.json')
+
+    log_dicts = load_json_logs(json_logs)
+
+    eval(args.task)(log_dicts, args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/vad/analysis_tools/benchmark.py b/adzoo/vad/analysis_tools/benchmark.py
new file mode 100644
index 0000000..487a348
--- /dev/null
+++ b/adzoo/vad/analysis_tools/benchmark.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import time
+import torch
+from mmcv import Config
+from mmcv.parallel import MMDataParallel
+from mmcv.runner import load_checkpoint, wrap_fp16_model
+import sys
+sys.path.append('.')
+from projects.mmdet3d_plugin.datasets.builder import build_dataloader
+from projects.mmdet3d_plugin.datasets import custom_build_dataset
+# from mmdet3d.datasets import build_dataloader, build_dataset
+from mmdet3d.models import build_detector
+#from tools.misc.fuse_conv_bn import fuse_module
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='MMDet benchmark a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--checkpoint', default=None, help='checkpoint file')
+    parser.add_argument('--samples', default=2000, help='samples to benchmark')
+    parser.add_argument(
+        '--log-interval', default=50, help='interval of logging')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    # build the dataloader
+    # TODO: support multiple images per gpu (only minor changes are needed)
+    print(cfg.data.test)
+    dataset = custom_build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=1,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=False,
+        shuffle=False)
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg'))
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    if args.checkpoint is not None:
+        load_checkpoint(model, args.checkpoint, map_location='cpu')
+    #if args.fuse_conv_bn:
+    #    model = fuse_module(model)
+
+    model = MMDataParallel(model, device_ids=[0])
+
+    model.eval()
+
+    # the first several iterations may be very slow so skip them
+    num_warmup = 5
+    pure_inf_time = 0
+
+    # benchmark with several samples and take the average
+    for i, data in enumerate(data_loader):
+        torch.cuda.synchronize()
+        start_time = time.perf_counter()
+        with torch.no_grad():
+            model(return_loss=False, rescale=True, **data)
+
+        torch.cuda.synchronize()
+        elapsed = time.perf_counter() - start_time
+
+        if i >= num_warmup:
+            pure_inf_time += elapsed
+            if (i + 1) % args.log_interval == 0:
+                fps = (i + 1 - num_warmup) / pure_inf_time
+                print(f'Done image [{i + 1:<3}/ {args.samples}], '
+                      f'fps: {fps:.1f} img / s')
+
+        if (i + 1) == args.samples:
+            pure_inf_time += elapsed
+            fps = (i + 1 - num_warmup) / pure_inf_time
+            print(f'Overall fps: {fps:.1f} img / s')
+            break
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/vad/analysis_tools/get_flops.py b/adzoo/vad/analysis_tools/get_flops.py
new file mode 100644
index 0000000..1b9fb01
--- /dev/null
+++ b/adzoo/vad/analysis_tools/get_flops.py
@@ -0,0 +1,747 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import argparse
+
+import torch
+from mmcv import Config, DictAction
+
+from mmdet3d.models import build_model
+from mmdet3d.datasets import build_dataset
+from projects.mmdet3d_plugin.datasets.builder import build_dataloader
+
+# try:
+#     from mmcv.cnn import get_model_complexity_info
+# except ImportError:
+#     raise ImportError('Please upgrade mmcv to >0.6.2')
+
+import sys
+sys.path.append('.')
+
+
+from functools import partial
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+import mmcv
+
+
+def get_model_complexity_info(model,
+                              data,
+                              input_shape=(1280, 720),
+                              print_per_layer_stat=True,
+                              as_strings=True,
+                              input_constructor=None,
+                              flush=False,
+                              ost=sys.stdout):
+    """Get complexity information of a model.
+
+    This method can calculate FLOPs and parameter counts of a model with
+    corresponding input shape. It can also print complexity information for
+    each layer in a model.
+
+    Supported layers are listed as below:
+        - Convolutions: ``nn.Conv1d``, ``nn.Conv2d``, ``nn.Conv3d``.
+        - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``, ``nn.LeakyReLU``,
+            ``nn.ReLU6``.
+        - Poolings: ``nn.MaxPool1d``, ``nn.MaxPool2d``, ``nn.MaxPool3d``,
+            ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``,
+            ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``,
+            ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``,
+            ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``.
+        - BatchNorms: ``nn.BatchNorm1d``, ``nn.BatchNorm2d``,
+            ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``,
+            ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``.
+        - Linear: ``nn.Linear``.
+        - Deconvolution: ``nn.ConvTranspose2d``.
+        - Upsample: ``nn.Upsample``.
+
+    Args:
+        model (nn.Module): The model for complexity calculation.
+        input_shape (tuple): Input shape used for calculation.
+        print_per_layer_stat (bool): Whether to print complexity information
+            for each layer in a model. Default: True.
+        as_strings (bool): Output FLOPs and params counts in a string form.
+            Default: True.
+        input_constructor (None | callable): If specified, it takes a callable
+            method that generates input. otherwise, it will generate a random
+            tensor with input shape to calculate FLOPs. Default: None.
+        flush (bool): same as that in :func:`print`. Default: False.
+        ost (stream): same as ``file`` param in :func:`print`.
+            Default: sys.stdout.
+
+    Returns:
+        tuple[float | str]: If ``as_strings`` is set to True, it will return
+            FLOPs and parameter counts in a string format. otherwise, it will
+            return those in a float number format.
+    """
+
+    assert isinstance(model, nn.Module)
+    flops_model = add_flops_counting_methods(model)
+    flops_model.eval()
+    flops_model.start_flops_count()
+    if input_constructor:
+        input = input_constructor(input_shape)
+        _ = flops_model(**input)
+    else:
+        try:
+            batch = torch.ones(()).new_empty(
+                (1, 6, 3, *input_shape),
+                dtype=next(flops_model.parameters()).dtype,
+                device=next(flops_model.parameters()).device)
+        except StopIteration:
+            # Avoid StopIteration for models which have no parameters,
+            # like `nn.Relu()`, `nn.AvgPool2d`, etc.
+            batch = torch.ones(()).new_empty((1, 6, 3, *input_shape))
+
+        # img_metas = [data['img_metas'][0].data[0]]
+        # img = data['img'][0].data[0]
+        # points = data['points'][0].data[0][0]
+        # fut_valid_flag = data['fut_valid_flag'][0].data[0]
+        # img = img.to(batch.device)
+        # points = [points.to(batch.device)]
+        # ego_his_trajs = data['ego_his_trajs'][0].data[0].to(batch.device)
+        # ego_lcf_feat = data['ego_lcf_feat'][0].data[0].to(batch.device).unsqueeze(0)
+
+        # _ = flops_model(rescale=True, img=img, img_metas=img_metas, points=points,
+        #                 fut_valid_flag=fut_valid_flag, ego_his_trajs=ego_his_trajs, ego_lcf_feat=ego_lcf_feat)
+
+        img_metas = [data['img_metas'][0].data[0]]
+        img = data['img'][0].data[0]
+        img = img.to(batch.device)
+
+        _ = flops_model(rescale=True, img=img, img_metas=img_metas)
+
+    flops_count, params_count = flops_model.compute_average_flops_cost()
+    if print_per_layer_stat:
+        print_model_with_flops(
+            flops_model, flops_count, params_count, ost=ost, flush=flush)
+    flops_model.stop_flops_count()
+
+    if as_strings:
+        return flops_to_string(flops_count), params_to_string(params_count)
+
+    return flops_count, params_count
+
+
+def flops_to_string(flops, units='GFLOPs', precision=2):
+    """Convert FLOPs number into a string.
+
+    Note that Here we take a multiply-add counts as one FLOP.
+
+    Args:
+        flops (float): FLOPs number to be converted.
+        units (str | None): Converted FLOPs units. Options are None, 'GFLOPs',
+            'MFLOPs', 'KFLOPs', 'FLOPs'. If set to None, it will automatically
+            choose the most suitable unit for FLOPs. Default: 'GFLOPs'.
+        precision (int): Digit number after the decimal point. Default: 2.
+
+    Returns:
+        str: The converted FLOPs number with units.
+
+    Examples:
+        >>> flops_to_string(1e9)
+        '1.0 GFLOPs'
+        >>> flops_to_string(2e5, 'MFLOPs')
+        '0.2 MFLOPs'
+        >>> flops_to_string(3e-9, None)
+        '3e-09 FLOPs'
+    """
+    if units is None:
+        if flops // 10**9 > 0:
+            return str(round(flops / 10.**9, precision)) + ' GFLOPs'
+        elif flops // 10**6 > 0:
+            return str(round(flops / 10.**6, precision)) + ' MFLOPs'
+        elif flops // 10**3 > 0:
+            return str(round(flops / 10.**3, precision)) + ' KFLOPs'
+        else:
+            return str(flops) + ' FLOPs'
+    else:
+        if units == 'GFLOPs':
+            return str(round(flops / 10.**9, precision)) + ' ' + units
+        elif units == 'MFLOPs':
+            return str(round(flops / 10.**6, precision)) + ' ' + units
+        elif units == 'KFLOPs':
+            return str(round(flops / 10.**3, precision)) + ' ' + units
+        else:
+            return str(flops) + ' FLOPs'
+
+
+def params_to_string(num_params, units=None, precision=2):
+    """Convert parameter number into a string.
+
+    Args:
+        num_params (float): Parameter number to be converted.
+        units (str | None): Converted FLOPs units. Options are None, 'M',
+            'K' and ''. If set to None, it will automatically choose the most
+            suitable unit for Parameter number. Default: None.
+        precision (int): Digit number after the decimal point. Default: 2.
+
+    Returns:
+        str: The converted parameter number with units.
+
+    Examples:
+        >>> params_to_string(1e9)
+        '1000.0 M'
+        >>> params_to_string(2e5)
+        '200.0 k'
+        >>> params_to_string(3e-9)
+        '3e-09'
+    """
+    if units is None:
+        if num_params // 10**6 > 0:
+            return str(round(num_params / 10**6, precision)) + ' M'
+        elif num_params // 10**3:
+            return str(round(num_params / 10**3, precision)) + ' k'
+        else:
+            return str(num_params)
+    else:
+        if units == 'M':
+            return str(round(num_params / 10.**6, precision)) + ' ' + units
+        elif units == 'K':
+            return str(round(num_params / 10.**3, precision)) + ' ' + units
+        else:
+            return str(num_params)
+
+
+def print_model_with_flops(model,
+                           total_flops,
+                           total_params,
+                           units='GFLOPs',
+                           precision=3,
+                           ost=sys.stdout,
+                           flush=False):
+    """Print a model with FLOPs for each layer.
+
+    Args:
+        model (nn.Module): The model to be printed.
+        total_flops (float): Total FLOPs of the model.
+        total_params (float): Total parameter counts of the model.
+        units (str | None): Converted FLOPs units. Default: 'GFLOPs'.
+        precision (int): Digit number after the decimal point. Default: 3.
+        ost (stream): same as `file` param in :func:`print`.
+            Default: sys.stdout.
+        flush (bool): same as that in :func:`print`. Default: False.
+
+    Example:
+        >>> class ExampleModel(nn.Module):
+
+        >>> def __init__(self):
+        >>>     super().__init__()
+        >>>     self.conv1 = nn.Conv2d(3, 8, 3)
+        >>>     self.conv2 = nn.Conv2d(8, 256, 3)
+        >>>     self.conv3 = nn.Conv2d(256, 8, 3)
+        >>>     self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+        >>>     self.flatten = nn.Flatten()
+        >>>     self.fc = nn.Linear(8, 1)
+
+        >>> def forward(self, x):
+        >>>     x = self.conv1(x)
+        >>>     x = self.conv2(x)
+        >>>     x = self.conv3(x)
+        >>>     x = self.avg_pool(x)
+        >>>     x = self.flatten(x)
+        >>>     x = self.fc(x)
+        >>>     return x
+
+        >>> model = ExampleModel()
+        >>> x = (3, 16, 16)
+        to print the complexity information state for each layer, you can use
+        >>> get_model_complexity_info(model, x)
+        or directly use
+        >>> print_model_with_flops(model, 4579784.0, 37361)
+        ExampleModel(
+          0.037 M, 100.000% Params, 0.005 GFLOPs, 100.000% FLOPs,
+          (conv1): Conv2d(0.0 M, 0.600% Params, 0.0 GFLOPs, 0.959% FLOPs, 3, 8, kernel_size=(3, 3), stride=(1, 1))  # noqa: E501
+          (conv2): Conv2d(0.019 M, 50.020% Params, 0.003 GFLOPs, 58.760% FLOPs, 8, 256, kernel_size=(3, 3), stride=(1, 1))
+          (conv3): Conv2d(0.018 M, 49.356% Params, 0.002 GFLOPs, 40.264% FLOPs, 256, 8, kernel_size=(3, 3), stride=(1, 1))
+          (avg_pool): AdaptiveAvgPool2d(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.017% FLOPs, output_size=(1, 1))
+          (flatten): Flatten(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, )
+          (fc): Linear(0.0 M, 0.024% Params, 0.0 GFLOPs, 0.000% FLOPs, in_features=8, out_features=1, bias=True)
+        )
+    """
+
+    def accumulate_params(self):
+        if is_supported_instance(self):
+            return self.__params__
+        else:
+            sum = 0
+            for m in self.children():
+                sum += m.accumulate_params()
+            return sum
+
+    def accumulate_flops(self):
+        if is_supported_instance(self):
+            return self.__flops__ / model.__batch_counter__
+        else:
+            sum = 0
+            for m in self.children():
+                sum += m.accumulate_flops()
+            return sum
+
+    def flops_repr(self):
+        accumulated_num_params = self.accumulate_params()
+        accumulated_flops_cost = self.accumulate_flops()
+        return ', '.join([
+            params_to_string(
+                accumulated_num_params, units='M', precision=precision),
+            '{:.3%} Params'.format(accumulated_num_params / total_params),
+            flops_to_string(
+                accumulated_flops_cost, units=units, precision=precision),
+            '{:.3%} FLOPs'.format(accumulated_flops_cost / total_flops),
+            self.original_extra_repr()
+        ])
+
+    def add_extra_repr(m):
+        m.accumulate_flops = accumulate_flops.__get__(m)
+        m.accumulate_params = accumulate_params.__get__(m)
+        flops_extra_repr = flops_repr.__get__(m)
+        if m.extra_repr != flops_extra_repr:
+            m.original_extra_repr = m.extra_repr
+            m.extra_repr = flops_extra_repr
+            assert m.extra_repr != m.original_extra_repr
+
+    def del_extra_repr(m):
+        if hasattr(m, 'original_extra_repr'):
+            m.extra_repr = m.original_extra_repr
+            del m.original_extra_repr
+        if hasattr(m, 'accumulate_flops'):
+            del m.accumulate_flops
+
+    model.apply(add_extra_repr)
+    print(model, file=ost, flush=flush)
+    model.apply(del_extra_repr)
+
+
+def get_model_parameters_number(model):
+    """Calculate parameter number of a model.
+
+    Args:
+        model (nn.module): The model for parameter number calculation.
+
+    Returns:
+        float: Parameter number of the model.
+    """
+    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    return num_params
+
+
+def add_flops_counting_methods(net_main_module):
+    # adding additional methods to the existing module object,
+    # this is done this way so that each function has access to self object
+    net_main_module.start_flops_count = start_flops_count.__get__(
+        net_main_module)
+    net_main_module.stop_flops_count = stop_flops_count.__get__(
+        net_main_module)
+    net_main_module.reset_flops_count = reset_flops_count.__get__(
+        net_main_module)
+    net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__(  # noqa: E501
+        net_main_module)
+
+    net_main_module.reset_flops_count()
+
+    return net_main_module
+
+
+def compute_average_flops_cost(self):
+    """Compute average FLOPs cost.
+
+    A method to compute average FLOPs cost, which will be available after
+    `add_flops_counting_methods()` is called on a desired net object.
+
+    Returns:
+        float: Current mean flops consumption per image.
+    """
+    batches_count = self.__batch_counter__
+    flops_sum = 0
+    for module in self.modules():
+        if is_supported_instance(module):
+            flops_sum += module.__flops__
+    params_sum = get_model_parameters_number(self)
+    return flops_sum / batches_count, params_sum
+
+
+def start_flops_count(self):
+    """Activate the computation of mean flops consumption per image.
+
+    A method to activate the computation of mean flops consumption per image.
+    which will be available after ``add_flops_counting_methods()`` is called on
+    a desired net object. It should be called before running the network.
+    """
+    add_batch_counter_hook_function(self)
+
+    def add_flops_counter_hook_function(module):
+        if is_supported_instance(module):
+            if hasattr(module, '__flops_handle__'):
+                return
+
+            else:
+                handle = module.register_forward_hook(
+                    get_modules_mapping()[type(module)])
+
+            module.__flops_handle__ = handle
+
+    self.apply(partial(add_flops_counter_hook_function))
+
+
+def stop_flops_count(self):
+    """Stop computing the mean flops consumption per image.
+
+    A method to stop computing the mean flops consumption per image, which will
+    be available after ``add_flops_counting_methods()`` is called on a desired
+    net object. It can be called to pause the computation whenever.
+    """
+    remove_batch_counter_hook_function(self)
+    self.apply(remove_flops_counter_hook_function)
+
+
+def reset_flops_count(self):
+    """Reset statistics computed so far.
+
+    A method to Reset computed statistics, which will be available after
+    `add_flops_counting_methods()` is called on a desired net object.
+    """
+    add_batch_counter_variables_or_reset(self)
+    self.apply(add_flops_counter_variable_or_reset)
+
+
+# ---- Internal functions
+def empty_flops_counter_hook(module, input, output):
+    module.__flops__ += 0
+
+
+def upsample_flops_counter_hook(module, input, output):
+    output_size = output[0]
+    batch_size = output_size.shape[0]
+    output_elements_count = batch_size
+    for val in output_size.shape[1:]:
+        output_elements_count *= val
+    module.__flops__ += int(output_elements_count)
+
+
+def relu_flops_counter_hook(module, input, output):
+    active_elements_count = output.numel()
+    module.__flops__ += int(active_elements_count)
+
+
+def linear_flops_counter_hook(module, input, output):
+    input = input[0]
+    output_last_dim = output.shape[
+        -1]  # pytorch checks dimensions, so here we don't care much
+    module.__flops__ += int(np.prod(input.shape) * output_last_dim)
+
+
+def pool_flops_counter_hook(module, input, output):
+    input = input[0]
+    module.__flops__ += int(np.prod(input.shape))
+
+
+def norm_flops_counter_hook(module, input, output):
+    input = input[0]
+
+    batch_flops = np.prod(input.shape)
+    if (getattr(module, 'affine', False)
+            or getattr(module, 'elementwise_affine', False)):
+        batch_flops *= 2
+    module.__flops__ += int(batch_flops)
+
+
+def deconv_flops_counter_hook(conv_module, input, output):
+    # Can have multiple inputs, getting the first one
+    input = input[0]
+
+    batch_size = input.shape[0]
+    input_height, input_width = input.shape[2:]
+
+    kernel_height, kernel_width = conv_module.kernel_size
+    in_channels = conv_module.in_channels
+    out_channels = conv_module.out_channels
+    groups = conv_module.groups
+
+    filters_per_channel = out_channels // groups
+    conv_per_position_flops = (
+        kernel_height * kernel_width * in_channels * filters_per_channel)
+
+    active_elements_count = batch_size * input_height * input_width
+    overall_conv_flops = conv_per_position_flops * active_elements_count
+    bias_flops = 0
+    if conv_module.bias is not None:
+        output_height, output_width = output.shape[2:]
+        bias_flops = out_channels * batch_size * output_height * output_height
+    overall_flops = overall_conv_flops + bias_flops
+
+    conv_module.__flops__ += int(overall_flops)
+
+
+def conv_flops_counter_hook(conv_module, input, output):
+    # Can have multiple inputs, getting the first one
+    input = input[0]
+
+    batch_size = input.shape[0]
+    output_dims = list(output.shape[2:])
+
+    kernel_dims = list(conv_module.kernel_size)
+    in_channels = conv_module.in_channels
+    out_channels = conv_module.out_channels
+    groups = conv_module.groups
+
+    filters_per_channel = out_channels // groups
+    conv_per_position_flops = int(
+        np.prod(kernel_dims)) * in_channels * filters_per_channel
+
+    active_elements_count = batch_size * int(np.prod(output_dims))
+
+    overall_conv_flops = conv_per_position_flops * active_elements_count
+
+    bias_flops = 0
+
+    if conv_module.bias is not None:
+
+        bias_flops = out_channels * active_elements_count
+
+    overall_flops = overall_conv_flops + bias_flops
+
+    conv_module.__flops__ += int(overall_flops)
+
+
+def batch_counter_hook(module, input, output):
+    batch_size = 1
+    if len(input) > 0:
+        # Can have multiple inputs, getting the first one
+        input = input[0]
+        batch_size = len(input)
+    else:
+        pass
+        print('Warning! No positional inputs found for a module, '
+              'assuming batch size is 1.')
+    module.__batch_counter__ += batch_size
+
+
+def add_batch_counter_variables_or_reset(module):
+
+    module.__batch_counter__ = 0
+
+
+def add_batch_counter_hook_function(module):
+    if hasattr(module, '__batch_counter_handle__'):
+        return
+
+    handle = module.register_forward_hook(batch_counter_hook)
+    module.__batch_counter_handle__ = handle
+
+
+def remove_batch_counter_hook_function(module):
+    if hasattr(module, '__batch_counter_handle__'):
+        module.__batch_counter_handle__.remove()
+        del module.__batch_counter_handle__
+
+
+def add_flops_counter_variable_or_reset(module):
+    if is_supported_instance(module):
+        if hasattr(module, '__flops__') or hasattr(module, '__params__'):
+            print('Warning: variables __flops__ or __params__ are already '
+                  'defined for the module' + type(module).__name__ +
+                  ' ptflops can affect your code!')
+        module.__flops__ = 0
+        module.__params__ = get_model_parameters_number(module)
+
+
+def is_supported_instance(module):
+    if type(module) in get_modules_mapping():
+        return True
+    return False
+
+
+def remove_flops_counter_hook_function(module):
+    if is_supported_instance(module):
+        if hasattr(module, '__flops_handle__'):
+            module.__flops_handle__.remove()
+            del module.__flops_handle__
+
+
+def get_modules_mapping():
+    return {
+        # convolutions
+        nn.Conv1d: conv_flops_counter_hook,
+        nn.Conv2d: conv_flops_counter_hook,
+        mmcv.cnn.bricks.Conv2d: conv_flops_counter_hook,
+        nn.Conv3d: conv_flops_counter_hook,
+        mmcv.cnn.bricks.Conv3d: conv_flops_counter_hook,
+        # activations
+        nn.ReLU: relu_flops_counter_hook,
+        nn.PReLU: relu_flops_counter_hook,
+        nn.ELU: relu_flops_counter_hook,
+        nn.LeakyReLU: relu_flops_counter_hook,
+        nn.ReLU6: relu_flops_counter_hook,
+        # poolings
+        nn.MaxPool1d: pool_flops_counter_hook,
+        nn.AvgPool1d: pool_flops_counter_hook,
+        nn.AvgPool2d: pool_flops_counter_hook,
+        nn.MaxPool2d: pool_flops_counter_hook,
+        mmcv.cnn.bricks.MaxPool2d: pool_flops_counter_hook,
+        nn.MaxPool3d: pool_flops_counter_hook,
+        mmcv.cnn.bricks.MaxPool3d: pool_flops_counter_hook,
+        nn.AvgPool3d: pool_flops_counter_hook,
+        nn.AdaptiveMaxPool1d: pool_flops_counter_hook,
+        nn.AdaptiveAvgPool1d: pool_flops_counter_hook,
+        nn.AdaptiveMaxPool2d: pool_flops_counter_hook,
+        nn.AdaptiveAvgPool2d: pool_flops_counter_hook,
+        nn.AdaptiveMaxPool3d: pool_flops_counter_hook,
+        nn.AdaptiveAvgPool3d: pool_flops_counter_hook,
+        # normalizations
+        nn.BatchNorm1d: norm_flops_counter_hook,
+        nn.BatchNorm2d: norm_flops_counter_hook,
+        nn.BatchNorm3d: norm_flops_counter_hook,
+        nn.GroupNorm: norm_flops_counter_hook,
+        nn.InstanceNorm1d: norm_flops_counter_hook,
+        nn.InstanceNorm2d: norm_flops_counter_hook,
+        nn.InstanceNorm3d: norm_flops_counter_hook,
+        nn.LayerNorm: norm_flops_counter_hook,
+        # FC
+        nn.Linear: linear_flops_counter_hook,
+        mmcv.cnn.bricks.Linear: linear_flops_counter_hook,
+        # Upscale
+        nn.Upsample: upsample_flops_counter_hook,
+        # Deconvolution
+        nn.ConvTranspose2d: deconv_flops_counter_hook,
+        mmcv.cnn.bricks.ConvTranspose2d: deconv_flops_counter_hook,
+    }
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=[40000, 4],
+        help='input point cloud size')
+    parser.add_argument(
+        '--modality',
+        type=str,
+        default='point',
+        choices=['point', 'image', 'multi'],
+        help='input data modality')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+
+    args = parse_args()
+
+    if args.modality == 'point':
+        assert len(args.shape) == 2, 'invalid input shape'
+        input_shape = tuple(args.shape)
+    elif args.modality == 'image':
+        if len(args.shape) == 1:
+            input_shape = (3, args.shape[0], args.shape[0])
+        elif len(args.shape) == 2:
+            input_shape = (3, ) + tuple(args.shape)
+        else:
+            raise ValueError('invalid input shape')
+    elif args.modality == 'multi':
+        raise NotImplementedError(
+            'FLOPs counter is currently not supported for models with '
+            'multi-modality input')
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    if hasattr(cfg, 'plugin'):
+        if cfg.plugin:
+            import importlib
+            if hasattr(cfg, 'plugin_dir'):
+                plugin_dir = cfg.plugin_dir
+                _module_dir = os.path.dirname(plugin_dir)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            else:
+                # import dir is the dirpath for the config file
+                _module_dir = os.path.dirname(args.config)
+                _module_dir = _module_dir.split('/')
+                _module_path = _module_dir[0]
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + '.' + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+
+    samples_per_gpu = 1
+    from mmdet.datasets import replace_ImageToTensor
+    if isinstance(cfg.data.test, dict):
+        cfg.data.test.test_mode = True
+        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
+        if samples_per_gpu > 1:
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.test.pipeline = replace_ImageToTensor(
+                cfg.data.test.pipeline)
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            ds_cfg.test_mode = True
+        samples_per_gpu = max(
+            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
+        if samples_per_gpu > 1:
+            for ds_cfg in cfg.data.test:
+                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
+
+    dataset = build_dataset(cfg.data.test)
+    dataset.is_vis_on_test = True #TODO, this is a hack
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=1,
+        workers_per_gpu=0,
+        dist=False,
+        shuffle=False,
+        nonshuffler_sampler=cfg.data.nonshuffler_sampler,
+    )
+    for i, data in enumerate(data_loader):
+        # if ~(data['map_gt_labels_3d'].data[0][0] != -1).any():
+        #     continue
+        img = data['img'][0].data[0]
+        img_metas = data['img_metas'][0].data[0]
+        break
+
+    model = build_model(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    if torch.cuda.is_available():
+        model.cuda()
+    model.eval()
+
+    if hasattr(model, 'forward_dummy'):
+        model.forward = model.forward_dummy
+    else:
+        raise NotImplementedError(
+            'FLOPs counter is currently not supported for {}'.format(
+                model.__class__.__name__))
+
+    flops, params = get_model_complexity_info(model, data)
+    split_line = '=' * 30
+    print(f'{split_line}\nInput shape: {input_shape}\n'
+          f'Flops: {flops}\nParams: {params}\n{split_line}')
+    print('!!!Please be cautious if you use the results in papers. '
+          'You may need to check if all ops are supported and verify that the '
+          'flops computation is correct.')
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/adzoo/vad/analysis_tools/get_params.py b/adzoo/vad/analysis_tools/get_params.py
new file mode 100644
index 0000000..6bf4ecf
--- /dev/null
+++ b/adzoo/vad/analysis_tools/get_params.py
@@ -0,0 +1,8 @@
+import torch
+YOUR_CKPT_PATH = None
+file_path = YOUR_CKPT_PATH
+model = torch.load(file_path, map_location='cpu')
+all = 0
+for key in list(model['state_dict'].keys()):
+    all += model['state_dict'][key].nelement()
+print(all)
diff --git a/adzoo/vad/analysis_tools/visualization.py b/adzoo/vad/analysis_tools/visualization.py
new file mode 100644
index 0000000..7fb9776
--- /dev/null
+++ b/adzoo/vad/analysis_tools/visualization.py
@@ -0,0 +1,911 @@
+import sys
+sys.path.append('')
+import os
+import argparse
+import os.path as osp
+from PIL import Image
+from tqdm import tqdm
+from typing import List, Dict
+
+import cv2
+import mmcv
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib import rcParams
+from pyquaternion import Quaternion
+from nuscenes.nuscenes import NuScenes
+from mmdet.datasets.pipelines import to_tensor
+from matplotlib.collections import LineCollection
+from nuscenes.utils.data_classes import LidarPointCloud, Box
+from nuscenes.eval.common.data_classes import EvalBoxes, EvalBox
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility
+
+from projects.mmdet3d_plugin.core.bbox.structures.nuscenes_box import CustomNuscenesBox, CustomDetectionBox, color_map
+from projects.mmdet3d_plugin.datasets.nuscenes_vad_dataset import VectorizedLocalMap, LiDARInstanceLines
+
+
+cams = ['CAM_FRONT',
+ 'CAM_FRONT_RIGHT',
+ 'CAM_BACK_RIGHT',
+ 'CAM_BACK',
+ 'CAM_BACK_LEFT',
+ 'CAM_FRONT_LEFT']
+
+
+def render_annotation(
+        anntoken: str,
+        margin: float = 10,
+        view: np.ndarray = np.eye(4),
+        box_vis_level: BoxVisibility = BoxVisibility.ANY,
+        out_path: str = 'render.png',
+        extra_info: bool = False) -> None:
+    """
+    Render selected annotation.
+    :param anntoken: Sample_annotation token.
+    :param margin: How many meters in each direction to include in LIDAR view.
+    :param view: LIDAR view point.
+    :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+    :param out_path: Optional path to save the rendered figure to disk.
+    :param extra_info: Whether to render extra information below camera view.
+    """
+    ann_record = nusc.get('sample_annotation', anntoken)
+    sample_record = nusc.get('sample', ann_record['sample_token'])
+    assert 'LIDAR_TOP' in sample_record['data'].keys(), 'Error: No LIDAR_TOP in data, unable to render.'
+
+    # Figure out which camera the object is fully visible in (this may return nothing).
+    boxes, cam = [], []
+    cams = [key for key in sample_record['data'].keys() if 'CAM' in key]
+    all_bboxes = []
+    select_cams = []
+    for cam in cams:
+        _, boxes, _ = nusc.get_sample_data(sample_record['data'][cam], box_vis_level=box_vis_level,
+                                           selected_anntokens=[anntoken])
+        if len(boxes) > 0:
+            all_bboxes.append(boxes)
+            select_cams.append(cam)
+            # We found an image that matches. Let's abort.
+    # assert len(boxes) > 0, 'Error: Could not find image where annotation is visible. ' \
+    #                      'Try using e.g. BoxVisibility.ANY.'
+    # assert len(boxes) < 2, 'Error: Found multiple annotations. Something is wrong!'
+
+    num_cam = len(all_bboxes)
+
+    fig, axes = plt.subplots(1, num_cam + 1, figsize=(18, 9))
+    select_cams = [sample_record['data'][cam] for cam in select_cams]
+    print('bbox in cams:', select_cams)
+    # Plot LIDAR view.
+    lidar = sample_record['data']['LIDAR_TOP']
+    data_path, boxes, camera_intrinsic = nusc.get_sample_data(lidar, selected_anntokens=[anntoken])
+    LidarPointCloud.from_file(data_path).render_height(axes[0], view=view)
+    for box in boxes:
+        c = np.array(get_color(box.name)) / 255.0
+        box.render(axes[0], view=view, colors=(c, c, c))
+        corners = view_points(boxes[0].corners(), view, False)[:2, :]
+        axes[0].set_xlim([np.min(corners[0, :]) - margin, np.max(corners[0, :]) + margin])
+        axes[0].set_ylim([np.min(corners[1, :]) - margin, np.max(corners[1, :]) + margin])
+        axes[0].axis('off')
+        axes[0].set_aspect('equal')
+
+    # Plot CAMERA view.
+    for i in range(1, num_cam + 1):
+        cam = select_cams[i - 1]
+        data_path, boxes, camera_intrinsic = nusc.get_sample_data(cam, selected_anntokens=[anntoken])
+        im = Image.open(data_path)
+        axes[i].imshow(im)
+        axes[i].set_title(nusc.get('sample_data', cam)['channel'])
+        axes[i].axis('off')
+        axes[i].set_aspect('equal')
+        for box in boxes:
+            c = np.array(get_color(box.name)) / 255.0
+            box.render(axes[i], view=camera_intrinsic, normalize=True, colors=(c, c, c))
+
+        # Print extra information about the annotation below the camera view.
+        axes[i].set_xlim(0, im.size[0])
+        axes[i].set_ylim(im.size[1], 0)
+
+    if extra_info:
+        rcParams['font.family'] = 'monospace'
+
+        w, l, h = ann_record['size']
+        category = ann_record['category_name']
+        lidar_points = ann_record['num_lidar_pts']
+        radar_points = ann_record['num_radar_pts']
+
+        sample_data_record = nusc.get('sample_data', sample_record['data']['LIDAR_TOP'])
+        pose_record = nusc.get('ego_pose', sample_data_record['ego_pose_token'])
+        dist = np.linalg.norm(np.array(pose_record['translation']) - np.array(ann_record['translation']))
+
+        information = ' \n'.join(['category: {}'.format(category),
+                                  '',
+                                  '# lidar points: {0:>4}'.format(lidar_points),
+                                  '# radar points: {0:>4}'.format(radar_points),
+                                  '',
+                                  'distance: {:>7.3f}m'.format(dist),
+                                  '',
+                                  'width:  {:>7.3f}m'.format(w),
+                                  'length: {:>7.3f}m'.format(l),
+                                  'height: {:>7.3f}m'.format(h)])
+
+        plt.annotate(information, (0, 0), (0, -20), xycoords='axes fraction', textcoords='offset points', va='top')
+
+    if out_path is not None:
+        plt.savefig(out_path)
+
+
+def get_sample_data(sample_data_token: str,
+                    box_vis_level: BoxVisibility = BoxVisibility.ANY,
+                    selected_anntokens=None,
+                    use_flat_vehicle_coordinates: bool = False):
+    """
+    Returns the data path as well as all annotations related to that sample_data.
+    Note that the boxes are transformed into the current sensor's coordinate frame.
+    :param sample_data_token: Sample_data token.
+    :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+    :param selected_anntokens: If provided only return the selected annotation.
+    :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+                                         aligned to z-plane in the world.
+    :return: (data_path, boxes, camera_intrinsic <np.array: 3, 3>)
+    """
+
+    # Retrieve sensor & pose records
+    sd_record = nusc.get('sample_data', sample_data_token)
+    cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+    sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+
+    data_path = nusc.get_sample_data_path(sample_data_token)
+
+    if sensor_record['modality'] == 'camera':
+        cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+        imsize = (sd_record['width'], sd_record['height'])
+    else:
+        cam_intrinsic = None
+        imsize = None
+
+    # Retrieve all sample annotations and map to sensor coordinate system.
+    if selected_anntokens is not None:
+        boxes = list(map(nusc.get_box, selected_anntokens))
+    else:
+        boxes = nusc.get_boxes(sample_data_token)
+
+    # Make list of Box objects including coord system transforms.
+    box_list = []
+    for box in boxes:
+        if use_flat_vehicle_coordinates:
+            # Move box to ego vehicle coord system parallel to world z plane.
+            yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0]
+            box.translate(-np.array(pose_record['translation']))
+            box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse)
+        else:
+            # Move box to ego vehicle coord system.
+            box.translate(-np.array(pose_record['translation']))
+            box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+            #  Move box to sensor coord system.
+            box.translate(-np.array(cs_record['translation']))
+            box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+        if sensor_record['modality'] == 'camera' and not \
+                box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level):
+            continue
+
+        box_list.append(box)
+
+    return data_path, box_list, cam_intrinsic
+
+
+def get_predicted_data(sample_data_token: str,
+                       box_vis_level: BoxVisibility = BoxVisibility.ANY,
+                       selected_anntokens=None,
+                       use_flat_vehicle_coordinates: bool = False,
+                       pred_anns=None
+                       ):
+    """
+    Returns the data path as well as all annotations related to that sample_data.
+    Note that the boxes are transformed into the current sensor's coordinate frame.
+    :param sample_data_token: Sample_data token.
+    :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+    :param selected_anntokens: If provided only return the selected annotation.
+    :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+                                         aligned to z-plane in the world.
+    :return: (data_path, boxes, camera_intrinsic <np.array: 3, 3>)
+    """
+
+    # Retrieve sensor & pose records
+    sd_record = nusc.get('sample_data', sample_data_token)
+    cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+    sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+
+    data_path = nusc.get_sample_data_path(sample_data_token)
+
+    if sensor_record['modality'] == 'camera':
+        cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+        imsize = (sd_record['width'], sd_record['height'])
+    else:
+        cam_intrinsic = None
+        imsize = None
+
+    # Retrieve all sample annotations and map to sensor coordinate system.
+    # if selected_anntokens is not None:
+    #    boxes = list(map(nusc.get_box, selected_anntokens))
+    # else:
+    #    boxes = nusc.get_boxes(sample_data_token)
+    boxes = pred_anns
+    # Make list of Box objects including coord system transforms.
+    box_list = []
+    for box in boxes:
+        if use_flat_vehicle_coordinates:
+            # Move box to ego vehicle coord system parallel to world z plane.
+            yaw = Quaternion(pose_record['rotation']).yaw_pitch_roll[0]
+            box.translate(-np.array(pose_record['translation']))
+            box.rotate(Quaternion(scalar=np.cos(yaw / 2), vector=[0, 0, np.sin(yaw / 2)]).inverse)
+        else:
+            # Move box to ego vehicle coord system.
+            box.translate(-np.array(pose_record['translation']))
+            box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+            #  Move box to sensor coord system.
+            box.translate(-np.array(cs_record['translation']))
+            box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+        if sensor_record['modality'] == 'camera' and not \
+                box_in_image(box, cam_intrinsic, imsize, vis_level=box_vis_level):
+            continue
+        box_list.append(box)
+
+    return data_path, box_list, cam_intrinsic
+
+
+def lidiar_render(sample_token, data, out_path=None, out_name=None, traj_use_perstep_offset=True):
+    bbox_gt_list = []
+    bbox_pred_list = []
+    sample_rec = nusc.get('sample', sample_token)
+    anns = sample_rec['anns']
+    sd_record = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+    cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+
+    for ann in anns:
+        content = nusc.get('sample_annotation', ann)
+        gt_fut_trajs, gt_fut_masks = get_gt_fut_trajs(
+            nusc=nusc, anno=content, cs_record=cs_record, 
+            pose_record=pose_record, fut_ts=6
+        )
+        try:
+            bbox_gt_list.append(CustomDetectionBox(
+                sample_token=content['sample_token'],
+                translation=tuple(content['translation']),
+                size=tuple(content['size']),
+                rotation=tuple(content['rotation']),
+                velocity=nusc.box_velocity(content['token'])[:2],
+                fut_trajs=tuple(gt_fut_trajs),
+                ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+                else tuple(content['ego_translation']),
+                num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+                detection_name=category_to_detection_name(content['category_name']),
+                detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+                attribute_name=''))
+        except:
+            pass
+
+    bbox_anns = data['results'][sample_token]
+    for content in bbox_anns:
+        bbox_pred_list.append(CustomDetectionBox(
+            sample_token=content['sample_token'],
+            translation=tuple(content['translation']),
+            size=tuple(content['size']),
+            rotation=tuple(content['rotation']),
+            velocity=tuple(content['velocity']),
+            fut_trajs=tuple(content['fut_traj']),
+            ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+            else tuple(content['ego_translation']),
+            num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+            detection_name=content['detection_name'],
+            detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+            attribute_name=content['attribute_name']))
+    gt_annotations = EvalBoxes()
+    pred_annotations = EvalBoxes()
+    gt_annotations.add_boxes(sample_token, bbox_gt_list)
+    pred_annotations.add_boxes(sample_token, bbox_pred_list)
+    # print('green is ground truth')
+    # print('blue is the predited result')
+    visualize_sample(nusc, sample_token, gt_annotations, pred_annotations,
+                     savepath=out_path, traj_use_perstep_offset=traj_use_perstep_offset, pred_data=data)
+
+
+def get_color(category_name: str):
+    """
+    Provides the default colors based on the category names.
+    This method works for the general nuScenes categories, as well as the nuScenes detection categories.
+    """
+    a = ['noise', 'animal', 'human.pedestrian.adult', 'human.pedestrian.child', 'human.pedestrian.construction_worker',
+     'human.pedestrian.personal_mobility', 'human.pedestrian.police_officer', 'human.pedestrian.stroller',
+     'human.pedestrian.wheelchair', 'movable_object.barrier', 'movable_object.debris',
+     'movable_object.pushable_pullable', 'movable_object.trafficcone', 'static_object.bicycle_rack', 'vehicle.bicycle',
+     'vehicle.bus.bendy', 'vehicle.bus.rigid', 'vehicle.car', 'vehicle.construction', 'vehicle.emergency.ambulance',
+     'vehicle.emergency.police', 'vehicle.motorcycle', 'vehicle.trailer', 'vehicle.truck', 'flat.driveable_surface',
+     'flat.other', 'flat.sidewalk', 'flat.terrain', 'static.manmade', 'static.other', 'static.vegetation',
+     'vehicle.ego']
+    class_names = [
+        'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+        'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+    ]
+    #print(category_name)
+    if category_name == 'bicycle':
+        return nusc.colormap['vehicle.bicycle']
+    elif category_name == 'construction_vehicle':
+        return nusc.colormap['vehicle.construction']
+    elif category_name == 'traffic_cone':
+        return nusc.colormap['movable_object.trafficcone']
+
+    for key in nusc.colormap.keys():
+        if category_name in key:
+            return nusc.colormap[key]
+    return [0, 0, 0]
+
+# TODO: whether to rotate traj
+def boxes_to_sensor(boxes: List[EvalBox], pose_record: Dict, cs_record: Dict):
+    """
+    Map boxes from global coordinates to the vehicle's sensor coordinate system.
+    :param boxes: The boxes in global coordinates.
+    :param pose_record: The pose record of the vehicle at the current timestamp.
+    :param cs_record: The calibrated sensor record of the sensor.
+    :return: The transformed boxes.
+    """
+    boxes_out = []
+    for box in boxes:
+        # Create Box instance.
+        box = CustomNuscenesBox(
+            box.translation, box.size, Quaternion(box.rotation), box.fut_trajs, name=box.detection_name
+        )
+        # Move box to ego vehicle coord system.
+        box.translate(-np.array(pose_record['translation']))
+        box.rotate(Quaternion(pose_record['rotation']).inverse)
+        # Move box to sensor coord system.
+        box.translate(-np.array(cs_record['translation']))
+        box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+        boxes_out.append(box)
+
+    return boxes_out
+
+
+def get_gt_fut_trajs(nusc: NuScenes,
+                     anno,
+                     cs_record,
+                     pose_record,
+                     fut_ts) -> None:
+    """
+    Visualizes a sample from BEV with annotations and detection results.
+    :param nusc: NuScenes object.
+    """
+    box = Box(anno['translation'], anno['size'], Quaternion(anno['rotation']))
+    # Move box to ego vehicle coord system.
+    box.translate(-np.array(pose_record['translation']))
+    box.rotate(Quaternion(pose_record['rotation']).inverse)
+    #  Move box to sensor coord system.
+    box.translate(-np.array(cs_record['translation']))
+    box.rotate(Quaternion(cs_record['rotation']).inverse)
+    
+    # get future trajectory coords for each box
+    gt_fut_trajs = np.zeros((fut_ts, 2))  # [fut_ts*2]
+    gt_fut_masks = np.zeros((fut_ts))  # [fut_ts]
+    gt_fut_trajs[:] = box.center[:2]
+    cur_box = box
+    cur_anno = anno
+    for i in range(fut_ts):
+        if cur_anno['next'] != '':
+            anno_next = nusc.get('sample_annotation', cur_anno['next'])
+            box_next = Box(
+                anno_next['translation'], anno_next['size'], Quaternion(anno_next['rotation'])
+            )
+            # Move box to ego vehicle coord system.
+            box_next.translate(-np.array(pose_record['translation']))
+            box_next.rotate(Quaternion(pose_record['rotation']).inverse)
+            #  Move box to sensor coord system.
+            box_next.translate(-np.array(cs_record['translation']))
+            box_next.rotate(Quaternion(cs_record['rotation']).inverse)
+            # gt_fut_trajs[i] = box_next.center[:2]
+            gt_fut_trajs[i] = box_next.center[:2] - cur_box.center[:2]
+            gt_fut_masks[i] = 1
+            cur_anno = anno_next
+            cur_box = box_next
+        else:
+            # gt_fut_trajs[i:] = gt_fut_trajs[i-1]
+            gt_fut_trajs[i:] = 0
+            break         
+
+    return gt_fut_trajs.reshape(-1).tolist(), gt_fut_masks.reshape(-1).tolist()
+
+def get_gt_vec_maps(
+    sample_token,
+    data_root='data/nuscenes/',
+    pc_range=[-15.0, -30.0, -4.0, 15.0, 30.0, 4.0],
+    padding_value=-10000,
+    map_classes=['divider', 'ped_crossing', 'boundary'],
+    map_fixed_ptsnum_per_line=20
+) -> None:
+    """
+    Get gt vec map for a given sample.
+    """
+    sample_rec = nusc.get('sample', sample_token)
+    sd_record = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+    cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+    lidar2ego_translation = cs_record['translation'],
+    lidar2ego_rotation = cs_record['rotation'],
+    ego2global_translation = pose_record['translation'],
+    ego2global_rotation = pose_record['rotation'],
+    map_location = nusc.get('log', nusc.get('scene', sample_rec['scene_token'])['log_token'])['location']
+
+    lidar2ego = np.eye(4)
+    lidar2ego[:3,:3] = Quaternion(cs_record['rotation']).rotation_matrix
+    lidar2ego[:3, 3] = cs_record['translation']
+    ego2global = np.eye(4)
+    ego2global[:3,:3] = Quaternion(pose_record['rotation']).rotation_matrix
+    ego2global[:3, 3] = pose_record['translation']
+    lidar2global = ego2global @ lidar2ego
+    lidar2global_translation = list(lidar2global[:3,3])
+    lidar2global_rotation = list(Quaternion(matrix=lidar2global).q)
+    patch_h = pc_range[4]-pc_range[1]
+    patch_w = pc_range[3]-pc_range[0]
+    patch_size = (patch_h, patch_w)
+
+    vector_map = VectorizedLocalMap(data_root, patch_size=patch_size,
+                                    map_classes=map_classes, 
+                                    fixed_ptsnum_per_line=map_fixed_ptsnum_per_line,
+                                    padding_value=padding_value)
+
+
+    anns_results = vector_map.gen_vectorized_samples(
+        map_location, lidar2global_translation, lidar2global_rotation
+    )
+    
+    '''
+    anns_results, type: dict
+        'gt_vecs_pts_loc': list[num_vecs], vec with num_points*2 coordinates
+        'gt_vecs_pts_num': list[num_vecs], vec with num_points
+        'gt_vecs_label': list[num_vecs], vec with cls index
+    '''
+    gt_vecs_label = to_tensor(anns_results['gt_vecs_label'])
+    if isinstance(anns_results['gt_vecs_pts_loc'], LiDARInstanceLines):
+        gt_vecs_pts_loc = anns_results['gt_vecs_pts_loc']
+    else:
+        gt_vecs_pts_loc = to_tensor(anns_results['gt_vecs_pts_loc'])
+        try:
+            gt_vecs_pts_loc = gt_vecs_pts_loc.flatten(1).to(dtype=torch.float32)
+        except:
+            gt_vecs_pts_loc = gt_vecs_pts_loc
+    
+    return gt_vecs_pts_loc, gt_vecs_label
+
+
+def visualize_sample(nusc: NuScenes,
+                     sample_token: str,
+                     gt_boxes: EvalBoxes,
+                     pred_boxes: EvalBoxes,
+                     nsweeps: int = 1,
+                     conf_th: float = 0.4,
+                     pc_range: list = [-30.0, -30.0, -4.0, 30.0, 30.0, 4.0],
+                     verbose: bool = True,
+                     savepath: str = None,
+                     traj_use_perstep_offset: bool = True,
+                     data_root='data/nuscenes/',
+                     map_pc_range: list = [-15.0, -30.0, -4.0, 15.0, 30.0, 4.0],
+                     padding_value=-10000,
+                     map_classes=['divider', 'ped_crossing', 'boundary'],
+                     map_fixed_ptsnum_per_line=20,
+                     gt_format=['fixed_num_pts'],
+                     colors_plt = ['cornflowerblue', 'royalblue', 'slategrey'],
+                     pred_data = None) -> None:
+    """
+    Visualizes a sample from BEV with annotations and detection results.
+    :param nusc: NuScenes object.
+    :param sample_token: The nuScenes sample token.
+    :param gt_boxes: Ground truth boxes grouped by sample.
+    :param pred_boxes: Prediction grouped by sample.
+    :param nsweeps: Number of sweeps used for lidar visualization.
+    :param conf_th: The confidence threshold used to filter negatives.
+    :param eval_range: Range in meters beyond which boxes are ignored.
+    :param verbose: Whether to print to stdout.
+    :param savepath: If given, saves the the rendering here instead of displaying.
+    """
+    # Retrieve sensor & pose records.
+    sample_rec = nusc.get('sample', sample_token)
+    sd_record = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+    cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+    # Get boxes.
+    boxes_gt_global = gt_boxes[sample_token]
+    boxes_est_global = pred_boxes[sample_token]
+    # Map GT boxes to lidar.
+    boxes_gt = boxes_to_sensor(boxes_gt_global, pose_record, cs_record)
+    # Map EST boxes to lidar.
+    boxes_est = boxes_to_sensor(boxes_est_global, pose_record, cs_record)
+    # Add scores to EST boxes.
+    for box_est, box_est_global in zip(boxes_est, boxes_est_global):
+        box_est.score = box_est_global.detection_score
+
+    # Init axes.
+    fig, axes = plt.subplots(1, 1, figsize=(4, 4))
+    plt.xlim(xmin=-30, xmax=30)
+    plt.ylim(ymin=-30, ymax=30)
+
+    # Show Pred Map
+    result_dic = pred_data['map_results'][sample_token]['vectors']
+
+    for vector in result_dic:
+        if vector['confidence_level'] < 0.6:
+            continue
+        pred_pts_3d = vector['pts']
+        pred_label_3d = vector['type']
+        pts_x = np.array([pt[0] for pt in pred_pts_3d])
+        pts_y = np.array([pt[1] for pt in pred_pts_3d])
+
+        axes.plot(pts_x, pts_y, color=colors_plt[pred_label_3d],linewidth=1,alpha=0.8,zorder=-1)
+        axes.scatter(pts_x, pts_y, color=colors_plt[pred_label_3d],s=1,alpha=0.8,zorder=-1)  
+
+    # ignore_list = ['barrier', 'motorcycle', 'bicycle', 'traffic_cone']
+    ignore_list = ['barrier', 'bicycle', 'traffic_cone']
+
+    # Show Pred boxes.
+    for i, box in enumerate(boxes_est):
+        if box.name in ignore_list:
+            continue
+        # Show only predictions with a high score.
+        assert not np.isnan(box.score), 'Error: Box score cannot be NaN!'
+        if box.score < conf_th or abs(box.center[0]) > 15 or abs(box.center[1]) > 30:
+            continue
+        box.render(axes, view=np.eye(4), colors=('tomato', 'tomato', 'tomato'), linewidth=1, box_idx=None)
+        # if box.name in ['pedestrian']:
+        #     continue
+        if traj_use_perstep_offset:
+            mode_idx = [0, 1, 2, 3, 4, 5]
+            box.render_fut_trajs_grad_color(axes, linewidth=1, mode_idx=mode_idx, fut_ts=6, cmap='autumn')
+        else:
+            box.render_fut_trajs_coords(axes, color='tomato', linewidth=1)
+
+    # Show Planning.
+    axes.plot([-0.9, -0.9], [-2, 2], color='mediumseagreen', linewidth=1, alpha=0.8)
+    axes.plot([-0.9, 0.9], [2, 2], color='mediumseagreen', linewidth=1, alpha=0.8)
+    axes.plot([0.9, 0.9], [2, -2], color='mediumseagreen', linewidth=1, alpha=0.8)
+    axes.plot([0.9, -0.9], [-2, -2], color='mediumseagreen', linewidth=1, alpha=0.8)
+    axes.plot([0.0, 0.0], [0.0, 2], color='mediumseagreen', linewidth=1, alpha=0.8)
+    plan_cmd = np.argmax(pred_data['plan_results'][sample_token][1][0,0,0])
+    plan_traj = pred_data['plan_results'][sample_token][0][plan_cmd]
+    plan_traj[abs(plan_traj) < 0.01] = 0.0
+    plan_traj = plan_traj.cumsum(axis=0)
+    plan_traj = np.concatenate((np.zeros((1, plan_traj.shape[1])), plan_traj), axis=0)
+    plan_traj = np.stack((plan_traj[:-1], plan_traj[1:]), axis=1)
+
+    plan_vecs = None
+    for i in range(plan_traj.shape[0]):
+        plan_vec_i = plan_traj[i]
+        x_linspace = np.linspace(plan_vec_i[0, 0], plan_vec_i[1, 0], 51)
+        y_linspace = np.linspace(plan_vec_i[0, 1], plan_vec_i[1, 1], 51)
+        xy = np.stack((x_linspace, y_linspace), axis=1)
+        xy = np.stack((xy[:-1], xy[1:]), axis=1)
+        if plan_vecs is None:
+            plan_vecs = xy
+        else:
+            plan_vecs = np.concatenate((plan_vecs, xy), axis=0)
+
+    cmap = 'winter'
+    y = np.sin(np.linspace(1/2*np.pi, 3/2*np.pi, 301))
+    colors = color_map(y[:-1], cmap)
+    line_segments = LineCollection(plan_vecs, colors=colors, linewidths=1, linestyles='solid', cmap=cmap)
+    axes.add_collection(line_segments)
+
+    axes.axes.xaxis.set_ticks([])
+    axes.axes.yaxis.set_ticks([])
+    axes.axis('off')
+    fig.set_tight_layout(True)
+    fig.canvas.draw()
+    plt.savefig(savepath+'/bev_pred.png', bbox_inches='tight', dpi=200)
+    plt.close()
+
+
+def obtain_sensor2top(nusc,
+                      sensor_token,
+                      l2e_t,
+                      l2e_r_mat,
+                      e2g_t,
+                      e2g_r_mat,
+                      sensor_type='lidar'):
+    """Obtain the info with RT matric from general sensor to Top LiDAR.
+
+    Args:
+        nusc (class): Dataset class in the nuScenes dataset.
+        sensor_token (str): Sample data token corresponding to the
+            specific sensor type.
+        l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3).
+        l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego
+            in shape (3, 3).
+        e2g_t (np.ndarray): Translation from ego to global in shape (1, 3).
+        e2g_r_mat (np.ndarray): Rotation matrix from ego to global
+            in shape (3, 3).
+        sensor_type (str): Sensor to calibrate. Default: 'lidar'.
+
+    Returns:
+        sweep (dict): Sweep information after transformation.
+    """
+    sd_rec = nusc.get('sample_data', sensor_token)
+    cs_record = nusc.get('calibrated_sensor',
+                         sd_rec['calibrated_sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    data_path = str(nusc.get_sample_data_path(sd_rec['token']))
+    if os.getcwd() in data_path:  # path from lyftdataset is absolute path
+        data_path = data_path.split(f'{os.getcwd()}/')[-1]  # relative path
+    sweep = {
+        'data_path': data_path,
+        'type': sensor_type,
+        'sample_data_token': sd_rec['token'],
+        'sensor2ego_translation': cs_record['translation'],
+        'sensor2ego_rotation': cs_record['rotation'],
+        'ego2global_translation': pose_record['translation'],
+        'ego2global_rotation': pose_record['rotation'],
+        'timestamp': sd_rec['timestamp']
+    }
+
+    l2e_r_s = sweep['sensor2ego_rotation']
+    l2e_t_s = sweep['sensor2ego_translation']
+    e2g_r_s = sweep['ego2global_rotation']
+    e2g_t_s = sweep['ego2global_translation']
+
+    # obtain the RT from sensor to Top LiDAR
+    # sweep->ego->global->ego'->lidar
+    l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix
+    e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix
+    R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
+                  ) + l2e_t @ np.linalg.inv(l2e_r_mat).T
+    sensor2lidar_rotation = R.T  # points @ R.T + T
+    sensor2lidar_translation = T
+
+    return sensor2lidar_rotation, sensor2lidar_translation
+
+def render_sample_data(
+        sample_toekn: str,
+        with_anns: bool = True,
+        box_vis_level: BoxVisibility = BoxVisibility.ANY,
+        axes_limit: float = 40,
+        ax=None,
+        nsweeps: int = 1,
+        out_path: str = None,
+        out_name: str = None,
+        underlay_map: bool = True,
+        use_flat_vehicle_coordinates: bool = True,
+        show_lidarseg: bool = False,
+        show_lidarseg_legend: bool = False,
+        filter_lidarseg_labels=None,
+        lidarseg_preds_bin_path: str = None,
+        verbose: bool = True,
+        show_panoptic: bool = False,
+        pred_data=None,
+        traj_use_perstep_offset: bool = True
+      ) -> None:
+    """
+    Render sample data onto axis.
+    :param sample_data_token: Sample_data token.
+    :param with_anns: Whether to draw box annotations.
+    :param box_vis_level: If sample_data is an image, this sets required visibility for boxes.
+    :param axes_limit: Axes limit for lidar and radar (measured in meters).
+    :param ax: Axes onto which to render.
+    :param nsweeps: Number of sweeps for lidar and radar.
+    :param out_path: Optional path to save the rendered figure to disk.
+    :param underlay_map: When set to true, lidar data is plotted onto the map. This can be slow.
+    :param use_flat_vehicle_coordinates: Instead of the current sensor's coordinate frame, use ego frame which is
+        aligned to z-plane in the world. Note: Previously this method did not use flat vehicle coordinates, which
+        can lead to small errors when the vertical axis of the global frame and lidar are not aligned. The new
+        setting is more correct and rotates the plot by ~90 degrees.
+    :param show_lidarseg: When set to True, the lidar data is colored with the segmentation labels. When set
+        to False, the colors of the lidar data represent the distance from the center of the ego vehicle.
+    :param show_lidarseg_legend: Whether to display the legend for the lidarseg labels in the frame.
+    :param filter_lidarseg_labels: Only show lidar points which belong to the given list of classes. If None
+        or the list is empty, all classes will be displayed.
+    :param lidarseg_preds_bin_path: A path to the .bin file which contains the user's lidar segmentation
+                                    predictions for the sample.
+    :param verbose: Whether to display the image after it is rendered.
+    :param show_panoptic: When set to True, the lidar data is colored with the panoptic labels. When set
+        to False, the colors of the lidar data represent the distance from the center of the ego vehicle.
+        If show_lidarseg is True, show_panoptic will be set to False.
+    """
+    lidiar_render(sample_toekn, pred_data, out_path=out_path,
+                  out_name=out_name, traj_use_perstep_offset=traj_use_perstep_offset)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Visualize VAD predictions')
+    parser.add_argument('--result-path', help='inference result file path')
+    parser.add_argument('--save-path', help='the dir to save visualization results')
+    args = parser.parse_args()
+
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    inference_result_path = args.result_path
+    out_path = args.save_path
+    bevformer_results = mmcv.load(inference_result_path)
+    sample_token_list = list(bevformer_results['results'].keys())
+
+    nusc = NuScenes(version='v1.0-trainval', dataroot='./data/nuscenes', verbose=True)
+    
+    imgs = []
+    fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
+    video_path = osp.join(out_path, 'vis.mp4')
+    video = cv2.VideoWriter(video_path, fourcc, 10, (2933, 800), True)
+    for id in tqdm(range(len(sample_token_list))):
+        mmcv.mkdir_or_exist(out_path)
+        render_sample_data(sample_token_list[id],
+                           pred_data=bevformer_results,
+                           out_path=out_path)
+        pred_path = osp.join(out_path, 'bev_pred.png')
+        pred_img = cv2.imread(pred_path)
+        os.remove(pred_path)
+
+        sample_token = sample_token_list[id]
+        sample = nusc.get('sample', sample_token)
+        # sample = data['results'][sample_token_list[0]][0]
+        cams = [
+            'CAM_FRONT_LEFT',
+            'CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_BACK_LEFT',
+            'CAM_BACK',
+            'CAM_BACK_RIGHT',
+        ]
+
+        cam_imgs = []
+        for cam in cams:
+            sample_data_token = sample['data'][cam]
+            sd_record = nusc.get('sample_data', sample_data_token)
+            sensor_modality = sd_record['sensor_modality']
+            if sensor_modality in ['lidar', 'radar']:
+                assert False
+            elif sensor_modality == 'camera':
+                boxes = [Box(record['translation'], record['size'], Quaternion(record['rotation']),
+                            name=record['detection_name'], token='predicted') for record in
+                        bevformer_results['results'][sample_token]]
+                data_path, boxes_pred, camera_intrinsic = get_predicted_data(sample_data_token,
+                                                                            box_vis_level=BoxVisibility.ANY,
+                                                                            pred_anns=boxes)
+                _, boxes_gt, _ = nusc.get_sample_data(sample_data_token, box_vis_level=BoxVisibility.ANY)
+
+                data = Image.open(data_path)
+ 
+                # Show image.
+                _, ax = plt.subplots(1, 1, figsize=(6, 12))
+                ax.imshow(data)
+
+                if cam == 'CAM_FRONT':
+                    lidar_sd_record =  nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+                    lidar_cs_record = nusc.get('calibrated_sensor', lidar_sd_record['calibrated_sensor_token'])
+                    lidar_pose_record = nusc.get('ego_pose', lidar_sd_record['ego_pose_token'])
+
+                    # get plan traj [x,y,z,w] quaternion, w=1
+                    # we set z=-1 to get points near the ground in lidar coord system
+                    plan_cmd = np.argmax(bevformer_results['plan_results'][sample_token][1][0,0,0])
+                    plan_traj = bevformer_results['plan_results'][sample_token][0][plan_cmd]
+                    plan_traj[abs(plan_traj) < 0.01] = 0.0
+                    plan_traj = plan_traj.cumsum(axis=0)
+
+                    plan_traj = np.concatenate((
+                        plan_traj[:, [0]],
+                        plan_traj[:, [1]],
+                        -1.0*np.ones((plan_traj.shape[0], 1)),
+                        np.ones((plan_traj.shape[0], 1)),
+                    ), axis=1)
+                    # add the start point in lcf
+                    plan_traj = np.concatenate((np.zeros((1, plan_traj.shape[1])), plan_traj), axis=0)
+                    # plan_traj[0, :2] = 2*plan_traj[1, :2] - plan_traj[2, :2]
+                    plan_traj[0, 0] = 0.3
+                    plan_traj[0, 2] = -1.0
+                    plan_traj[0, 3] = 1.0
+
+                    l2e_r = lidar_cs_record['rotation']
+                    l2e_t = lidar_cs_record['translation']
+                    e2g_r = lidar_pose_record['rotation']
+                    e2g_t = lidar_pose_record['translation']
+                    l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+                    e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+                    s2l_r, s2l_t = obtain_sensor2top(nusc, sample_data_token, l2e_t, l2e_r_mat, e2g_t, e2g_r_mat, cam)
+                    # obtain lidar to image transformation matrix
+                    lidar2cam_r = np.linalg.inv(s2l_r)
+                    lidar2cam_t = s2l_t @ lidar2cam_r.T
+                    lidar2cam_rt = np.eye(4)
+                    lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                    lidar2cam_rt[3, :3] = -lidar2cam_t
+                    viewpad = np.eye(4)
+                    viewpad[:camera_intrinsic.shape[0], :camera_intrinsic.shape[1]] = camera_intrinsic
+                    lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+                    plan_traj = lidar2img_rt @ plan_traj.T
+                    plan_traj = plan_traj[0:2, ...] / np.maximum(
+                        plan_traj[2:3, ...], np.ones_like(plan_traj[2:3, ...]) * 1e-5)
+                    plan_traj = plan_traj.T
+                    plan_traj = np.stack((plan_traj[:-1], plan_traj[1:]), axis=1)
+
+                    plan_vecs = None
+                    for i in range(plan_traj.shape[0]):
+                        plan_vec_i = plan_traj[i]
+                        x_linspace = np.linspace(plan_vec_i[0, 0], plan_vec_i[1, 0], 51)
+                        y_linspace = np.linspace(plan_vec_i[0, 1], plan_vec_i[1, 1], 51)
+                        xy = np.stack((x_linspace, y_linspace), axis=1)
+                        xy = np.stack((xy[:-1], xy[1:]), axis=1)
+                        if plan_vecs is None:
+                            plan_vecs = xy
+                        else:
+                            plan_vecs = np.concatenate((plan_vecs, xy), axis=0)
+
+                    cmap = 'winter'
+                    y = np.sin(np.linspace(1/2*np.pi, 3/2*np.pi, 301))
+                    colors = color_map(y[:-1], cmap)
+                    line_segments = LineCollection(plan_vecs, colors=colors, linewidths=2, linestyles='solid', cmap=cmap)
+                    ax.add_collection(line_segments)
+
+                ax.set_xlim(0, data.size[0])
+                ax.set_ylim(data.size[1], 0)
+                ax.axis('off')
+                if out_path is not None:
+                    savepath = osp.join(out_path, f'{cam}_PRED')
+                    plt.savefig(savepath, bbox_inches='tight', dpi=200, pad_inches=0.0)
+                plt.close()
+
+                # Load boxes and image.
+                data_path = osp.join(out_path, f'{cam}_PRED.png')
+                cam_img = cv2.imread(data_path)
+                lw = 6
+                tf = max(lw - 3, 1)
+                w, h = cv2.getTextSize(cam, 0, fontScale=lw / 6, thickness=tf)[0]  # text width, height
+                # color=(0, 0, 0)
+                txt_color=(255, 255, 255)
+                cv2.putText(cam_img,
+                            cam, (10, h + 10),
+                            0,
+                            lw / 6,
+                            txt_color,
+                            thickness=tf,
+                            lineType=cv2.LINE_AA)
+                cam_imgs.append(cam_img)
+            else:
+                raise ValueError("Error: Unknown sensor modality!")
+
+        plan_cmd = np.argmax(bevformer_results['plan_results'][sample_token][1][0,0,0])
+        cmd_list = ['Turn Right', 'Turn Left', 'Go Straight']
+        plan_cmd_str = cmd_list[plan_cmd]
+        pred_img = cv2.copyMakeBorder(pred_img, 10, 10, 10, 10, cv2.BORDER_CONSTANT, None, value = 0)
+        # font
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        # fontScale
+        fontScale = 1
+        # Line thickness of 2 px
+        thickness = 3
+        # org
+        org = (20, 40)      
+        # Blue color in BGR
+        color = (0, 0, 0)
+        # Using cv2.putText() method
+        pred_img = cv2.putText(pred_img, 'BEV', org, font, 
+                        fontScale, color, thickness, cv2.LINE_AA)
+        pred_img = cv2.putText(pred_img, plan_cmd_str, (20, 770), font, 
+                        fontScale, color, thickness, cv2.LINE_AA)
+        
+        sample_img = pred_img
+        cam_img_top = cv2.hconcat([cam_imgs[0], cam_imgs[1], cam_imgs[2]])
+        cam_img_down = cv2.hconcat([cam_imgs[3], cam_imgs[4], cam_imgs[5]])
+        cam_img = cv2.vconcat([cam_img_top, cam_img_down])
+        size = (2133, 800)
+        cam_img = cv2.resize(cam_img, size)
+        vis_img = cv2.hconcat([cam_img, sample_img])
+
+        video.write(vis_img)
+    
+    video.release()
+    cv2.destroyAllWindows()
diff --git a/adzoo/vad/apis/__init__.py b/adzoo/vad/apis/__init__.py
new file mode 100644
index 0000000..15dff22
--- /dev/null
+++ b/adzoo/vad/apis/__init__.py
@@ -0,0 +1,3 @@
+from .train import custom_train_model
+from .mmdet_train import custom_train_detector
+# from .test import custom_multi_gpu_test
\ No newline at end of file
diff --git a/adzoo/vad/apis/mmdet_train.py b/adzoo/vad/apis/mmdet_train.py
new file mode 100644
index 0000000..687b989
--- /dev/null
+++ b/adzoo/vad/apis/mmdet_train.py
@@ -0,0 +1,196 @@
+import random
+import warnings
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.nn import DataParallel
+from torch.nn.parallel.distributed import DistributedDataParallel
+from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner,
+                         Fp16OptimizerHook, OptimizerHook,
+                         build_runner)
+from mmcv.utils import build_from_cfg
+from mmcv.optims import build_optimizer
+from mmcv.core import EvalHook
+
+from mmcv.datasets import (build_dataset,
+                            replace_ImageToTensor)
+from mmcv.utils import get_root_logger
+import time
+import os.path as osp
+from mmcv.datasets.builder import build_dataloader
+from mmcv.core.evaluation.eval_hooks import CustomDistEvalHook
+from mmcv.datasets.builder import custom_build_dataset
+def custom_train_detector(model,
+                   dataset,
+                   cfg,
+                   distributed=False,
+                   validate=False,
+                   timestamp=None,
+                   eval_model=None,
+                   meta=None):
+    logger = get_root_logger(cfg.log_level)
+
+    # prepare data loaders
+   
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    #assert len(dataset)==1s
+    if 'imgs_per_gpu' in cfg.data:
+        logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. '
+                       'Please use "samples_per_gpu" instead')
+        if 'samples_per_gpu' in cfg.data:
+            logger.warning(
+                f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and '
+                f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"'
+                f'={cfg.data.imgs_per_gpu} is used in this experiments')
+        else:
+            logger.warning(
+                'Automatically set "samples_per_gpu"="imgs_per_gpu"='
+                f'{cfg.data.imgs_per_gpu} in this experiments')
+        cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu
+
+    data_loaders = [
+        build_dataloader(
+            ds,
+            cfg.data.samples_per_gpu,
+            cfg.data.workers_per_gpu,
+            # cfg.gpus will be ignored if distributed
+            len(cfg.gpu_ids),
+            dist=distributed,
+            seed=cfg.seed,
+            shuffler_sampler=cfg.data.shuffler_sampler,  # dict(type='DistributedGroupSampler'),
+            nonshuffler_sampler=cfg.data.nonshuffler_sampler,  # dict(type='DistributedSampler'),
+        ) for ds in dataset
+    ]
+
+    # put model on gpus
+    if distributed:
+        find_unused_parameters = cfg.get('find_unused_parameters', False)
+        # Sets the `find_unused_parameters` parameter in
+        # torch.nn.parallel.DistributedDataParallel
+        model = DistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False,
+            find_unused_parameters=find_unused_parameters)
+        if eval_model is not None:
+            eval_model = DistributedDataParallel(
+                eval_model.cuda(),
+                device_ids=[torch.cuda.current_device()],
+                broadcast_buffers=False,
+                find_unused_parameters=find_unused_parameters)
+    else:
+        model = DataParallel(
+            model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+        if eval_model is not None:
+            eval_model = DataParallel(
+                eval_model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids)
+
+
+    # build runner
+    optimizer = build_optimizer(model, cfg.optimizer)
+
+    if 'runner' not in cfg:
+        cfg.runner = {
+            'type': 'EpochBasedRunner',
+            'max_epochs': cfg.total_epochs
+        }
+        warnings.warn(
+            'config is now expected to have a `runner` section, '
+            'please set `runner` in your config.', UserWarning)
+    else:
+        if 'total_epochs' in cfg:
+            assert cfg.total_epochs == cfg.runner.max_epochs
+    if eval_model is not None:
+        runner = build_runner(
+            cfg.runner,
+            default_args=dict(
+                model=model,
+                eval_model=eval_model,
+                optimizer=optimizer,
+                work_dir=cfg.work_dir,
+                logger=logger,
+                meta=meta))
+    else:
+        runner = build_runner(
+            cfg.runner,
+            default_args=dict(
+                model=model,
+                optimizer=optimizer,
+                work_dir=cfg.work_dir,
+                logger=logger,
+                meta=meta))
+
+    # an ugly workaround to make .log and .log.json filenames the same
+    runner.timestamp = timestamp
+
+    # fp16 setting
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        optimizer_config = Fp16OptimizerHook(
+            **cfg.optimizer_config, **fp16_cfg, distributed=distributed)
+    elif distributed and 'type' not in cfg.optimizer_config:
+        optimizer_config = OptimizerHook(**cfg.optimizer_config)
+    else:
+        optimizer_config = cfg.optimizer_config
+
+    # register hooks
+    runner.register_training_hooks(cfg.lr_config, optimizer_config,
+                                   cfg.checkpoint_config, cfg.log_config,
+                                   cfg.get('momentum_config', None))
+    
+    # register profiler hook
+    #trace_config = dict(type='tb_trace', dir_name='work_dir')
+    #profiler_config = dict(on_trace_ready=trace_config)
+    #runner.register_profiler_hook(profiler_config)
+    
+    if distributed:
+        if isinstance(runner, EpochBasedRunner):
+            runner.register_hook(DistSamplerSeedHook())
+
+    # register eval hooks
+    if validate:
+        # Support batch_size > 1 in validation
+        val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1)
+        if val_samples_per_gpu > 1:
+            assert False
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.val.pipeline = replace_ImageToTensor(
+                cfg.data.val.pipeline)
+        val_dataset = custom_build_dataset(cfg.data.val, dict(test_mode=True))
+
+        val_dataloader = build_dataloader(
+            val_dataset,
+            samples_per_gpu=val_samples_per_gpu,
+            workers_per_gpu=cfg.data.workers_per_gpu,
+            dist=distributed,
+            shuffle=False,
+            shuffler_sampler=cfg.data.shuffler_sampler,  # dict(type='DistributedGroupSampler'),
+            nonshuffler_sampler=cfg.data.nonshuffler_sampler,  # dict(type='DistributedSampler'),
+        )
+        eval_cfg = cfg.get('evaluation', {})
+        eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
+        eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_'))
+        eval_hook = CustomDistEvalHook if distributed else EvalHook
+        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
+
+    # user-defined hooks
+    if cfg.get('custom_hooks', None):
+        custom_hooks = cfg.custom_hooks
+        assert isinstance(custom_hooks, list), \
+            f'custom_hooks expect list type, but got {type(custom_hooks)}'
+        for hook_cfg in cfg.custom_hooks:
+            assert isinstance(hook_cfg, dict), \
+                'Each item in custom_hooks expects dict type, but got ' \
+                f'{type(hook_cfg)}'
+            hook_cfg = hook_cfg.copy()
+            priority = hook_cfg.pop('priority', 'NORMAL')
+            hook = build_from_cfg(hook_cfg, HOOKS)
+            runner.register_hook(hook, priority=priority)
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow)
+
diff --git a/adzoo/vad/apis/test.py b/adzoo/vad/apis/test.py
new file mode 100644
index 0000000..3d31abb
--- /dev/null
+++ b/adzoo/vad/apis/test.py
@@ -0,0 +1,215 @@
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+
+import torch
+import torch.distributed as dist
+from mmcv.image import tensor2imgs
+from mmcv.utils import get_dist_info
+
+from mmcv.core import encode_mask_results
+from mmcv.fileio.io import dump, load
+from mmcv.utils import mkdir_or_exist, ProgressBar
+
+import numpy as np
+import pycocotools.mask as mask_util
+
+def custom_encode_mask_results(mask_results):
+    """Encode bitmap mask to RLE code. Semantic Masks only
+    Args:
+        mask_results (list | tuple[list]): bitmap mask results.
+            In mask scoring rcnn, mask_results is a tuple of (segm_results,
+            segm_cls_score).
+    Returns:
+        list | tuple: RLE encoded mask.
+    """
+    cls_segms = mask_results
+    num_classes = len(cls_segms)
+    encoded_mask_results = []
+    for i in range(len(cls_segms)):
+        encoded_mask_results.append(
+            mask_util.encode(
+                np.array(
+                    cls_segms[i][:, :, np.newaxis], order='F',
+                        dtype='uint8'))[0])  # encoded with RLE
+    return [encoded_mask_results]
+
+def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+    """Test model with multiple gpus.
+    This method tests model with multiple gpus and collects the results
+    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
+    it encodes results to gpu tensors and use gpu communication for results
+    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
+    and collects them by the rank 0 worker.
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode.
+        gpu_collect (bool): Option to use either gpu or cpu to collect results.
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    bbox_results = []
+    mask_results = []
+    dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        prog_bar = ProgressBar(len(dataset))
+    time.sleep(2)  # This line can prevent deadlock problem in some cases.
+    have_mask = False
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(data,return_loss=False, rescale=True)
+            # encode mask results
+            if isinstance(result, dict):
+                if 'bbox_results' in result.keys():
+                    bbox_result = result['bbox_results']
+                    batch_size = len(result['bbox_results'])
+                    bbox_results.extend(bbox_result)
+                if 'mask_results' in result.keys() and result['mask_results'] is not None:
+                    mask_result = custom_encode_mask_results(result['mask_results'])
+                    mask_results.extend(mask_result)
+                    have_mask = True
+            else:
+                batch_size = len(result)
+                bbox_results.extend(result)
+
+            if i>150:
+                break
+
+            #if isinstance(result[0], tuple):
+            #    assert False, 'this code is for instance segmentation, which our code will not utilize.'
+            #    result = [(bbox_results, encode_mask_results(mask_results))
+            #              for bbox_results, mask_results in result]
+        if rank == 0:
+            
+            for _ in range(batch_size * world_size):
+                prog_bar.update()
+
+    # collect results from all ranks
+    if gpu_collect:
+        bbox_results = collect_results_gpu(bbox_results, len(dataset))
+        if have_mask:
+            mask_results = collect_results_gpu(mask_results, len(dataset))
+        else:
+            mask_results = None
+    else:
+        bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir)
+        tmpdir = tmpdir+'_mask' if tmpdir is not None else None
+        if have_mask:
+            mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir)
+        else:
+            mask_results = None
+
+    return {'bbox_results': bbox_results, 'mask_results': mask_results}
+
+
+def collect_results_cpu(result_part, size, tmpdir=None):
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ),
+                                32,
+                                dtype=torch.uint8,
+                                device='cuda')
+        if rank == 0:
+            mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+            dir_tensor[:len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mkdir_or_exist(tmpdir)
+    # dump the part result to the dir
+    dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            part_file = osp.join(tmpdir, f'part_{i}.pkl')
+            part_list.append(load(part_file))
+        # sort the results
+        ordered_results = []
+        '''
+        bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample,
+        '''
+        #for res in zip(*part_list):
+        for res in part_list:  
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)
+        return ordered_results
+
+
+def collect_results_gpu(result_part, size):
+    collect_results_cpu(result_part, size)
+
+
+def single_gpu_test(model, data_loader):
+    """Test model with single gpu.
+
+    This method tests model with single gpu and gives the 'show' option.
+    By setting ``show=True``, it saves the visualization results under
+    ``out_dir``.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        show (bool, optional): Whether to save viualization results.
+            Default: True.
+        out_dir (str, optional): The path to save visualization results.
+            Default: None.
+
+    Returns:
+        list[dict]: The prediction results.
+    """
+    model.eval()
+    bbox_results = []
+    mask_results = []
+    dataset = data_loader.dataset
+    prog_bar = ProgressBar(len(dataset))
+    time.sleep(2)  # This line can prevent deadlock problem in some cases.
+    have_mask = False
+
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(data,return_loss=False, rescale=True)
+            batch_size = len(result['bbox_results'])
+
+            # encode mask results
+            if isinstance(result, dict):
+                if 'bbox_results' in result.keys():
+                    bbox_result = result['bbox_results']
+                    batch_size = len(result['bbox_results'])
+                    bbox_results.extend(bbox_result)
+                if 'mask_results' in result.keys() and result['mask_results'] is not None:
+                    mask_result = custom_encode_mask_results(result['mask_results'])
+                    mask_results.extend(mask_result)
+                    have_mask = True
+            else:
+                batch_size = len(result)
+                bbox_results.extend(result)
+
+            if isinstance(result[0], tuple):
+               assert False, 'this code is for instance segmentation, which our code will not utilize.'
+               result = [(bbox_results, encode_mask_results(mask_results))
+                         for bbox_results, mask_results in result]
+
+        for _ in range(batch_size):
+                prog_bar.update()
+
+    return {'bbox_results': bbox_results, 'mask_results': mask_results}
diff --git a/adzoo/vad/apis/train.py b/adzoo/vad/apis/train.py
new file mode 100644
index 0000000..049cd5c
--- /dev/null
+++ b/adzoo/vad/apis/train.py
@@ -0,0 +1,60 @@
+from .mmdet_train import custom_train_detector
+
+
+def custom_train_model(model,
+                dataset,
+                cfg,
+                distributed=False,
+                validate=False,
+                timestamp=None,
+                eval_model=None,
+                meta=None):
+    """A function wrapper for launching model training according to cfg.
+
+    Because we need different eval_hook in runner. Should be deprecated in the
+    future.
+    """
+    if cfg.model.type in ['EncoderDecoder3D']:
+        assert False
+    else:
+        custom_train_detector(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            eval_model=eval_model,
+            meta=meta)
+
+
+def train_model(model,
+                dataset,
+                cfg,
+                distributed=False,
+                validate=False,
+                timestamp=None,
+                meta=None):
+    """A function wrapper for launching model training according to cfg.
+
+    Because we need different eval_hook in runner. Should be deprecated in the
+    future.
+    """
+    if cfg.model.type in ['EncoderDecoder3D']:
+        train_segmentor(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            meta=meta)
+    else:
+        train_detector(
+            model,
+            dataset,
+            cfg,
+            distributed=distributed,
+            validate=validate,
+            timestamp=timestamp,
+            meta=meta)
diff --git a/adzoo/vad/configs/VAD/VAD_base_e2e.py b/adzoo/vad/configs/VAD/VAD_base_e2e.py
new file mode 100644
index 0000000..10319db
--- /dev/null
+++ b/adzoo/vad/configs/VAD/VAD_base_e2e.py
@@ -0,0 +1,438 @@
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+#
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0]
+voxel_size = [0.15, 0.15, 4]
+
+img_norm_cfg = dict(
+   mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+num_classes = len(class_names)
+
+# map has classes: divider, ped_crossing, boundary
+map_classes = ['divider', 'ped_crossing', 'boundary']
+map_num_vec = 100
+map_fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0
+map_fixed_ptsnum_per_pred_line = 20
+map_eval_use_same_gt_sample_num_flag = True
+map_num_classes = len(map_classes)
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 4
+bev_h_ = 200
+bev_w_ = 200
+queue_length = 4 # each sequence contains `queue_length` frames.
+total_epochs = 60
+
+model = dict(
+    type='VAD',
+    use_grid_mask=True,
+    video_test_mode=True,
+    pretrained=dict(img='ckpts/resnet50-19c8e357.pth'),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='VADHead',
+        map_thresh=0.5,
+        dis_thresh=0.2,
+        pe_normalization=True,
+        tot_epoch=total_epochs,
+        use_traj_lr_warmup=False,
+        query_thresh=0.0,
+        query_use_fix_pad=False,
+        ego_his_encoder=None,
+        ego_lcf_feat_idx=None,
+        valid_fut_ts=6,
+        ego_agent_decoder=dict(
+            type='CustomTransformerDecoder',
+            num_layers=1,
+            return_intermediate=False,
+            transformerlayers=dict(
+                type='BaseTransformerLayer',
+                attn_cfgs=[
+                    dict(
+                        type='MultiheadAttention',
+                        embed_dims=_dim_,
+                        num_heads=8,
+                        dropout=0.1),
+                ],
+                feedforward_channels=_ffn_dim_,
+                ffn_dropout=0.1,
+                operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+        ego_map_decoder=dict(
+            type='CustomTransformerDecoder',
+            num_layers=1,
+            return_intermediate=False,
+            transformerlayers=dict(
+                type='BaseTransformerLayer',
+                attn_cfgs=[
+                    dict(
+                        type='MultiheadAttention',
+                        embed_dims=_dim_,
+                        num_heads=8,
+                        dropout=0.1),
+                ],
+                feedforward_channels=_ffn_dim_,
+                ffn_dropout=0.1,
+                operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+        motion_decoder=dict(
+            type='CustomTransformerDecoder',
+            num_layers=1,
+            return_intermediate=False,
+            transformerlayers=dict(
+                type='BaseTransformerLayer',
+                attn_cfgs=[
+                    dict(
+                        type='MultiheadAttention',
+                        embed_dims=_dim_,
+                        num_heads=8,
+                        dropout=0.1),
+                ],
+                feedforward_channels=_ffn_dim_,
+                ffn_dropout=0.1,
+                operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+        motion_map_decoder=dict(
+            type='CustomTransformerDecoder',
+            num_layers=1,
+            return_intermediate=False,
+            transformerlayers=dict(
+                type='BaseTransformerLayer',
+                attn_cfgs=[
+                    dict(
+                        type='MultiheadAttention',
+                        embed_dims=_dim_,
+                        num_heads=8,
+                        dropout=0.1),
+                ],
+                feedforward_channels=_ffn_dim_,
+                ffn_dropout=0.1,
+                operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+        use_pe=True,
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=300,
+        num_classes=num_classes,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        map_num_vec=map_num_vec,
+        map_num_classes=map_num_classes,
+        map_num_pts_per_vec=map_fixed_ptsnum_per_pred_line,
+        map_num_pts_per_gt_vec=map_fixed_ptsnum_per_gt_line,
+        map_query_embed_type='instance_pts',
+        map_transform_method='minmax',
+        map_gt_shift_pts_pattern='v2',
+        map_dir_interval=1,
+        map_code_size=2,
+        map_code_weights=[1.0, 1.0, 1.0, 1.0],
+        transformer=dict(
+            type='VADPerceptionTransformer',
+            map_num_vec=map_num_vec,
+            map_num_pts_per_vec=map_fixed_ptsnum_per_pred_line,
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=6,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            map_decoder=dict(
+                type='MapDetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='CustomNMSFreeCoder',
+            post_center_range=[-20, -35, -10.0, 20, 35, 10.0],
+            pc_range=point_cloud_range,
+            max_num=100,
+            voxel_size=voxel_size,
+            num_classes=num_classes),
+        map_bbox_coder=dict(
+            type='MapNMSFreeCoder',
+            post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35],
+            pc_range=point_cloud_range,
+            max_num=50,
+            voxel_size=voxel_size,
+            num_classes=map_num_classes),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+        loss_traj=dict(type='L1Loss', loss_weight=0.2),
+        loss_traj_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=0.2),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0),
+        loss_map_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_map_bbox=dict(type='L1Loss', loss_weight=0.0),
+        loss_map_iou=dict(type='GIoULoss', loss_weight=0.0),
+        loss_map_pts=dict(type='PtsL1Loss', loss_weight=1.0),
+        loss_map_dir=dict(type='PtsDirCosLoss', loss_weight=0.005),
+        loss_plan_reg=dict(type='L1Loss', loss_weight=1.0),
+        loss_plan_bound=dict(type='PlanMapBoundLoss', loss_weight=1.0, dis_thresh=1.0),
+        loss_plan_col=dict(type='PlanCollisionLoss', loss_weight=1.0),
+        loss_plan_dir=dict(type='PlanMapDirectionLoss', loss_weight=0.5)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='HungarianAssigner3D',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+            pc_range=point_cloud_range),
+        map_assigner=dict(
+            type='MapHungarianAssigner3D',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'),
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0),
+            pts_cost=dict(type='OrderedPtsL1Cost', weight=1.0),
+            pc_range=point_cloud_range))))
+
+dataset_type = 'VADCustomNuScenesDataset'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=True),
+    dict(type='VADObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='VADObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='RandomScaleImageMultiViewImage', scales=[0.8]),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='VADFormatBundle3D', class_names=class_names, with_ego=True),
+    dict(type='CustomCollect3D',\
+         keys=['gt_bboxes_3d', 'gt_labels_3d', 'img', 'ego_his_trajs',
+               'ego_fut_trajs', 'ego_fut_masks', 'ego_fut_cmd', 'ego_lcf_feat', 'gt_attr_labels'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='LoadPointsFromFile',
+         coord_type='LIDAR',
+         load_dim=5,
+         use_dim=5,
+         file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=True),
+    dict(type='VADObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='VADObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    # dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='RandomScaleImageMultiViewImage', scales=[0.8]),
+            dict(type='PadMultiViewImage', size_divisor=32),
+            dict(type='VADFormatBundle3D', class_names=class_names, with_label=False, with_ego=True),
+            dict(type='CustomCollect3D',\
+                 keys=['points', 'gt_bboxes_3d', 'gt_labels_3d', 'img', 'fut_valid_flag',
+                       'ego_his_trajs', 'ego_fut_trajs', 'ego_fut_masks', 'ego_fut_cmd',
+                       'ego_lcf_feat', 'gt_attr_labels'])])
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'vad_nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        bev_size=(bev_h_, bev_w_),
+        pc_range=point_cloud_range,
+        queue_length=queue_length,
+        map_classes=map_classes,
+        map_fixed_ptsnum_per_line=map_fixed_ptsnum_per_gt_line,
+        map_eval_use_same_gt_sample_num_flag=map_eval_use_same_gt_sample_num_flag,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR',
+        custom_eval_version='vad_nusc_detection_cvpr_2019'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             pc_range=point_cloud_range,
+             ann_file=data_root + 'vad_nuscenes_infos_temporal_val.pkl',
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             classes=class_names, modality=input_modality, samples_per_gpu=1,
+             map_classes=map_classes,
+             map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+             map_fixed_ptsnum_per_line=map_fixed_ptsnum_per_gt_line,
+             map_eval_use_same_gt_sample_num_flag=map_eval_use_same_gt_sample_num_flag,
+             use_pkl_result=True,
+             custom_eval_version='vad_nusc_detection_cvpr_2019'),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              pc_range=point_cloud_range,
+              ann_file=data_root + 'vad_nuscenes_infos_temporal_val.pkl',
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              classes=class_names, modality=input_modality, samples_per_gpu=1,
+              map_classes=map_classes,
+              map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+              map_fixed_ptsnum_per_line=map_fixed_ptsnum_per_gt_line,
+              map_eval_use_same_gt_sample_num_flag=map_eval_use_same_gt_sample_num_flag,
+              use_pkl_result=True,
+              custom_eval_version='vad_nusc_detection_cvpr_2019'),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+
+evaluation = dict(interval=total_epochs, pipeline=test_pipeline, metric='bbox', map_metric='chamfer')
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# fp16 = dict(loss_scale=512.)
+# find_unused_parameters = True
+checkpoint_config = dict(interval=1, max_keep_ckpts=total_epochs)
+
+
+custom_hooks = [dict(type='CustomSetEpochInfoHook')]
\ No newline at end of file
diff --git a/adzoo/vad/configs/VAD/VAD_base_e2e_b2d.py b/adzoo/vad/configs/VAD/VAD_base_e2e_b2d.py
new file mode 100644
index 0000000..8d59fa0
--- /dev/null
+++ b/adzoo/vad/configs/VAD/VAD_base_e2e_b2d.py
@@ -0,0 +1,568 @@
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0]
+voxel_size = [0.15, 0.15, 4]
+
+img_norm_cfg = dict(
+   mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# For nuScenes we usually do 10-class detection
+
+NameMapping = {
+    #=================vehicle=================
+    # bicycle
+    'vehicle.bh.crossbike': 'bicycle',
+    "vehicle.diamondback.century": 'bicycle',
+    "vehicle.gazelle.omafiets": 'bicycle',
+    # car
+    "vehicle.chevrolet.impala": 'car',
+    "vehicle.dodge.charger_2020": 'car',
+    "vehicle.dodge.charger_police": 'car',
+    "vehicle.dodge.charger_police_2020": 'car',
+    "vehicle.lincoln.mkz_2017": 'car',
+    "vehicle.lincoln.mkz_2020": 'car',
+    "vehicle.mini.cooper_s_2021": 'car',
+    "vehicle.mercedes.coupe_2020": 'car',
+    "vehicle.ford.mustang": 'car',
+    "vehicle.nissan.patrol_2021": 'car',
+    "vehicle.audi.tt": 'car',
+    "vehicle.audi.etron": 'car',
+    "vehicle.ford.crown": 'car',
+    "vehicle.ford.mustang": 'car',
+    "vehicle.tesla.model3": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/FordCrown/SM_FordCrown_parked.SM_FordCrown_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Charger/SM_ChargerParked.SM_ChargerParked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Lincoln/SM_LincolnParked.SM_LincolnParked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/MercedesCCC/SM_MercedesCCC_Parked.SM_MercedesCCC_Parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/Mini2021/SM_Mini2021_parked.SM_Mini2021_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/NissanPatrol2021/SM_NissanPatrol2021_parked.SM_NissanPatrol2021_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/TeslaM3/SM_TeslaM3_parked.SM_TeslaM3_parked": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": 'car',
+    # bus
+    # van
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": "van",
+    "vehicle.ford.ambulance": "van",
+    # truck
+    "vehicle.carlamotors.firetruck": 'truck',
+    #=========================================
+
+    #=================traffic sign============
+    # traffic.speed_limit
+    "traffic.speed_limit.30": 'traffic_sign',
+    "traffic.speed_limit.40": 'traffic_sign',
+    "traffic.speed_limit.50": 'traffic_sign',
+    "traffic.speed_limit.60": 'traffic_sign',
+    "traffic.speed_limit.90": 'traffic_sign',
+    "traffic.speed_limit.120": 'traffic_sign',
+    
+    "traffic.stop": 'traffic_sign',
+    "traffic.yield": 'traffic_sign',
+    "traffic.traffic_light": 'traffic_light',
+    #=========================================
+
+    #===================Construction===========
+    "static.prop.warningconstruction" : 'traffic_cone',
+    "static.prop.warningaccident": 'traffic_cone',
+    "static.prop.trafficwarning": "traffic_cone",
+
+    #===================Construction===========
+    "static.prop.constructioncone": 'traffic_cone',
+
+    #=================pedestrian==============
+    "walker.pedestrian.0001": 'pedestrian',
+    "walker.pedestrian.0004": 'pedestrian',
+    "walker.pedestrian.0005": 'pedestrian',
+    "walker.pedestrian.0007": 'pedestrian',
+    "walker.pedestrian.0013": 'pedestrian',
+    "walker.pedestrian.0014": 'pedestrian',
+    "walker.pedestrian.0017": 'pedestrian',
+    "walker.pedestrian.0018": 'pedestrian',
+    "walker.pedestrian.0019": 'pedestrian',
+    "walker.pedestrian.0020": 'pedestrian',
+    "walker.pedestrian.0022": 'pedestrian',
+    "walker.pedestrian.0025": 'pedestrian',
+    "walker.pedestrian.0035": 'pedestrian',
+    "walker.pedestrian.0041": 'pedestrian',
+    "walker.pedestrian.0046": 'pedestrian',
+    "walker.pedestrian.0047": 'pedestrian',
+
+    # ==========================================
+    "static.prop.dirtdebris01": 'others',
+    "static.prop.dirtdebris02": 'others',
+}
+
+eval_cfg = {
+            "dist_ths": [0.5, 1.0, 2.0, 4.0],
+            "dist_th_tp": 2.0,
+            "min_recall": 0.1,
+            "min_precision": 0.1,
+            "mean_ap_weight": 5,
+            "class_names":['car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian'],
+            "tp_metrics":['trans_err', 'scale_err', 'orient_err', 'vel_err'],
+            "err_name_maping":{'trans_err': 'mATE','scale_err': 'mASE','orient_err': 'mAOE','vel_err': 'mAVE','attr_err': 'mAAE'},
+            "class_range":{'car':(50,50),'van':(50,50),'truck':(50,50),'bicycle':(40,40),'traffic_sign':(30,30),'traffic_cone':(30,30),'traffic_light':(30,30),'pedestrian':(40,40)}
+            }
+
+class_names = [
+'car','van','truck','bicycle','traffic_sign','traffic_cone','traffic_light','pedestrian','others'
+]
+num_classes = len(class_names)
+
+# map has classes: divider, ped_crossing, boundary
+map_classes = ['Broken','Solid','SolidSolid','Center','TrafficLight','StopSign']
+map_num_vec = 100
+map_fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0
+map_fixed_ptsnum_per_pred_line = 20
+map_eval_use_same_gt_sample_num_flag = True
+map_num_classes = len(map_classes)
+past_frames = 2
+future_frames = 6
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 4
+bev_h_ = 200
+bev_w_ = 200
+queue_length = 4 # each sequence contains `queue_length` frames.
+total_epochs = 60
+
+model = dict(
+    type='VAD',
+    use_grid_mask=True,
+    video_test_mode=True,
+    pretrained=dict(img='ckpts/resnet50-19c8e357.pth'),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[512, 1024, 2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='VADHead',
+        map_thresh=0.5,
+        dis_thresh=0.2,
+        pe_normalization=True,
+        tot_epoch=total_epochs,
+        use_traj_lr_warmup=False,
+        query_thresh=0.0,
+        query_use_fix_pad=False,
+        ego_his_encoder=None,
+        ego_lcf_feat_idx=None,
+        valid_fut_ts=6,
+        ego_fut_mode=6,
+        ego_agent_decoder=dict(
+            type='CustomTransformerDecoder',
+            num_layers=1,
+            return_intermediate=False,
+            transformerlayers=dict(
+                type='BaseTransformerLayer',
+                attn_cfgs=[
+                    dict(
+                        type='MultiheadAttention',
+                        embed_dims=_dim_,
+                        num_heads=8,
+                        dropout=0.1),
+                ],
+                feedforward_channels=_ffn_dim_,
+                ffn_dropout=0.1,
+                operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+        ego_map_decoder=dict(
+            type='CustomTransformerDecoder',
+            num_layers=1,
+            return_intermediate=False,
+            transformerlayers=dict(
+                type='BaseTransformerLayer',
+                attn_cfgs=[
+                    dict(
+                        type='MultiheadAttention',
+                        embed_dims=_dim_,
+                        num_heads=8,
+                        dropout=0.1),
+                ],
+                feedforward_channels=_ffn_dim_,
+                ffn_dropout=0.1,
+                operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+        motion_decoder=dict(
+            type='CustomTransformerDecoder',
+            num_layers=1,
+            return_intermediate=False,
+            transformerlayers=dict(
+                type='BaseTransformerLayer',
+                attn_cfgs=[
+                    dict(
+                        type='MultiheadAttention',
+                        embed_dims=_dim_,
+                        num_heads=8,
+                        dropout=0.1),
+                ],
+                feedforward_channels=_ffn_dim_,
+                ffn_dropout=0.1,
+                operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+        motion_map_decoder=dict(
+            type='CustomTransformerDecoder',
+            num_layers=1,
+            return_intermediate=False,
+            transformerlayers=dict(
+                type='BaseTransformerLayer',
+                attn_cfgs=[
+                    dict(
+                        type='MultiheadAttention',
+                        embed_dims=_dim_,
+                        num_heads=8,
+                        dropout=0.1),
+                ],
+                feedforward_channels=_ffn_dim_,
+                ffn_dropout=0.1,
+                operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+        use_pe=True,
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=300,
+        num_classes=num_classes,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        map_num_vec=map_num_vec,
+        map_num_classes=map_num_classes,
+        map_num_pts_per_vec=map_fixed_ptsnum_per_pred_line,
+        map_num_pts_per_gt_vec=map_fixed_ptsnum_per_gt_line,
+        map_query_embed_type='instance_pts',
+        map_transform_method='minmax',
+        map_gt_shift_pts_pattern='v2',
+        map_dir_interval=1,
+        map_code_size=2,
+        map_code_weights=[1.0, 1.0, 1.0, 1.0],
+        transformer=dict(
+            type='VADPerceptionTransformer',
+            map_num_vec=map_num_vec,
+            map_num_pts_per_vec=map_fixed_ptsnum_per_pred_line,
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=6,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            map_decoder=dict(
+                type='MapDetectionTransformerDecoder',
+                num_layers=6,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='CustomNMSFreeCoder',
+            post_center_range=[-20, -35, -10.0, 20, 35, 10.0],
+            pc_range=point_cloud_range,
+            max_num=100,
+            voxel_size=voxel_size,
+            num_classes=num_classes),
+        map_bbox_coder=dict(
+            type='MapNMSFreeCoder',
+            post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35],
+            pc_range=point_cloud_range,
+            max_num=50,
+            voxel_size=voxel_size,
+            num_classes=map_num_classes),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+        loss_traj=dict(type='L1Loss', loss_weight=0.2),
+        loss_traj_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=0.2),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0),
+        loss_map_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_map_bbox=dict(type='L1Loss', loss_weight=0.0),
+        loss_map_iou=dict(type='GIoULoss', loss_weight=0.0),
+        loss_map_pts=dict(type='PtsL1Loss', loss_weight=1.0),
+        loss_map_dir=dict(type='PtsDirCosLoss', loss_weight=0.005),
+        loss_plan_reg=dict(type='L1Loss', loss_weight=1.0),
+        loss_plan_bound=dict(type='PlanMapBoundLoss', loss_weight=1.0, dis_thresh=1.0),
+        loss_plan_col=dict(type='PlanCollisionLoss', loss_weight=1.0),
+        loss_plan_dir=dict(type='PlanMapDirectionLoss', loss_weight=0.5)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='HungarianAssigner3D',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+            pc_range=point_cloud_range),
+        map_assigner=dict(
+            type='MapHungarianAssigner3D',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'),
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0),
+            pts_cost=dict(type='OrderedPtsL1Cost', weight=1.0),
+            pc_range=point_cloud_range))))
+
+dataset_type = "B2D_VAD_Dataset"
+data_root = "data/bench2drive"
+info_root = "data/infos"
+map_root = "data/bench2drive/maps"
+map_file = "data/infos/b2d_map_infos.pkl"
+file_client_args = dict(backend="disk")
+ann_file_train=info_root + f"/b2d_infos_train.pkl"
+ann_file_val=info_root + f"/b2d_infos_val.pkl"
+ann_file_test=info_root + f"/b2d_infos_val.pkl"
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=True),
+    dict(type='VADObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='VADObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='RandomScaleImageMultiViewImage', scales=[0.8]),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='VADFormatBundle3D', class_names=class_names, with_ego=True),
+    dict(type='CustomCollect3D',\
+         keys=['gt_bboxes_3d', 'gt_labels_3d', 'img', 'ego_his_trajs','gt_attr_labels','ego_fut_trajs', 'ego_fut_masks', 'ego_fut_cmd', 'ego_lcf_feat'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=True),
+    dict(type='VADObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='VADObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    # dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='RandomScaleImageMultiViewImage', scales=[0.8]),
+            dict(type='PadMultiViewImage', size_divisor=32),
+            dict(type='VADFormatBundle3D', class_names=class_names, with_label=False, with_ego=True),
+            dict(type='CustomCollect3D',\
+                 keys=['gt_bboxes_3d', 'gt_labels_3d', 'img', 'fut_valid_flag',
+                       'ego_his_trajs', 'ego_fut_trajs', 'ego_fut_masks', 'ego_fut_cmd',
+                       'ego_lcf_feat','gt_attr_labels'])])
+]
+
+inference_only_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='RandomScaleImageMultiViewImage', scales=[0.8]),
+            dict(type='PadMultiViewImage', size_divisor=32),
+            dict(type='VADFormatBundle3D', class_names=class_names, with_label=False, with_ego=True),
+            dict(type='CustomCollect3D', keys=[ 'img', 'ego_fut_cmd'])])
+]
+
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=6,
+    train=dict(
+
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=ann_file_train,
+        pipeline=train_pipeline,
+        classes=class_names,
+        name_mapping=NameMapping,
+        map_root=map_root,
+        map_file=map_file,
+        modality=input_modality,
+        bev_size=(bev_h_, bev_w_),
+        queue_length=queue_length,
+        past_frames=past_frames,
+        future_frames=future_frames,
+        point_cloud_range=point_cloud_range,
+        polyline_points_num=map_fixed_ptsnum_per_gt_line,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR',
+        #custom_eval_version='vad_nusc_detection_cvpr_2019'
+        ),
+    val=dict(type=dataset_type,
+
+            data_root=data_root,
+            ann_file=ann_file_train,
+            pipeline=train_pipeline,
+            classes=class_names,
+            name_mapping=NameMapping,
+            map_root=map_root,
+            map_file=map_file,
+            modality=input_modality,
+            bev_size=(bev_h_, bev_w_),
+            queue_length=queue_length,
+            past_frames=past_frames,
+            future_frames=future_frames,
+            point_cloud_range=point_cloud_range,
+            polyline_points_num=map_fixed_ptsnum_per_gt_line,
+            #use_pkl_result=True,
+            #custom_eval_version='vad_nusc_detection_cvpr_2019'
+            ),
+    test=dict(type=dataset_type,
+            data_root=data_root,
+            ann_file=ann_file_val,
+            pipeline=test_pipeline,
+            classes=class_names,
+            name_mapping=NameMapping,
+            map_root=map_root,
+            map_file=map_file,
+            modality=input_modality,
+            bev_size=(bev_h_, bev_w_),
+            queue_length=queue_length,
+            past_frames=past_frames,
+            future_frames=future_frames,
+            point_cloud_range=point_cloud_range,
+            polyline_points_num=map_fixed_ptsnum_per_gt_line,
+            eval_cfg=eval_cfg
+            ),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+
+evaluation = dict(interval=total_epochs, pipeline=test_pipeline, metric='bbox', map_metric='chamfer')
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+
+log_config = dict(
+    interval=1,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# fp16 = dict(loss_scale=512.)
+# find_unused_parameters = True
+checkpoint_config = dict(interval=1, max_keep_ckpts=total_epochs)
+
+
+custom_hooks = [dict(type='CustomSetEpochInfoHook')]
\ No newline at end of file
diff --git a/adzoo/vad/configs/VAD/VAD_tiny_e2e.py b/adzoo/vad/configs/VAD/VAD_tiny_e2e.py
new file mode 100644
index 0000000..67e088a
--- /dev/null
+++ b/adzoo/vad/configs/VAD/VAD_tiny_e2e.py
@@ -0,0 +1,454 @@
+_base_ = [
+    '../datasets/custom_nus-3d.py',
+    '../_base_/default_runtime.py'
+]
+
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0]
+voxel_size = [0.15, 0.15, 4]
+
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+num_classes = len(class_names)
+
+# map has classes: divider, ped_crossing, boundary
+map_classes = ['divider', 'ped_crossing', 'boundary']
+map_num_vec = 100
+map_fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0
+map_fixed_ptsnum_per_pred_line = 20
+map_eval_use_same_gt_sample_num_flag = True
+map_num_classes = len(map_classes)
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+
+_dim_ = 256
+_pos_dim_ = _dim_//2
+_ffn_dim_ = _dim_*2
+_num_levels_ = 1
+bev_h_ = 100
+bev_w_ = 100
+queue_length = 3 # each sequence contains `queue_length` frames.
+total_epochs = 60
+
+model = dict(
+    type='VAD',
+    use_grid_mask=True,
+    video_test_mode=True,
+    pretrained=dict(img='torchvision://resnet50'),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(3,),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[2048],
+        out_channels=_dim_,
+        start_level=0,
+        add_extra_convs='on_output',
+        num_outs=_num_levels_,
+        relu_before_extra_convs=True),
+    pts_bbox_head=dict(
+        type='VADHead',
+        map_thresh=0.5,
+        dis_thresh=0.2,
+        pe_normalization=True,
+        tot_epoch=total_epochs,
+        use_traj_lr_warmup=False,
+        query_thresh=0.0,
+        query_use_fix_pad=False,
+        ego_his_encoder=None,
+        ego_lcf_feat_idx=None,
+        valid_fut_ts=6,
+        ego_agent_decoder=dict(
+            type='CustomTransformerDecoder',
+            num_layers=1,
+            return_intermediate=False,
+            transformerlayers=dict(
+                type='BaseTransformerLayer',
+                attn_cfgs=[
+                    dict(
+                        type='MultiheadAttention',
+                        embed_dims=_dim_,
+                        num_heads=8,
+                        dropout=0.1),
+                ],
+                feedforward_channels=_ffn_dim_,
+                ffn_dropout=0.1,
+                operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+        ego_map_decoder=dict(
+            type='CustomTransformerDecoder',
+            num_layers=1,
+            return_intermediate=False,
+            transformerlayers=dict(
+                type='BaseTransformerLayer',
+                attn_cfgs=[
+                    dict(
+                        type='MultiheadAttention',
+                        embed_dims=_dim_,
+                        num_heads=8,
+                        dropout=0.1),
+                ],
+                feedforward_channels=_ffn_dim_,
+                ffn_dropout=0.1,
+                operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+        motion_decoder=dict(
+            type='CustomTransformerDecoder',
+            num_layers=1,
+            return_intermediate=False,
+            transformerlayers=dict(
+                type='BaseTransformerLayer',
+                attn_cfgs=[
+                    dict(
+                        type='MultiheadAttention',
+                        embed_dims=_dim_,
+                        num_heads=8,
+                        dropout=0.1),
+                ],
+                feedforward_channels=_ffn_dim_,
+                ffn_dropout=0.1,
+                operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+        motion_map_decoder=dict(
+            type='CustomTransformerDecoder',
+            num_layers=1,
+            return_intermediate=False,
+            transformerlayers=dict(
+                type='BaseTransformerLayer',
+                attn_cfgs=[
+                    dict(
+                        type='MultiheadAttention',
+                        embed_dims=_dim_,
+                        num_heads=8,
+                        dropout=0.1),
+                ],
+                feedforward_channels=_ffn_dim_,
+                ffn_dropout=0.1,
+                operation_order=('cross_attn', 'norm', 'ffn', 'norm'))),
+        use_pe=True,
+        bev_h=bev_h_,
+        bev_w=bev_w_,
+        num_query=300,
+        num_classes=num_classes,
+        in_channels=_dim_,
+        sync_cls_avg_factor=True,
+        with_box_refine=True,
+        as_two_stage=False,
+        map_num_vec=map_num_vec,
+        map_num_classes=map_num_classes,
+        map_num_pts_per_vec=map_fixed_ptsnum_per_pred_line,
+        map_num_pts_per_gt_vec=map_fixed_ptsnum_per_gt_line,
+        map_query_embed_type='instance_pts',
+        map_transform_method='minmax',
+        map_gt_shift_pts_pattern='v2',
+        map_dir_interval=1,
+        map_code_size=2,
+        map_code_weights=[1.0, 1.0, 1.0, 1.0],
+        transformer=dict(
+            type='VADPerceptionTransformer',
+            map_num_vec=map_num_vec,
+            map_num_pts_per_vec=map_fixed_ptsnum_per_pred_line,
+            rotate_prev_bev=True,
+            use_shift=True,
+            use_can_bus=True,
+            embed_dims=_dim_,
+            encoder=dict(
+                type='BEVFormerEncoder',
+                num_layers=3,
+                pc_range=point_cloud_range,
+                num_points_in_pillar=4,
+                return_intermediate=False,
+                transformerlayers=dict(
+                    type='BEVFormerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='TemporalSelfAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                        dict(
+                            type='SpatialCrossAttention',
+                            pc_range=point_cloud_range,
+                            deformable_attention=dict(
+                                type='MSDeformableAttention3D',
+                                embed_dims=_dim_,
+                                num_points=8,
+                                num_levels=_num_levels_),
+                            embed_dims=_dim_,
+                        )
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            decoder=dict(
+                type='DetectionTransformerDecoder',
+                num_layers=3,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))),
+            map_decoder=dict(
+                type='MapDetectionTransformerDecoder',
+                num_layers=3,
+                return_intermediate=True,
+                transformerlayers=dict(
+                    type='DetrTransformerDecoderLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=_dim_,
+                            num_heads=8,
+                            dropout=0.1),
+                         dict(
+                            type='CustomMSDeformableAttention',
+                            embed_dims=_dim_,
+                            num_levels=1),
+                    ],
+                    feedforward_channels=_ffn_dim_,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm')))),
+        bbox_coder=dict(
+            type='CustomNMSFreeCoder',
+            post_center_range=[-20, -35, -10.0, 20, 35, 10.0],
+            pc_range=point_cloud_range,
+            max_num=100,
+            voxel_size=voxel_size,
+            num_classes=num_classes),
+        map_bbox_coder=dict(
+            type='MapNMSFreeCoder',
+            post_center_range=[-20, -35, -20, -35, 20, 35, 20, 35],
+            pc_range=point_cloud_range,
+            max_num=50,
+            voxel_size=voxel_size,
+            num_classes=map_num_classes),
+        positional_encoding=dict(
+            type='LearnedPositionalEncoding',
+            num_feats=_pos_dim_,
+            row_num_embed=bev_h_,
+            col_num_embed=bev_w_,
+            ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=0.25),
+        loss_traj=dict(type='L1Loss', loss_weight=0.2),
+        loss_traj_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=0.2),
+        loss_iou=dict(type='GIoULoss', loss_weight=0.0),
+        loss_map_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=2.0),
+        loss_map_bbox=dict(type='L1Loss', loss_weight=0.0),
+        loss_map_iou=dict(type='GIoULoss', loss_weight=0.0),
+        loss_map_pts=dict(type='PtsL1Loss', loss_weight=1.0),
+        loss_map_dir=dict(type='PtsDirCosLoss', loss_weight=0.005),
+        loss_plan_reg=dict(type='L1Loss', loss_weight=1.0),
+        loss_plan_bound=dict(type='PlanMapBoundLoss', loss_weight=1.0, dis_thresh=1.0),
+        loss_plan_col=dict(type='PlanCollisionLoss', loss_weight=1.0),
+        loss_plan_dir=dict(type='PlanMapDirectionLoss', loss_weight=0.5)),
+    # model training and testing settings
+    train_cfg=dict(pts=dict(
+        grid_size=[512, 512, 1],
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=4,
+        assigner=dict(
+            type='HungarianAssigner3D',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
+            iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head.
+            pc_range=point_cloud_range),
+        map_assigner=dict(
+            type='MapHungarianAssigner3D',
+            cls_cost=dict(type='FocalLossCost', weight=2.0),
+            reg_cost=dict(type='BBoxL1Cost', weight=0.0, box_format='xywh'),
+            iou_cost=dict(type='IoUCost', iou_mode='giou', weight=0.0),
+            pts_cost=dict(type='OrderedPtsL1Cost', weight=1.0),
+            pc_range=point_cloud_range))))
+
+dataset_type = 'VADCustomNuScenesDataset'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=True),
+    dict(type='CustomObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='CustomObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='RandomScaleImageMultiViewImage', scales=[0.4]),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='CustomDefaultFormatBundle3D', class_names=class_names, with_ego=True),
+    dict(type='CustomCollect3D',\
+         keys=['gt_bboxes_3d', 'gt_labels_3d', 'img', 'ego_his_trajs',
+               'ego_fut_trajs', 'ego_fut_masks', 'ego_fut_cmd', 'ego_lcf_feat', 'gt_attr_labels'])
+]
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='LoadPointsFromFile',
+         coord_type='LIDAR',
+         load_dim=5,
+         use_dim=5,
+         file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=True),
+    dict(type='CustomObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='CustomObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    # dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='RandomScaleImageMultiViewImage', scales=[0.4]),
+            dict(type='PadMultiViewImage', size_divisor=32),
+            dict(type='CustomDefaultFormatBundle3D', class_names=class_names, with_label=False, with_ego=True),
+            dict(type='CustomCollect3D',\
+                 keys=['points', 'gt_bboxes_3d', 'gt_labels_3d', 'img', 'fut_valid_flag',
+                       'ego_his_trajs', 'ego_fut_trajs', 'ego_fut_masks', 'ego_fut_cmd',
+                       'ego_lcf_feat', 'gt_attr_labels'])])
+]
+
+inference_only_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1600, 900),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type='RandomScaleImageMultiViewImage', scales=[0.8]),
+            dict(type='PadMultiViewImage', size_divisor=32),
+            dict(type='VADFormatBundle3D', class_names=class_names, with_label=False, with_ego=True),
+            dict(type='CustomCollect3D', keys=[ 'img', 'ego_fut_cmd'])])
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'vad_nuscenes_infos_temporal_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        bev_size=(bev_h_, bev_w_),
+        pc_range=point_cloud_range,
+        queue_length=queue_length,
+        map_classes=map_classes,
+        map_fixed_ptsnum_per_line=map_fixed_ptsnum_per_gt_line,
+        map_eval_use_same_gt_sample_num_flag=map_eval_use_same_gt_sample_num_flag,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR',
+        custom_eval_version='vad_nusc_detection_cvpr_2019'),
+    val=dict(type=dataset_type,
+             data_root=data_root,
+             pc_range=point_cloud_range,
+             ann_file=data_root + 'vad_nuscenes_infos_temporal_val.pkl',
+             pipeline=test_pipeline,  bev_size=(bev_h_, bev_w_),
+             classes=class_names, modality=input_modality, samples_per_gpu=1,
+             map_classes=map_classes,
+             map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+             map_fixed_ptsnum_per_line=map_fixed_ptsnum_per_gt_line,
+             map_eval_use_same_gt_sample_num_flag=map_eval_use_same_gt_sample_num_flag,
+             use_pkl_result=True,
+             custom_eval_version='vad_nusc_detection_cvpr_2019'),
+    test=dict(type=dataset_type,
+              data_root=data_root,
+              pc_range=point_cloud_range,
+              ann_file=data_root + 'vad_nuscenes_infos_temporal_val.pkl',
+              pipeline=test_pipeline, bev_size=(bev_h_, bev_w_),
+              classes=class_names, modality=input_modality, samples_per_gpu=1,
+              map_classes=map_classes,
+              map_ann_file=data_root + 'nuscenes_map_anns_val.json',
+              map_fixed_ptsnum_per_line=map_fixed_ptsnum_per_gt_line,
+              map_eval_use_same_gt_sample_num_flag=map_eval_use_same_gt_sample_num_flag,
+              use_pkl_result=True,
+              custom_eval_version='vad_nusc_detection_cvpr_2019'),
+    shuffler_sampler=dict(type='DistributedGroupSampler'),
+    nonshuffler_sampler=dict(type='DistributedSampler')
+)
+
+optimizer = dict(
+    type='AdamW',
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_backbone': dict(lr_mult=0.1),
+        }),
+    weight_decay=0.01)
+
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3)
+
+evaluation = dict(interval=total_epochs, pipeline=test_pipeline, metric='bbox', map_metric='chamfer')
+
+runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
+
+log_config = dict(
+    interval=100,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# fp16 = dict(loss_scale=512.)
+# find_unused_parameters = True
+checkpoint_config = dict(interval=1, max_keep_ckpts=total_epochs)
+
+
+custom_hooks = [dict(type='CustomSetEpochInfoHook')]
\ No newline at end of file
diff --git a/adzoo/vad/configs/_base_/datasets/coco_instance.py b/adzoo/vad/configs/_base_/datasets/coco_instance.py
new file mode 100644
index 0000000..f6ea4f4
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/coco_instance.py
@@ -0,0 +1,48 @@
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/adzoo/vad/configs/_base_/datasets/kitti-3d-3class.py b/adzoo/vad/configs/_base_/datasets/kitti-3d-3class.py
new file mode 100644
index 0000000..1822af4
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/kitti-3d-3class.py
@@ -0,0 +1,140 @@
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+    classes=class_names,
+    sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6))
+
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        file_client_args=file_client_args),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=6,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'kitti_infos_train.pkl',
+            split='training',
+            pts_prefix='velodyne_reduced',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR')),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pts_prefix='velodyne_reduced',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pts_prefix='velodyne_reduced',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+
+evaluation = dict(interval=1, pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/kitti-3d-car.py b/adzoo/vad/configs/_base_/datasets/kitti-3d-car.py
new file mode 100644
index 0000000..1e81226
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/kitti-3d-car.py
@@ -0,0 +1,138 @@
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Car']
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15))
+
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        file_client_args=file_client_args),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=6,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'kitti_infos_train.pkl',
+            split='training',
+            pts_prefix='velodyne_reduced',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR')),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pts_prefix='velodyne_reduced',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'kitti_infos_val.pkl',
+        split='training',
+        pts_prefix='velodyne_reduced',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+
+evaluation = dict(interval=1, pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/lyft-3d.py b/adzoo/vad/configs/_base_/datasets/lyft-3d.py
new file mode 100644
index 0000000..71baff0
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/lyft-3d.py
@@ -0,0 +1,136 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-80, -80, -5, 80, 80, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+    'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+    'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'LyftDataset'
+data_root = 'data/lyft/'
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/lyft/': 's3://lyft/lyft/',
+#         'data/lyft/': 's3://lyft/lyft/'
+#    }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_test.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True))
+# For Lyft dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/nuim_instance.py b/adzoo/vad/configs/_base_/datasets/nuim_instance.py
new file mode 100644
index 0000000..82fce56
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/nuim_instance.py
@@ -0,0 +1,59 @@
+dataset_type = 'CocoDataset'
+data_root = 'data/nuimages/'
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
+    dict(
+        type='Resize',
+        img_scale=[(1280, 720), (1920, 1080)],
+        multiscale_mode='range',
+        keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1600, 900),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/nuimages_v1.0-train.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/nuimages_v1.0-val.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/nuimages_v1.0-val.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=test_pipeline))
+evaluation = dict(metric=['bbox', 'segm'])
diff --git a/adzoo/vad/configs/_base_/datasets/nus-3d.py b/adzoo/vad/configs/_base_/datasets/nus-3d.py
new file mode 100644
index 0000000..1548171
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/nus-3d.py
@@ -0,0 +1,142 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/nus-mono3d.py b/adzoo/vad/configs/_base_/datasets/nus-mono3d.py
new file mode 100644
index 0000000..1363a94
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/nus-mono3d.py
@@ -0,0 +1,100 @@
+dataset_type = 'CustomNuScenesMonoDataset'
+data_root = 'data/nuscenes/'
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=True,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D',
+        keys=[
+            'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers2d', 'depths'
+        ]),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='MultiScaleFlipAug',
+        scale_factor=1.0,
+        flip=False,
+        transforms=[
+            dict(type='RandomFlip3D'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['img']),
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['img'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train_mono3d.coco.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        box_type_3d='Camera'),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='Camera'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json',
+        img_prefix=data_root,
+        classes=class_names,
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='Camera'))
+evaluation = dict(interval=2)
diff --git a/adzoo/vad/configs/_base_/datasets/range100_lyft-3d.py b/adzoo/vad/configs/_base_/datasets/range100_lyft-3d.py
new file mode 100644
index 0000000..efa63ea
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/range100_lyft-3d.py
@@ -0,0 +1,136 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-100, -100, -5, 100, 100, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+    'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+    'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'LyftDataset'
+data_root = 'data/lyft/'
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/lyft/': 's3://lyft/lyft/',
+#         'data/lyft/': 's3://lyft/lyft/'
+#    }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_test.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True))
+# For Lyft dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/s3dis-3d-5class.py b/adzoo/vad/configs/_base_/datasets/s3dis-3d-5class.py
new file mode 100644
index 0000000..2422766
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/s3dis-3d-5class.py
@@ -0,0 +1,114 @@
+# dataset settings
+dataset_type = 'S3DISDataset'
+data_root = './data/s3dis/'
+class_names = ('table', 'chair', 'sofa', 'bookcase', 'board')
+train_area = [1, 2, 3, 4, 6]
+test_area = 5
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='PointSample', num_points=40000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        # following ScanNet dataset the rotation range is 5 degrees
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0],
+        shift_height=True),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type='PointSample', num_points=40000),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type='ConcatDataset',
+            datasets=[
+                dict(
+                    type=dataset_type,
+                    data_root=data_root,
+                    ann_file=data_root + f's3dis_infos_Area_{i}.pkl',
+                    pipeline=train_pipeline,
+                    filter_empty_gt=False,
+                    classes=class_names,
+                    box_type_3d='Depth') for i in train_area
+            ],
+            separate_eval=False)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/s3dis_seg-3d-13class.py b/adzoo/vad/configs/_base_/datasets/s3dis_seg-3d-13class.py
new file mode 100644
index 0000000..39bf556
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/s3dis_seg-3d-13class.py
@@ -0,0 +1,139 @@
+# dataset settings
+dataset_type = 'S3DISSegDataset'
+data_root = './data/s3dis/'
+class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door',
+               'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter')
+num_points = 4096
+train_area = [1, 2, 3, 4, 6]
+test_area = 5
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=tuple(range(len(class_names))),
+        max_cat_id=13),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.0,
+        ignore_index=len(class_names),
+        use_normalized_coord=True,
+        enlarge_size=0.2,
+        min_unique_num=None),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        # a wrapper in order to successfully call test function
+        # actually we don't perform test-time-aug
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.0,
+                flip_ratio_bev_vertical=0.0),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+# we need to load gt seg_mask!
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=tuple(range(len(class_names))),
+        max_cat_id=13),
+    dict(
+        type='DefaultFormatBundle3D',
+        with_label=False,
+        class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    # train on area 1, 2, 3, 4, 6
+    # test on area 5
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_files=[
+            data_root + f's3dis_infos_Area_{i}.pkl' for i in train_area
+        ],
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        ignore_index=len(class_names),
+        scene_idxs=[
+            data_root + f'seg_info/Area_{i}_resampled_scene_idxs.npy'
+            for i in train_area
+        ]),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        ignore_index=len(class_names),
+        scene_idxs=data_root +
+        f'seg_info/Area_{test_area}_resampled_scene_idxs.npy'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        ignore_index=len(class_names)))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/scannet-3d-18class.py b/adzoo/vad/configs/_base_/datasets/scannet-3d-18class.py
new file mode 100644
index 0000000..93da1e5
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/scannet-3d-18class.py
@@ -0,0 +1,128 @@
+# dataset settings
+dataset_type = 'ScanNetDataset'
+data_root = './data/scannet/'
+class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+               'garbagebin')
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
+                       36, 39),
+        max_cat_id=40),
+    dict(type='PointSample', num_points=40000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0],
+        shift_height=True),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D',
+        keys=[
+            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+            'pts_instance_mask'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type='PointSample', num_points=40000),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'scannet_infos_train.pkl',
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            classes=class_names,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='Depth')),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/scannet_seg-3d-20class.py b/adzoo/vad/configs/_base_/datasets/scannet_seg-3d-20class.py
new file mode 100644
index 0000000..cf73b09
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/scannet_seg-3d-20class.py
@@ -0,0 +1,132 @@
+# dataset settings
+dataset_type = 'ScanNetSegDataset'
+data_root = './data/scannet/'
+class_names = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table',
+               'door', 'window', 'bookshelf', 'picture', 'counter', 'desk',
+               'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink',
+               'bathtub', 'otherfurniture')
+num_points = 8192
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28,
+                       33, 34, 36, 39),
+        max_cat_id=40),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.5,
+        ignore_index=len(class_names),
+        use_normalized_coord=False,
+        enlarge_size=0.2,
+        min_unique_num=None),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        # a wrapper in order to successfully call test function
+        # actually we don't perform test-time-aug
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.0,
+                flip_ratio_bev_vertical=0.0),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+# we need to load gt seg_mask!
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5]),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True),
+    dict(
+        type='PointSegClassMapping',
+        valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28,
+                       33, 34, 36, 39),
+        max_cat_id=40),
+    dict(
+        type='DefaultFormatBundle3D',
+        with_label=False,
+        class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'pts_semantic_mask'])
+]
+
+data = dict(
+    samples_per_gpu=8,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        ignore_index=len(class_names),
+        scene_idxs=data_root + 'seg_info/train_resampled_scene_idxs.npy'),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        ignore_index=len(class_names)),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        ignore_index=len(class_names)))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/sunrgbd-3d-10class.py b/adzoo/vad/configs/_base_/datasets/sunrgbd-3d-10class.py
new file mode 100644
index 0000000..7121b75
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/sunrgbd-3d-10class.py
@@ -0,0 +1,107 @@
+dataset_type = 'SUNRGBDDataset'
+data_root = 'data/sunrgbd/'
+class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+               'night_stand', 'bookshelf', 'bathtub')
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+    ),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.523599, 0.523599],
+        scale_ratio_range=[0.85, 1.15],
+        shift_height=True),
+    dict(type='PointSample', num_points=20000),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+            ),
+            dict(type='PointSample', num_points=20000),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        load_dim=6,
+        use_dim=[0, 1, 2]),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=16,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'sunrgbd_infos_train.pkl',
+            pipeline=train_pipeline,
+            classes=class_names,
+            filter_empty_gt=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='Depth')),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'sunrgbd_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'sunrgbd_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='Depth'))
+
+evaluation = dict(pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/waymoD5-3d-3class.py b/adzoo/vad/configs/_base_/datasets/waymoD5-3d-3class.py
new file mode 100644
index 0000000..920ac15
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/waymoD5-3d-3class.py
@@ -0,0 +1,145 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'LidarWaymoDataset'
+data_root = 'data/waymo-full/kitti_format/'
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
+
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'waymo_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+    classes=class_names,
+    sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        file_client_args=file_client_args))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        file_client_args=file_client_args),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'waymo_infos_train.pkl',
+            split='training',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            # load one frame every five frames
+            load_interval=5)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/datasets/waymoD5-3d-car.py b/adzoo/vad/configs/_base_/datasets/waymoD5-3d-car.py
new file mode 100644
index 0000000..02e2627
--- /dev/null
+++ b/adzoo/vad/configs/_base_/datasets/waymoD5-3d-car.py
@@ -0,0 +1,143 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'WaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
+
+class_names = ['Car']
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=True, use_camera=False)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'waymo_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        file_client_args=file_client_args))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        file_client_args=file_client_args),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=6,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'waymo_infos_train.pkl',
+            split='training',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            # load one frame every five frames
+            load_interval=5)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/_base_/default_runtime.py b/adzoo/vad/configs/_base_/default_runtime.py
new file mode 100644
index 0000000..4e85b69
--- /dev/null
+++ b/adzoo/vad/configs/_base_/default_runtime.py
@@ -0,0 +1,18 @@
+checkpoint_config = dict(interval=1)
+# yapf:disable push
+# By default we use textlogger hook and tensorboard
+# For more loggers see
+# https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = None
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/adzoo/vad/configs/_base_/models/3dssd.py b/adzoo/vad/configs/_base_/models/3dssd.py
new file mode 100644
index 0000000..55344c7
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/3dssd.py
@@ -0,0 +1,77 @@
+model = dict(
+    type='SSD3DNet',
+    backbone=dict(
+        type='PointNet2SAMSG',
+        in_channels=4,
+        num_points=(4096, 512, (256, 256)),
+        radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)),
+        num_samples=((32, 32, 64), (32, 32, 64), (32, 32, 32)),
+        sa_channels=(((16, 16, 32), (16, 16, 32), (32, 32, 64)),
+                     ((64, 64, 128), (64, 64, 128), (64, 96, 128)),
+                     ((128, 128, 256), (128, 192, 256), (128, 256, 256))),
+        aggregation_channels=(64, 128, 256),
+        fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')),
+        fps_sample_range_lists=((-1), (-1), (512, -1)),
+        norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1),
+        sa_cfg=dict(
+            type='PointSAModuleMSG',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False)),
+    bbox_head=dict(
+        type='SSD3DHead',
+        in_channels=256,
+        vote_module_cfg=dict(
+            in_channels=256,
+            num_points=256,
+            gt_per_seed=1,
+            conv_channels=(128, ),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
+            with_res_feat=False,
+            vote_xyz_range=(3.0, 3.0, 2.0)),
+        vote_aggregation_cfg=dict(
+            type='PointSAModuleMSG',
+            num_point=256,
+            radii=(4.8, 6.4),
+            sample_nums=(16, 32),
+            mlp_channels=((256, 256, 256, 512), (256, 256, 512, 1024)),
+            norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1),
+            use_xyz=True,
+            normalize_xyz=False,
+            bias=True),
+        pred_layer_cfg=dict(
+            in_channels=1536,
+            shared_conv_channels=(512, 128),
+            cls_conv_channels=(128, ),
+            reg_conv_channels=(128, ),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
+            bias=True),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
+        objectness_loss=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='sum',
+            loss_weight=1.0),
+        center_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+        corner_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
+        vote_loss=dict(type='SmoothL1Loss', reduction='sum', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        sample_mod='spec', pos_distance_thr=10.0, expand_dims_length=0.05),
+    test_cfg=dict(
+        nms_cfg=dict(type='nms', iou_thr=0.1),
+        sample_mod='spec',
+        score_thr=0.0,
+        per_class_proposal=True,
+        max_output_num=100))
diff --git a/adzoo/vad/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py b/adzoo/vad/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
new file mode 100644
index 0000000..fb9e0a8
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
@@ -0,0 +1,200 @@
+# model settings
+model = dict(
+    type='CascadeRCNN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
+    roi_head=dict(
+        type='CascadeRoIHead',
+        num_stages=3,
+        stage_loss_weights=[1, 0.5, 0.25],
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=[
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.1, 0.1, 0.2, 0.2]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.05, 0.05, 0.1, 0.1]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                               loss_weight=1.0)),
+            dict(
+                type='Shared2FCBBoxHead',
+                in_channels=256,
+                fc_out_channels=1024,
+                roi_feat_size=7,
+                num_classes=80,
+                bbox_coder=dict(
+                    type='DeltaXYWHBBoxCoder',
+                    target_means=[0., 0., 0., 0.],
+                    target_stds=[0.033, 0.033, 0.067, 0.067]),
+                reg_class_agnostic=True,
+                loss_cls=dict(
+                    type='CrossEntropyLoss',
+                    use_sigmoid=False,
+                    loss_weight=1.0),
+                loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
+        ],
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_across_levels=False,
+            nms_pre=2000,
+            nms_post=2000,
+            max_num=2000,
+            nms_thr=0.7,
+            min_bbox_size=0),
+        rcnn=[
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.5,
+                    min_pos_iou=0.5,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.6,
+                    min_pos_iou=0.6,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False),
+            dict(
+                assigner=dict(
+                    type='MaxIoUAssigner',
+                    pos_iou_thr=0.7,
+                    neg_iou_thr=0.7,
+                    min_pos_iou=0.7,
+                    match_low_quality=False,
+                    ignore_iof_thr=-1),
+                sampler=dict(
+                    type='RandomSampler',
+                    num=512,
+                    pos_fraction=0.25,
+                    neg_pos_ub=-1,
+                    add_gt_as_proposals=True),
+                mask_size=28,
+                pos_weight=-1,
+                debug=False)
+        ]),
+    test_cfg=dict(
+        rpn=dict(
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_post=1000,
+            max_num=1000,
+            nms_thr=0.7,
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/adzoo/vad/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py b/adzoo/vad/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py
new file mode 100644
index 0000000..efdce59
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py
@@ -0,0 +1,83 @@
+voxel_size = [0.1, 0.1, 0.2]
+model = dict(
+    type='CenterPoint',
+    pts_voxel_layer=dict(
+        max_num_points=10, voxel_size=voxel_size, max_voxels=(90000, 120000)),
+    pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
+    pts_middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=5,
+        sparse_shape=[41, 1024, 1024],
+        output_channels=128,
+        order=('conv', 'norm', 'act'),
+        encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
+                                                                      128)),
+        encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
+        block_type='basicblock'),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        out_channels=[128, 256],
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        conv_cfg=dict(type='Conv2d', bias=False)),
+    pts_neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        out_channels=[256, 256],
+        upsample_strides=[1, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        upsample_cfg=dict(type='deconv', bias=False),
+        use_conv_for_no_stride=True),
+    pts_bbox_head=dict(
+        type='CenterHead',
+        in_channels=sum([256, 256]),
+        tasks=[
+            dict(num_class=1, class_names=['car']),
+            dict(num_class=2, class_names=['truck', 'construction_vehicle']),
+            dict(num_class=2, class_names=['bus', 'trailer']),
+            dict(num_class=1, class_names=['barrier']),
+            dict(num_class=2, class_names=['motorcycle', 'bicycle']),
+            dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
+        ],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type='CenterPointBBoxCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_num=500,
+            score_threshold=0.1,
+            out_size_factor=8,
+            voxel_size=voxel_size[:2],
+            code_size=9),
+        separate_head=dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[1024, 1024, 40],
+            voxel_size=voxel_size,
+            out_size_factor=8,
+            dense_reg=1,
+            gaussian_overlap=0.1,
+            max_objs=500,
+            min_radius=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+    test_cfg=dict(
+        pts=dict(
+            post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_per_img=500,
+            max_pool_nms=False,
+            min_radius=[4, 12, 10, 1, 0.85, 0.175],
+            score_threshold=0.1,
+            out_size_factor=8,
+            voxel_size=voxel_size[:2],
+            nms_type='rotate',
+            pre_max_size=1000,
+            post_max_size=83,
+            nms_thr=0.2)))
diff --git a/adzoo/vad/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py b/adzoo/vad/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py
new file mode 100644
index 0000000..311d763
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py
@@ -0,0 +1,83 @@
+voxel_size = [0.2, 0.2, 8]
+model = dict(
+    type='CenterPoint',
+    pts_voxel_layer=dict(
+        max_num_points=20, voxel_size=voxel_size, max_voxels=(30000, 40000)),
+    pts_voxel_encoder=dict(
+        type='PillarFeatureNet',
+        in_channels=5,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=(0.2, 0.2, 8),
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+        legacy=False),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=(512, 512)),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        out_channels=[64, 128, 256],
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        conv_cfg=dict(type='Conv2d', bias=False)),
+    pts_neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        out_channels=[128, 128, 128],
+        upsample_strides=[0.5, 1, 2],
+        norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
+        upsample_cfg=dict(type='deconv', bias=False),
+        use_conv_for_no_stride=True),
+    pts_bbox_head=dict(
+        type='CenterHead',
+        in_channels=sum([128, 128, 128]),
+        tasks=[
+            dict(num_class=1, class_names=['car']),
+            dict(num_class=2, class_names=['truck', 'construction_vehicle']),
+            dict(num_class=2, class_names=['bus', 'trailer']),
+            dict(num_class=1, class_names=['barrier']),
+            dict(num_class=2, class_names=['motorcycle', 'bicycle']),
+            dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
+        ],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type='CenterPointBBoxCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_num=500,
+            score_threshold=0.1,
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            code_size=9),
+        separate_head=dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            out_size_factor=4,
+            dense_reg=1,
+            gaussian_overlap=0.1,
+            max_objs=500,
+            min_radius=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+    test_cfg=dict(
+        pts=dict(
+            post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_per_img=500,
+            max_pool_nms=False,
+            min_radius=[4, 12, 10, 1, 0.85, 0.175],
+            score_threshold=0.1,
+            pc_range=[-51.2, -51.2],
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            nms_type='rotate',
+            pre_max_size=1000,
+            post_max_size=83,
+            nms_thr=0.2)))
diff --git a/adzoo/vad/configs/_base_/models/fcos3d.py b/adzoo/vad/configs/_base_/models/fcos3d.py
new file mode 100644
index 0000000..92ea907
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/fcos3d.py
@@ -0,0 +1,74 @@
+model = dict(
+    type='FCOSMono3D',
+    pretrained='open-mmlab://detectron2/resnet101_caffe',
+    backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs='on_output',
+        num_outs=5,
+        relu_before_extra_convs=True),
+    bbox_head=dict(
+        type='FCOSMono3DHead',
+        num_classes=10,
+        in_channels=256,
+        stacked_convs=2,
+        feat_channels=256,
+        use_direction_classifier=True,
+        diff_rad_by_sin=True,
+        pred_attrs=True,
+        pred_velo=True,
+        dir_offset=0.7854,  # pi/4
+        strides=[8, 16, 32, 64, 128],
+        group_reg_dims=(2, 1, 3, 1, 2),  # offset, depth, size, rot, velo
+        cls_branch=(256, ),
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            ()  # velo
+        ),
+        dir_branch=(256, ),
+        attr_branch=(256, ),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_attr=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        norm_on_bbox=True,
+        centerness_on_reg=True,
+        center_sampling=True,
+        conv_bias=True,
+        dcn_on_last_conv=True),
+    train_cfg=dict(
+        allowed_border=0,
+        code_weight=[1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05],
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_thr=0.8,
+        score_thr=0.05,
+        min_bbox_size=0,
+        max_per_img=200))
diff --git a/adzoo/vad/configs/_base_/models/groupfree3d.py b/adzoo/vad/configs/_base_/models/groupfree3d.py
new file mode 100644
index 0000000..077d049
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/groupfree3d.py
@@ -0,0 +1,71 @@
+model = dict(
+    type='GroupFree3DNet',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=3,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                     (128, 128, 256)),
+        fp_channels=((256, 256), (256, 288)),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)),
+    bbox_head=dict(
+        type='GroupFree3DHead',
+        in_channels=288,
+        num_decoder_layers=6,
+        num_proposal=256,
+        transformerlayers=dict(
+            type='BaseTransformerLayer',
+            attn_cfgs=dict(
+                type='GroupFree3DMHA',
+                embed_dims=288,
+                num_heads=8,
+                attn_drop=0.1,
+                dropout_layer=dict(type='Dropout', drop_prob=0.1)),
+            ffn_cfgs=dict(
+                embed_dims=288,
+                feedforward_channels=2048,
+                ffn_drop=0.1,
+                act_cfg=dict(type='ReLU', inplace=True)),
+            operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn',
+                             'norm')),
+        pred_layer_cfg=dict(
+            in_channels=288, shared_conv_channels=(288, 288), bias=True),
+        sampling_objectness_loss=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=8.0),
+        objectness_loss=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        center_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', beta=1.0, reduction='sum', loss_weight=10.0),
+        semantic_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(sample_mod='kps'),
+    test_cfg=dict(
+        sample_mod='kps',
+        nms_thr=0.25,
+        score_thr=0.0,
+        per_class_proposal=True,
+        prediction_stages='last'))
diff --git a/adzoo/vad/configs/_base_/models/h3dnet.py b/adzoo/vad/configs/_base_/models/h3dnet.py
new file mode 100644
index 0000000..7605667
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/h3dnet.py
@@ -0,0 +1,341 @@
+primitive_z_cfg = dict(
+    type='PrimitiveHead',
+    num_dims=2,
+    num_classes=18,
+    primitive_mode='z',
+    upper_thresh=100.0,
+    surface_thresh=0.5,
+    vote_module_cfg=dict(
+        in_channels=256,
+        vote_per_seed=1,
+        gt_per_seed=1,
+        conv_channels=(256, 256),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        norm_feats=True,
+        vote_loss=dict(
+            type='ChamferDistance',
+            mode='l1',
+            reduction='none',
+            loss_dst_weight=10.0)),
+    vote_aggregation_cfg=dict(
+        type='PointSAModule',
+        num_point=1024,
+        radius=0.3,
+        num_sample=16,
+        mlp_channels=[256, 128, 128, 128],
+        use_xyz=True,
+        normalize_xyz=True),
+    feat_channels=(128, 128),
+    conv_cfg=dict(type='Conv1d'),
+    norm_cfg=dict(type='BN1d'),
+    objectness_loss=dict(
+        type='CrossEntropyLoss',
+        class_weight=[0.4, 0.6],
+        reduction='mean',
+        loss_weight=30.0),
+    center_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=0.5,
+        loss_dst_weight=0.5),
+    semantic_reg_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=0.5,
+        loss_dst_weight=0.5),
+    semantic_cls_loss=dict(
+        type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+    train_cfg=dict(
+        dist_thresh=0.2,
+        var_thresh=1e-2,
+        lower_thresh=1e-6,
+        num_point=100,
+        num_point_line=10,
+        line_thresh=0.2))
+
+primitive_xy_cfg = dict(
+    type='PrimitiveHead',
+    num_dims=1,
+    num_classes=18,
+    primitive_mode='xy',
+    upper_thresh=100.0,
+    surface_thresh=0.5,
+    vote_module_cfg=dict(
+        in_channels=256,
+        vote_per_seed=1,
+        gt_per_seed=1,
+        conv_channels=(256, 256),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        norm_feats=True,
+        vote_loss=dict(
+            type='ChamferDistance',
+            mode='l1',
+            reduction='none',
+            loss_dst_weight=10.0)),
+    vote_aggregation_cfg=dict(
+        type='PointSAModule',
+        num_point=1024,
+        radius=0.3,
+        num_sample=16,
+        mlp_channels=[256, 128, 128, 128],
+        use_xyz=True,
+        normalize_xyz=True),
+    feat_channels=(128, 128),
+    conv_cfg=dict(type='Conv1d'),
+    norm_cfg=dict(type='BN1d'),
+    objectness_loss=dict(
+        type='CrossEntropyLoss',
+        class_weight=[0.4, 0.6],
+        reduction='mean',
+        loss_weight=30.0),
+    center_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=0.5,
+        loss_dst_weight=0.5),
+    semantic_reg_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=0.5,
+        loss_dst_weight=0.5),
+    semantic_cls_loss=dict(
+        type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+    train_cfg=dict(
+        dist_thresh=0.2,
+        var_thresh=1e-2,
+        lower_thresh=1e-6,
+        num_point=100,
+        num_point_line=10,
+        line_thresh=0.2))
+
+primitive_line_cfg = dict(
+    type='PrimitiveHead',
+    num_dims=0,
+    num_classes=18,
+    primitive_mode='line',
+    upper_thresh=100.0,
+    surface_thresh=0.5,
+    vote_module_cfg=dict(
+        in_channels=256,
+        vote_per_seed=1,
+        gt_per_seed=1,
+        conv_channels=(256, 256),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        norm_feats=True,
+        vote_loss=dict(
+            type='ChamferDistance',
+            mode='l1',
+            reduction='none',
+            loss_dst_weight=10.0)),
+    vote_aggregation_cfg=dict(
+        type='PointSAModule',
+        num_point=1024,
+        radius=0.3,
+        num_sample=16,
+        mlp_channels=[256, 128, 128, 128],
+        use_xyz=True,
+        normalize_xyz=True),
+    feat_channels=(128, 128),
+    conv_cfg=dict(type='Conv1d'),
+    norm_cfg=dict(type='BN1d'),
+    objectness_loss=dict(
+        type='CrossEntropyLoss',
+        class_weight=[0.4, 0.6],
+        reduction='mean',
+        loss_weight=30.0),
+    center_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=1.0,
+        loss_dst_weight=1.0),
+    semantic_reg_loss=dict(
+        type='ChamferDistance',
+        mode='l1',
+        reduction='sum',
+        loss_src_weight=1.0,
+        loss_dst_weight=1.0),
+    semantic_cls_loss=dict(
+        type='CrossEntropyLoss', reduction='sum', loss_weight=2.0),
+    train_cfg=dict(
+        dist_thresh=0.2,
+        var_thresh=1e-2,
+        lower_thresh=1e-6,
+        num_point=100,
+        num_point_line=10,
+        line_thresh=0.2))
+
+model = dict(
+    type='H3DNet',
+    backbone=dict(
+        type='MultiBackbone',
+        num_streams=4,
+        suffixes=['net0', 'net1', 'net2', 'net3'],
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.01),
+        act_cfg=dict(type='ReLU'),
+        backbones=dict(
+            type='PointNet2SASSG',
+            in_channels=4,
+            num_points=(2048, 1024, 512, 256),
+            radius=(0.2, 0.4, 0.8, 1.2),
+            num_samples=(64, 32, 16, 16),
+            sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                         (128, 128, 256)),
+            fp_channels=((256, 256), (256, 256)),
+            norm_cfg=dict(type='BN2d'),
+            sa_cfg=dict(
+                type='PointSAModule',
+                pool_mod='max',
+                use_xyz=True,
+                normalize_xyz=True))),
+    rpn_head=dict(
+        type='VoteHead',
+        vote_module_cfg=dict(
+            in_channels=256,
+            vote_per_seed=1,
+            gt_per_seed=3,
+            conv_channels=(256, 256),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            norm_feats=True,
+            vote_loss=dict(
+                type='ChamferDistance',
+                mode='l1',
+                reduction='none',
+                loss_dst_weight=10.0)),
+        vote_aggregation_cfg=dict(
+            type='PointSAModule',
+            num_point=256,
+            radius=0.3,
+            num_sample=16,
+            mlp_channels=[256, 128, 128, 128],
+            use_xyz=True,
+            normalize_xyz=True),
+        pred_layer_cfg=dict(
+            in_channels=128, shared_conv_channels=(128, 128), bias=True),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        objectness_loss=dict(
+            type='CrossEntropyLoss',
+            class_weight=[0.2, 0.8],
+            reduction='sum',
+            loss_weight=5.0),
+        center_loss=dict(
+            type='ChamferDistance',
+            mode='l2',
+            reduction='sum',
+            loss_src_weight=10.0,
+            loss_dst_weight=10.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        semantic_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    roi_head=dict(
+        type='H3DRoIHead',
+        primitive_list=[primitive_z_cfg, primitive_xy_cfg, primitive_line_cfg],
+        bbox_head=dict(
+            type='H3DBboxHead',
+            gt_per_seed=3,
+            num_proposal=256,
+            suface_matching_cfg=dict(
+                type='PointSAModule',
+                num_point=256 * 6,
+                radius=0.5,
+                num_sample=32,
+                mlp_channels=[128 + 6, 128, 64, 32],
+                use_xyz=True,
+                normalize_xyz=True),
+            line_matching_cfg=dict(
+                type='PointSAModule',
+                num_point=256 * 12,
+                radius=0.5,
+                num_sample=32,
+                mlp_channels=[128 + 12, 128, 64, 32],
+                use_xyz=True,
+                normalize_xyz=True),
+            feat_channels=(128, 128),
+            primitive_refine_channels=[128, 128, 128],
+            upper_thresh=100.0,
+            surface_thresh=0.5,
+            line_thresh=0.5,
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            objectness_loss=dict(
+                type='CrossEntropyLoss',
+                class_weight=[0.2, 0.8],
+                reduction='sum',
+                loss_weight=5.0),
+            center_loss=dict(
+                type='ChamferDistance',
+                mode='l2',
+                reduction='sum',
+                loss_src_weight=10.0,
+                loss_dst_weight=10.0),
+            dir_class_loss=dict(
+                type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
+            dir_res_loss=dict(
+                type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+            size_class_loss=dict(
+                type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
+            size_res_loss=dict(
+                type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+            semantic_loss=dict(
+                type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
+            cues_objectness_loss=dict(
+                type='CrossEntropyLoss',
+                class_weight=[0.3, 0.7],
+                reduction='mean',
+                loss_weight=5.0),
+            cues_semantic_loss=dict(
+                type='CrossEntropyLoss',
+                class_weight=[0.3, 0.7],
+                reduction='mean',
+                loss_weight=5.0),
+            proposal_objectness_loss=dict(
+                type='CrossEntropyLoss',
+                class_weight=[0.2, 0.8],
+                reduction='none',
+                loss_weight=5.0),
+            primitive_center_loss=dict(
+                type='MSELoss', reduction='none', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'),
+        rpn_proposal=dict(use_nms=False),
+        rcnn=dict(
+            pos_distance_thr=0.3,
+            neg_distance_thr=0.6,
+            sample_mod='vote',
+            far_threshold=0.6,
+            near_threshold=0.3,
+            mask_surface_threshold=0.3,
+            label_surface_threshold=0.3,
+            mask_line_threshold=0.3,
+            label_line_threshold=0.3)),
+    test_cfg=dict(
+        rpn=dict(
+            sample_mod='seed',
+            nms_thr=0.25,
+            score_thr=0.05,
+            per_class_proposal=True,
+            use_nms=False),
+        rcnn=dict(
+            sample_mod='seed',
+            nms_thr=0.25,
+            score_thr=0.05,
+            per_class_proposal=True)))
diff --git a/adzoo/vad/configs/_base_/models/hv_pointpillars_fpn_lyft.py b/adzoo/vad/configs/_base_/models/hv_pointpillars_fpn_lyft.py
new file mode 100644
index 0000000..87c7fe0
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/hv_pointpillars_fpn_lyft.py
@@ -0,0 +1,22 @@
+_base_ = './hv_pointpillars_fpn_nus.py'
+
+# model settings (based on nuScenes model settings)
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+model = dict(
+    pts_voxel_layer=dict(
+        max_num_points=20,
+        point_cloud_range=[-80, -80, -5, 80, 80, 3],
+        max_voxels=(60000, 60000)),
+    pts_voxel_encoder=dict(
+        feat_channels=[64], point_cloud_range=[-80, -80, -5, 80, 80, 3]),
+    pts_middle_encoder=dict(output_shape=[640, 640]),
+    pts_bbox_head=dict(
+        num_classes=9,
+        anchor_generator=dict(
+            ranges=[[-80, -80, -1.8, 80, 80, -1.8]], custom_values=[]),
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)),
+    # model training settings (based on nuScenes model settings)
+    train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])))
diff --git a/adzoo/vad/configs/_base_/models/hv_pointpillars_fpn_nus.py b/adzoo/vad/configs/_base_/models/hv_pointpillars_fpn_nus.py
new file mode 100644
index 0000000..e153f6c
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/hv_pointpillars_fpn_nus.py
@@ -0,0 +1,96 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.25, 0.25, 8]
+model = dict(
+    type='MVXFasterRCNN',
+    pts_voxel_layer=dict(
+        max_num_points=64,
+        point_cloud_range=[-50, -50, -5, 50, 50, 3],
+        voxel_size=voxel_size,
+        max_voxels=(30000, 40000)),
+    pts_voxel_encoder=dict(
+        type='HardVFE',
+        in_channels=4,
+        feat_channels=[64, 64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=[-50, -50, -5, 50, 50, 3],
+        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[400, 400]),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256]),
+    pts_neck=dict(
+        type='FPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        act_cfg=dict(type='ReLU'),
+        in_channels=[64, 128, 256],
+        out_channels=256,
+        start_level=0,
+        num_outs=3),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=10,
+        in_channels=256,
+        feat_channels=256,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
+            scales=[1, 2, 4],
+            sizes=[
+                [0.8660, 2.5981, 1.],  # 1.5/sqrt(3)
+                [0.5774, 1.7321, 1.],  # 1/sqrt(3)
+                [1., 1., 1.],
+                [0.4, 0.4, 1],
+            ],
+            custom_values=[0, 0],
+            rotations=[0, 1.57],
+            reshape_out=True),
+        assigner_per_size=False,
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            allowed_border=0,
+            code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        pts=dict(
+            use_rotate_nms=True,
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_thr=0.2,
+            score_thr=0.05,
+            min_bbox_size=0,
+            max_num=500)))
diff --git a/adzoo/vad/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py b/adzoo/vad/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py
new file mode 100644
index 0000000..9cd200f
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py
@@ -0,0 +1,22 @@
+_base_ = './hv_pointpillars_fpn_nus.py'
+
+# model settings (based on nuScenes model settings)
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+model = dict(
+    pts_voxel_layer=dict(
+        max_num_points=20,
+        point_cloud_range=[-100, -100, -5, 100, 100, 3],
+        max_voxels=(60000, 60000)),
+    pts_voxel_encoder=dict(
+        feat_channels=[64], point_cloud_range=[-100, -100, -5, 100, 100, 3]),
+    pts_middle_encoder=dict(output_shape=[800, 800]),
+    pts_bbox_head=dict(
+        num_classes=9,
+        anchor_generator=dict(
+            ranges=[[-100, -100, -1.8, 100, 100, -1.8]], custom_values=[]),
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)),
+    # model training settings (based on nuScenes model settings)
+    train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])))
diff --git a/adzoo/vad/configs/_base_/models/hv_pointpillars_secfpn_kitti.py b/adzoo/vad/configs/_base_/models/hv_pointpillars_secfpn_kitti.py
new file mode 100644
index 0000000..85076d0
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/hv_pointpillars_secfpn_kitti.py
@@ -0,0 +1,93 @@
+voxel_size = [0.16, 0.16, 4]
+
+model = dict(
+    type='VoxelNet',
+    voxel_layer=dict(
+        max_num_points=32,  # max_points_per_voxel
+        point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1],
+        voxel_size=voxel_size,
+        max_voxels=(16000, 40000)  # (training, testing) max_voxels
+    ),
+    voxel_encoder=dict(
+        type='PillarFeatureNet',
+        in_channels=4,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1]),
+    middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]),
+    backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        layer_nums=[3, 5, 5],
+        layer_strides=[2, 2, 2],
+        out_channels=[64, 128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[
+                [0, -39.68, -0.6, 70.4, 39.68, -0.6],
+                [0, -39.68, -0.6, 70.4, 39.68, -0.6],
+                [0, -39.68, -1.78, 70.4, 39.68, -1.78],
+            ],
+            sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=[
+            dict(  # for Pedestrian
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Cyclist
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.35,
+                min_pos_iou=0.35,
+                ignore_iof_thr=-1),
+            dict(  # for Car
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1),
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.1,
+        min_bbox_size=0,
+        nms_pre=100,
+        max_num=50))
diff --git a/adzoo/vad/configs/_base_/models/hv_pointpillars_secfpn_waymo.py b/adzoo/vad/configs/_base_/models/hv_pointpillars_secfpn_waymo.py
new file mode 100644
index 0000000..14873ea
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/hv_pointpillars_secfpn_waymo.py
@@ -0,0 +1,108 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.32, 0.32, 6]
+model = dict(
+    type='MVXFasterRCNN',
+    pts_voxel_layer=dict(
+        max_num_points=20,
+        point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
+        voxel_size=voxel_size,
+        max_voxels=(32000, 32000)),
+    pts_voxel_encoder=dict(
+        type='HardVFE',
+        in_channels=5,
+        feat_channels=[64],
+        with_distance=False,
+        voxel_size=voxel_size,
+        with_cluster_center=True,
+        with_voxel_center=True,
+        point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
+        norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
+    pts_middle_encoder=dict(
+        type='PointPillarsScatter', in_channels=64, output_shape=[468, 468]),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[3, 5, 5],
+        layer_strides=[1, 2, 2],
+        out_channels=[64, 128, 256]),
+    pts_neck=dict(
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[64, 128, 256],
+        upsample_strides=[1, 2, 4],
+        out_channels=[128, 128, 128]),
+    pts_bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=384,
+        feat_channels=384,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345],
+                    [-74.88, -74.88, -0.1188, 74.88, 74.88, -0.1188],
+                    [-74.88, -74.88, 0, 74.88, 74.88, 0]],
+            sizes=[
+                [2.08, 4.73, 1.77],  # car
+                [0.84, 1.81, 1.77],  # cyclist
+                [0.84, 0.91, 1.74]  # pedestrian
+            ],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            assigner=[
+                dict(  # car
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.4,
+                    min_pos_iou=0.4,
+                    ignore_iof_thr=-1),
+                dict(  # cyclist
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.3,
+                    min_pos_iou=0.3,
+                    ignore_iof_thr=-1),
+                dict(  # pedestrian
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.3,
+                    min_pos_iou=0.3,
+                    ignore_iof_thr=-1),
+            ],
+            allowed_border=0,
+            code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        pts=dict(
+            use_rotate_nms=True,
+            nms_across_levels=False,
+            nms_pre=4096,
+            nms_thr=0.25,
+            score_thr=0.1,
+            min_bbox_size=0,
+            max_num=500)))
diff --git a/adzoo/vad/configs/_base_/models/hv_second_secfpn_kitti.py b/adzoo/vad/configs/_base_/models/hv_second_secfpn_kitti.py
new file mode 100644
index 0000000..6bf18ab
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/hv_second_secfpn_kitti.py
@@ -0,0 +1,89 @@
+voxel_size = [0.05, 0.05, 0.1]
+
+model = dict(
+    type='VoxelNet',
+    voxel_layer=dict(
+        max_num_points=5,
+        point_cloud_range=[0, -40, -3, 70.4, 40, 1],
+        voxel_size=voxel_size,
+        max_voxels=(16000, 40000)),
+    voxel_encoder=dict(type='HardSimpleVFE'),
+    middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=4,
+        sparse_shape=[41, 1600, 1408],
+        order=('conv', 'norm', 'act')),
+    backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                [0, -40.0, -1.78, 70.4, 40.0, -1.78],
+            ],
+            sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=[
+            dict(  # for Pedestrian
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.35,
+                neg_iou_thr=0.2,
+                min_pos_iou=0.2,
+                ignore_iof_thr=-1),
+            dict(  # for Cyclist
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.35,
+                neg_iou_thr=0.2,
+                min_pos_iou=0.2,
+                ignore_iof_thr=-1),
+            dict(  # for Car
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1),
+        ],
+        allowed_border=0,
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_thr=0.01,
+        score_thr=0.1,
+        min_bbox_size=0,
+        nms_pre=100,
+        max_num=50))
diff --git a/adzoo/vad/configs/_base_/models/hv_second_secfpn_waymo.py b/adzoo/vad/configs/_base_/models/hv_second_secfpn_waymo.py
new file mode 100644
index 0000000..eb9bd3a
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/hv_second_secfpn_waymo.py
@@ -0,0 +1,100 @@
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.08, 0.08, 0.1]
+model = dict(
+    type='VoxelNet',
+    voxel_layer=dict(
+        max_num_points=10,
+        point_cloud_range=[-76.8, -51.2, -2, 76.8, 51.2, 4],
+        voxel_size=voxel_size,
+        max_voxels=(80000, 90000)),
+    voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
+    middle_encoder=dict(
+        type='SparseEncoder',
+        in_channels=5,
+        sparse_shape=[61, 1280, 1920],
+        order=('conv', 'norm', 'act')),
+    backbone=dict(
+        type='SECOND',
+        in_channels=384,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    bbox_head=dict(
+        type='Anchor3DHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-76.8, -51.2, -0.0345, 76.8, 51.2, -0.0345],
+                    [-76.8, -51.2, 0, 76.8, 51.2, 0],
+                    [-76.8, -51.2, -0.1188, 76.8, 51.2, -0.1188]],
+            sizes=[
+                [2.08, 4.73, 1.77],  # car
+                [0.84, 0.91, 1.74],  # pedestrian
+                [0.84, 1.81, 1.77]  # cyclist
+            ],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        dir_offset=0.7854,  # pi/4
+        dir_limit_offset=0,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        assigner=[
+            dict(  # car
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.55,
+                neg_iou_thr=0.4,
+                min_pos_iou=0.4,
+                ignore_iof_thr=-1),
+            dict(  # pedestrian
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1),
+            dict(  # cyclist
+                type='MaxIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                ignore_iof_thr=-1)
+        ],
+        allowed_border=0,
+        code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+        pos_weight=-1,
+        debug=False),
+    test_cfg=dict(
+        use_rotate_nms=True,
+        nms_across_levels=False,
+        nms_pre=4096,
+        nms_thr=0.25,
+        score_thr=0.1,
+        min_bbox_size=0,
+        max_num=500))
diff --git a/adzoo/vad/configs/_base_/models/imvotenet_image.py b/adzoo/vad/configs/_base_/models/imvotenet_image.py
new file mode 100644
index 0000000..981f8bc
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/imvotenet_image.py
@@ -0,0 +1,108 @@
+model = dict(
+    type='ImVoteNet',
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    img_neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    img_rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    img_roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=10,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+
+    # model training and testing settings
+    train_cfg=dict(
+        img_rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        img_rpn_proposal=dict(
+            nms_across_levels=False,
+            nms_pre=2000,
+            nms_post=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        img_rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        img_rpn=dict(
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_post=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        img_rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)))
diff --git a/adzoo/vad/configs/_base_/models/mask_rcnn_r50_fpn.py b/adzoo/vad/configs/_base_/models/mask_rcnn_r50_fpn.py
new file mode 100644
index 0000000..c5d5e32
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/mask_rcnn_r50_fpn.py
@@ -0,0 +1,124 @@
+# model settings
+model = dict(
+    type='MaskRCNN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+        mask_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        mask_head=dict(
+            type='FCNMaskHead',
+            num_convs=4,
+            in_channels=256,
+            conv_out_channels=256,
+            num_classes=80,
+            loss_mask=dict(
+                type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_across_levels=False,
+            nms_pre=2000,
+            nms_post=1000,
+            max_num=1000,
+            nms_thr=0.7,
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            mask_size=28,
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_across_levels=False,
+            nms_pre=1000,
+            nms_post=1000,
+            max_num=1000,
+            nms_thr=0.7,
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100,
+            mask_thr_binary=0.5)))
diff --git a/adzoo/vad/configs/_base_/models/paconv_cuda_ssg.py b/adzoo/vad/configs/_base_/models/paconv_cuda_ssg.py
new file mode 100644
index 0000000..f513bd4
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/paconv_cuda_ssg.py
@@ -0,0 +1,7 @@
+_base_ = './paconv_ssg.py'
+
+model = dict(
+    backbone=dict(
+        sa_cfg=dict(
+            type='PAConvCUDASAModule',
+            scorenet_cfg=dict(mlp_channels=[8, 16, 16]))))
diff --git a/adzoo/vad/configs/_base_/models/paconv_ssg.py b/adzoo/vad/configs/_base_/models/paconv_ssg.py
new file mode 100644
index 0000000..1d4f1ed
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/paconv_ssg.py
@@ -0,0 +1,49 @@
+# model settings
+model = dict(
+    type='EncoderDecoder3D',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=9,  # [xyz, rgb, normalized_xyz]
+        num_points=(1024, 256, 64, 16),
+        radius=(None, None, None, None),  # use kNN instead of ball query
+        num_samples=(32, 32, 32, 32),
+        sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256,
+                                                                    512)),
+        fp_channels=(),
+        norm_cfg=dict(type='BN2d', momentum=0.1),
+        sa_cfg=dict(
+            type='PAConvSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False,
+            paconv_num_kernels=[16, 16, 16],
+            paconv_kernel_input='w_neighbor',
+            scorenet_input='w_neighbor_dist',
+            scorenet_cfg=dict(
+                mlp_channels=[16, 16, 16],
+                score_norm='softmax',
+                temp_factor=1.0,
+                last_bn=False))),
+    decode_head=dict(
+        type='PAConvHead',
+        # PAConv model's decoder takes skip connections from beckbone
+        # different from PointNet++, it also concats input features in the last
+        # level of decoder, leading to `128 + 6` as the channel number
+        fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128),
+                     (128 + 6, 128, 128, 128)),
+        channels=128,
+        dropout_ratio=0.5,
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        act_cfg=dict(type='ReLU'),
+        loss_decode=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            class_weight=None,  # should be modified with dataset
+            loss_weight=1.0)),
+    # correlation loss to regularize PAConv's kernel weights
+    loss_regularization=dict(
+        type='PAConvRegularizationLoss', reduction='sum', loss_weight=10.0),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide'))
diff --git a/adzoo/vad/configs/_base_/models/parta2.py b/adzoo/vad/configs/_base_/models/parta2.py
new file mode 100644
index 0000000..6c5ae9a
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/parta2.py
@@ -0,0 +1,201 @@
+# model settings
+voxel_size = [0.05, 0.05, 0.1]
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+model = dict(
+    type='PartA2',
+    voxel_layer=dict(
+        max_num_points=5,  # max_points_per_voxel
+        point_cloud_range=point_cloud_range,
+        voxel_size=voxel_size,
+        max_voxels=(16000, 40000)  # (training, testing) max_voxels
+    ),
+    voxel_encoder=dict(type='HardSimpleVFE'),
+    middle_encoder=dict(
+        type='SparseUNet',
+        in_channels=4,
+        sparse_shape=[41, 1600, 1408],
+        order=('conv', 'norm', 'act')),
+    backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        out_channels=[128, 256]),
+    neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        upsample_strides=[1, 2],
+        out_channels=[256, 256]),
+    rpn_head=dict(
+        type='PartA2RPNHead',
+        num_classes=3,
+        in_channels=512,
+        feat_channels=512,
+        use_direction_classifier=True,
+        anchor_generator=dict(
+            type='Anchor3DRangeGenerator',
+            ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -0.6, 70.4, 40.0, -0.6],
+                    [0, -40.0, -1.78, 70.4, 40.0, -1.78]],
+            sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False),
+        diff_rad_by_sin=True,
+        assigner_per_size=True,
+        assign_per_class=True,
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+        loss_dir=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
+    roi_head=dict(
+        type='PartAggregationROIHead',
+        num_classes=3,
+        semantic_head=dict(
+            type='PointwiseSemanticHead',
+            in_channels=16,
+            extra_width=0.2,
+            seg_score_thr=0.3,
+            num_classes=3,
+            loss_seg=dict(
+                type='FocalLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_part=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+        seg_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='max')),
+        part_roi_extractor=dict(
+            type='Single3DRoIAwareExtractor',
+            roi_layer=dict(
+                type='RoIAwarePool3d',
+                out_size=14,
+                max_pts_per_voxel=128,
+                mode='avg')),
+        bbox_head=dict(
+            type='PartA2BboxHead',
+            num_classes=3,
+            seg_in_channels=16,
+            part_in_channels=4,
+            seg_conv_channels=[64, 64],
+            part_conv_channels=[64, 64],
+            merge_conv_channels=[128, 128],
+            down_conv_channels=[128, 256],
+            bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+            shared_fc_channels=[256, 512, 512, 512],
+            cls_channels=[256, 256],
+            reg_channels=[256, 256],
+            dropout_ratio=0.1,
+            roi_feat_size=14,
+            with_corner_loss=True,
+            loss_bbox=dict(
+                type='SmoothL1Loss',
+                beta=1.0 / 9.0,
+                reduction='sum',
+                loss_weight=1.0),
+            loss_cls=dict(
+                type='CrossEntropyLoss',
+                use_sigmoid=True,
+                reduction='sum',
+                loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=[
+                dict(  # for Pedestrian
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.35,
+                    min_pos_iou=0.35,
+                    ignore_iof_thr=-1),
+                dict(  # for Cyclist
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.5,
+                    neg_iou_thr=0.35,
+                    min_pos_iou=0.35,
+                    ignore_iof_thr=-1),
+                dict(  # for Car
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                    pos_iou_thr=0.6,
+                    neg_iou_thr=0.45,
+                    min_pos_iou=0.45,
+                    ignore_iof_thr=-1)
+            ],
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=9000,
+            nms_post=512,
+            max_num=512,
+            nms_thr=0.8,
+            score_thr=0,
+            use_rotate_nms=False),
+        rcnn=dict(
+            assigner=[
+                dict(  # for Pedestrian
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1),
+                dict(  # for Cyclist
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1),
+                dict(  # for Car
+                    type='MaxIoUAssigner',
+                    iou_calculator=dict(
+                        type='BboxOverlaps3D', coordinate='lidar'),
+                    pos_iou_thr=0.55,
+                    neg_iou_thr=0.55,
+                    min_pos_iou=0.55,
+                    ignore_iof_thr=-1)
+            ],
+            sampler=dict(
+                type='IoUNegPiecewiseSampler',
+                num=128,
+                pos_fraction=0.55,
+                neg_piece_fractions=[0.8, 0.2],
+                neg_iou_piece_thrs=[0.55, 0.1],
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False,
+                return_iou=True),
+            cls_pos_thr=0.75,
+            cls_neg_thr=0.25)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1024,
+            nms_post=100,
+            max_num=100,
+            nms_thr=0.7,
+            score_thr=0,
+            use_rotate_nms=True),
+        rcnn=dict(
+            use_rotate_nms=True,
+            use_raw_score=True,
+            nms_thr=0.01,
+            score_thr=0.1)))
diff --git a/adzoo/vad/configs/_base_/models/pointnet2_msg.py b/adzoo/vad/configs/_base_/models/pointnet2_msg.py
new file mode 100644
index 0000000..222ab88
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/pointnet2_msg.py
@@ -0,0 +1,28 @@
+_base_ = './pointnet2_ssg.py'
+
+# model settings
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='PointNet2SAMSG',
+        in_channels=6,  # [xyz, rgb], should be modified with dataset
+        num_points=(1024, 256, 64, 16),
+        radii=((0.05, 0.1), (0.1, 0.2), (0.2, 0.4), (0.4, 0.8)),
+        num_samples=((16, 32), (16, 32), (16, 32), (16, 32)),
+        sa_channels=(((16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 96,
+                                                                    128)),
+                     ((128, 196, 256), (128, 196, 256)), ((256, 256, 512),
+                                                          (256, 384, 512))),
+        aggregation_channels=(None, None, None, None),
+        fps_mods=(('D-FPS'), ('D-FPS'), ('D-FPS'), ('D-FPS')),
+        fps_sample_range_lists=((-1), (-1), (-1), (-1)),
+        dilated_group=(False, False, False, False),
+        out_indices=(0, 1, 2, 3),
+        sa_cfg=dict(
+            type='PointSAModuleMSG',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False)),
+    decode_head=dict(
+        fp_channels=((1536, 256, 256), (512, 256, 256), (352, 256, 128),
+                     (128, 128, 128, 128))))
diff --git a/adzoo/vad/configs/_base_/models/pointnet2_ssg.py b/adzoo/vad/configs/_base_/models/pointnet2_ssg.py
new file mode 100644
index 0000000..58b4c24
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/pointnet2_ssg.py
@@ -0,0 +1,35 @@
+# model settings
+model = dict(
+    type='EncoderDecoder3D',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=6,  # [xyz, rgb], should be modified with dataset
+        num_points=(1024, 256, 64, 16),
+        radius=(0.1, 0.2, 0.4, 0.8),
+        num_samples=(32, 32, 32, 32),
+        sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256,
+                                                                    512)),
+        fp_channels=(),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=False)),
+    decode_head=dict(
+        type='PointNet2Head',
+        fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128),
+                     (128, 128, 128, 128)),
+        channels=128,
+        dropout_ratio=0.5,
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        act_cfg=dict(type='ReLU'),
+        loss_decode=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            class_weight=None,  # should be modified with dataset
+            loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide'))
diff --git a/adzoo/vad/configs/_base_/models/votenet.py b/adzoo/vad/configs/_base_/models/votenet.py
new file mode 100644
index 0000000..129339d
--- /dev/null
+++ b/adzoo/vad/configs/_base_/models/votenet.py
@@ -0,0 +1,73 @@
+model = dict(
+    type='VoteNet',
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=4,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                     (128, 128, 256)),
+        fp_channels=((256, 256), (256, 256)),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)),
+    bbox_head=dict(
+        type='VoteHead',
+        vote_module_cfg=dict(
+            in_channels=256,
+            vote_per_seed=1,
+            gt_per_seed=3,
+            conv_channels=(256, 256),
+            conv_cfg=dict(type='Conv1d'),
+            norm_cfg=dict(type='BN1d'),
+            norm_feats=True,
+            vote_loss=dict(
+                type='ChamferDistance',
+                mode='l1',
+                reduction='none',
+                loss_dst_weight=10.0)),
+        vote_aggregation_cfg=dict(
+            type='PointSAModule',
+            num_point=256,
+            radius=0.3,
+            num_sample=16,
+            mlp_channels=[256, 128, 128, 128],
+            use_xyz=True,
+            normalize_xyz=True),
+        pred_layer_cfg=dict(
+            in_channels=128, shared_conv_channels=(128, 128), bias=True),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        objectness_loss=dict(
+            type='CrossEntropyLoss',
+            class_weight=[0.2, 0.8],
+            reduction='sum',
+            loss_weight=5.0),
+        center_loss=dict(
+            type='ChamferDistance',
+            mode='l2',
+            reduction='sum',
+            loss_src_weight=10.0,
+            loss_dst_weight=10.0),
+        dir_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
+        semantic_loss=dict(
+            type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(
+        pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'),
+    test_cfg=dict(
+        sample_mod='seed',
+        nms_thr=0.25,
+        score_thr=0.05,
+        per_class_proposal=True))
diff --git a/adzoo/vad/configs/_base_/schedules/cosine.py b/adzoo/vad/configs/_base_/schedules/cosine.py
new file mode 100644
index 0000000..69cb7df
--- /dev/null
+++ b/adzoo/vad/configs/_base_/schedules/cosine.py
@@ -0,0 +1,20 @@
+# This schedule is mainly used by models with dynamic voxelization
+# optimizer
+lr = 0.003  # max learning rate
+optimizer = dict(
+    type='AdamW',
+    lr=lr,
+    betas=(0.95, 0.99),  # the momentum is change during training
+    weight_decay=0.001)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+
+lr_config = dict(
+    policy='CosineAnnealing',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 10,
+    min_lr_ratio=1e-5)
+
+momentum_config = None
+
+runner = dict(type='EpochBasedRunner', max_epochs=40)
diff --git a/adzoo/vad/configs/_base_/schedules/cyclic_20e.py b/adzoo/vad/configs/_base_/schedules/cyclic_20e.py
new file mode 100644
index 0000000..704740e
--- /dev/null
+++ b/adzoo/vad/configs/_base_/schedules/cyclic_20e.py
@@ -0,0 +1,24 @@
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 20. Please change the interval accordingly if you do not
+# use a default schedule.
+# optimizer
+# This schedule is mainly used by models on nuScenes dataset
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='cyclic',
+    target_ratio=(10, 1e-4),
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+momentum_config = dict(
+    policy='cyclic',
+    target_ratio=(0.85 / 0.95, 1),
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=20)
diff --git a/adzoo/vad/configs/_base_/schedules/cyclic_40e.py b/adzoo/vad/configs/_base_/schedules/cyclic_40e.py
new file mode 100644
index 0000000..4a711ac
--- /dev/null
+++ b/adzoo/vad/configs/_base_/schedules/cyclic_40e.py
@@ -0,0 +1,31 @@
+# The schedule is usually used by models trained on KITTI dataset
+
+# The learning rate set in the cyclic schedule is the initial learning rate
+# rather than the max learning rate. Since the target_ratio is (10, 1e-4),
+# the learning rate will change from 0.0018 to 0.018, than go to 0.0018*1e-4
+lr = 0.0018
+# The optimizer follows the setting in SECOND.Pytorch, but here we use
+# the offcial AdamW optimizer implemented by PyTorch.
+optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+# We use cyclic learning rate and momentum schedule following SECOND.Pytorch
+# https://github.com/traveller59/second.pytorch/blob/3aba19c9688274f75ebb5e576f65cfe54773c021/torchplus/train/learning_schedules_fastai.py#L69  # noqa
+# We implement them in mmcv, for more details, please refer to
+# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L327  # noqa
+# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/momentum_updater.py#L130  # noqa
+lr_config = dict(
+    policy='cyclic',
+    target_ratio=(10, 1e-4),
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+momentum_config = dict(
+    policy='cyclic',
+    target_ratio=(0.85 / 0.95, 1),
+    cyclic_times=1,
+    step_ratio_up=0.4,
+)
+# Although the max_epochs is 40, this schedule is usually used we
+# RepeatDataset with repeat ratio N, thus the actual max epoch
+# number could be Nx40
+runner = dict(type='EpochBasedRunner', max_epochs=40)
diff --git a/adzoo/vad/configs/_base_/schedules/mmdet_schedule_1x.py b/adzoo/vad/configs/_base_/schedules/mmdet_schedule_1x.py
new file mode 100644
index 0000000..13b3783
--- /dev/null
+++ b/adzoo/vad/configs/_base_/schedules/mmdet_schedule_1x.py
@@ -0,0 +1,11 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[8, 11])
+runner = dict(type='EpochBasedRunner', max_epochs=12)
diff --git a/adzoo/vad/configs/_base_/schedules/schedule_2x.py b/adzoo/vad/configs/_base_/schedules/schedule_2x.py
new file mode 100644
index 0000000..afde799
--- /dev/null
+++ b/adzoo/vad/configs/_base_/schedules/schedule_2x.py
@@ -0,0 +1,14 @@
+# optimizer
+# This schedule is mainly used by models on nuScenes dataset
+optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01)
+# max_norm=10 is better for SECOND
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=1000,
+    warmup_ratio=1.0 / 1000,
+    step=[20, 23])
+momentum_config = None
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=24)
diff --git a/adzoo/vad/configs/_base_/schedules/schedule_3x.py b/adzoo/vad/configs/_base_/schedules/schedule_3x.py
new file mode 100644
index 0000000..115cd26
--- /dev/null
+++ b/adzoo/vad/configs/_base_/schedules/schedule_3x.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used by models on indoor dataset,
+# e.g., VoteNet on SUNRGBD and ScanNet
+lr = 0.008  # max learning rate
+optimizer = dict(type='AdamW', lr=lr, weight_decay=0.01)
+optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
+lr_config = dict(policy='step', warmup=None, step=[24, 32])
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=36)
diff --git a/adzoo/vad/configs/_base_/schedules/seg_cosine_150e.py b/adzoo/vad/configs/_base_/schedules/seg_cosine_150e.py
new file mode 100644
index 0000000..04b44e5
--- /dev/null
+++ b/adzoo/vad/configs/_base_/schedules/seg_cosine_150e.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optimizer = dict(type='SGD', lr=0.2, weight_decay=0.0001, momentum=0.9)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=0.002)
+momentum_config = None
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=150)
diff --git a/adzoo/vad/configs/_base_/schedules/seg_cosine_200e.py b/adzoo/vad/configs/_base_/schedules/seg_cosine_200e.py
new file mode 100644
index 0000000..6a49484
--- /dev/null
+++ b/adzoo/vad/configs/_base_/schedules/seg_cosine_200e.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used on ScanNet dataset in segmentation task
+optimizer = dict(type='Adam', lr=0.001, weight_decay=0.01)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5)
+momentum_config = None
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=200)
diff --git a/adzoo/vad/configs/_base_/schedules/seg_cosine_50e.py b/adzoo/vad/configs/_base_/schedules/seg_cosine_50e.py
new file mode 100644
index 0000000..975a8f9
--- /dev/null
+++ b/adzoo/vad/configs/_base_/schedules/seg_cosine_50e.py
@@ -0,0 +1,9 @@
+# optimizer
+# This schedule is mainly used on S3DIS dataset in segmentation task
+optimizer = dict(type='Adam', lr=0.001, weight_decay=0.001)
+optimizer_config = dict(grad_clip=None)
+lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5)
+momentum_config = None
+
+# runtime settings
+runner = dict(type='EpochBasedRunner', max_epochs=50)
diff --git a/adzoo/vad/configs/datasets/custom_lyft-3d.py b/adzoo/vad/configs/datasets/custom_lyft-3d.py
new file mode 100644
index 0000000..5a95d89
--- /dev/null
+++ b/adzoo/vad/configs/datasets/custom_lyft-3d.py
@@ -0,0 +1,136 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-80, -80, -5, 80, 80, 3]
+# For Lyft we usually do 9-class detection
+class_names = [
+    'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
+    'bicycle', 'pedestrian', 'animal'
+]
+dataset_type = 'CustomLyftDataset'
+data_root = 'data/lyft/'
+# Input modality for Lyft dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=True)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/lyft/': 's3://lyft/lyft/',
+#         'data/lyft/': 's3://lyft/lyft/'
+#    }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'lyft_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True))
+# For Lyft dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
\ No newline at end of file
diff --git a/adzoo/vad/configs/datasets/custom_nus-3d.py b/adzoo/vad/configs/datasets/custom_nus-3d.py
new file mode 100644
index 0000000..af81f9b
--- /dev/null
+++ b/adzoo/vad/configs/datasets/custom_nus-3d.py
@@ -0,0 +1,141 @@
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+dataset_type = 'NuScenesDataset_eval_modified'
+data_root = 'data/nuscenes/'
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(
+    use_lidar=True,
+    use_camera=False,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/': 's3://nuscenes/nuscenes/',
+#         'data/nuscenes/': 's3://nuscenes/nuscenes/'
+#     }))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(
+        type='DefaultFormatBundle3D',
+        class_names=class_names,
+        with_label=False),
+    dict(type='Collect3D', keys=['points'])
+]
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+# For nuScenes dataset, we usually evaluate the model at the end of training.
+# Since the models are trained by 24 epochs by default, we set evaluation
+# interval to be 24. Please change the interval accordingly if you do not
+# use a default schedule.
+evaluation = dict(interval=24, pipeline=eval_pipeline)
diff --git a/adzoo/vad/configs/datasets/custom_waymo-3d.py b/adzoo/vad/configs/datasets/custom_waymo-3d.py
new file mode 100644
index 0000000..4100e13
--- /dev/null
+++ b/adzoo/vad/configs/datasets/custom_waymo-3d.py
@@ -0,0 +1,112 @@
+# dataset settings
+# D5 in the config name means the whole dataset is divided into 5 folds
+# We only use one fold for efficient experiments
+dataset_type = 'CustomWaymoDataset'
+data_root = 'data/waymo/kitti_format/'
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
+
+img_norm_cfg = dict(
+    mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
+class_names = ['Car', 'Pedestrian', 'Cyclist']
+point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
+input_modality = dict(use_lidar=False, use_camera=True)
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'waymo_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+    classes=class_names,
+    sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        file_client_args=file_client_args))
+
+
+
+train_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='PhotoMetricDistortionMultiViewImage'),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='CustomCollect3D', keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'])
+]
+
+
+test_pipeline = [
+    dict(type='LoadMultiViewImageFromFiles', to_float32=True),
+    dict(type='NormalizeMultiviewImage', **img_norm_cfg),
+    dict(type='PadMultiViewImage', size_divisor=32),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1920, 1280),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='CustomCollect3D', keys=['img'])
+        ])
+]
+
+
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file=data_root + 'waymo_infos_train.pkl',
+            split='training',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            classes=class_names,
+            test_mode=False,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR',
+            # load one frame every five frames
+            load_interval=5)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + 'waymo_infos_val.pkl',
+        split='training',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        classes=class_names,
+        test_mode=True,
+        box_type_3d='LiDAR'))
+
+evaluation = dict(interval=24, pipeline=test_pipeline)
\ No newline at end of file
diff --git a/adzoo/vad/create_data.py b/adzoo/vad/create_data.py
new file mode 100644
index 0000000..f2b0cc1
--- /dev/null
+++ b/adzoo/vad/create_data.py
@@ -0,0 +1,305 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+from data_converter.create_gt_database import create_groundtruth_database
+from data_converter import nuscenes_converter as nuscenes_converter
+from data_converter import lyft_converter as lyft_converter
+from data_converter import kitti_converter as kitti
+from data_converter import indoor_converter as indoor
+import argparse
+from os import path as osp
+import sys
+sys.path.append('.')
+
+
+def kitti_data_prep(root_path, info_prefix, version, out_dir):
+    """Prepare data related to Kitti dataset.
+
+    Related data consists of '.pkl' files recording basic infos,
+    2D annotations and groundtruth database.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        version (str): Dataset version.
+        out_dir (str): Output directory of the groundtruth database info.
+    """
+    kitti.create_kitti_info_file(root_path, info_prefix)
+    kitti.create_reduced_point_cloud(root_path, info_prefix)
+
+    info_train_path = osp.join(root_path, f'{info_prefix}_infos_train.pkl')
+    info_val_path = osp.join(root_path, f'{info_prefix}_infos_val.pkl')
+    info_trainval_path = osp.join(root_path,
+                                  f'{info_prefix}_infos_trainval.pkl')
+    info_test_path = osp.join(root_path, f'{info_prefix}_infos_test.pkl')
+    kitti.export_2d_annotation(root_path, info_train_path)
+    kitti.export_2d_annotation(root_path, info_val_path)
+    kitti.export_2d_annotation(root_path, info_trainval_path)
+    kitti.export_2d_annotation(root_path, info_test_path)
+
+    create_groundtruth_database(
+        'KittiDataset',
+        root_path,
+        info_prefix,
+        f'{out_dir}/{info_prefix}_infos_train.pkl',
+        relative_path=False,
+        mask_anno_path='instances_train.json',
+        with_mask=(version == 'mask'))
+
+
+def nuscenes_data_prep(root_path,
+                       can_bus_root_path,
+                       info_prefix,
+                       version,
+                       dataset_name,
+                       out_dir,
+                       max_sweeps=10):
+    """Prepare data related to nuScenes dataset.
+
+    Related data consists of '.pkl' files recording basic infos,
+    2D annotations and groundtruth database.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        version (str): Dataset version.
+        dataset_name (str): The dataset class name.
+        out_dir (str): Output directory of the groundtruth database info.
+        max_sweeps (int): Number of input consecutive frames. Default: 10
+    """
+    nuscenes_converter.create_nuscenes_infos(
+        root_path, out_dir, can_bus_root_path, info_prefix, version=version, max_sweeps=max_sweeps)
+
+    if version == 'v1.0-test':
+        info_test_path = osp.join(
+            out_dir, f'{info_prefix}_infos_temporal_test.pkl')
+        nuscenes_converter.export_2d_annotation(
+            root_path, info_test_path, version=version)
+    else:
+        info_train_path = osp.join(
+            out_dir, f'{info_prefix}_infos_temporal_train.pkl')
+        info_val_path = osp.join(
+            out_dir, f'{info_prefix}_infos_temporal_val.pkl')
+        nuscenes_converter.export_2d_annotation(
+            root_path, info_train_path, version=version)
+        nuscenes_converter.export_2d_annotation(
+            root_path, info_val_path, version=version)
+        # create_groundtruth_database(dataset_name, root_path, info_prefix,
+        #                             f'{out_dir}/{info_prefix}_infos_train.pkl')
+
+
+def lyft_data_prep(root_path, info_prefix, version, max_sweeps=10):
+    """Prepare data related to Lyft dataset.
+
+    Related data consists of '.pkl' files recording basic infos.
+    Although the ground truth database and 2D annotations are not used in
+    Lyft, it can also be generated like nuScenes.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        version (str): Dataset version.
+        max_sweeps (int, optional): Number of input consecutive frames.
+            Defaults to 10.
+    """
+    lyft_converter.create_lyft_infos(
+        root_path, info_prefix, version=version, max_sweeps=max_sweeps)
+
+
+def scannet_data_prep(root_path, info_prefix, out_dir, workers):
+    """Prepare the info file for scannet dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+    """
+    indoor.create_indoor_info_file(
+        root_path, info_prefix, out_dir, workers=workers)
+
+
+def s3dis_data_prep(root_path, info_prefix, out_dir, workers):
+    """Prepare the info file for s3dis dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+    """
+    indoor.create_indoor_info_file(
+        root_path, info_prefix, out_dir, workers=workers)
+
+
+def sunrgbd_data_prep(root_path, info_prefix, out_dir, workers):
+    """Prepare the info file for sunrgbd dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+    """
+    indoor.create_indoor_info_file(
+        root_path, info_prefix, out_dir, workers=workers)
+
+
+def waymo_data_prep(root_path,
+                    info_prefix,
+                    version,
+                    out_dir,
+                    workers,
+                    max_sweeps=5):
+    """Prepare the info file for waymo dataset.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        out_dir (str): Output directory of the generated info file.
+        workers (int): Number of threads to be used.
+        max_sweeps (int): Number of input consecutive frames. Default: 5 \
+            Here we store pose information of these frames for later use.
+    """
+    from tools.data_converter import waymo_converter as waymo
+
+    splits = ['training', 'validation', 'testing']
+
+    for i, split in enumerate(splits):
+        load_dir = osp.join(root_path, 'waymo_format', split)
+        if split == 'validation':
+            save_dir = osp.join(out_dir, 'kitti_format', 'training')
+        else:
+            save_dir = osp.join(out_dir, 'kitti_format', split)
+        converter = waymo.Waymo2KITTI(
+            load_dir,
+            save_dir,
+            prefix=str(i),
+            workers=workers,
+            test_mode=(split == 'test'))
+        converter.convert()
+    # Generate waymo infos
+    out_dir = osp.join(out_dir, 'kitti_format')
+    kitti.create_waymo_info_file(out_dir, info_prefix, max_sweeps=max_sweeps)
+
+    create_groundtruth_database(
+        'WaymoDataset',
+        out_dir,
+        info_prefix,
+        f'{out_dir}/{info_prefix}_infos_train.pkl',
+        relative_path=False,
+        with_mask=False)
+
+
+parser = argparse.ArgumentParser(description='Data converter arg parser')
+parser.add_argument('dataset', metavar='kitti', help='name of the dataset')
+parser.add_argument(
+    '--root-path',
+    type=str,
+    default='./data/kitti',
+    help='specify the root path of dataset')
+parser.add_argument(
+    '--canbus',
+    type=str,
+    default='./data',
+    help='specify the root path of nuScenes canbus')
+parser.add_argument(
+    '--version',
+    type=str,
+    default='v1.0',
+    required=False,
+    help='specify the dataset version, no need for kitti')
+parser.add_argument(
+    '--max-sweeps',
+    type=int,
+    default=10,
+    required=False,
+    help='specify sweeps of lidar per example')
+parser.add_argument(
+    '--out-dir',
+    type=str,
+    default='./data/kitti',
+    required='False',
+    help='name of info pkl')
+parser.add_argument('--extra-tag', type=str, default='kitti')
+parser.add_argument(
+    '--workers', type=int, default=4, help='number of threads to be used')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    if args.dataset == 'kitti':
+        kitti_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=args.version,
+            out_dir=args.out_dir)
+    elif args.dataset == 'nuscenes' and args.version != 'v1.0-mini':
+        train_version = f'{args.version}-trainval'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
+        test_version = f'{args.version}-test'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=test_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'nuscenes' and args.version == 'v1.0-mini':
+        train_version = f'{args.version}'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'lyft':
+        train_version = f'{args.version}-train'
+        lyft_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            max_sweeps=args.max_sweeps)
+        test_version = f'{args.version}-test'
+        lyft_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=test_version,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'waymo':
+        waymo_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            version=args.version,
+            out_dir=args.out_dir,
+            workers=args.workers,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'scannet':
+        scannet_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            out_dir=args.out_dir,
+            workers=args.workers)
+    elif args.dataset == 's3dis':
+        s3dis_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            out_dir=args.out_dir,
+            workers=args.workers)
+    elif args.dataset == 'sunrgbd':
+        sunrgbd_data_prep(
+            root_path=args.root_path,
+            info_prefix=args.extra_tag,
+            out_dir=args.out_dir,
+            workers=args.workers)
diff --git a/adzoo/vad/data_converter/__init__.py b/adzoo/vad/data_converter/__init__.py
new file mode 100644
index 0000000..ef101fe
--- /dev/null
+++ b/adzoo/vad/data_converter/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/adzoo/vad/data_converter/create_gt_database.py b/adzoo/vad/data_converter/create_gt_database.py
new file mode 100644
index 0000000..7317ced
--- /dev/null
+++ b/adzoo/vad/data_converter/create_gt_database.py
@@ -0,0 +1,338 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import pickle
+from mmcv import track_iter_progress
+from mmcv.ops import roi_align
+from os import path as osp
+from pycocotools import mask as maskUtils
+from pycocotools.coco import COCO
+
+from mmdet3d.core.bbox import box_np_ops as box_np_ops
+from mmdet3d.datasets import build_dataset
+from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
+
+
+def _poly2mask(mask_ann, img_h, img_w):
+    if isinstance(mask_ann, list):
+        # polygon -- a single object might consist of multiple parts
+        # we merge all parts into one mask rle code
+        rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+        rle = maskUtils.merge(rles)
+    elif isinstance(mask_ann['counts'], list):
+        # uncompressed RLE
+        rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+    else:
+        # rle
+        rle = mask_ann
+    mask = maskUtils.decode(rle)
+    return mask
+
+
+def _parse_coco_ann_info(ann_info):
+    gt_bboxes = []
+    gt_labels = []
+    gt_bboxes_ignore = []
+    gt_masks_ann = []
+
+    for i, ann in enumerate(ann_info):
+        if ann.get('ignore', False):
+            continue
+        x1, y1, w, h = ann['bbox']
+        if ann['area'] <= 0:
+            continue
+        bbox = [x1, y1, x1 + w, y1 + h]
+        if ann.get('iscrowd', False):
+            gt_bboxes_ignore.append(bbox)
+        else:
+            gt_bboxes.append(bbox)
+            gt_masks_ann.append(ann['segmentation'])
+
+    if gt_bboxes:
+        gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+        gt_labels = np.array(gt_labels, dtype=np.int64)
+    else:
+        gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+        gt_labels = np.array([], dtype=np.int64)
+
+    if gt_bboxes_ignore:
+        gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+    else:
+        gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+    ann = dict(
+        bboxes=gt_bboxes, bboxes_ignore=gt_bboxes_ignore, masks=gt_masks_ann)
+
+    return ann
+
+
+def crop_image_patch_v2(pos_proposals, pos_assigned_gt_inds, gt_masks):
+    import torch
+    from torch.nn.modules.utils import _pair
+    device = pos_proposals.device
+    num_pos = pos_proposals.size(0)
+    fake_inds = (
+        torch.arange(num_pos,
+                     device=device).to(dtype=pos_proposals.dtype)[:, None])
+    rois = torch.cat([fake_inds, pos_proposals], dim=1)  # Nx5
+    mask_size = _pair(28)
+    rois = rois.to(device=device)
+    gt_masks_th = (
+        torch.from_numpy(gt_masks).to(device).index_select(
+            0, pos_assigned_gt_inds).to(dtype=rois.dtype))
+    # Use RoIAlign could apparently accelerate the training (~0.1s/iter)
+    targets = (
+        roi_align(gt_masks_th, rois, mask_size[::-1], 1.0, 0, True).squeeze(1))
+    return targets
+
+
+def crop_image_patch(pos_proposals, gt_masks, pos_assigned_gt_inds, org_img):
+    num_pos = pos_proposals.shape[0]
+    masks = []
+    img_patches = []
+    for i in range(num_pos):
+        gt_mask = gt_masks[pos_assigned_gt_inds[i]]
+        bbox = pos_proposals[i, :].astype(np.int32)
+        x1, y1, x2, y2 = bbox
+        w = np.maximum(x2 - x1 + 1, 1)
+        h = np.maximum(y2 - y1 + 1, 1)
+
+        mask_patch = gt_mask[y1:y1 + h, x1:x1 + w]
+        masked_img = gt_mask[..., None] * org_img
+        img_patch = masked_img[y1:y1 + h, x1:x1 + w]
+
+        img_patches.append(img_patch)
+        masks.append(mask_patch)
+    return img_patches, masks
+
+
+def create_groundtruth_database(dataset_class_name,
+                                data_path,
+                                info_prefix,
+                                info_path=None,
+                                mask_anno_path=None,
+                                used_classes=None,
+                                database_save_path=None,
+                                db_info_save_path=None,
+                                relative_path=True,
+                                add_rgb=False,
+                                lidar_only=False,
+                                bev_only=False,
+                                coors_range=None,
+                                with_mask=False):
+    """Given the raw data, generate the ground truth database.
+
+    Args:
+        dataset_class_name （str): Name of the input dataset.
+        data_path (str): Path of the data.
+        info_prefix (str): Prefix of the info file.
+        info_path (str): Path of the info file.
+            Default: None.
+        mask_anno_path (str): Path of the mask_anno.
+            Default: None.
+        used_classes (list[str]): Classes have been used.
+            Default: None.
+        database_save_path (str): Path to save database.
+            Default: None.
+        db_info_save_path (str): Path to save db_info.
+            Default: None.
+        relative_path (bool): Whether to use relative path.
+            Default: True.
+        with_mask (bool): Whether to use mask.
+            Default: False.
+    """
+    print(f'Create GT Database of {dataset_class_name}')
+    dataset_cfg = dict(
+        type=dataset_class_name, data_root=data_path, ann_file=info_path)
+    if dataset_class_name == 'KittiDataset':
+        file_client_args = dict(backend='disk')
+        dataset_cfg.update(
+            test_mode=False,
+            split='training',
+            modality=dict(
+                use_lidar=True,
+                use_depth=False,
+                use_lidar_intensity=True,
+                use_camera=with_mask,
+            ),
+            pipeline=[
+                dict(
+                    type='LoadPointsFromFile',
+                    coord_type='LIDAR',
+                    load_dim=4,
+                    use_dim=4,
+                    file_client_args=file_client_args),
+                dict(
+                    type='LoadAnnotations3D',
+                    with_bbox_3d=True,
+                    with_label_3d=True,
+                    file_client_args=file_client_args)
+            ])
+
+    elif dataset_class_name == 'NuScenesDataset':
+        dataset_cfg.update(
+            use_valid_flag=True,
+            pipeline=[
+                dict(
+                    type='LoadPointsFromFile',
+                    coord_type='LIDAR',
+                    load_dim=5,
+                    use_dim=5),
+                dict(
+                    type='LoadPointsFromMultiSweeps',
+                    sweeps_num=10,
+                    use_dim=[0, 1, 2, 3, 4],
+                    pad_empty_sweeps=True,
+                    remove_close=True),
+                dict(
+                    type='LoadAnnotations3D',
+                    with_bbox_3d=True,
+                    with_label_3d=True)
+            ])
+
+    elif dataset_class_name == 'WaymoDataset':
+        file_client_args = dict(backend='disk')
+        dataset_cfg.update(
+            test_mode=False,
+            split='training',
+            modality=dict(
+                use_lidar=True,
+                use_depth=False,
+                use_lidar_intensity=True,
+                use_camera=False,
+            ),
+            pipeline=[
+                dict(
+                    type='LoadPointsFromFile',
+                    coord_type='LIDAR',
+                    load_dim=6,
+                    use_dim=5,
+                    file_client_args=file_client_args),
+                dict(
+                    type='LoadAnnotations3D',
+                    with_bbox_3d=True,
+                    with_label_3d=True,
+                    file_client_args=file_client_args)
+            ])
+
+    dataset = build_dataset(dataset_cfg)
+
+    if database_save_path is None:
+        database_save_path = osp.join(data_path, f'{info_prefix}_gt_database')
+    if db_info_save_path is None:
+        db_info_save_path = osp.join(data_path,
+                                     f'{info_prefix}_dbinfos_train.pkl')
+    mmcv.mkdir_or_exist(database_save_path)
+    all_db_infos = dict()
+    if with_mask:
+        coco = COCO(osp.join(data_path, mask_anno_path))
+        imgIds = coco.getImgIds()
+        file2id = dict()
+        for i in imgIds:
+            info = coco.loadImgs([i])[0]
+            file2id.update({info['file_name']: i})
+
+    group_counter = 0
+    for j in track_iter_progress(list(range(len(dataset)))):
+        input_dict = dataset.get_data_info(j)
+        dataset.pre_pipeline(input_dict)
+        example = dataset.pipeline(input_dict)
+        annos = example['ann_info']
+        image_idx = example['sample_idx']
+        points = example['points'].tensor.numpy()
+        gt_boxes_3d = annos['gt_bboxes_3d'].tensor.numpy()
+        names = annos['gt_names']
+        group_dict = dict()
+        if 'group_ids' in annos:
+            group_ids = annos['group_ids']
+        else:
+            group_ids = np.arange(gt_boxes_3d.shape[0], dtype=np.int64)
+        difficulty = np.zeros(gt_boxes_3d.shape[0], dtype=np.int32)
+        if 'difficulty' in annos:
+            difficulty = annos['difficulty']
+
+        num_obj = gt_boxes_3d.shape[0]
+        point_indices = box_np_ops.points_in_rbbox(points, gt_boxes_3d)
+
+        if with_mask:
+            # prepare masks
+            gt_boxes = annos['gt_bboxes']
+            img_path = osp.split(example['img_info']['filename'])[-1]
+            if img_path not in file2id.keys():
+                print(f'skip image {img_path} for empty mask')
+                continue
+            img_id = file2id[img_path]
+            kins_annIds = coco.getAnnIds(imgIds=img_id)
+            kins_raw_info = coco.loadAnns(kins_annIds)
+            kins_ann_info = _parse_coco_ann_info(kins_raw_info)
+            h, w = annos['img_shape'][:2]
+            gt_masks = [
+                _poly2mask(mask, h, w) for mask in kins_ann_info['masks']
+            ]
+            # get mask inds based on iou mapping
+            bbox_iou = bbox_overlaps(kins_ann_info['bboxes'], gt_boxes)
+            mask_inds = bbox_iou.argmax(axis=0)
+            valid_inds = (bbox_iou.max(axis=0) > 0.5)
+
+            # mask the image
+            # use more precise crop when it is ready
+            # object_img_patches = np.ascontiguousarray(
+            #     np.stack(object_img_patches, axis=0).transpose(0, 3, 1, 2))
+            # crop image patches using roi_align
+            # object_img_patches = crop_image_patch_v2(
+            #     torch.Tensor(gt_boxes),
+            #     torch.Tensor(mask_inds).long(), object_img_patches)
+            object_img_patches, object_masks = crop_image_patch(
+                gt_boxes, gt_masks, mask_inds, annos['img'])
+
+        for i in range(num_obj):
+            filename = f'{image_idx}_{names[i]}_{i}.bin'
+            abs_filepath = osp.join(database_save_path, filename)
+            rel_filepath = osp.join(f'{info_prefix}_gt_database', filename)
+
+            # save point clouds and image patches for each object
+            gt_points = points[point_indices[:, i]]
+            gt_points[:, :3] -= gt_boxes_3d[i, :3]
+
+            if with_mask:
+                if object_masks[i].sum() == 0 or not valid_inds[i]:
+                    # Skip object for empty or invalid mask
+                    continue
+                img_patch_path = abs_filepath + '.png'
+                mask_patch_path = abs_filepath + '.mask.png'
+                mmcv.imwrite(object_img_patches[i], img_patch_path)
+                mmcv.imwrite(object_masks[i], mask_patch_path)
+
+            with open(abs_filepath, 'w') as f:
+                gt_points.tofile(f)
+
+            if (used_classes is None) or names[i] in used_classes:
+                db_info = {
+                    'name': names[i],
+                    'path': rel_filepath,
+                    'image_idx': image_idx,
+                    'gt_idx': i,
+                    'box3d_lidar': gt_boxes_3d[i],
+                    'num_points_in_gt': gt_points.shape[0],
+                    'difficulty': difficulty[i],
+                }
+                local_group_id = group_ids[i]
+                # if local_group_id >= 0:
+                if local_group_id not in group_dict:
+                    group_dict[local_group_id] = group_counter
+                    group_counter += 1
+                db_info['group_id'] = group_dict[local_group_id]
+                if 'score' in annos:
+                    db_info['score'] = annos['score'][i]
+                if with_mask:
+                    db_info.update({'box2d_camera': gt_boxes[i]})
+                if names[i] in all_db_infos:
+                    all_db_infos[names[i]].append(db_info)
+                else:
+                    all_db_infos[names[i]] = [db_info]
+
+    for k, v in all_db_infos.items():
+        print(f'load {len(v)} {k} database infos')
+
+    with open(db_info_save_path, 'wb') as f:
+        pickle.dump(all_db_infos, f)
diff --git a/adzoo/vad/data_converter/vad_nuscenes_converter.py b/adzoo/vad/data_converter/vad_nuscenes_converter.py
new file mode 100644
index 0000000..338051c
--- /dev/null
+++ b/adzoo/vad/data_converter/vad_nuscenes_converter.py
@@ -0,0 +1,1005 @@
+import os
+import math
+import copy
+import argparse
+from os import path as osp
+from collections import OrderedDict
+from typing import List, Tuple, Union
+
+import mmcv
+import numpy as np
+from pyquaternion import Quaternion
+from nuscenes.nuscenes import NuScenes
+from nuscenes.utils.data_classes import Box
+from shapely.geometry import MultiPoint, box
+from mmdet3d.datasets import NuScenesDataset
+from nuscenes.utils.geometry_utils import view_points
+from mmdet3d.core.bbox.box_np_ops import points_cam2img
+from nuscenes.utils.geometry_utils import transform_matrix
+
+
+nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+                  'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+                  'barrier')
+
+nus_attributes = ('cycle.with_rider', 'cycle.without_rider',
+                  'pedestrian.moving', 'pedestrian.standing',
+                  'pedestrian.sitting_lying_down', 'vehicle.moving',
+                  'vehicle.parked', 'vehicle.stopped', 'None')
+
+ego_width, ego_length = 1.85, 4.084
+
+def quart_to_rpy(qua):
+    x, y, z, w = qua
+    roll = math.atan2(2 * (w * x + y * z), 1 - 2 * (x * x + y * y))
+    pitch = math.asin(2 * (w * y - x * z))
+    yaw = math.atan2(2 * (w * z + x * y), 1 - 2 * (z * z + y * y))
+    return roll, pitch, yaw
+
+def locate_message(utimes, utime):
+    i = np.searchsorted(utimes, utime)
+    if i == len(utimes) or (i > 0 and utime - utimes[i-1] < utimes[i] - utime):
+        i -= 1
+    return i
+
+
+def create_nuscenes_infos(root_path,
+                          out_path,
+                          can_bus_root_path,
+                          info_prefix,
+                          version='v1.0-trainval',
+                          max_sweeps=10):
+    """Create info file of nuscene dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        root_path (str): Path of the data root.
+        info_prefix (str): Prefix of the info file to be generated.
+        version (str): Version of the data.
+            Default: 'v1.0-trainval'
+        max_sweeps (int): Max number of sweeps.
+            Default: 10
+    """
+    from nuscenes.nuscenes import NuScenes
+    from nuscenes.can_bus.can_bus_api import NuScenesCanBus
+    print(version, root_path)
+    nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+    nusc_can_bus = NuScenesCanBus(dataroot=can_bus_root_path)
+    from nuscenes.utils import splits
+    available_vers = ['v1.0-trainval', 'v1.0-test', 'v1.0-mini']
+    assert version in available_vers
+    if version == 'v1.0-trainval':
+        train_scenes = splits.train
+        val_scenes = splits.val
+    elif version == 'v1.0-test':
+        train_scenes = splits.test
+        val_scenes = []
+    elif version == 'v1.0-mini':
+        train_scenes = splits.mini_train
+        val_scenes = splits.mini_val
+    else:
+        raise ValueError('unknown')
+
+    # filter existing scenes.
+    available_scenes = get_available_scenes(nusc)
+    available_scene_names = [s['name'] for s in available_scenes]
+    train_scenes = list(
+        filter(lambda x: x in available_scene_names, train_scenes))
+    val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes))
+    train_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in train_scenes
+    ])
+    val_scenes = set([
+        available_scenes[available_scene_names.index(s)]['token']
+        for s in val_scenes
+    ])
+
+    test = 'test' in version
+    if test:
+        print('test scene: {}'.format(len(train_scenes)))
+    else:
+        print('train scene: {}, val scene: {}'.format(
+            len(train_scenes), len(val_scenes)))
+
+    train_nusc_infos, val_nusc_infos = _fill_trainval_infos(
+        nusc, nusc_can_bus, train_scenes, val_scenes, test, max_sweeps=max_sweeps)
+
+    metadata = dict(version=version)
+    if test:
+        print('test sample: {}'.format(len(train_nusc_infos)))
+        data = dict(infos=train_nusc_infos, metadata=metadata)
+        info_path = osp.join(out_path,
+                             '{}_infos_temporal_test.pkl'.format(info_prefix))
+        mmcv.dump(data, info_path)
+    else:
+        print('train sample: {}, val sample: {}'.format(
+            len(train_nusc_infos), len(val_nusc_infos)))
+        data = dict(infos=train_nusc_infos, metadata=metadata)
+        info_path = osp.join(out_path,
+                             '{}_infos_temporal_train.pkl'.format(info_prefix))
+        mmcv.dump(data, info_path)
+        data['infos'] = val_nusc_infos
+        info_val_path = osp.join(out_path,
+                                 '{}_infos_temporal_val.pkl'.format(info_prefix))
+        mmcv.dump(data, info_val_path)
+
+
+def get_available_scenes(nusc):
+    """Get available scenes from the input nuscenes class.
+
+    Given the raw data, get the information of available scenes for
+    further info generation.
+
+    Args:
+        nusc (class): Dataset class in the nuScenes dataset.
+
+    Returns:
+        available_scenes (list[dict]): List of basic information for the
+            available scenes.
+    """
+    available_scenes = []
+    print('total scene num: {}'.format(len(nusc.scene)))
+    for scene in nusc.scene:
+        scene_token = scene['token']
+        scene_rec = nusc.get('scene', scene_token)
+        sample_rec = nusc.get('sample', scene_rec['first_sample_token'])
+        sd_rec = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP'])
+        has_more_frames = True
+        scene_not_exist = False
+        while has_more_frames:
+            lidar_path, boxes, _ = nusc.get_sample_data(sd_rec['token'])
+            lidar_path = str(lidar_path)
+            if os.getcwd() in lidar_path:
+                # path from lyftdataset is absolute path
+                lidar_path = lidar_path.split(f'{os.getcwd()}/')[-1]
+                # relative path
+            if not mmcv.is_filepath(lidar_path):
+                scene_not_exist = True
+                break
+            else:
+                break
+        if scene_not_exist:
+            continue
+        available_scenes.append(scene)
+    print('exist scene num: {}'.format(len(available_scenes)))
+    return available_scenes
+
+
+def _get_can_bus_info(nusc, nusc_can_bus, sample):
+    scene_name = nusc.get('scene', sample['scene_token'])['name']
+    sample_timestamp = sample['timestamp']
+    try:
+        pose_list = nusc_can_bus.get_messages(scene_name, 'pose')
+    except:
+        return np.zeros(18)  # server scenes do not have can bus information.
+    can_bus = []
+    # during each scene, the first timestamp of can_bus may be large than the first sample's timestamp
+    last_pose = pose_list[0]
+    for i, pose in enumerate(pose_list):
+        if pose['utime'] > sample_timestamp:
+            break
+        last_pose = pose
+    _ = last_pose.pop('utime')  # useless
+    pos = last_pose.pop('pos')
+    rotation = last_pose.pop('orientation')
+    can_bus.extend(pos)
+    can_bus.extend(rotation)
+    for key in last_pose.keys():
+        can_bus.extend(pose[key])  # 16 elements
+    can_bus.extend([0., 0.])
+    return np.array(can_bus)
+
+
+def _fill_trainval_infos(nusc,
+                         nusc_can_bus,
+                         train_scenes,
+                         val_scenes,
+                         test=False,
+                         max_sweeps=10,
+                         fut_ts=6,
+                         his_ts=2):
+    """Generate the train/val infos from the raw data.
+
+    Args:
+        nusc (:obj:`NuScenes`): Dataset class in the nuScenes dataset.
+        train_scenes (list[str]): Basic information of training scenes.
+        val_scenes (list[str]): Basic information of validation scenes.
+        test (bool): Whether use the test mode. In the test mode, no
+            annotations can be accessed. Default: False.
+        max_sweeps (int): Max number of sweeps. Default: 10.
+
+    Returns:
+        tuple[list[dict]]: Information of training set and validation set
+            that will be saved to the info file.
+    """
+    train_nusc_infos = []
+    val_nusc_infos = []
+    frame_idx = 0
+    cat2idx = {}
+    for idx, dic in enumerate(nusc.category):
+        cat2idx[dic['name']] = idx
+
+    for sample in mmcv.track_iter_progress(nusc.sample):
+        map_location = nusc.get('log', nusc.get('scene', sample['scene_token'])['log_token'])['location']
+        lidar_token = sample['data']['LIDAR_TOP']
+        sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+        cs_record = nusc.get('calibrated_sensor',
+                             sd_rec['calibrated_sensor_token'])
+        pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+        if sample['prev'] != '':
+            sample_prev = nusc.get('sample', sample['prev'])
+            sd_rec_prev = nusc.get('sample_data', sample_prev['data']['LIDAR_TOP'])
+            pose_record_prev = nusc.get('ego_pose', sd_rec_prev['ego_pose_token'])
+        else:
+            pose_record_prev = None
+        if sample['next'] != '':
+            sample_next = nusc.get('sample', sample['next'])
+            sd_rec_next = nusc.get('sample_data', sample_next['data']['LIDAR_TOP'])
+            pose_record_next = nusc.get('ego_pose', sd_rec_next['ego_pose_token'])
+        else:
+            pose_record_next = None
+
+        lidar_path, boxes, _ = nusc.get_sample_data(lidar_token)
+
+        mmcv.check_file_exist(lidar_path)
+        can_bus = _get_can_bus_info(nusc, nusc_can_bus, sample)
+        fut_valid_flag = True
+        test_sample = copy.deepcopy(sample)
+        for i in range(fut_ts):
+            if test_sample['next'] != '':
+                test_sample = nusc.get('sample', test_sample['next'])
+            else:
+                fut_valid_flag = False
+        ##
+        info = {
+            'lidar_path': lidar_path,
+            'token': sample['token'],
+            'prev': sample['prev'],
+            'next': sample['next'],
+            'can_bus': can_bus,
+            'frame_idx': frame_idx,  # temporal related info
+            'sweeps': [],
+            'cams': dict(),
+            'scene_token': sample['scene_token'],  # temporal related info
+            'lidar2ego_translation': cs_record['translation'],
+            'lidar2ego_rotation': cs_record['rotation'],
+            'ego2global_translation': pose_record['translation'],
+            'ego2global_rotation': pose_record['rotation'],
+            'timestamp': sample['timestamp'],
+            'fut_valid_flag': fut_valid_flag,
+            'map_location': map_location
+        }
+
+        if sample['next'] == '':
+            frame_idx = 0
+        else:
+            frame_idx += 1
+
+        l2e_r = info['lidar2ego_rotation']
+        l2e_t = info['lidar2ego_translation']
+        e2g_r = info['ego2global_rotation']
+        e2g_t = info['ego2global_translation']
+        l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+        e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+        # obtain 6 image's information per frame
+        camera_types = [
+            'CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_FRONT_LEFT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_BACK_RIGHT',
+        ]
+        for cam in camera_types:
+            cam_token = sample['data'][cam]
+            cam_path, _, cam_intrinsic = nusc.get_sample_data(cam_token)
+            cam_info = obtain_sensor2top(nusc, cam_token, l2e_t, l2e_r_mat,
+                                         e2g_t, e2g_r_mat, cam)
+            cam_info.update(cam_intrinsic=cam_intrinsic)
+            info['cams'].update({cam: cam_info})
+
+        # obtain sweeps for a single key-frame
+        sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+        sweeps = []
+        while len(sweeps) < max_sweeps:
+            if not sd_rec['prev'] == '':
+                sweep = obtain_sensor2top(nusc, sd_rec['prev'], l2e_t,
+                                          l2e_r_mat, e2g_t, e2g_r_mat, 'lidar')
+                sweeps.append(sweep)
+                sd_rec = nusc.get('sample_data', sd_rec['prev'])
+            else:
+                break
+        info['sweeps'] = sweeps
+        # obtain annotation
+        if not test:
+            annotations = [
+                nusc.get('sample_annotation', token)
+                for token in sample['anns']
+            ]
+            locs = np.array([b.center for b in boxes]).reshape(-1, 3)
+            dims = np.array([b.wlh for b in boxes]).reshape(-1, 3)
+            rots = np.array([b.orientation.yaw_pitch_roll[0]
+                             for b in boxes]).reshape(-1, 1)
+            velocity = np.array(
+                [nusc.box_velocity(token)[:2] for token in sample['anns']])
+            valid_flag = np.array(
+                [(anno['num_lidar_pts'] + anno['num_radar_pts']) > 0
+                 for anno in annotations],
+                dtype=bool).reshape(-1)
+            # convert velo from global to lidar
+            for i in range(len(boxes)):
+                velo = np.array([*velocity[i], 0.0])
+                velo = velo @ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(
+                    l2e_r_mat).T
+                velocity[i] = velo[:2]
+
+            names = [b.name for b in boxes]
+            for i in range(len(names)):
+                if names[i] in NuScenesDataset.NameMapping:
+                    names[i] = NuScenesDataset.NameMapping[names[i]]
+            names = np.array(names)
+            # we need to convert rot to SECOND format.
+            gt_boxes = np.concatenate([locs, dims, -rots - np.pi / 2], axis=1)
+            assert len(gt_boxes) == len(
+                annotations), f'{len(gt_boxes)}, {len(annotations)}'
+            
+            # get future coords for each box
+            # [num_box, fut_ts*2]
+            num_box = len(boxes)
+            gt_fut_trajs = np.zeros((num_box, fut_ts, 2))
+            gt_fut_yaw = np.zeros((num_box, fut_ts))
+            gt_fut_masks = np.zeros((num_box, fut_ts))
+            gt_boxes_yaw = -(gt_boxes[:,6] + np.pi / 2)
+            # agent lcf feat (x, y, yaw, vx, vy, width, length, height, type)
+            agent_lcf_feat = np.zeros((num_box, 9))
+            gt_fut_goal = np.zeros((num_box))
+            for i, anno in enumerate(annotations):
+                cur_box = boxes[i]
+                cur_anno = anno
+                agent_lcf_feat[i, 0:2] = cur_box.center[:2]	
+                agent_lcf_feat[i, 2] = gt_boxes_yaw[i]
+                agent_lcf_feat[i, 3:5] = velocity[i]
+                agent_lcf_feat[i, 5:8] = anno['size'] # width,length,height
+                agent_lcf_feat[i, 8] = cat2idx[anno['category_name']] if anno['category_name'] in cat2idx.keys() else -1
+                for j in range(fut_ts):
+                    if cur_anno['next'] != '':
+                        anno_next = nusc.get('sample_annotation', cur_anno['next'])
+                        box_next = Box(
+                            anno_next['translation'], anno_next['size'], Quaternion(anno_next['rotation'])
+                        )
+                        # Move box to ego vehicle coord system.
+                        box_next.translate(-np.array(pose_record['translation']))
+                        box_next.rotate(Quaternion(pose_record['rotation']).inverse)
+                        #  Move box to sensor coord system.
+                        box_next.translate(-np.array(cs_record['translation']))
+                        box_next.rotate(Quaternion(cs_record['rotation']).inverse)
+                        gt_fut_trajs[i, j] = box_next.center[:2] - cur_box.center[:2]
+                        gt_fut_masks[i, j] = 1
+                        # add yaw diff
+                        _, _, box_yaw = quart_to_rpy([cur_box.orientation.x, cur_box.orientation.y,
+                                                      cur_box.orientation.z, cur_box.orientation.w])
+                        _, _, box_yaw_next = quart_to_rpy([box_next.orientation.x, box_next.orientation.y,
+                                                           box_next.orientation.z, box_next.orientation.w])
+                        gt_fut_yaw[i, j] = box_yaw_next - box_yaw
+                        cur_anno = anno_next
+                        cur_box = box_next
+                    else:
+                        gt_fut_trajs[i, j:] = 0
+                        break
+                # get agent goal
+                gt_fut_coords = np.cumsum(gt_fut_trajs[i], axis=-2)
+                coord_diff = gt_fut_coords[-1] - gt_fut_coords[0]
+                if coord_diff.max() < 1.0: # static
+                    gt_fut_goal[i] = 9
+                else:
+                    box_mot_yaw = np.arctan2(coord_diff[1], coord_diff[0]) + np.pi
+                    gt_fut_goal[i] = box_mot_yaw // (np.pi / 4)  # 0-8: goal direction class
+
+            # get ego history traj (offset format)
+            ego_his_trajs = np.zeros((his_ts+1, 3))
+            ego_his_trajs_diff = np.zeros((his_ts+1, 3))
+            sample_cur = sample
+            for i in range(his_ts, -1, -1):
+                if sample_cur is not None:
+                    pose_mat = get_global_sensor_pose(sample_cur, nusc, inverse=False)
+                    ego_his_trajs[i] = pose_mat[:3, 3]
+                    has_prev = sample_cur['prev'] != ''
+                    has_next = sample_cur['next'] != ''
+                    if has_next:
+                        sample_next = nusc.get('sample', sample_cur['next'])
+                        pose_mat_next = get_global_sensor_pose(sample_next, nusc, inverse=False)
+                        ego_his_trajs_diff[i] = pose_mat_next[:3, 3] - ego_his_trajs[i]
+                    sample_cur = nusc.get('sample', sample_cur['prev']) if has_prev else None
+                else:
+                    ego_his_trajs[i] = ego_his_trajs[i+1] - ego_his_trajs_diff[i+1]
+                    ego_his_trajs_diff[i] = ego_his_trajs_diff[i+1]
+            
+            # global to ego at lcf
+            ego_his_trajs = ego_his_trajs - np.array(pose_record['translation'])
+            rot_mat = Quaternion(pose_record['rotation']).inverse.rotation_matrix
+            ego_his_trajs = np.dot(rot_mat, ego_his_trajs.T).T
+            # ego to lidar at lcf
+            ego_his_trajs = ego_his_trajs - np.array(cs_record['translation'])
+            rot_mat = Quaternion(cs_record['rotation']).inverse.rotation_matrix
+            ego_his_trajs = np.dot(rot_mat, ego_his_trajs.T).T
+            ego_his_trajs = ego_his_trajs[1:] - ego_his_trajs[:-1]
+
+            # get ego futute traj (offset format)
+            ego_fut_trajs = np.zeros((fut_ts+1, 3))
+            ego_fut_masks = np.zeros((fut_ts+1))
+            sample_cur = sample
+            for i in range(fut_ts+1):
+                pose_mat = get_global_sensor_pose(sample_cur, nusc, inverse=False)
+                ego_fut_trajs[i] = pose_mat[:3, 3]
+                ego_fut_masks[i] = 1
+                if sample_cur['next'] == '':
+                    ego_fut_trajs[i+1:] = ego_fut_trajs[i]
+                    break
+                else:
+                    sample_cur = nusc.get('sample', sample_cur['next'])
+            # global to ego at lcf
+            ego_fut_trajs = ego_fut_trajs - np.array(pose_record['translation'])
+            rot_mat = Quaternion(pose_record['rotation']).inverse.rotation_matrix
+            ego_fut_trajs = np.dot(rot_mat, ego_fut_trajs.T).T
+            # ego to lidar at lcf
+            ego_fut_trajs = ego_fut_trajs - np.array(cs_record['translation'])
+            rot_mat = Quaternion(cs_record['rotation']).inverse.rotation_matrix
+            ego_fut_trajs = np.dot(rot_mat, ego_fut_trajs.T).T
+            # drive command according to final fut step offset from lcf
+            if ego_fut_trajs[-1][0] >= 2:
+                command = np.array([1, 0, 0])  # Turn Right
+            elif ego_fut_trajs[-1][0] <= -2:
+                command = np.array([0, 1, 0])  # Turn Left
+            else:
+                command = np.array([0, 0, 1])  # Go Straight
+            # offset from lcf -> per-step offset
+            ego_fut_trajs = ego_fut_trajs[1:] - ego_fut_trajs[:-1]
+
+            ### ego lcf feat (vx, vy, ax, ay, w, length, width, vel, steer), w: yaw角速度
+            ego_lcf_feat = np.zeros(9)
+            # 根据odom推算自车速度及加速度
+            _, _, ego_yaw = quart_to_rpy(pose_record['rotation'])
+            ego_pos = np.array(pose_record['translation'])
+            if pose_record_prev is not None:
+                _, _, ego_yaw_prev = quart_to_rpy(pose_record_prev['rotation'])
+                ego_pos_prev = np.array(pose_record_prev['translation'])
+            if pose_record_next is not None:
+                _, _, ego_yaw_next = quart_to_rpy(pose_record_next['rotation'])
+                ego_pos_next = np.array(pose_record_next['translation'])
+            assert (pose_record_prev is not None) or (pose_record_next is not None), 'prev token and next token all empty'
+            if pose_record_prev is not None:
+                ego_w = (ego_yaw - ego_yaw_prev) / 0.5
+                ego_v = np.linalg.norm(ego_pos[:2] - ego_pos_prev[:2]) / 0.5
+                ego_vx, ego_vy = ego_v * math.cos(ego_yaw + np.pi/2), ego_v * math.sin(ego_yaw + np.pi/2)
+            else:
+                ego_w = (ego_yaw_next - ego_yaw) / 0.5
+                ego_v = np.linalg.norm(ego_pos_next[:2] - ego_pos[:2]) / 0.5
+                ego_vx, ego_vy = ego_v * math.cos(ego_yaw + np.pi/2), ego_v * math.sin(ego_yaw + np.pi/2)
+
+            ref_scene = nusc.get("scene", sample['scene_token'])
+            try:
+                pose_msgs = nusc_can_bus.get_messages(ref_scene['name'],'pose')
+                steer_msgs = nusc_can_bus.get_messages(ref_scene['name'], 'steeranglefeedback')
+                pose_uts = [msg['utime'] for msg in pose_msgs]
+                steer_uts = [msg['utime'] for msg in steer_msgs]
+                ref_utime = sample['timestamp']
+                pose_index = locate_message(pose_uts, ref_utime)
+                pose_data = pose_msgs[pose_index]
+                steer_index = locate_message(steer_uts, ref_utime)
+                steer_data = steer_msgs[steer_index]
+                # initial speed
+                v0 = pose_data["vel"][0]  # [0] means longitudinal velocity  m/s
+                # curvature (positive: turn left)
+                steering = steer_data["value"]
+                # flip x axis if in left-hand traffic (singapore)
+                flip_flag = True if map_location.startswith('singapore') else False
+                if flip_flag:
+                    steering *= -1
+                Kappa = 2 * steering / 2.588
+            except:
+                delta_x = ego_his_trajs[-1, 0] + ego_fut_trajs[0, 0]
+                delta_y = ego_his_trajs[-1, 1] + ego_fut_trajs[0, 1]
+                v0 = np.sqrt(delta_x**2 + delta_y**2)
+                Kappa = 0
+
+            ego_lcf_feat[:2] = np.array([ego_vx, ego_vy]) #can_bus[13:15]
+            ego_lcf_feat[2:4] = can_bus[7:9]
+            ego_lcf_feat[4] = ego_w #can_bus[12]
+            ego_lcf_feat[5:7] = np.array([ego_length, ego_width])
+            ego_lcf_feat[7] = v0
+            ego_lcf_feat[8] = Kappa
+
+            info['gt_boxes'] = gt_boxes
+            info['gt_names'] = names
+            info['gt_velocity'] = velocity.reshape(-1, 2)
+            info['num_lidar_pts'] = np.array(
+                [a['num_lidar_pts'] for a in annotations])
+            info['num_radar_pts'] = np.array(
+                [a['num_radar_pts'] for a in annotations])
+            info['valid_flag'] = valid_flag
+            info['gt_agent_fut_trajs'] = gt_fut_trajs.reshape(-1, fut_ts*2).astype(np.float32)
+            info['gt_agent_fut_masks'] = gt_fut_masks.reshape(-1, fut_ts).astype(np.float32)
+            info['gt_agent_lcf_feat'] = agent_lcf_feat.astype(np.float32)
+            info['gt_agent_fut_yaw'] = gt_fut_yaw.astype(np.float32)
+            info['gt_agent_fut_goal'] = gt_fut_goal.astype(np.float32)
+            info['gt_ego_his_trajs'] = ego_his_trajs[:, :2].astype(np.float32)
+            info['gt_ego_fut_trajs'] = ego_fut_trajs[:, :2].astype(np.float32)
+            info['gt_ego_fut_masks'] = ego_fut_masks[1:].astype(np.float32)
+            info['gt_ego_fut_cmd'] = command.astype(np.float32)
+            info['gt_ego_lcf_feat'] = ego_lcf_feat.astype(np.float32)
+
+        if sample['scene_token'] in train_scenes:
+            train_nusc_infos.append(info)
+        else:
+            val_nusc_infos.append(info)
+
+    return train_nusc_infos, val_nusc_infos
+
+def get_global_sensor_pose(rec, nusc, inverse=False):
+    lidar_sample_data = nusc.get('sample_data', rec['data']['LIDAR_TOP'])
+
+    sd_ep = nusc.get("ego_pose", lidar_sample_data["ego_pose_token"])
+    sd_cs = nusc.get("calibrated_sensor", lidar_sample_data["calibrated_sensor_token"])
+    if inverse is False:
+        global_from_ego = transform_matrix(sd_ep["translation"], Quaternion(sd_ep["rotation"]), inverse=False)
+        ego_from_sensor = transform_matrix(sd_cs["translation"], Quaternion(sd_cs["rotation"]), inverse=False)
+        pose = global_from_ego.dot(ego_from_sensor)
+        # translation equivalent writing
+        # pose_translation = np.array(sd_cs["translation"])
+        # rot_mat = Quaternion(sd_ep['rotation']).rotation_matrix
+        # pose_translation = np.dot(rot_mat, pose_translation)
+        # # pose_translation = pose[:3, 3]
+        # pose_translation = pose_translation + np.array(sd_ep["translation"])
+    else:
+        sensor_from_ego = transform_matrix(sd_cs["translation"], Quaternion(sd_cs["rotation"]), inverse=True)
+        ego_from_global = transform_matrix(sd_ep["translation"], Quaternion(sd_ep["rotation"]), inverse=True)
+        pose = sensor_from_ego.dot(ego_from_global)
+    return pose
+
+def obtain_sensor2top(nusc,
+                      sensor_token,
+                      l2e_t,
+                      l2e_r_mat,
+                      e2g_t,
+                      e2g_r_mat,
+                      sensor_type='lidar'):
+    """Obtain the info with RT matric from general sensor to Top LiDAR.
+
+    Args:
+        nusc (class): Dataset class in the nuScenes dataset.
+        sensor_token (str): Sample data token corresponding to the
+            specific sensor type.
+        l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3).
+        l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego
+            in shape (3, 3).
+        e2g_t (np.ndarray): Translation from ego to global in shape (1, 3).
+        e2g_r_mat (np.ndarray): Rotation matrix from ego to global
+            in shape (3, 3).
+        sensor_type (str): Sensor to calibrate. Default: 'lidar'.
+
+    Returns:
+        sweep (dict): Sweep information after transformation.
+    """
+    sd_rec = nusc.get('sample_data', sensor_token)
+    cs_record = nusc.get('calibrated_sensor',
+                         sd_rec['calibrated_sensor_token'])
+    pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    data_path = str(nusc.get_sample_data_path(sd_rec['token']))
+    if os.getcwd() in data_path:  # path from lyftdataset is absolute path
+        data_path = data_path.split(f'{os.getcwd()}/')[-1]  # relative path
+    sweep = {
+        'data_path': data_path,
+        'type': sensor_type,
+        'sample_data_token': sd_rec['token'],
+        'sensor2ego_translation': cs_record['translation'],
+        'sensor2ego_rotation': cs_record['rotation'],
+        'ego2global_translation': pose_record['translation'],
+        'ego2global_rotation': pose_record['rotation'],
+        'timestamp': sd_rec['timestamp']
+    }
+
+    l2e_r_s = sweep['sensor2ego_rotation']
+    l2e_t_s = sweep['sensor2ego_translation']
+    e2g_r_s = sweep['ego2global_rotation']
+    e2g_t_s = sweep['ego2global_translation']
+
+    # obtain the RT from sensor to Top LiDAR
+    # sweep->ego->global->ego'->lidar
+    l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix
+    e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix
+    R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+    T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
+                  ) + l2e_t @ np.linalg.inv(l2e_r_mat).T
+    sweep['sensor2lidar_rotation'] = R.T  # points @ R.T + T
+    sweep['sensor2lidar_translation'] = T
+    return sweep
+
+
+def export_2d_annotation(root_path, info_path, version, mono3d=False):
+    """Export 2d annotation from the info file and raw data.
+
+    Args:
+        root_path (str): Root path of the raw data.
+        info_path (str): Path of the info file.
+        version (str): Dataset version.
+        mono3d (bool): Whether to export mono3d annotation. Default: False.
+    """
+    # get bbox annotations for camera
+    camera_types = [
+        'CAM_FRONT',
+        'CAM_FRONT_RIGHT',
+        'CAM_FRONT_LEFT',
+        'CAM_BACK',
+        'CAM_BACK_LEFT',
+        'CAM_BACK_RIGHT',
+    ]
+    nusc_infos = mmcv.load(info_path)['infos']
+    nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+    # info_2d_list = []
+    cat2Ids = [
+        dict(id=nus_categories.index(cat_name), name=cat_name)
+        for cat_name in nus_categories
+    ]
+    coco_ann_id = 0
+    coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids)
+    for info in mmcv.track_iter_progress(nusc_infos):
+        for cam in camera_types:
+            cam_info = info['cams'][cam]
+            coco_infos = get_2d_boxes(
+                nusc,
+                cam_info['sample_data_token'],
+                visibilities=['', '1', '2', '3', '4'],
+                mono3d=mono3d)
+            (height, width, _) = mmcv.imread(cam_info['data_path']).shape
+            coco_2d_dict['images'].append(
+                dict(
+                    file_name=cam_info['data_path'].split('data/nuscenes/')
+                    [-1],
+                    id=cam_info['sample_data_token'],
+                    token=info['token'],
+                    cam2ego_rotation=cam_info['sensor2ego_rotation'],
+                    cam2ego_translation=cam_info['sensor2ego_translation'],
+                    ego2global_rotation=info['ego2global_rotation'],
+                    ego2global_translation=info['ego2global_translation'],
+                    cam_intrinsic=cam_info['cam_intrinsic'],
+                    width=width,
+                    height=height))
+            for coco_info in coco_infos:
+                if coco_info is None:
+                    continue
+                # add an empty key for coco format
+                coco_info['segmentation'] = []
+                coco_info['id'] = coco_ann_id
+                coco_2d_dict['annotations'].append(coco_info)
+                coco_ann_id += 1
+    if mono3d:
+        json_prefix = f'{info_path[:-4]}_mono3d'
+    else:
+        json_prefix = f'{info_path[:-4]}'
+    mmcv.dump(coco_2d_dict, f'{json_prefix}.coco.json')
+
+
+def get_2d_boxes(nusc,
+                 sample_data_token: str,
+                 visibilities: List[str],
+                 mono3d=True):
+    """Get the 2D annotation records for a given `sample_data_token`.
+
+    Args:
+        sample_data_token (str): Sample data token belonging to a camera \
+            keyframe.
+        visibilities (list[str]): Visibility filter.
+        mono3d (bool): Whether to get boxes with mono3d annotation.
+
+    Return:
+        list[dict]: List of 2D annotation record that belongs to the input
+            `sample_data_token`.
+    """
+
+    # Get the sample data and the sample corresponding to that sample data.
+    sd_rec = nusc.get('sample_data', sample_data_token)
+
+    assert sd_rec[
+        'sensor_modality'] == 'camera', 'Error: get_2d_boxes only works' \
+        ' for camera sample_data!'
+    if not sd_rec['is_key_frame']:
+        raise ValueError(
+            'The 2D re-projections are available only for keyframes.')
+
+    s_rec = nusc.get('sample', sd_rec['sample_token'])
+
+    # Get the calibrated sensor and ego pose
+    # record to get the transformation matrices.
+    cs_rec = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token'])
+    pose_rec = nusc.get('ego_pose', sd_rec['ego_pose_token'])
+    camera_intrinsic = np.array(cs_rec['camera_intrinsic'])
+
+    # Get all the annotation with the specified visibilties.
+    ann_recs = [
+        nusc.get('sample_annotation', token) for token in s_rec['anns']
+    ]
+    ann_recs = [
+        ann_rec for ann_rec in ann_recs
+        if (ann_rec['visibility_token'] in visibilities)
+    ]
+
+    repro_recs = []
+
+    for ann_rec in ann_recs:
+        # Augment sample_annotation with token information.
+        ann_rec['sample_annotation_token'] = ann_rec['token']
+        ann_rec['sample_data_token'] = sample_data_token
+
+        # Get the box in global coordinates.
+        box = nusc.get_box(ann_rec['token'])
+
+        # Move them to the ego-pose frame.
+        box.translate(-np.array(pose_rec['translation']))
+        box.rotate(Quaternion(pose_rec['rotation']).inverse)
+
+        # Move them to the calibrated sensor frame.
+        box.translate(-np.array(cs_rec['translation']))
+        box.rotate(Quaternion(cs_rec['rotation']).inverse)
+
+        # Filter out the corners that are not in front of the calibrated
+        # sensor.
+        corners_3d = box.corners()
+        in_front = np.argwhere(corners_3d[2, :] > 0).flatten()
+        corners_3d = corners_3d[:, in_front]
+
+        # Project 3d box to 2d.
+        corner_coords = view_points(corners_3d, camera_intrinsic,
+                                    True).T[:, :2].tolist()
+
+        # Keep only corners that fall within the image.
+        final_coords = post_process_coords(corner_coords)
+
+        # Skip if the convex hull of the re-projected corners
+        # does not intersect the image canvas.
+        if final_coords is None:
+            continue
+        else:
+            min_x, min_y, max_x, max_y = final_coords
+
+        # Generate dictionary record to be included in the .json file.
+        repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y,
+                                    sample_data_token, sd_rec['filename'])
+
+        # If mono3d=True, add 3D annotations in camera coordinates
+        if mono3d and (repro_rec is not None):
+            loc = box.center.tolist()
+
+            dim = box.wlh
+            dim[[0, 1, 2]] = dim[[1, 2, 0]]  # convert wlh to our lhw
+            dim = dim.tolist()
+
+            rot = box.orientation.yaw_pitch_roll[0]
+            rot = [-rot]  # convert the rot to our cam coordinate
+
+            global_velo2d = nusc.box_velocity(box.token)[:2]
+            global_velo3d = np.array([*global_velo2d, 0.0])
+            e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix
+            c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix
+            cam_velo3d = global_velo3d @ np.linalg.inv(
+                e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T
+            velo = cam_velo3d[0::2].tolist()
+
+            repro_rec['bbox_cam3d'] = loc + dim + rot
+            repro_rec['velo_cam3d'] = velo
+
+            center3d = np.array(loc).reshape([1, 3])
+            center2d = points_cam2img(
+                center3d, camera_intrinsic, with_depth=True)
+            repro_rec['center2d'] = center2d.squeeze().tolist()
+            # normalized center2D + depth
+            # if samples with depth < 0 will be removed
+            if repro_rec['center2d'][2] <= 0:
+                continue
+
+            ann_token = nusc.get('sample_annotation',
+                                 box.token)['attribute_tokens']
+            if len(ann_token) == 0:
+                attr_name = 'None'
+            else:
+                attr_name = nusc.get('attribute', ann_token[0])['name']
+            attr_id = nus_attributes.index(attr_name)
+            repro_rec['attribute_name'] = attr_name
+            repro_rec['attribute_id'] = attr_id
+
+        repro_recs.append(repro_rec)
+
+    return repro_recs
+
+
+def post_process_coords(
+    corner_coords: List, imsize: Tuple[int, int] = (1600, 900)
+) -> Union[Tuple[float, float, float, float], None]:
+    """Get the intersection of the convex hull of the reprojected bbox corners
+    and the image canvas, return None if no intersection.
+
+    Args:
+        corner_coords (list[int]): Corner coordinates of reprojected
+            bounding box.
+        imsize (tuple[int]): Size of the image canvas.
+
+    Return:
+        tuple [float]: Intersection of the convex hull of the 2D box
+            corners and the image canvas.
+    """
+    polygon_from_2d_box = MultiPoint(corner_coords).convex_hull
+    img_canvas = box(0, 0, imsize[0], imsize[1])
+
+    if polygon_from_2d_box.intersects(img_canvas):
+        img_intersection = polygon_from_2d_box.intersection(img_canvas)
+        intersection_coords = np.array(
+            [coord for coord in img_intersection.exterior.coords])
+
+        min_x = min(intersection_coords[:, 0])
+        min_y = min(intersection_coords[:, 1])
+        max_x = max(intersection_coords[:, 0])
+        max_y = max(intersection_coords[:, 1])
+
+        return min_x, min_y, max_x, max_y
+    else:
+        return None
+
+
+def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float,
+                    sample_data_token: str, filename: str) -> OrderedDict:
+    """Generate one 2D annotation record given various informations on top of
+    the 2D bounding box coordinates.
+
+    Args:
+        ann_rec (dict): Original 3d annotation record.
+        x1 (float): Minimum value of the x coordinate.
+        y1 (float): Minimum value of the y coordinate.
+        x2 (float): Maximum value of the x coordinate.
+        y2 (float): Maximum value of the y coordinate.
+        sample_data_token (str): Sample data token.
+        filename (str):The corresponding image file where the annotation
+            is present.
+
+    Returns:
+        dict: A sample 2D annotation record.
+            - file_name (str): flie name
+            - image_id (str): sample data token
+            - area (float): 2d box area
+            - category_name (str): category name
+            - category_id (int): category id
+            - bbox (list[float]): left x, top y, dx, dy of 2d box
+            - iscrowd (int): whether the area is crowd
+    """
+    repro_rec = OrderedDict()
+    repro_rec['sample_data_token'] = sample_data_token
+    coco_rec = dict()
+
+    relevant_keys = [
+        'attribute_tokens',
+        'category_name',
+        'instance_token',
+        'next',
+        'num_lidar_pts',
+        'num_radar_pts',
+        'prev',
+        'sample_annotation_token',
+        'sample_data_token',
+        'visibility_token',
+    ]
+
+    for key, value in ann_rec.items():
+        if key in relevant_keys:
+            repro_rec[key] = value
+
+    repro_rec['bbox_corners'] = [x1, y1, x2, y2]
+    repro_rec['filename'] = filename
+
+    coco_rec['file_name'] = filename
+    coco_rec['image_id'] = sample_data_token
+    coco_rec['area'] = (y2 - y1) * (x2 - x1)
+
+    if repro_rec['category_name'] not in NuScenesDataset.NameMapping:
+        return None
+    cat_name = NuScenesDataset.NameMapping[repro_rec['category_name']]
+    coco_rec['category_name'] = cat_name
+    coco_rec['category_id'] = nus_categories.index(cat_name)
+    coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1]
+    coco_rec['iscrowd'] = 0
+
+    return coco_rec
+
+
+def nuscenes_data_prep(root_path,
+                       can_bus_root_path,
+                       info_prefix,
+                       version,
+                       dataset_name,
+                       out_dir,
+                       max_sweeps=10):
+    """Prepare data related to nuScenes dataset.
+
+    Related data consists of '.pkl' files recording basic infos,
+    2D annotations and groundtruth database.
+
+    Args:
+        root_path (str): Path of dataset root.
+        info_prefix (str): The prefix of info filenames.
+        version (str): Dataset version.
+        dataset_name (str): The dataset class name.
+        out_dir (str): Output directory of the groundtruth database info.
+        max_sweeps (int): Number of input consecutive frames. Default: 10
+    """
+    create_nuscenes_infos(
+        root_path, out_dir, can_bus_root_path, info_prefix, version=version, max_sweeps=max_sweeps)
+
+
+parser = argparse.ArgumentParser(description='Data converter arg parser')
+parser.add_argument('dataset', metavar='kitti', help='name of the dataset')
+parser.add_argument(
+    '--root-path',
+    type=str,
+    default='./data/kitti',
+    help='specify the root path of dataset')
+parser.add_argument(
+    '--canbus',
+    type=str,
+    default='./data',
+    help='specify the root path of nuScenes canbus')
+parser.add_argument(
+    '--version',
+    type=str,
+    default='v1.0',
+    required=False,
+    help='specify the dataset version, no need for kitti')
+parser.add_argument(
+    '--max-sweeps',
+    type=int,
+    default=10,
+    required=False,
+    help='specify sweeps of lidar per example')
+parser.add_argument(
+    '--out-dir',
+    type=str,
+    default='./data/kitti',
+    required='False',
+    help='name of info pkl')
+parser.add_argument('--extra-tag', type=str, default='kitti')
+parser.add_argument(
+    '--workers', type=int, default=4, help='number of threads to be used')
+args = parser.parse_args()
+
+if __name__ == '__main__':
+    if args.dataset == 'nuscenes' and args.version != 'v1.0-mini':
+        train_version = f'{args.version}-trainval'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
+        test_version = f'{args.version}-test'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=test_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
+    elif args.dataset == 'nuscenes' and args.version == 'v1.0-mini':
+        train_version = f'{args.version}'
+        nuscenes_data_prep(
+            root_path=args.root_path,
+            can_bus_root_path=args.canbus,
+            info_prefix=args.extra_tag,
+            version=train_version,
+            dataset_name='NuScenesDataset',
+            out_dir=args.out_dir,
+            max_sweeps=args.max_sweeps)
diff --git a/adzoo/vad/dist_test.sh b/adzoo/vad/dist_test.sh
new file mode 100755
index 0000000..3e2ec30
--- /dev/null
+++ b/adzoo/vad/dist_test.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+PORT=${PORT:-29503}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} --eval bbox
diff --git a/adzoo/vad/dist_train.sh b/adzoo/vad/dist_train.sh
new file mode 100755
index 0000000..141b284
--- /dev/null
+++ b/adzoo/vad/dist_train.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-28509}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic
diff --git a/adzoo/vad/misc/browse_dataset.py b/adzoo/vad/misc/browse_dataset.py
new file mode 100644
index 0000000..e3419f6
--- /dev/null
+++ b/adzoo/vad/misc/browse_dataset.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import numpy as np
+import warnings
+from mmcv import Config, DictAction, mkdir_or_exist, track_iter_progress
+from os import path as osp
+
+from mmdet3d.core.bbox import (Box3DMode, CameraInstance3DBoxes, Coord3DMode,
+                               DepthInstance3DBoxes, LiDARInstance3DBoxes)
+from mmdet3d.core.visualizer import (show_multi_modality_result, show_result,
+                                     show_seg_result)
+from mmdet3d.datasets import build_dataset
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--skip-type',
+        type=str,
+        nargs='+',
+        default=['Normalize'],
+        help='skip some useless pipeline')
+    parser.add_argument(
+        '--output-dir',
+        default=None,
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument(
+        '--task',
+        type=str,
+        choices=['det', 'seg', 'multi_modality-det', 'mono-det'],
+        help='Determine the visualization method depending on the task.')
+    parser.add_argument(
+        '--online',
+        action='store_true',
+        help='Whether to perform online visualization. Note that you often '
+        'need a monitor to do so.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def build_data_cfg(config_path, skip_type, cfg_options):
+    """Build data config for loading visualization data."""
+    cfg = Config.fromfile(config_path)
+    if cfg_options is not None:
+        cfg.merge_from_dict(cfg_options)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+    # extract inner dataset of `RepeatDataset` as `cfg.data.train`
+    # so we don't need to worry about it later
+    if cfg.data.train['type'] == 'RepeatDataset':
+        cfg.data.train = cfg.data.train.dataset
+    # use only first dataset for `ConcatDataset`
+    if cfg.data.train['type'] == 'ConcatDataset':
+        cfg.data.train = cfg.data.train.datasets[0]
+    train_data_cfg = cfg.data.train
+    # eval_pipeline purely consists of loading functions
+    # use eval_pipeline for data loading
+    train_data_cfg['pipeline'] = [
+        x for x in cfg.eval_pipeline if x['type'] not in skip_type
+    ]
+
+    return cfg
+
+
+def to_depth_mode(points, bboxes):
+    """Convert points and bboxes to Depth Coord and Depth Box mode."""
+    if points is not None:
+        points = Coord3DMode.convert_point(points.copy(), Coord3DMode.LIDAR,
+                                           Coord3DMode.DEPTH)
+    if bboxes is not None:
+        bboxes = Box3DMode.convert(bboxes.clone(), Box3DMode.LIDAR,
+                                   Box3DMode.DEPTH)
+    return points, bboxes
+
+
+def show_det_data(idx, dataset, out_dir, filename, show=False):
+    """Visualize 3D point cloud and 3D bboxes."""
+    example = dataset.prepare_train_data(idx)
+    points = example['points']._data.numpy()
+    gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d'].tensor
+    if dataset.box_mode_3d != Box3DMode.DEPTH:
+        points, gt_bboxes = to_depth_mode(points, gt_bboxes)
+    show_result(
+        points,
+        gt_bboxes.clone(),
+        None,
+        out_dir,
+        filename,
+        show=show,
+        snapshot=True)
+
+
+def show_seg_data(idx, dataset, out_dir, filename, show=False):
+    """Visualize 3D point cloud and segmentation mask."""
+    example = dataset.prepare_train_data(idx)
+    points = example['points']._data.numpy()
+    gt_seg = example['pts_semantic_mask']._data.numpy()
+    show_seg_result(
+        points,
+        gt_seg.copy(),
+        None,
+        out_dir,
+        filename,
+        np.array(dataset.PALETTE),
+        dataset.ignore_index,
+        show=show,
+        snapshot=True)
+
+
+def show_proj_bbox_img(idx,
+                       dataset,
+                       out_dir,
+                       filename,
+                       show=False,
+                       is_nus_mono=False):
+    """Visualize 3D bboxes on 2D image by projection."""
+    try:
+        example = dataset.prepare_train_data(idx)
+    except AttributeError:  # for Mono-3D datasets
+        example = dataset.prepare_train_img(idx)
+    gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d']
+    img_metas = example['img_metas']._data
+    img = example['img']._data.numpy()
+    # need to transpose channel to first dim
+    img = img.transpose(1, 2, 0)
+    # no 3D gt bboxes, just show img
+    if gt_bboxes.tensor.shape[0] == 0:
+        gt_bboxes = None
+    if isinstance(gt_bboxes, DepthInstance3DBoxes):
+        show_multi_modality_result(
+            img,
+            gt_bboxes,
+            None,
+            None,
+            out_dir,
+            filename,
+            box_mode='depth',
+            img_metas=img_metas,
+            show=show)
+    elif isinstance(gt_bboxes, LiDARInstance3DBoxes):
+        show_multi_modality_result(
+            img,
+            gt_bboxes,
+            None,
+            img_metas['lidar2img'],
+            out_dir,
+            filename,
+            box_mode='lidar',
+            img_metas=img_metas,
+            show=show)
+    elif isinstance(gt_bboxes, CameraInstance3DBoxes):
+        show_multi_modality_result(
+            img,
+            gt_bboxes,
+            None,
+            img_metas['cam2img'],
+            out_dir,
+            filename,
+            box_mode='camera',
+            img_metas=img_metas,
+            show=show)
+    else:
+        # can't project, just show img
+        warnings.warn(
+            f'unrecognized gt box type {type(gt_bboxes)}, only show image')
+        show_multi_modality_result(
+            img, None, None, None, out_dir, filename, show=show)
+
+
+def main():
+    args = parse_args()
+
+    if args.output_dir is not None:
+        mkdir_or_exist(args.output_dir)
+
+    cfg = build_data_cfg(args.config, args.skip_type, args.cfg_options)
+    try:
+        dataset = build_dataset(
+            cfg.data.train, default_args=dict(filter_empty_gt=False))
+    except TypeError:  # seg dataset doesn't have `filter_empty_gt` key
+        dataset = build_dataset(cfg.data.train)
+    data_infos = dataset.data_infos
+    dataset_type = cfg.dataset_type
+
+    # configure visualization mode
+    vis_task = args.task  # 'det', 'seg', 'multi_modality-det', 'mono-det'
+
+    for idx, data_info in enumerate(track_iter_progress(data_infos)):
+        if dataset_type in ['KittiDataset', 'WaymoDataset']:
+            data_path = data_info['point_cloud']['velodyne_path']
+        elif dataset_type in [
+                'ScanNetDataset', 'SUNRGBDDataset', 'ScanNetSegDataset',
+                'S3DISSegDataset', 'S3DISDataset'
+        ]:
+            data_path = data_info['pts_path']
+        elif dataset_type in ['NuScenesDataset', 'LyftDataset']:
+            data_path = data_info['lidar_path']
+        elif dataset_type in ['NuScenesMonoDataset']:
+            data_path = data_info['file_name']
+        else:
+            raise NotImplementedError(
+                f'unsupported dataset type {dataset_type}')
+
+        file_name = osp.splitext(osp.basename(data_path))[0]
+
+        if vis_task in ['det', 'multi_modality-det']:
+            # show 3D bboxes on 3D point clouds
+            show_det_data(
+                idx, dataset, args.output_dir, file_name, show=args.online)
+        if vis_task in ['multi_modality-det', 'mono-det']:
+            # project 3D bboxes to 2D image
+            show_proj_bbox_img(
+                idx,
+                dataset,
+                args.output_dir,
+                file_name,
+                show=args.online,
+                is_nus_mono=(dataset_type == 'NuScenesMonoDataset'))
+        elif vis_task in ['seg']:
+            # show 3D segmentation mask on 3D point clouds
+            show_seg_data(
+                idx, dataset, args.output_dir, file_name, show=args.online)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/vad/misc/fuse_conv_bn.py b/adzoo/vad/misc/fuse_conv_bn.py
new file mode 100644
index 0000000..d4e2201
--- /dev/null
+++ b/adzoo/vad/misc/fuse_conv_bn.py
@@ -0,0 +1,67 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import torch
+from mmcv.runner import save_checkpoint
+from torch import nn as nn
+
+from mmdet.apis import init_model
+
+
+def fuse_conv_bn(conv, bn):
+    """During inference, the functionary of batch norm layers is turned off but
+    only the mean and var alone channels are used, which exposes the chance to
+    fuse it with the preceding conv layers to save computations and simplify
+    network structures."""
+    conv_w = conv.weight
+    conv_b = conv.bias if conv.bias is not None else torch.zeros_like(
+        bn.running_mean)
+
+    factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
+    conv.weight = nn.Parameter(conv_w *
+                               factor.reshape([conv.out_channels, 1, 1, 1]))
+    conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
+    return conv
+
+
+def fuse_module(m):
+    last_conv = None
+    last_conv_name = None
+
+    for name, child in m.named_children():
+        if isinstance(child, (nn.BatchNorm2d, nn.SyncBatchNorm)):
+            if last_conv is None:  # only fuse BN that is after Conv
+                continue
+            fused_conv = fuse_conv_bn(last_conv, child)
+            m._modules[last_conv_name] = fused_conv
+            # To reduce changes, set BN as Identity instead of deleting it.
+            m._modules[name] = nn.Identity()
+            last_conv = None
+        elif isinstance(child, nn.Conv2d):
+            last_conv = child
+            last_conv_name = name
+        else:
+            fuse_module(child)
+    return m
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='fuse Conv and BN layers in a model')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument('checkpoint', help='checkpoint file path')
+    parser.add_argument('out', help='output path of the converted model')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    # build the model from a config file and a checkpoint file
+    model = init_model(args.config, args.checkpoint)
+    # fuse conv and bn layers of the model
+    fused_model = fuse_module(model)
+    save_checkpoint(fused_model, args.out)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/vad/misc/print_config.py b/adzoo/vad/misc/print_config.py
new file mode 100644
index 0000000..3100fc3
--- /dev/null
+++ b/adzoo/vad/misc/print_config.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+from mmcv import Config, DictAction
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Print the whole config')
+    parser.add_argument('config', help='config file path')
+    parser.add_argument(
+        '--options', nargs='+', action=DictAction, help='arguments in dict')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.options is not None:
+        cfg.merge_from_dict(args.options)
+    print(f'Config:\n{cfg.pretty_text}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/vad/misc/visualize_results.py b/adzoo/vad/misc/visualize_results.py
new file mode 100644
index 0000000..302adc5
--- /dev/null
+++ b/adzoo/vad/misc/visualize_results.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import mmcv
+from mmcv import Config
+
+from mmdet3d.datasets import build_dataset
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet3D visualize the results')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('--result', help='results file in pickle format')
+    parser.add_argument(
+        '--show-dir', help='directory where visualize results will be saved')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.result is not None and \
+            not args.result.endswith(('.pkl', '.pickle')):
+        raise ValueError('The results file must be a pkl file.')
+
+    cfg = Config.fromfile(args.config)
+    cfg.data.test.test_mode = True
+
+    # build the dataset
+    dataset = build_dataset(cfg.data.test)
+    results = mmcv.load(args.result)
+
+    if getattr(dataset, 'show', None) is not None:
+        # data loading pipeline for showing
+        eval_pipeline = cfg.get('eval_pipeline', {})
+        if eval_pipeline:
+            dataset.show(results, args.show_dir, pipeline=eval_pipeline)
+        else:
+            dataset.show(results, args.show_dir)  # use default pipeline
+    else:
+        raise NotImplementedError(
+            'Show is not implemented for dataset {}!'.format(
+                type(dataset).__name__))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/vad/model_converters/convert_votenet_checkpoints.py b/adzoo/vad/model_converters/convert_votenet_checkpoints.py
new file mode 100644
index 0000000..33792b0
--- /dev/null
+++ b/adzoo/vad/model_converters/convert_votenet_checkpoints.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import tempfile
+import torch
+from mmcv import Config
+from mmcv.runner import load_state_dict
+
+from mmdet3d.models import build_detector
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet3D upgrade model version(before v0.6.0) of VoteNet')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--out', help='path of the output checkpoint file')
+    args = parser.parse_args()
+    return args
+
+
+def parse_config(config_strings):
+    """Parse config from strings.
+
+    Args:
+        config_strings (string): strings of model config.
+
+    Returns:
+        Config: model config
+    """
+    temp_file = tempfile.NamedTemporaryFile()
+    config_path = f'{temp_file.name}.py'
+    with open(config_path, 'w') as f:
+        f.write(config_strings)
+
+    config = Config.fromfile(config_path)
+
+    # Update backbone config
+    if 'pool_mod' in config.model.backbone:
+        config.model.backbone.pop('pool_mod')
+
+    if 'sa_cfg' not in config.model.backbone:
+        config.model.backbone['sa_cfg'] = dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)
+
+    if 'type' not in config.model.bbox_head.vote_aggregation_cfg:
+        config.model.bbox_head.vote_aggregation_cfg['type'] = 'PointSAModule'
+
+    # Update bbox_head config
+    if 'pred_layer_cfg' not in config.model.bbox_head:
+        config.model.bbox_head['pred_layer_cfg'] = dict(
+            in_channels=128, shared_conv_channels=(128, 128), bias=True)
+
+    if 'feat_channels' in config.model.bbox_head:
+        config.model.bbox_head.pop('feat_channels')
+
+    if 'vote_moudule_cfg' in config.model.bbox_head:
+        config.model.bbox_head['vote_module_cfg'] = config.model.bbox_head.pop(
+            'vote_moudule_cfg')
+
+    if config.model.bbox_head.vote_aggregation_cfg.use_xyz:
+        config.model.bbox_head.vote_aggregation_cfg.mlp_channels[0] -= 3
+
+    temp_file.close()
+
+    return config
+
+
+def main():
+    """Convert keys in checkpoints for VoteNet.
+
+    There can be some breaking changes during the development of mmdetection3d,
+    and this tool is used for upgrading checkpoints trained with old versions
+    (before v0.6.0) to the latest one.
+    """
+    args = parse_args()
+    checkpoint = torch.load(args.checkpoint)
+    cfg = parse_config(checkpoint['meta']['config'])
+    # Build the model and load checkpoint
+    model = build_detector(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    orig_ckpt = checkpoint['state_dict']
+    converted_ckpt = orig_ckpt.copy()
+
+    if cfg['dataset_type'] == 'ScanNetDataset':
+        NUM_CLASSES = 18
+    elif cfg['dataset_type'] == 'SUNRGBDDataset':
+        NUM_CLASSES = 10
+    else:
+        raise NotImplementedError
+
+    RENAME_PREFIX = {
+        'bbox_head.conv_pred.0': 'bbox_head.conv_pred.shared_convs.layer0',
+        'bbox_head.conv_pred.1': 'bbox_head.conv_pred.shared_convs.layer1'
+    }
+
+    DEL_KEYS = [
+        'bbox_head.conv_pred.0.bn.num_batches_tracked',
+        'bbox_head.conv_pred.1.bn.num_batches_tracked'
+    ]
+
+    EXTRACT_KEYS = {
+        'bbox_head.conv_pred.conv_cls.weight':
+        ('bbox_head.conv_pred.conv_out.weight', [(0, 2), (-NUM_CLASSES, -1)]),
+        'bbox_head.conv_pred.conv_cls.bias':
+        ('bbox_head.conv_pred.conv_out.bias', [(0, 2), (-NUM_CLASSES, -1)]),
+        'bbox_head.conv_pred.conv_reg.weight':
+        ('bbox_head.conv_pred.conv_out.weight', [(2, -NUM_CLASSES)]),
+        'bbox_head.conv_pred.conv_reg.bias':
+        ('bbox_head.conv_pred.conv_out.bias', [(2, -NUM_CLASSES)])
+    }
+
+    # Delete some useless keys
+    for key in DEL_KEYS:
+        converted_ckpt.pop(key)
+
+    # Rename keys with specific prefix
+    RENAME_KEYS = dict()
+    for old_key in converted_ckpt.keys():
+        for rename_prefix in RENAME_PREFIX.keys():
+            if rename_prefix in old_key:
+                new_key = old_key.replace(rename_prefix,
+                                          RENAME_PREFIX[rename_prefix])
+                RENAME_KEYS[new_key] = old_key
+    for new_key, old_key in RENAME_KEYS.items():
+        converted_ckpt[new_key] = converted_ckpt.pop(old_key)
+
+    # Extract weights and rename the keys
+    for new_key, (old_key, indices) in EXTRACT_KEYS.items():
+        cur_layers = orig_ckpt[old_key]
+        converted_layers = []
+        for (start, end) in indices:
+            if end != -1:
+                converted_layers.append(cur_layers[start:end])
+            else:
+                converted_layers.append(cur_layers[start:])
+        converted_layers = torch.cat(converted_layers, 0)
+        converted_ckpt[new_key] = converted_layers
+        if old_key in converted_ckpt.keys():
+            converted_ckpt.pop(old_key)
+
+    # Check the converted checkpoint by loading to the model
+    load_state_dict(model, converted_ckpt, strict=True)
+    checkpoint['state_dict'] = converted_ckpt
+    torch.save(checkpoint, args.out)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/vad/model_converters/publish_model.py b/adzoo/vad/model_converters/publish_model.py
new file mode 100644
index 0000000..318fd46
--- /dev/null
+++ b/adzoo/vad/model_converters/publish_model.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import subprocess
+import torch
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Process a checkpoint to be published')
+    parser.add_argument('in_file', help='input checkpoint filename')
+    parser.add_argument('out_file', help='output checkpoint filename')
+    args = parser.parse_args()
+    return args
+
+
+def process_checkpoint(in_file, out_file):
+    checkpoint = torch.load(in_file, map_location='cpu')
+    # remove optimizer for smaller file size
+    if 'optimizer' in checkpoint:
+        del checkpoint['optimizer']
+    # if it is necessary to remove some sensitive data in checkpoint['meta'],
+    # add the code here.
+    torch.save(checkpoint, out_file)
+    sha = subprocess.check_output(['sha256sum', out_file]).decode()
+    final_file = out_file.rstrip('.pth') + '-{}.pth'.format(sha[:8])
+    subprocess.Popen(['mv', out_file, final_file])
+
+
+def main():
+    args = parse_args()
+    process_checkpoint(args.in_file, args.out_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/vad/model_converters/regnet2mmdet.py b/adzoo/vad/model_converters/regnet2mmdet.py
new file mode 100644
index 0000000..9dee3c8
--- /dev/null
+++ b/adzoo/vad/model_converters/regnet2mmdet.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import torch
+from collections import OrderedDict
+
+
+def convert_stem(model_key, model_weight, state_dict, converted_names):
+    new_key = model_key.replace('stem.conv', 'conv1')
+    new_key = new_key.replace('stem.bn', 'bn1')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+    print(f'Convert {model_key} to {new_key}')
+
+
+def convert_head(model_key, model_weight, state_dict, converted_names):
+    new_key = model_key.replace('head.fc', 'fc')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+    print(f'Convert {model_key} to {new_key}')
+
+
+def convert_reslayer(model_key, model_weight, state_dict, converted_names):
+    split_keys = model_key.split('.')
+    layer, block, module = split_keys[:3]
+    block_id = int(block[1:])
+    layer_name = f'layer{int(layer[1:])}'
+    block_name = f'{block_id - 1}'
+
+    if block_id == 1 and module == 'bn':
+        new_key = f'{layer_name}.{block_name}.downsample.1.{split_keys[-1]}'
+    elif block_id == 1 and module == 'proj':
+        new_key = f'{layer_name}.{block_name}.downsample.0.{split_keys[-1]}'
+    elif module == 'f':
+        if split_keys[3] == 'a_bn':
+            module_name = 'bn1'
+        elif split_keys[3] == 'b_bn':
+            module_name = 'bn2'
+        elif split_keys[3] == 'c_bn':
+            module_name = 'bn3'
+        elif split_keys[3] == 'a':
+            module_name = 'conv1'
+        elif split_keys[3] == 'b':
+            module_name = 'conv2'
+        elif split_keys[3] == 'c':
+            module_name = 'conv3'
+        new_key = f'{layer_name}.{block_name}.{module_name}.{split_keys[-1]}'
+    else:
+        raise ValueError(f'Unsupported conversion of key {model_key}')
+    print(f'Convert {model_key} to {new_key}')
+    state_dict[new_key] = model_weight
+    converted_names.add(model_key)
+
+
+def convert(src, dst):
+    """Convert keys in pycls pretrained RegNet models to mmdet style."""
+    # load caffe model
+    regnet_model = torch.load(src)
+    blobs = regnet_model['model_state']
+    # convert to pytorch style
+    state_dict = OrderedDict()
+    converted_names = set()
+    for key, weight in blobs.items():
+        if 'stem' in key:
+            convert_stem(key, weight, state_dict, converted_names)
+        elif 'head' in key:
+            convert_head(key, weight, state_dict, converted_names)
+        elif key.startswith('s'):
+            convert_reslayer(key, weight, state_dict, converted_names)
+
+    # check if all layers are converted
+    for key in blobs:
+        if key not in converted_names:
+            print(f'not converted: {key}')
+    # save checkpoint
+    checkpoint = dict()
+    checkpoint['state_dict'] = state_dict
+    torch.save(checkpoint, dst)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert model keys')
+    parser.add_argument('src', help='src detectron model path')
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+    convert(args.src, args.dst)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adzoo/vad/test.py b/adzoo/vad/test.py
new file mode 100644
index 0000000..1733443
--- /dev/null
+++ b/adzoo/vad/test.py
@@ -0,0 +1,277 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+import argparse
+import os
+import torch
+import warnings
+from mmcv.utils import get_dist_info, init_dist, wrap_fp16_model, set_random_seed, Config, DictAction, load_checkpoint
+from mmcv.models import build_model, fuse_conv_bn
+from torch.nn import DataParallel
+from torch.nn.parallel.distributed import DistributedDataParallel
+
+from mmcv.datasets import build_dataset, build_dataloader, replace_ImageToTensor
+import time
+import os.path as osp
+from adzoo.vad.apis.test import custom_multi_gpu_test, single_gpu_test
+
+import warnings
+warnings.filterwarnings("ignore")
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='MMDet test (and eval) a model')
+    parser.add_argument('config', help='test config file path')
+    parser.add_argument('checkpoint', help='checkpoint file')
+    parser.add_argument('--json_dir', help='json parent dir name file') # NOTE: json file parent folder name
+    parser.add_argument('--out', help='output result file in pickle format')
+    parser.add_argument(
+        '--fuse-conv-bn',
+        action='store_true',
+        help='Whether to fuse conv and bn, this will slightly increase'
+        'the inference speed')
+    parser.add_argument(
+        '--format-only',
+        action='store_true',
+        help='Format the output results without perform evaluation. It is'
+        'useful when you want to format the result to a specific format and '
+        'submit it to the test server')
+    parser.add_argument(
+        '--eval',
+        type=str,
+        nargs='+',
+        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
+        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC')
+    parser.add_argument('--show', action='store_true', help='show results')
+    parser.add_argument(
+        '--show-dir', help='directory where results will be saved')
+    parser.add_argument(
+        '--gpu-collect',
+        action='store_true',
+        help='whether to use gpu to collect results.')
+    parser.add_argument(
+        '--tmpdir',
+        help='tmp directory used for collecting results from multiple '
+        'workers, available when gpu-collect is not specified')
+    parser.add_argument('--seed', type=int, default=0, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function (deprecate), '
+        'change to --eval-options instead.')
+    parser.add_argument(
+        '--eval-options',
+        nargs='+',
+        action=DictAction,
+        help='custom options for evaluation, the key-value pair in xxx=yyy '
+        'format will be kwargs for dataset.evaluate() function')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local-rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.eval_options:
+        raise ValueError(
+            '--options and --eval-options cannot be both specified, '
+            '--options is deprecated in favor of --eval-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --eval-options')
+        args.eval_options = args.options
+    return args
+
+
+def main():
+    args = parse_args()
+
+    assert args.out or args.eval or args.format_only or args.show \
+        or args.show_dir, \
+        ('Please specify at least one operation (save/eval/format/show the '
+         'results / save the results) with the argument "--out", "--eval"'
+         ', "--format-only", "--show" or "--show-dir"')
+
+    if args.eval and args.format_only:
+        raise ValueError('--eval and --format_only cannot be both specified')
+
+    if args.out is not None and not args.out.endswith(('.pkl', '.pickle')):
+        raise ValueError('The output file must be a pkl file.')
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+
+    # import modules from plguin/xx, registry will be updated
+    # if hasattr(cfg, 'plugin'):
+    #     if cfg.plugin:
+    #         import importlib
+    #         if hasattr(cfg, 'plugin_dir'):
+    #             plugin_dir = cfg.plugin_dir
+    #             _module_dir = os.path.dirname(plugin_dir)
+    #             _module_dir = _module_dir.split('/')
+    #             _module_path = _module_dir[0]
+
+    #             for m in _module_dir[1:]:
+    #                 _module_path = _module_path + '.' + m
+    #             print(_module_path)
+    #             plg_lib = importlib.import_module(_module_path)
+    #         else:
+    #             # import dir is the dirpath for the config file
+    #             _module_dir = os.path.dirname(args.config)
+    #             _module_dir = _module_dir.split('/')
+    #             _module_path = _module_dir[0]
+    #             for m in _module_dir[1:]:
+    #                 _module_path = _module_path + '.' + m
+    #             print(_module_path)
+    #             plg_lib = importlib.import_module(_module_path)
+
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    
+    if cfg.get('close_tf32', False):
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.allow_tf32 = False
+
+    cfg.model.pretrained = None
+    # in case the test dataset is concatenated
+    samples_per_gpu = 1
+    if isinstance(cfg.data.test, dict):
+        cfg.data.test.test_mode = True
+        samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1)
+        if samples_per_gpu > 1:
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.test.pipeline = replace_ImageToTensor(
+                cfg.data.test.pipeline)
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            ds_cfg.test_mode = True
+        samples_per_gpu = max(
+            [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test])
+        if samples_per_gpu > 1:
+            for ds_cfg in cfg.data.test:
+                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+
+    # set random seeds
+    if args.seed is not None:
+        set_random_seed(args.seed, deterministic=args.deterministic)
+
+    # build the dataloader
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=samples_per_gpu,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=distributed,
+        shuffle=False,
+        nonshuffler_sampler=cfg.data.nonshuffler_sampler,
+    )
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    model = build_model(cfg.model, test_cfg=cfg.get('test_cfg'))
+    fp16_cfg = cfg.get('fp16', None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu')
+    if args.fuse_conv_bn:
+        model = fuse_conv_bn(model)
+    # old versions did not save class info in checkpoints, this walkaround is
+    # for backward compatibility
+    if 'CLASSES' in checkpoint.get('meta', {}):
+        model.CLASSES = checkpoint['meta']['CLASSES']
+    else:
+        model.CLASSES = dataset.CLASSES
+    # palette for visualization in segmentation tasks
+    if 'PALETTE' in checkpoint.get('meta', {}):
+        model.PALETTE = checkpoint['meta']['PALETTE']
+    elif hasattr(dataset, 'PALETTE'):
+        # segmentation dataset has `PALETTE` attribute
+        model.PALETTE = dataset.PALETTE
+
+    if not distributed:
+        model = DataParallel(model, device_ids=[0])
+        outputs = single_gpu_test(model, data_loader)
+    else:
+        model = DistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False)
+        outputs = custom_multi_gpu_test(model, data_loader, args.tmpdir,
+                                        args.gpu_collect)
+
+
+
+    rank, _ = get_dist_info()
+    if rank == 0:
+        if args.out:
+            print(f'\nwriting results to {args.out}')
+        kwargs = {} if args.eval_options is None else args.eval_options
+        kwargs['jsonfile_prefix'] = osp.join('test', args.config.split(
+            '/')[-1].split('.')[-2], time.ctime().replace(' ', '_').replace(':', '_'))
+        if args.format_only:
+            dataset.format_results(outputs, **kwargs)
+
+        if args.eval:
+            eval_kwargs = cfg.get('evaluation', {}).copy()
+            # hard-code way to remove EvalHook args
+            for key in [
+                    'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best',
+                    'rule'
+            ]:
+                eval_kwargs.pop(key, None)
+            eval_kwargs.update(dict(metric=args.eval, **kwargs))
+
+            print(dataset.evaluate(outputs['bbox_results'], **eval_kwargs))
+    
+        # # # NOTE: record to json
+        # json_path = args.json_dir
+        # if not os.path.exists(json_path):
+        #     os.makedirs(json_path)
+        
+        # metric_all = []
+        # for res in outputs['bbox_results']:
+        #     for k in res['metric_results'].keys():
+        #         if type(res['metric_results'][k]) is np.ndarray:
+        #             res['metric_results'][k] = res['metric_results'][k].tolist()
+        #     metric_all.append(res['metric_results'])
+        
+        # print('start saving to json done')
+        # with open(json_path+'/metric_record.json', "w", encoding="utf-8") as f2:
+        #     json.dump(metric_all, f2, indent=4)
+        # print('save to json done')
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/adzoo/vad/train.py b/adzoo/vad/train.py
new file mode 100644
index 0000000..d880d4f
--- /dev/null
+++ b/adzoo/vad/train.py
@@ -0,0 +1,237 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+ 
+from __future__ import division
+
+import argparse
+import copy
+import mmcv
+import os
+import time
+import torch
+import warnings
+from mmcv import Config, DictAction
+from mmcv.utils import get_dist_info, init_dist
+from os import path as osp
+
+
+from mmcv.datasets import build_dataset
+from mmcv.models import build_model
+from mmcv.utils import collect_env, get_root_logger
+from mmcv.utils import set_random_seed
+
+from mmcv.utils import TORCH_VERSION, digit_version
+from adzoo.bevformer.mmdet3d_plugin.bevformer.apis.train import custom_train_model
+
+import cv2
+cv2.setNumThreads(1)
+
+import sys
+sys.path.append('')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume-from', help='the checkpoint file to resume from')
+    parser.add_argument(
+        '--no-validate',
+        action='store_true',
+        help='whether not to evaluate the checkpoint during training')
+    group_gpus = parser.add_mutually_exclusive_group()
+    group_gpus.add_argument(
+        '--gpus',
+        type=int,
+        help='number of gpus to use '
+        '(only applicable to non-distributed training)')
+    group_gpus.add_argument(
+        '--gpu-ids',
+        type=int,
+        nargs='+',
+        help='ids of gpus to use '
+        '(only applicable to non-distributed training)')
+    parser.add_argument('--seed', type=int, default=0, help='random seed')
+    parser.add_argument(
+        '--deterministic',
+        action='store_true',
+        help='whether to set deterministic options for CUDNN backend.')
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file (deprecate), '
+        'change to --cfg-options instead.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local-rank', type=int, default=0)
+    parser.add_argument(
+        '--autoscale-lr',
+        action='store_true',
+        help='automatically scale lr with the number of gpus')
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+
+    if args.options and args.cfg_options:
+        raise ValueError(
+            '--options and --cfg-options cannot be both specified, '
+            '--options is deprecated in favor of --cfg-options')
+    if args.options:
+        warnings.warn('--options is deprecated in favor of --cfg-options')
+        args.cfg_options = args.options
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # import modules from string list.
+    if cfg.get('custom_imports', None):
+        from mmcv.utils import import_modules_from_strings
+        import_modules_from_strings(**cfg['custom_imports'])
+
+    # set cudnn_benchmark
+    if cfg.get('cudnn_benchmark', False):
+        torch.backends.cudnn.benchmark = True
+    # set tf32
+    if cfg.get('close_tf32', False):
+        torch.backends.cuda.matmul.allow_tf32 = False
+        torch.backends.cudnn.allow_tf32 = False
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join('./work_dirs',
+                                osp.splitext(osp.basename(args.config))[0])
+    # if args.resume_from is not None:
+    if args.resume_from is not None and osp.isfile(args.resume_from):
+        cfg.resume_from = args.resume_from
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids
+    else:
+        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+    if digit_version(TORCH_VERSION) == digit_version('1.8.1') and cfg.optimizer['type'] == 'AdamW':
+        cfg.optimizer['type'] = 'AdamW2' # fix bug in Adamw
+    if args.autoscale_lr:
+        # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
+        cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == 'none':
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+        # re-set gpu_ids with distributed training mode
+        _, world_size = get_dist_info()
+        cfg.gpu_ids = range(world_size)
+
+    # create work_dir
+    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+    # dump config
+    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
+    # init the logger before other steps
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
+    # specify logger name, if we still use 'mmdet', the output info will be
+    # filtered and won't be saved in the log_file
+    # TODO: ugly workaround to judge whether we are training det or seg model
+    if cfg.model.type in ['EncoderDecoder3D']:
+        logger_name = 'mmseg'
+    else:
+        logger_name = 'mmdet'
+    logger = get_root_logger(
+        log_file=log_file, log_level=cfg.log_level, name=logger_name)
+
+    # init the meta dict to record some important information such as
+    # environment info and seed, which will be logged
+    meta = dict()
+    # log env info
+    env_info_dict = collect_env()
+    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+    dash_line = '-' * 60 + '\n'
+    logger.info('Environment info:\n' + dash_line + env_info + '\n' +
+                dash_line)
+    meta['env_info'] = env_info
+    meta['config'] = cfg.pretty_text
+
+    # log some basic info
+    logger.info(f'Distributed training: {distributed}')
+    logger.info(f'Config:\n{cfg.pretty_text}')
+
+    # set random seeds
+    if args.seed is not None:
+        logger.info(f'Set random seed to {args.seed}, '
+                    f'deterministic: {args.deterministic}')
+        set_random_seed(args.seed, deterministic=args.deterministic)
+    cfg.seed = args.seed
+    meta['seed'] = args.seed
+    meta['exp_name'] = osp.basename(args.config)
+
+    model = build_model(
+        cfg.model,
+        train_cfg=cfg.get('train_cfg'),
+        test_cfg=cfg.get('test_cfg'))
+    model.init_weights()
+
+    logger.info(f'Model:\n{model}')
+    datasets = [build_dataset(cfg.data.train)]
+    if len(cfg.workflow) == 2:
+        val_dataset = copy.deepcopy(cfg.data.val)
+        # in case we use a dataset wrapper
+        if 'dataset' in cfg.data.train:
+            val_dataset.pipeline = cfg.data.train.dataset.pipeline
+        else:
+            val_dataset.pipeline = cfg.data.train.pipeline
+        # set test_mode=False here in deep copied config
+        # which do not affect AP/AR calculation later
+        # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow  # noqa
+        val_dataset.test_mode = False
+        datasets.append(build_dataset(val_dataset))
+    if cfg.checkpoint_config is not None:
+        # save mmdet version, config file content and class names in
+        # checkpoints as meta data
+        cfg.checkpoint_config.meta = dict(
+            config=cfg.pretty_text,
+            CLASSES=datasets[0].CLASSES,
+            PALETTE=datasets[0].PALETTE  # for segmentors
+            if hasattr(datasets[0], 'PALETTE') else None)
+    # add an attribute for visualization convenience
+    model.CLASSES = datasets[0].CLASSES
+    custom_train_model(
+        model,
+        datasets,
+        cfg,
+        distributed=distributed,
+        validate=(not args.no_validate),
+        timestamp=timestamp,
+        meta=meta)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/asserts/bench2drive.jpg b/asserts/bench2drive.jpg
new file mode 100644
index 0000000..2a8aad2
Binary files /dev/null and b/asserts/bench2drive.jpg differ
diff --git a/asserts/bench2drivezoo.png b/asserts/bench2drivezoo.png
new file mode 100644
index 0000000..f8b1b2a
Binary files /dev/null and b/asserts/bench2drivezoo.png differ
diff --git a/clear.py b/clear.py
new file mode 100644
index 0000000..c12b90f
--- /dev/null
+++ b/clear.py
@@ -0,0 +1,55 @@
+import os
+import ast
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+def find_py_files(root_dir):
+    py_files = []
+    for root, dirs, files in os.walk(root_dir):
+        for file in files:
+            if file.endswith('.py'):
+                py_files.append(os.path.join(root, file))
+    return py_files
+
+def analyze_file(file_path):
+    with open(file_path, "r", encoding='utf-8') as file:
+        file_content = file.read()
+    tree = ast.parse(file_content)
+    
+    classes = [node.name for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]
+    functions = [node.name for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]    
+    return file_path, classes, functions
+
+def analyze_projects(root_dir):
+    py_files = find_py_files(root_dir)
+    results = []
+
+    with ThreadPoolExecutor(max_workers=4) as executor:
+        future_to_file = {executor.submit(analyze_file, file_path): file_path for file_path in py_files}
+        for future in as_completed(future_to_file):
+            file_path = future_to_file[future]
+            try:
+                results.append(future.result())
+            except Exception as exc:
+                print(f'{file_path} generated an exception: {exc}')
+    return results
+
+root_dir = "./"
+results = analyze_projects(root_dir)
+
+result_dict = {}
+
+for file_path, classes, functions in results:
+    print(f"File: {file_path}")
+    print(f"Classes: {classes}")
+    print(f"Functions: {functions}")
+    result_dict[file_path] = {}
+    result_dict[file_path][classes] = 0
+    result_dict[file_path][functions] = 0
+    result_dict[file_path][functions] = 0
+    result_dict[file_path]['count'] = 0
+
+import json
+  
+out_file = open("myfile.json", "w") 
+json.dump(result_dict, out_file, indent = 4) 
+out_file.close() 
\ No newline at end of file
diff --git a/data/others/b2d_motion_anchor_infos_mode6.pkl b/data/others/b2d_motion_anchor_infos_mode6.pkl
new file mode 100644
index 0000000..82465a7
Binary files /dev/null and b/data/others/b2d_motion_anchor_infos_mode6.pkl differ
diff --git a/data/splits/bench2drive_base_train_val_split.json b/data/splits/bench2drive_base_train_val_split.json
new file mode 100644
index 0000000..855d8ed
--- /dev/null
+++ b/data/splits/bench2drive_base_train_val_split.json
@@ -0,0 +1 @@
+{"train": ["v1/PedestrianCrossing_Town13_Route638_Weather14", "v1/BlockedIntersection_Town13_Route616_Weather18", "v1/LaneChange_Town13_Route725_Weather23", "v1/VanillaSignalizedTurnEncounterRedLight_Town13_Route603_Weather15", "v1/VanillaSignalizedTurnEncounterGreenLight_Town15_Route507_Weather13", "v1/ParkedObstacleTwoWays_Town12_Route1166_Weather22", "v1/StaticCutIn_Town15_Route429_Weather13", "v1/ParkingCutIn_Town12_Route1301_Weather14", "v1/StaticCutIn_Town03_Route158_Weather2", "v1/YieldToEmergencyVehicle_Town04_Route207_Weather25", "v1/Accident_Town12_Route956_Weather20", "v1/ParkingCutIn_Town12_Route1313_Weather3", "v1/HighwayExit_Town12_Route1324_Weather0", "v1/CrossingBicycleFlow_Town12_Route1050_Weather10", "v1/ParkingCutIn_Town12_Route762_Weather8", "v1/ParkedObstacle_Town15_Route415_Weather25", "v1/BlockedIntersection_Town15_Route486_Weather18", "v1/TJunction_Town13_Route655_Weather5", "v1/ParkedObstacleTwoWays_Town12_Route1167_Weather23", "v1/ParkingCutIn_Town13_Route1349_Weather10", "v1/SignalizedJunctionLeftTurnEnterFlow_Town12_Route885_Weather14", "v1/OppositeVehicleRunningRedLight_Town12_Route809_Weather3", "v1/ParkedObstacle_Town05_Route262_Weather1", "v1/VehicleTurningRoutePedestrian_Town13_Route680_Weather10", "v1/HazardAtSideLaneTwoWays_Town12_Route1128_Weather10", "v1/NonSignalizedJunctionLeftTurn_Town03_Route153_Weather26", "v1/SignalizedJunctionLeftTurn_Town10HD_Route380_Weather21", "v1/HighwayExit_Town06_Route292_Weather14", "v1/VanillaSignalizedTurnEncounterGreenLight_Town04_Route196_Weather14", "v1/HazardAtSideLane_Town12_Route1512_Weather7", "v1/ParkingCrossingPedestrian_Town12_Route758_Weather3", "v1/HazardAtSideLaneTwoWays_Town12_Route1140_Weather22", "v1/SignalizedJunctionLeftTurnEnterFlow_Town15_Route511_Weather9", "v1/HazardAtSideLaneTwoWays_Town12_Route1129_Weather11", "v1/VehicleTurningRoute_Town13_Route679_Weather3", "v1/InvadingTurn_Town13_Route575_Weather3", "v1/OppositeVehicleRunningRedLight_Town15_Route440_Weather23", "v1/NonSignalizedJunctionRightTurn_Town12_Route1024_Weather10", "v1/HazardAtSideLaneTwoWays_Town12_Route1133_Weather15", "v1/MergerIntoSlowTrafficV2_Town12_Route941_Weather5", "v1/VehicleTurningRoutePedestrian_Town12_Route826_Weather20", "v1/ConstructionObstacle_Town03_Route61_Weather9", "v1/ConstructionObstacleTwoWays_Town12_Route1098_Weather6", "v1/MergerIntoSlowTrafficV2_Town12_Route858_Weather0", "v1/HardBreakRoute_Town02_Route34_Weather8", "v1/LaneChange_Town13_Route743_Weather3", "v1/NonSignalizedJunctionLeftTurn_Town12_Route812_Weather12", "v1/VanillaSignalizedTurnEncounterRedLight_Town12_Route874_Weather8", "v1/YieldToEmergencyVehicle_Town13_Route560_Weather14", "v1/ParkingExit_Town13_Route567_Weather21", "v1/HazardAtSideLane_Town15_Route420_Weather3", "v1/ParkedObstacleTwoWays_Town13_Route1334_Weather26", "v1/HighwayExit_Town12_Route841_Weather9", "v1/TJunction_Town12_Route926_Weather8", "v1/HighwayExit_Town12_Route1000_Weather12", "v1/VanillaSignalizedTurnEncounterRedLight_Town10HD_Route388_Weather23", "v1/HighwayExit_Town13_Route705_Weather3", "v1/StaticCutIn_Town15_Route427_Weather11", "v1/StaticCutIn_Town13_Route563_Weather9", "v1/VanillaSignalizedTurnEncounterRedLight_Town13_Route646_Weather22", "v1/AccidentTwoWays_Town12_Route1114_Weather22", "v1/OppositeVehicleRunningRedLight_Town05_Route268_Weather8", "v1/SignalizedJunctionLeftTurnEnterFlow_Town12_Route887_Weather3", "v1/HazardAtSideLane_Town12_Route1535_Weather10", "v1/ParkingCutIn_Town12_Route1304_Weather9", "v1/HighwayExit_Town12_Route937_Weather1", "v1/AccidentTwoWays_Town12_Route1110_Weather18", "v1/InterurbanAdvancedActorFlow_Town13_Route686_Weather10", "v1/HazardAtSideLane_Town03_Route105_Weather22", "v1/ParkingCrossingPedestrian_Town12_Route760_Weather6", "v1/OppositeVehicleTakingPriority_Town12_Route995_Weather7", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town13_Route661_Weather11", "v1/HardBreakRoute_Town06_Route46_Weather20", "v1/OppositeVehicleTakingPriority_Town04_Route189_Weather7", "v1/BlockedIntersection_Town07_Route352_Weather14", "v1/ConstructionObstacleTwoWays_Town12_Route1404_Weather26", "v1/AccidentTwoWays_Town12_Route1446_Weather2", "v1/ParkedObstacle_Town03_Route147_Weather0", "v1/HazardAtSideLaneTwoWays_Town12_Route1146_Weather2", "v1/VanillaSignalizedTurnEncounterGreenLight_Town04_Route197_Weather15", "v1/AccidentTwoWays_Town12_Route1456_Weather15", "v1/ParkingCutIn_Town12_Route955_Weather19", "v1/ParkedObstacle_Town13_Route553_Weather11", "v1/VanillaSignalizedTurnEncounterRedLight_Town03_Route141_Weather11", "v1/NonSignalizedJunctionRightTurn_Town12_Route817_Weather11", "v1/OppositeVehicleTakingPriority_Town03_Route128_Weather23", "v1/HighwayExit_Town13_Route749_Weather21", "v1/VanillaSignalizedTurnEncounterRedLight_Town03_Route142_Weather12", "v1/TJunction_Town07_Route364_Weather0", "v1/HardBreakRoute_Town07_Route47_Weather21", "v1/CrossingBicycleFlow_Town12_Route1062_Weather22", "v1/HazardAtSideLane_Town12_Route1527_Weather25", "v1/LaneChange_Town12_Route756_Weather2", "v1/VehicleTurningRoutePedestrian_Town13_Route703_Weather1", "v1/OppositeVehicleRunningRedLight_Town04_Route178_Weather22", "v1/OppositeVehicleTakingPriority_Town12_Route820_Weather14", "v1/Accident_Town12_Route769_Weather15", "v1/AccidentTwoWays_Town12_Route1469_Weather3", "v1/MergerIntoSlowTrafficV2_Town12_Route1010_Weather22", "v1/NonSignalizedJunctionLeftTurn_Town12_Route966_Weather3", "v1/TJunction_Town12_Route883_Weather25", "v1/OppositeVehicleRunningRedLight_Town12_Route807_Weather1", "v1/OppositeVehicleTakingPriority_Town12_Route994_Weather6", "v1/CrossingBicycleFlow_Town12_Route1077_Weather11", "v1/InvadingTurn_Town02_Route99_Weather21", "v1/YieldToEmergencyVehicle_Town12_Route917_Weather7", "v1/PedestrianCrossing_Town13_Route718_Weather8", "v1/VanillaSignalizedTurnEncounterGreenLight_Town13_Route640_Weather8", "v1/SignalizedJunctionLeftTurnEnterFlow_Town15_Route536_Weather8", "v1/HazardAtSideLane_Town15_Route421_Weather5", "v1/TJunction_Town12_Route1018_Weather3", "v1/ConstructionObstacle_Town05_Route69_Weather9", "v1/HardBreakRoute_Town13_Route1341_Weather26", "v1/NonSignalizedJunctionRightTurn_Town12_Route816_Weather15", "v1/LaneChange_Town12_Route984_Weather22", "v1/YieldToEmergencyVehicle_Town12_Route779_Weather25", "v1/Accident_Town03_Route102_Weather20", "v1/VehicleTurningRoute_Town15_Route480_Weather18", "v1/OppositeVehicleTakingPriority_Town04_Route188_Weather6", "v1/ParkingCutIn_Town13_Route1347_Weather6", "v1/SignalizedJunctionLeftTurnEnterFlow_Town12_Route1019_Weather5", "v1/HardBreakRoute_Town15_Route59_Weather7", "v1/LaneChange_Town06_Route326_Weather3", "v1/EnterActorFlow_Town13_Route612_Weather14", "v1/DynamicObjectCrossing_Town12_Route21_Weather21", "v1/HardBreakRoute_Town10HD_Route49_Weather23", "v1/StaticCutIn_Town06_Route287_Weather1", "v1/HighwayExit_Town13_Route619_Weather21", "v1/InterurbanAdvancedActorFlow_Town12_Route1048_Weather8", "v1/SignalizedJunctionRightTurn_Town04_Route176_Weather20", "v1/EnterActorFlow_Town07_Route349_Weather11", "v1/CrossingBicycleFlow_Town12_Route860_Weather2", "v1/ConstructionObstacleTwoWays_Town12_Route1419_Weather26", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town03_Route143_Weather13", "v1/ParkedObstacle_Town03_Route103_Weather25", "v1/ParkingExit_Town13_Route569_Weather23", "v1/AccidentTwoWays_Town12_Route1454_Weather13", "v1/NonSignalizedJunctionRightTurn_Town13_Route595_Weather11", "v1/SignalizedJunctionRightTurn_Town07_Route339_Weather1", "v1/HazardAtSideLane_Town12_Route1506_Weather0", "v1/VanillaSignalizedTurnEncounterGreenLight_Town15_Route490_Weather22", "v1/ConstructionObstacle_Town13_Route82_Weather3", "v1/DynamicObjectCrossing_Town02_Route12_Weather12", "v1/HighwayExit_Town12_Route1051_Weather11", "v1/MergerIntoSlowTraffic_Town13_Route627_Weather3", "v1/YieldToEmergencyVehicle_Town05_Route225_Weather9", "v1/ControlLoss_Town15_Route430_Weather14", "v1/ParkingCutIn_Town13_Route546_Weather0", "v1/VanillaSignalizedTurnEncounterRedLight_Town15_Route452_Weather10", "v1/StaticCutIn_Town15_Route428_Weather12", "v1/ParkedObstacleTwoWays_Town12_Route1183_Weather7", "v1/PedestrianCrossing_Town15_Route448_Weather6", "v1/BlockedIntersection_Town04_Route193_Weather11", "v1/InterurbanAdvancedActorFlow_Town06_Route325_Weather13", "v1/ParkedObstacle_Town12_Route771_Weather9", "v1/AccidentTwoWays_Town12_Route1104_Weather12", "v1/VanillaSignalizedTurnEncounterRedLight_Town03_Route140_Weather10", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town15_Route493_Weather25", "v1/TJunction_Town12_Route1201_Weather5", "v1/ParkedObstacle_Town04_Route162_Weather6", "v1/ParkingCutIn_Town13_Route670_Weather20", "v1/VehicleTurningRoute_Town13_Route605_Weather9", "v1/HazardAtSideLane_Town12_Route960_Weather20", "v1/StaticCutIn_Town04_Route168_Weather12", "v1/VanillaSignalizedTurnEncounterRedLight_Town10HD_Route393_Weather3", "v1/ParkingCutIn_Town12_Route764_Weather10", "v1/InterurbanAdvancedActorFlow_Town12_Route854_Weather22", "v1/HighwayExit_Town12_Route1028_Weather14", "v1/Accident_Town15_Route414_Weather23", "v1/ConstructionObstacleTwoWays_Town12_Route1425_Weather26", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route737_Weather23", "v1/ParkedObstacleTwoWays_Town12_Route1182_Weather12", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1356_Weather7", "v1/NonSignalizedJunctionLeftTurn_Town05_Route238_Weather26", "v1/HighwayCutIn_Town12_Route850_Weather18", "v1/HighwayCutIn_Town06_Route320_Weather8", "v1/HazardAtSideLaneTwoWays_Town12_Route1145_Weather1", "v1/ParkedObstacleTwoWays_Town12_Route1164_Weather20", "v1/NonSignalizedJunctionLeftTurn_Town12_Route813_Weather26", "v1/HardBreakRoute_Town02_Route33_Weather7", "v1/ParkingExit_Town12_Route1305_Weather18", "v1/HighwayCutIn_Town12_Route974_Weather12", "v1/NonSignalizedJunctionLeftTurn_Town07_Route344_Weather6", "v1/ConstructionObstacleTwoWays_Town12_Route1084_Weather5", "v1/ParkingCutIn_Town13_Route671_Weather21", "v1/ConstructionObstacle_Town03_Route63_Weather11", "v1/EnterActorFlow_Town05_Route245_Weather9", "v1/VanillaSignalizedTurnEncounterRedLight_Town10HD_Route389_Weather25", "v1/SignalizedJunctionLeftTurn_Town15_Route471_Weather26", "v1/ConstructionObstacleTwoWays_Town12_Route1083_Weather9", "v1/HighwayCutIn_Town13_Route685_Weather9", "v1/HazardAtSideLaneTwoWays_Town12_Route1139_Weather21", "v1/CrossingBicycleFlow_Town12_Route1075_Weather9", "v1/HighwayCutIn_Town12_Route1006_Weather18", "v1/PedestrianCrossing_Town13_Route687_Weather11", "v1/ParkingExit_Town13_Route697_Weather21", "v1/ParkingExit_Town12_Route1309_Weather22", "v1/InterurbanAdvancedActorFlow_Town13_Route715_Weather13", "v1/ParkingCrossingPedestrian_Town12_Route896_Weather12", "v1/HazardAtSideLane_Town05_Route263_Weather3", "v1/TJunction_Town12_Route980_Weather18", "v1/OppositeVehicleTakingPriority_Town12_Route1025_Weather11", "v1/EnterActorFlow_Town13_Route614_Weather8", "v1/ParkingCrossingPedestrian_Town13_Route668_Weather18", "v1/ParkingExit_Town13_Route731_Weather3", "v1/ParkingCrossingPedestrian_Town13_Route669_Weather19", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town13_Route723_Weather21", "v1/HazardAtSideLane_Town12_Route1519_Weather8", "v1/HighwayCutIn_Town13_Route631_Weather7", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route719_Weather18", "v1/ConstructionObstacleTwoWays_Town12_Route1421_Weather26", "v1/VehicleTurningRoutePedestrian_Town12_Route999_Weather11", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town12_Route1022_Weather8", "v1/ControlLoss_Town15_Route432_Weather8", "v1/CrossingBicycleFlow_Town12_Route1063_Weather23", "v1/ParkingExit_Town12_Route1308_Weather21", "v1/Accident_Town13_Route552_Weather6", "v1/EnterActorFlow_Town13_Route613_Weather15", "v1/LaneChange_Town13_Route726_Weather21", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route1633_Weather12", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route1629_Weather6", "v1/NonSignalizedJunctionRightTurn_Town12_Route967_Weather25", "v1/CrossingBicycleFlow_Town12_Route1078_Weather12", "v1/SignalizedJunctionLeftTurn_Town13_Route581_Weather26", "v1/VehicleTurningRoute_Town12_Route1026_Weather12", "v1/ParkedObstacleTwoWays_Town12_Route1181_Weather11", "v1/VehicleTurningRoute_Town15_Route1370_Weather7", "v1/ParkingCrossingPedestrian_Town12_Route953_Weather9", "v1/VehicleTurningRoutePedestrian_Town13_Route702_Weather14", "v1/ParkingCutIn_Town13_Route547_Weather1", "v1/HardBreakRoute_Town07_Route48_Weather22", "v1/ConstructionObstacle_Town13_Route81_Weather3", "v1/YieldToEmergencyVehicle_Town12_Route778_Weather14", "v1/DynamicObjectCrossing_Town01_Route2_Weather2", "v1/HazardAtSideLaneTwoWays_Town12_Route1141_Weather23", "v1/HighwayCutIn_Town13_Route745_Weather9", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route658_Weather8", "v1/ConstructionObstacleTwoWays_Town12_Route1085_Weather6", "v1/DynamicObjectCrossing_Town15_Route29_Weather3", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town04_Route202_Weather20", "v1/ParkingCutIn_Town12_Route1312_Weather2", "v1/ParkedObstacle_Town05_Route221_Weather13", "v1/ParkingExit_Town12_Route788_Weather8", "v1/HazardAtSideLane_Town12_Route1526_Weather23", "v1/AccidentTwoWays_Town12_Route1106_Weather14", "v1/BlockedIntersection_Town12_Route936_Weather0", "v1/AccidentTwoWays_Town12_Route1459_Weather18", "v1/ParkingCutIn_Town13_Route549_Weather3", "v1/HighwayCutIn_Town12_Route940_Weather3", "v1/HardBreakRoute_Town05_Route43_Weather9", "v1/InvadingTurn_Town13_Route578_Weather6", "v1/VehicleOpensDoorTwoWays_Town12_Route1197_Weather1", "v1/VehicleTurningRoutePedestrian_Town13_Route610_Weather12", "v1/HighwayCutIn_Town13_Route628_Weather3", "v1/ParkingExit_Town12_Route920_Weather10", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route1627_Weather3", "v1/ParkingCrossingPedestrian_Town12_Route759_Weather5", "v1/HardBreakRoute_Town03_Route38_Weather12", "v1/SignalizedJunctionLeftTurn_Town15_Route437_Weather26", "v1/HazardAtSideLaneTwoWays_Town12_Route1152_Weather8", "v1/ConstructionObstacleTwoWays_Town12_Route1424_Weather26", "v1/ConstructionObstacleTwoWays_Town12_Route1095_Weather3", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town03_Route145_Weather15", "v1/VehicleTurningRoute_Town15_Route1380_Weather19", "v1/VanillaSignalizedTurnEncounterGreenLight_Town07_Route355_Weather9", "v1/ControlLoss_Town15_Route433_Weather9", "v1/NonSignalizedJunctionLeftTurn_Town13_Route594_Weather26", "v1/Accident_Town12_Route766_Weather12", "v1/SignalizedJunctionLeftTurn_Town13_Route580_Weather26", "v1/HighwayCutIn_Town06_Route300_Weather14", "v1/HazardAtSideLane_Town12_Route1515_Weather12", "v1/StaticCutIn_Town03_Route109_Weather1", "v1/SignalizedJunctionLeftTurnEnterFlow_Town12_Route1034_Weather20", "v1/HardBreakRoute_Town04_Route39_Weather13", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route1636_Weather15", "v1/OppositeVehicleRunningRedLight_Town03_Route119_Weather12", "v1/OppositeVehicleRunningRedLight_Town12_Route991_Weather3", "v1/VehicleTurningRoute_Town13_Route700_Weather23", "v1/HazardAtSideLaneTwoWays_Town12_Route1135_Weather9", "v1/StaticCutIn_Town12_Route783_Weather3", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town12_Route946_Weather10", "v1/AccidentTwoWays_Town12_Route1445_Weather1", "v1/BlockedIntersection_Town04_Route195_Weather13", "v1/ParkingCutIn_Town12_Route903_Weather20", "v1/HardBreakRoute_Town05_Route44_Weather18", "v1/VehicleTurningRoute_Town12_Route825_Weather21", "v1/HazardAtSideLane_Town12_Route775_Weather12", "v1/ParkingCutIn_Town13_Route548_Weather2", "v1/CrossingBicycleFlow_Town12_Route863_Weather5", "v1/HazardAtSideLane_Town06_Route283_Weather23", "v1/BlockedIntersection_Town05_Route248_Weather14", "v1/BlockedIntersection_Town07_Route351_Weather13", "v1/AccidentTwoWays_Town12_Route1444_Weather0", "v1/NonSignalizedJunctionLeftTurn_Town05_Route240_Weather26", "v1/YieldToEmergencyVehicle_Town12_Route781_Weather8", "v1/HazardAtSideLane_Town13_Route558_Weather12", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town12_Route877_Weather19", "v1/NonSignalizedJunctionRightTurn_Town12_Route931_Weather21", "v1/ParkedObstacle_Town12_Route773_Weather19", "v1/HighwayCutIn_Town12_Route1042_Weather2", "v1/SignalizedJunctionLeftTurnEnterFlow_Town12_Route1020_Weather6", "v1/SignalizedJunctionRightTurn_Town12_Route804_Weather5", "v1/CrossingBicycleFlow_Town12_Route1066_Weather0", "v1/VanillaSignalizedTurnEncounterGreenLight_Town15_Route451_Weather9", "v1/ParkedObstacleTwoWays_Town12_Route1172_Weather2", "v1/StaticCutIn_Town05_Route227_Weather2", "v1/YieldToEmergencyVehicle_Town13_Route675_Weather25", "v1/DynamicObjectCrossing_Town12_Route22_Weather22", "v1/NonSignalizedJunctionRightTurn_Town13_Route596_Weather23", "v1/ParkedObstacle_Town10HD_Route371_Weather7", "v1/HazardAtSideLaneTwoWays_Town12_Route1155_Weather11", "v1/VehicleTurningRoute_Town15_Route519_Weather25", "v1/Accident_Town06_Route280_Weather11", "v1/AccidentTwoWays_Town12_Route1468_Weather2", "v1/InterurbanAdvancedActorFlow_Town06_Route330_Weather18", "v1/PedestrianCrossing_Town12_Route865_Weather7", "v1/ParkingCutIn_Town13_Route1348_Weather7", "v1/TJunction_Town13_Route654_Weather3", "v1/VehicleTurningRoutePedestrian_Town12_Route1040_Weather0", "v1/InterurbanAdvancedActorFlow_Town13_Route753_Weather25", "v1/HazardAtSideLane_Town12_Route1507_Weather1", "v1/OppositeVehicleRunningRedLight_Town03_Route120_Weather8", "v1/StaticCutIn_Town03_Route149_Weather19", "v1/OppositeVehicleRunningRedLight_Town13_Route587_Weather15", "v1/OppositeVehicleTakingPriority_Town12_Route932_Weather22", "v1/VanillaSignalizedTurnEncounterGreenLight_Town07_Route356_Weather18", "v1/PedestrianCrossing_Town13_Route637_Weather13", "v1/ParkedObstacle_Town13_Route555_Weather9", "v1/StaticCutIn_Town04_Route208_Weather0", "v1/LaneChange_Town06_Route277_Weather9", "v1/YieldToEmergencyVehicle_Town04_Route165_Weather7", "v1/EnterActorFlow_Town05_Route271_Weather11", "v1/ParkingCrossingPedestrian_Town13_Route728_Weather0", "v1/InvadingTurn_Town15_Route436_Weather20", "v1/AccidentTwoWays_Town12_Route1458_Weather9", "v1/ParkingExit_Town12_Route786_Weather6", "v1/CrossingBicycleFlow_Town12_Route1065_Weather25", "v1/ParkedObstacleTwoWays_Town12_Route1163_Weather19", "v1/OppositeVehicleRunningRedLight_Town15_Route475_Weather7", "v1/HighwayCutIn_Town06_Route321_Weather9", "v1/ParkingCrossingPedestrian_Town13_Route727_Weather25", "v1/TJunction_Town12_Route1017_Weather3", "v1/ParkingCutIn_Town12_Route1300_Weather13", "v1/HighwayExit_Town13_Route622_Weather23", "v1/Accident_Town12_Route1108_Weather8", "v1/NonSignalizedJunctionLeftTurn_Town13_Route592_Weather26", "v1/DynamicObjectCrossing_Town01_Route8_Weather3", "v1/AccidentTwoWays_Town12_Route1119_Weather1", "v1/HardBreakRoute_Town12_Route53_Weather1", "v1/SignalizedJunctionLeftTurnEnterFlow_Town15_Route529_Weather9", "v1/VehicleTurningRoute_Town15_Route1376_Weather15", "v1/HazardAtSideLane_Town12_Route1537_Weather12", "v1/HazardAtSideLane_Town12_Route915_Weather7", "v1/ParkingExit_Town13_Route568_Weather22", "v1/DynamicObjectCrossing_Town15_Route28_Weather2", "v1/TJunction_Town02_Route97_Weather19", "v1/MergerIntoSlowTrafficV2_Town12_Route1009_Weather21", "v1/TJunction_Town01_Route90_Weather12", "v1/VanillaSignalizedTurnEncounterRedLight_Town04_Route201_Weather19", "v1/ParkingCrossingPedestrian_Town12_Route952_Weather8", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1358_Weather11", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route1631_Weather10", "v1/AccidentTwoWays_Town12_Route1121_Weather3", "v1/HardBreakRoute_Town05_Route42_Weather8", "v1/SignalizedJunctionLeftTurn_Town07_Route336_Weather23", "v1/VanillaSignalizedTurnEncounterRedLight_Town05_Route254_Weather20", "v1/ParkedObstacle_Town11_Route395_Weather5", "v1/ParkedObstacleTwoWays_Town12_Route1175_Weather5", "v1/InvadingTurn_Town12_Route796_Weather8", "v1/HazardAtSideLane_Town05_Route222_Weather0", "v1/MergerIntoSlowTrafficV2_Town12_Route1043_Weather3", "v1/EnterActorFlow_Town12_Route832_Weather0", "v1/AccidentTwoWays_Town12_Route1126_Weather8", "v1/SignalizedJunctionLeftTurnEnterFlow_Town15_Route531_Weather11", "v1/TJunction_Town13_Route691_Weather15", "v1/TJunction_Town01_Route91_Weather13", "v1/ParkedObstacleTwoWays_Town12_Route1161_Weather9", "v1/ParkedObstacle_Town06_Route328_Weather8", "v1/Accident_Town06_Route327_Weather15", "v1/ControlLoss_Town13_Route574_Weather19", "v1/HardBreakRoute_Town15_Route58_Weather6", "v1/HighwayExit_Town13_Route621_Weather23", "v1/NonSignalizedJunctionLeftTurn_Town05_Route239_Weather26", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town04_Route203_Weather21", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town15_Route530_Weather10", "v1/MergerIntoSlowTrafficV2_Town12_Route1055_Weather15", "v1/ConstructionObstacle_Town12_Route79_Weather1", "v1/BlockedIntersection_Town03_Route136_Weather6", "v1/ParkingExit_Town13_Route677_Weather1", "v1/VanillaSignalizedTurnEncounterGreenLight_Town04_Route198_Weather8", "v1/TJunction_Town15_Route457_Weather15", "v1/OppositeVehicleTakingPriority_Town15_Route477_Weather9", "v1/VehicleTurningRoute_Town15_Route479_Weather11", "v1/ParkingCutIn_Town13_Route1346_Weather5", "v1/SignalizedJunctionLeftTurn_Town05_Route234_Weather7", "v1/AccidentTwoWays_Town12_Route1109_Weather9", "v1/ConstructionObstacleTwoWays_Town12_Route1094_Weather2", "v1/CrossingBicycleFlow_Town12_Route1072_Weather6", "v1/VanillaSignalizedTurnEncounterRedLight_Town15_Route492_Weather23", "v1/ConstructionObstacleTwoWays_Town12_Route1100_Weather14", "v1/NonSignalizedJunctionLeftTurn_Town04_Route182_Weather26", "v1/ParkingCrossingPedestrian_Town12_Route898_Weather14", "v1/ParkingExit_Town12_Route1316_Weather0", "v1/StaticCutIn_Town13_Route565_Weather8", "v1/SignalizedJunctionRightTurn_Town12_Route803_Weather23", "v1/HazardAtSideLane_Town12_Route1538_Weather13", "v1/MergerIntoSlowTraffic_Town13_Route626_Weather2", "v1/StaticCutIn_Town12_Route785_Weather5", "v1/HazardAtSideLaneTwoWays_Town12_Route1138_Weather20", "v1/HazardAtSideLane_Town12_Route1536_Weather11", "v1/OppositeVehicleTakingPriority_Town13_Route600_Weather2", "v1/HazardAtSideLane_Town12_Route1530_Weather2", "v1/OppositeVehicleTakingPriority_Town13_Route601_Weather3", "v1/HardBreakRoute_Town13_Route1337_Weather26", "v1/LaneChange_Town12_Route894_Weather10", "v1/HazardAtSideLane_Town06_Route329_Weather9", "v1/ParkingExit_Town13_Route732_Weather3", "v1/VehicleTurningRoute_Town15_Route1367_Weather3", "v1/HighwayExit_Town13_Route748_Weather20", "v1/Accident_Town03_Route101_Weather23", "v1/VanillaSignalizedTurnEncounterRedLight_Town12_Route1054_Weather14", "v1/ParkingCutIn_Town12_Route1314_Weather5", "v1/HighwayCutIn_Town06_Route322_Weather10", "v1/StaticCutIn_Town13_Route566_Weather20", "v1/AccidentTwoWays_Town12_Route1463_Weather22", "v1/CrossingBicycleFlow_Town12_Route1011_Weather23", "v1/HazardAtSideLaneTwoWays_Town12_Route1153_Weather9", "v1/ControlLoss_Town07_Route333_Weather21", "v1/TJunction_Town12_Route880_Weather22", "v1/Accident_Town12_Route957_Weather21", "v1/VanillaSignalizedTurnEncounterGreenLight_Town10HD_Route386_Weather22", "v1/AccidentTwoWays_Town12_Route1120_Weather2", "v1/InvadingTurn_Town12_Route925_Weather15", "v1/VehicleTurningRoute_Town12_Route822_Weather18", "v1/VehicleTurningRoutePedestrian_Town13_Route607_Weather19", "v1/HighwayExit_Town06_Route313_Weather1", "v1/Accident_Town04_Route205_Weather23", "v1/HazardAtSideLane_Town12_Route1508_Weather2", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town12_Route1035_Weather21", "v1/HazardAtSideLane_Town12_Route1533_Weather6", "v1/NonSignalizedJunctionLeftTurn_Town03_Route122_Weather26", "v1/PedestrianCrossing_Town15_Route526_Weather6", "v1/VanillaSignalizedTurnEncounterRedLight_Town04_Route199_Weather9", "v1/MergerIntoSlowTraffic_Town12_Route1003_Weather8", "v1/VehicleTurningRoute_Town15_Route1374_Weather13", "v1/VehicleTurningRoutePedestrian_Town12_Route970_Weather8", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town12_Route888_Weather3", "v1/MergerIntoSlowTrafficV2_Town12_Route1053_Weather13", "v1/EnterActorFlow_Town12_Route830_Weather23", "v1/YieldToEmergencyVehicle_Town13_Route562_Weather15", "v1/AccidentTwoWays_Town12_Route1453_Weather12", "v1/OppositeVehicleRunningRedLight_Town07_Route368_Weather3", "v1/HardBreakRoute_Town06_Route45_Weather19", "v1/HighwayCutIn_Town13_Route630_Weather6", "v1/ParkingExit_Town12_Route923_Weather13", "v1/LaneChange_Town12_Route983_Weather5", "v1/ParkingExit_Town13_Route676_Weather0", "v1/StaticCutIn_Town05_Route275_Weather15", "v1/InvadingTurn_Town04_Route217_Weather9", "v1/AccidentTwoWays_Town12_Route1455_Weather14", "v1/HighwayExit_Town12_Route1327_Weather3", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town13_Route660_Weather10", "v1/TJunction_Town07_Route365_Weather1", "v1/SignalizedJunctionRightTurn_Town12_Route988_Weather0", "v1/AccidentTwoWays_Town12_Route1124_Weather18", "v1/HighwayCutIn_Town13_Route734_Weather6", "v1/VehicleTurningRoute_Town13_Route698_Weather22", "v1/OppositeVehicleTakingPriority_Town05_Route270_Weather6", "v1/TJunction_Town15_Route496_Weather2", "v1/EnterActorFlow_Town07_Route350_Weather12", "v1/ParkingCutIn_Town13_Route1345_Weather3", "v1/HazardAtSideLane_Town12_Route1532_Weather5", "v1/PedestrianCrossing_Town12_Route867_Weather9", "v1/ConstructionObstacle_Town06_Route73_Weather21", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route721_Weather19", "v1/TJunction_Town15_Route495_Weather1", "v1/HighwayExit_Town12_Route1041_Weather1", "v1/ParkedObstacleTwoWays_Town12_Route1178_Weather8", "v1/ParkedObstacleTwoWays_Town12_Route1159_Weather23", "v1/NonSignalizedJunctionLeftTurn_Town03_Route124_Weather26", "v1/HardBreakRoute_Town13_Route1338_Weather26", "v1/PedestrianCrossing_Town12_Route1045_Weather5", "v1/VanillaSignalizedTurnEncounterRedLight_Town12_Route875_Weather9", "v1/HardBreakRoute_Town13_Route1339_Weather26", "v1/CrossingBicycleFlow_Town12_Route1071_Weather0", "v1/ParkingCrossingPedestrian_Town12_Route897_Weather13", "v1/PedestrianCrossing_Town12_Route943_Weather7", "v1/VehicleTurningRoutePedestrian_Town15_Route482_Weather20", "v1/AccidentTwoWays_Town12_Route1461_Weather20", "v1/HazardAtSideLaneTwoWays_Town12_Route1130_Weather12", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town15_Route535_Weather15", "v1/ConstructionObstacle_Town05_Route68_Weather8", "v1/StaticCutIn_Town04_Route216_Weather8", "v1/HazardAtSideLaneTwoWays_Town12_Route1154_Weather10", "v1/ParkingExit_Town12_Route1321_Weather6", "v1/HardBreakRoute_Town13_Route54_Weather2", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town15_Route512_Weather18", "v1/SignalizedJunctionLeftTurn_Town12_Route1470_Weather5", "v1/SignalizedJunctionRightTurn_Town15_Route473_Weather5", "v1/ConstructionObstacleTwoWays_Town12_Route1414_Weather26", "v1/ParkingExit_Town13_Route678_Weather2", "v1/VehicleTurningRoute_Town15_Route1368_Weather5", "v1/NonSignalizedJunctionRightTurn_Town04_Route185_Weather9", "v1/HighwayCutIn_Town12_Route1005_Weather9", "v1/HardBreakRoute_Town13_Route1340_Weather26", "v1/ParkingCrossingPedestrian_Town12_Route899_Weather15", "v1/InvadingTurn_Town15_Route434_Weather18", "v1/VehicleTurningRoute_Town15_Route443_Weather1", "v1/DynamicObjectCrossing_Town15_Route27_Weather1", "v1/VanillaSignalizedTurnEncounterGreenLight_Town10HD_Route392_Weather2", "v1/HardBreakRoute_Town15_Route57_Weather5", "v1/ParkingExit_Town12_Route789_Weather9", "v1/BlockedIntersection_Town13_Route617_Weather19", "v1/TJunction_Town05_Route259_Weather0", "v1/AccidentTwoWays_Town12_Route1112_Weather20", "v1/SignalizedJunctionRightTurn_Town04_Route211_Weather3", "v1/ParkedObstacleTwoWays_Town12_Route1171_Weather1", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town13_Route722_Weather20", "v1/VanillaSignalizedTurnEncounterGreenLight_Town13_Route643_Weather19", "v1/HazardAtSideLane_Town12_Route1516_Weather13", "v1/YieldToEmergencyVehicle_Town12_Route918_Weather8", "v1/HazardAtSideLaneTwoWays_Town12_Route1131_Weather13", "v1/VanillaSignalizedTurnEncounterRedLight_Town07_Route358_Weather20", "v1/InvadingTurn_Town05_Route230_Weather22", "v1/ParkingCutIn_Town12_Route765_Weather11", "v1/Accident_Town05_Route219_Weather11", "v1/ParkingCutIn_Town12_Route954_Weather18", "v1/DynamicObjectCrossing_Town01_Route7_Weather7", "v1/OppositeVehicleTakingPriority_Town12_Route1038_Weather23", "v1/PedestrianCrossing_Town12_Route1013_Weather25", "v1/CrossingBicycleFlow_Town12_Route1061_Weather21", "v1/OppositeVehicleRunningRedLight_Town03_Route121_Weather13", "v1/VanillaSignalizedTurnEncounterGreenLight_Town03_Route139_Weather9", "v1/HardBreakRoute_Town04_Route41_Weather15", "v1/HardBreakRoute_Town04_Route40_Weather14", "v1/ParkingCrossingPedestrian_Town15_Route462_Weather20", "v1/ConstructionObstacleTwoWays_Town12_Route1089_Weather23", "v1/TJunction_Town06_Route304_Weather18", "v1/MergerIntoSlowTrafficV2_Town12_Route1060_Weather20", "v1/StaticCutIn_Town15_Route426_Weather10", "v1/HazardAtSideLaneTwoWays_Town12_Route1144_Weather0", "v1/TJunction_Town12_Route881_Weather23", "v1/ParkingCutIn_Town12_Route1303_Weather8", "v1/SignalizedJunctionLeftTurn_Town03_Route114_Weather6", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town13_Route690_Weather14", "v1/MergerIntoSlowTraffic_Town12_Route844_Weather12", "v1/OppositeVehicleRunningRedLight_Town13_Route588_Weather8", "v1/HazardAtSideLane_Town12_Route774_Weather20", "v1/BlockedIntersection_Town15_Route485_Weather9", "v1/SignalizedJunctionLeftTurn_Town03_Route150_Weather26", "v1/ConstructionObstacle_Town12_Route76_Weather23", "v1/StaticCutIn_Town06_Route288_Weather2", "v1/MergerIntoSlowTraffic_Town12_Route1004_Weather8", "v1/StaticCutIn_Town05_Route265_Weather5", "v1/HardBreakRoute_Town03_Route37_Weather11", "v1/DynamicObjectCrossing_Town01_Route6_Weather6", "v1/CrossingBicycleFlow_Town12_Route1032_Weather18", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town15_Route455_Weather13", "v1/ConstructionObstacleTwoWays_Town12_Route1410_Weather26", "v1/ParkedObstacle_Town13_Route554_Weather12", "v1/SignalizedJunctionLeftTurn_Town07_Route366_Weather2", "v1/VanillaSignalizedTurnEncounterRedLight_Town13_Route689_Weather13", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town13_Route724_Weather22", "v1/ParkingCutIn_Town12_Route1311_Weather1", "v1/OppositeVehicleRunningRedLight_Town05_Route235_Weather1", "v1/VanillaSignalizedTurnEncounterGreenLight_Town05_Route252_Weather21", "v1/LaneChange_Town12_Route757_Weather10", "v1/VehicleTurningRoute_Town15_Route1377_Weather8", "v1/ParkingCrossingPedestrian_Town15_Route514_Weather0", "v1/NonSignalizedJunctionLeftTurn_Town03_Route123_Weather26", "v1/CrossingBicycleFlow_Town12_Route1076_Weather10", "v1/ParkingCrossingPedestrian_Town15_Route513_Weather19", "v1/ParkedObstacle_Town12_Route772_Weather11", "v1/OppositeVehicleTakingPriority_Town04_Route187_Weather5", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town13_Route651_Weather1", "v1/SignalizedJunctionRightTurn_Town07_Route338_Weather10", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town07_Route362_Weather23", "v1/Accident_Town04_Route159_Weather3", "v1/NonSignalizedJunctionLeftTurn_Town12_Route810_Weather10", "v1/MergerIntoSlowTraffic_Town12_Route845_Weather13", "v1/Accident_Town06_Route279_Weather19", "v1/ParkingCutIn_Town13_Route1344_Weather2", "v1/VanillaSignalizedTurnEncounterRedLight_Town13_Route644_Weather20", "v1/OppositeVehicleTakingPriority_Town05_Route242_Weather15", "v1/DynamicObjectCrossing_Town01_Route5_Weather2", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1351_Weather1", "v1/HazardAtSideLane_Town12_Route1521_Weather18", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town03_Route144_Weather14", "v1/SignalizedJunctionLeftTurn_Town05_Route267_Weather3", "v1/VehicleOpensDoorTwoWays_Town12_Route1196_Weather0", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town15_Route494_Weather0", "v1/ConstructionObstacle_Town13_Route80_Weather2", "v1/ConstructionObstacle_Town03_Route62_Weather10", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town07_Route361_Weather23", "v1/OppositeVehicleRunningRedLight_Town13_Route590_Weather18", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town15_Route533_Weather13", "v1/PedestrianCrossing_Town12_Route1033_Weather20", "v1/Accident_Town13_Route550_Weather3", "v1/SignalizedJunctionLeftTurnEnterFlow_Town12_Route948_Weather9", "v1/LaneChange_Town12_Route892_Weather8", "v1/HardBreakRoute_Town13_Route56_Weather3", "v1/DynamicObjectCrossing_Town13_Route24_Weather23", "v1/DynamicObjectCrossing_Town01_Route1_Weather1", "v1/MergerIntoSlowTrafficV2_Town12_Route859_Weather1", "v1/VanillaSignalizedTurnEncounterRedLight_Town13_Route645_Weather21", "v1/TJunction_Town12_Route947_Weather11", "v1/NonSignalizedJunctionLeftTurn_Town04_Route181_Weather15", "v1/ConstructionObstacle_Town05_Route70_Weather18", "v1/HazardAtSideLaneTwoWays_Town12_Route1134_Weather8", "v1/SignalizedJunctionLeftTurn_Town04_Route172_Weather8", "v1/ConstructionObstacleTwoWays_Town12_Route1086_Weather20", "v1/ParkedObstacleTwoWays_Town13_Route1336_Weather26", "v1/HardBreakRoute_Town01_Route30_Weather3", "v1/InterurbanAdvancedActorFlow_Town06_Route301_Weather15", "v1/HazardAtSideLane_Town12_Route1524_Weather21", "v1/HighwayExit_Town13_Route683_Weather7", "v1/ParkedObstacle_Town12_Route958_Weather22", "v1/HazardAtSideLaneTwoWays_Town12_Route1136_Weather18", "v1/BlockedIntersection_Town05_Route272_Weather12", "v1/VehicleTurningRoutePedestrian_Town15_Route1387_Weather1", "v1/OppositeVehicleTakingPriority_Town05_Route243_Weather9", "v1/HazardAtSideLane_Town12_Route1523_Weather20", "v1/NonSignalizedJunctionRightTurn_Town13_Route598_Weather0", "v1/HighwayCutIn_Town13_Route750_Weather22", "v1/HardBreakRoute_Town11_Route50_Weather23", "v1/ControlLoss_Town07_Route332_Weather20", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1352_Weather2", "v1/VanillaSignalizedTurnEncounterRedLight_Town04_Route200_Weather18", "v1/VanillaSignalizedTurnEncounterRedLight_Town12_Route872_Weather14", "v1/HighwayCutIn_Town06_Route299_Weather13", "v1/Accident_Town12_Route768_Weather14", "v1/InvadingTurn_Town05_Route231_Weather23", "v1/DynamicObjectCrossing_Town10HD_Route18_Weather18", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town15_Route537_Weather9", "v1/ConstructionObstacleTwoWays_Town12_Route1415_Weather26", "v1/SignalizedJunctionRightTurn_Town13_Route583_Weather11", "v1/BlockedIntersection_Town07_Route353_Weather15", "v1/ParkedObstacle_Town15_Route417_Weather1", "v1/ParkingExit_Town12_Route1318_Weather2", "v1/InvadingTurn_Town05_Route266_Weather6", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route1630_Weather7", "v1/ConstructionObstacleTwoWays_Town12_Route1091_Weather12", "v1/HazardAtSideLaneTwoWays_Town12_Route1132_Weather14", "v1/BlockedIntersection_Town12_Route834_Weather2", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town05_Route257_Weather23", "v1/VehicleTurningRoute_Town15_Route1379_Weather18", "v1/HazardAtSideLane_Town12_Route1520_Weather9", "v1/DynamicObjectCrossing_Town12_Route23_Weather23", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town15_Route460_Weather18", "v1/PedestrianCrossing_Town15_Route506_Weather12", "v1/VanillaSignalizedTurnEncounterRedLight_Town12_Route969_Weather7", "v1/StaticCutIn_Town03_Route110_Weather6", "v1/ParkingExit_Town12_Route787_Weather7", "v1/VanillaSignalizedTurnEncounterRedLight_Town12_Route945_Weather9", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town15_Route499_Weather5", "v1/ParkedObstacle_Town06_Route281_Weather21", "v1/VanillaSignalizedTurnEncounterRedLight_Town05_Route255_Weather21", "v1/TJunction_Town12_Route927_Weather9", "v1/VanillaSignalizedTurnEncounterGreenLight_Town13_Route641_Weather9", "v1/TJunction_Town06_Route305_Weather19", "v1/AccidentTwoWays_Town12_Route1117_Weather25", "v1/InterurbanAdvancedActorFlow_Town06_Route302_Weather21", "v1/ControlLoss_Town11_Route402_Weather12", "v1/EnterActorFlow_Town13_Route681_Weather5", "v1/HazardAtSideLaneTwoWays_Town12_Route1143_Weather25", "v1/ConstructionObstacleTwoWays_Town12_Route1406_Weather26", "v1/ParkedObstacle_Town12_Route959_Weather23", "v1/VehicleTurningRoute_Town15_Route1369_Weather6", "v1/NonSignalizedJunctionRightTurn_Town07_Route345_Weather14", "v1/VanillaSignalizedTurnEncounterGreenLight_Town12_Route868_Weather10", "v1/NonSignalizedJunctionLeftTurn_Town07_Route369_Weather18", "v1/TJunction_Town15_Route510_Weather8", "v1/VehicleTurningRoute_Town13_Route606_Weather18", "v1/DynamicObjectCrossing_Town02_Route14_Weather14", "v1/DynamicObjectCrossing_Town10HD_Route19_Weather19", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town12_Route879_Weather21", "v1/SignalizedJunctionRightTurn_Town04_Route177_Weather13", "v1/OppositeVehicleTakingPriority_Town12_Route818_Weather12", "v1/EnterActorFlow_Town12_Route831_Weather25", "v1/AccidentTwoWays_Town12_Route1105_Weather13", "v1/HazardAtSideLane_Town03_Route106_Weather23", "v1/HighwayExit_Town12_Route1326_Weather2", "v1/CrossingBicycleFlow_Town12_Route1067_Weather1", "v1/SignalizedJunctionRightTurn_Town12_Route964_Weather2", "v1/SignalizedJunctionRightTurn_Town12_Route805_Weather25", "v1/PedestrianCrossing_Town13_Route736_Weather8", "v1/AccidentTwoWays_Town12_Route1113_Weather21", "v1/ConstructionObstacleTwoWays_Town12_Route1418_Weather26", "v1/HighwayExit_Town13_Route620_Weather22", "v1/Accident_Town15_Route411_Weather21", "v1/HazardAtSideLane_Town12_Route961_Weather25", "v1/LaneChange_Town13_Route664_Weather14", "v1/HazardAtSideLaneTwoWays_Town12_Route1137_Weather19", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town13_Route650_Weather0", "v1/HighwayCutIn_Town12_Route1052_Weather12", "v1/HighwayCutIn_Town13_Route751_Weather23", "v1/HighwayCutIn_Town12_Route851_Weather19", "v1/HazardAtSideLaneTwoWays_Town12_Route1147_Weather3", "v1/BlockedIntersection_Town05_Route247_Weather19", "v1/VanillaSignalizedTurnEncounterGreenLight_Town12_Route944_Weather8", "v1/SignalizedJunctionRightTurn_Town03_Route151_Weather2", "v1/HardBreakRoute_Town12_Route52_Weather0", "v1/HardBreakRoute_Town03_Route36_Weather10", "v1/NonSignalizedJunctionLeftTurn_Town13_Route593_Weather26", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1350_Weather0", "v1/HazardAtSideLane_Town12_Route1518_Weather15", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route659_Weather3", "v1/OppositeVehicleRunningRedLight_Town05_Route236_Weather10", "v1/EnterActorFlow_Town07_Route348_Weather10", "v1/MergerIntoSlowTrafficV2_Town12_Route1031_Weather9", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town15_Route454_Weather12", "v1/ConstructionObstacle_Town04_Route67_Weather15", "v1/ParkedObstacle_Town03_Route157_Weather1", "v1/CrossingBicycleFlow_Town12_Route1074_Weather8", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1353_Weather3", "v1/HazardAtSideLane_Town04_Route164_Weather8", "v1/HighwayExit_Town13_Route706_Weather3", "v1/ParkedObstacle_Town12_Route770_Weather8", "v1/VehicleTurningRoute_Town12_Route933_Weather23", "v1/YieldToEmergencyVehicle_Town15_Route423_Weather7", "v1/Accident_Town13_Route551_Weather5", "v1/MergerIntoSlowTrafficV2_Town12_Route1057_Weather9", "v1/InterurbanAdvancedActorFlow_Town13_Route735_Weather7", "v1/OppositeVehicleRunningRedLight_Town12_Route929_Weather19", "v1/MergerIntoSlowTraffic_Town12_Route973_Weather11", "v1/ParkingExit_Town13_Route570_Weather23", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town13_Route738_Weather10", "v1/SignalizedJunctionLeftTurn_Town10HD_Route381_Weather22", "v1/YieldToEmergencyVehicle_Town13_Route561_Weather14", "v1/ConstructionObstacleTwoWays_Town12_Route1099_Weather7", "v1/DynamicObjectCrossing_Town02_Route13_Weather6", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town12_Route982_Weather20", "v1/LaneChange_Town13_Route739_Weather25", "v1/ConstructionObstacle_Town12_Route75_Weather23", "v1/AccidentTwoWays_Town12_Route1102_Weather10", "v1/ParkingCrossingPedestrian_Town15_Route403_Weather13", "v1/ControlLoss_Town04_Route169_Weather13", "v1/HazardAtSideLaneTwoWays_Town12_Route1142_Weather23", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town13_Route648_Weather23", "v1/HardBreakRoute_Town02_Route35_Weather9", "v1/HazardAtSideLaneTwoWays_Town12_Route1157_Weather13", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town12_Route878_Weather20", "v1/CrossingBicycleFlow_Town12_Route1068_Weather2", "v1/ConstructionObstacleTwoWays_Town12_Route1080_Weather14", "v1/AccidentTwoWays_Town12_Route1127_Weather9", "v1/HighwayExit_Town12_Route1046_Weather6", "v1/ConstructionObstacleTwoWays_Town12_Route1413_Weather26", "v1/ParkedObstacleTwoWays_Town12_Route1168_Weather23", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town12_Route891_Weather7", "v1/HighwayCutIn_Town12_Route849_Weather9", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1361_Weather14", "v1/HighwayExit_Town06_Route291_Weather5", "v1/DynamicObjectCrossing_Town13_Route25_Weather25", "v1/InterurbanAdvancedActorFlow_Town06_Route331_Weather19", "v1/TJunction_Town15_Route456_Weather14", "v1/VehicleTurningRoutePedestrian_Town12_Route1027_Weather13", "v1/VehicleTurningRoutePedestrian_Town12_Route827_Weather22", "v1/VanillaSignalizedTurnEncounterGreenLight_Town12_Route870_Weather12", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town15_Route532_Weather12", "v1/VanillaSignalizedTurnEncounterGreenLight_Town05_Route250_Weather8", "v1/VehicleTurningRoute_Town15_Route1378_Weather9", "v1/SignalizedJunctionLeftTurn_Town03_Route113_Weather26", "v1/AccidentTwoWays_Town12_Route1111_Weather19", "v1/MergerIntoSlowTrafficV2_Town12_Route1058_Weather18", "v1/ConstructionObstacle_Town04_Route66_Weather14", "v1/TJunction_Town12_Route882_Weather23", "v1/ParkedObstacleTwoWays_Town12_Route1158_Weather14", "v1/OppositeVehicleRunningRedLight_Town03_Route152_Weather22", "v1/HighwayCutIn_Town06_Route298_Weather20", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1359_Weather12", "v1/SignalizedJunctionLeftTurnEnterFlow_Town12_Route981_Weather19", "v1/Accident_Town03_Route156_Weather0", "v1/NonSignalizedJunctionRightTurn_Town04_Route184_Weather2", "v1/ParkedObstacle_Town06_Route309_Weather23", "v1/MergerIntoSlowTrafficV2_Town12_Route1056_Weather8", "v1/PedestrianCrossing_Town12_Route1014_Weather0", "v1/ConstructionObstacleTwoWays_Town12_Route1097_Weather5", "v1/CrossingBicycleFlow_Town12_Route1073_Weather7", "v1/CrossingBicycleFlow_Town12_Route1064_Weather23", "v1/InterurbanAdvancedActorFlow_Town06_Route303_Weather22", "v1/OppositeVehicleRunningRedLight_Town04_Route179_Weather14", "v1/ParkingExit_Town12_Route1319_Weather3", "v1/ParkingCutIn_Town12_Route901_Weather9", "v1/VehicleTurningRoute_Town13_Route699_Weather23", "v1/AccidentTwoWays_Town12_Route1448_Weather5", "v1/HardBreakRoute_Town13_Route55_Weather3", "v1/HazardAtSideLaneTwoWays_Town12_Route1156_Weather12", "v1/SignalizedJunctionLeftTurn_Town05_Route233_Weather6", "v1/ConstructionObstacleTwoWays_Town12_Route1422_Weather26", "v1/CrossingBicycleFlow_Town12_Route1044_Weather3", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town15_Route509_Weather15", "v1/ParkingExit_Town12_Route1320_Weather5", "v1/VehicleTurningRoute_Town12_Route997_Weather9", "v1/ConstructionObstacle_Town15_Route85_Weather7", "v1/DynamicObjectCrossing_Town13_Route26_Weather0", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1357_Weather10", "v1/VehicleTurningRoutePedestrian_Town12_Route829_Weather25", "v1/HardBreakRoute_Town01_Route31_Weather5", "v1/AccidentTwoWays_Town12_Route1116_Weather23", "v1/VanillaSignalizedTurnEncounterRedLight_Town13_Route647_Weather23", "v1/BlockedIntersection_Town04_Route194_Weather12", "v1/VanillaSignalizedTurnEncounterGreenLight_Town12_Route871_Weather13", "v1/HazardAtSideLane_Town12_Route1528_Weather0", "v1/HazardAtSideLane_Town12_Route777_Weather23", "v1/TJunction_Town07_Route363_Weather25", "v1/DynamicObjectCrossing_Town11_Route20_Weather20", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1355_Weather6", "v1/VanillaSignalizedTurnEncounterGreenLight_Town05_Route251_Weather9", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route692_Weather8", "v1/ParkedObstacleTwoWays_Town12_Route1169_Weather25", "v1/EnterActorFlow_Town04_Route192_Weather10", "v1/ParkingCutIn_Town13_Route1342_Weather0", "v1/DynamicObjectCrossing_Town01_Route4_Weather3", "v1/SignalizedJunctionLeftTurn_Town07_Route334_Weather26", "v1/BlockedIntersection_Town13_Route615_Weather9", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town07_Route360_Weather22", "v1/HazardAtSideLane_Town12_Route1525_Weather22", "v1/ControlLoss_Town15_Route431_Weather15", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town05_Route258_Weather25", "v1/BlockedIntersection_Town03_Route134_Weather3", "v1/NonSignalizedJunctionLeftTurn_Town04_Route212_Weather26", "v1/SignalizedJunctionLeftTurnEnterFlow_Town12_Route884_Weather0", "v1/ParkingCutIn_Town12_Route1315_Weather6", "v1/ParkingCutIn_Town13_Route672_Weather22", "v1/VanillaSignalizedTurnEncounterRedLight_Town07_Route357_Weather19", "v1/ParkedObstacle_Town05_Route273_Weather7", "v1/YieldToEmergencyVehicle_Town15_Route425_Weather9", "v1/MergerIntoSlowTrafficV2_Town12_Route856_Weather23", "v1/ParkedObstacleTwoWays_Town12_Route1173_Weather3", "v1/SignalizedJunctionLeftTurn_Town04_Route174_Weather18", "v1/ConstructionObstacle_Town15_Route84_Weather6", "v1/InterurbanActorFlow_Town12_Route1296_Weather7", "v1/HighwayExit_Town06_Route311_Weather25", "v1/LaneChange_Town13_Route740_Weather0", "v1/HazardAtSideLane_Town12_Route1534_Weather7", "v1/ParkingCutIn_Town12_Route763_Weather9", "v1/ParkingCutIn_Town12_Route1310_Weather0", "v1/ParkingCutIn_Town13_Route729_Weather1", "v1/VanillaSignalizedTurnEncounterGreenLight_Town13_Route642_Weather18", "v1/PedestrianCrossing_Town13_Route716_Weather14", "v1/PedestrianCrossing_Town12_Route864_Weather6", "v1/ConstructionObstacle_Town03_Route60_Weather8", "v1/HazardAtSideLane_Town05_Route223_Weather15", "v1/Accident_Town15_Route412_Weather22", "v1/ConstructionObstacle_Town04_Route65_Weather13", "v1/ParkedObstacleTwoWays_Town13_Route1335_Weather26", "v1/PedestrianCrossing_Town12_Route866_Weather8", "v1/NonSignalizedJunctionLeftTurn_Town12_Route930_Weather20", "v1/VanillaSignalizedTurnEncounterGreenLight_Town03_Route137_Weather7", "v1/HighwayCutIn_Town12_Route1047_Weather7", "v1/ParkingCutIn_Town13_Route696_Weather20", "v1/ParkedObstacleTwoWays_Town12_Route1165_Weather21", "v1/HighwayCutIn_Town13_Route629_Weather5", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town12_Route1015_Weather1", "v1/HardBreakRoute_Town12_Route51_Weather25", "v1/YieldToEmergencyVehicle_Town12_Route919_Weather11", "v1/InvadingTurn_Town12_Route924_Weather14", "v1/ConstructionObstacle_Town13_Route83_Weather5", "v1/HighwayExit_Town13_Route744_Weather8", "v1/Accident_Town12_Route767_Weather13", "v1/ParkedObstacleTwoWays_Town12_Route1174_Weather3", "v1/Accident_Town03_Route146_Weather8", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1363_Weather8", "v1/DynamicObjectCrossing_Town02_Route15_Weather15", "v1/NonSignalizedJunctionLeftTurn_Town04_Route183_Weather1", "v1/SignalizedJunctionLeftTurn_Town05_Route232_Weather23", "v1/ParkingCrossingPedestrian_Town15_Route405_Weather15", "v1/VehicleTurningRoutePedestrian_Town13_Route609_Weather21", "v1/HighwayExit_Town13_Route704_Weather2", "v1/BlockedIntersection_Town10HD_Route391_Weather1", "v1/HighwayExit_Town12_Route1331_Weather10", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town12_Route876_Weather18", "v1/YieldToEmergencyVehicle_Town13_Route673_Weather23", "v1/OppositeVehicleRunningRedLight_Town12_Route990_Weather2", "v1/TJunction_Town05_Route261_Weather1", "v1/StaticCutIn_Town12_Route782_Weather2", "v1/HighwayExit_Town12_Route838_Weather6", "v1/VanillaSignalizedTurnEncounterRedLight_Town05_Route253_Weather19", "v1/VanillaSignalizedTurnEncounterGreenLight_Town10HD_Route387_Weather23", "v1/ConstructionObstacle_Town15_Route87_Weather9", "v1/TJunction_Town13_Route652_Weather2", "v1/ParkingCutIn_Town12_Route1302_Weather15", "v1/SignalizedJunctionLeftTurnEnterFlow_Town15_Route498_Weather23", "v1/YieldToEmergencyVehicle_Town15_Route424_Weather8", "v1/CrossingBicycleFlow_Town12_Route1070_Weather3", "v1/ConstructionObstacle_Town04_Route64_Weather12", "v1/ParkingCutIn_Town12_Route902_Weather18", "v1/MergerIntoSlowTrafficV2_Town12_Route1059_Weather19", "v1/MergerIntoSlowTrafficV2_Town12_Route976_Weather14", "v1/SignalizedJunctionLeftTurn_Town12_Route799_Weather0", "v1/InterurbanAdvancedActorFlow_Town12_Route1030_Weather8", "v1/YieldToEmergencyVehicle_Town03_Route148_Weather18", "v1/ConstructionObstacle_Town06_Route72_Weather20", "v1/PedestrianCrossing_Town13_Route636_Weather12", "v1/SignalizedJunctionRightTurn_Town15_Route438_Weather7", "v1/Accident_Town04_Route160_Weather3", "v1/Accident_Town12_Route1122_Weather3", "v1/Accident_Town06_Route308_Weather22", "v1/NonSignalizedJunctionLeftTurn_Town12_Route1362_Weather15", "v1/CrossingBicycleFlow_Town12_Route1012_Weather23", "v1/OppositeVehicleTakingPriority_Town05_Route241_Weather7", "v1/InterurbanAdvancedActorFlow_Town13_Route634_Weather10", "v1/DynamicObjectCrossing_Town02_Route9_Weather9", "v1/Accident_Town15_Route413_Weather23", "v1/ParkedObstacle_Town13_Route556_Weather10", "v1/OppositeVehicleTakingPriority_Town12_Route968_Weather6", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route720_Weather19", "v1/NonSignalizedJunctionRightTurn_Town07_Route346_Weather15", "v1/BlockedIntersection_Town13_Route618_Weather20", "v1/HighwayCutIn_Town13_Route713_Weather11", "v1/AccidentTwoWays_Town12_Route1103_Weather11", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town12_Route1016_Weather2", "v1/ParkedObstacle_Town15_Route418_Weather2", "v1/ParkedObstacle_Town05_Route220_Weather12", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town05_Route256_Weather22", "v1/VehicleTurningRoutePedestrian_Town15_Route523_Weather2", "v1/ParkingCrossingPedestrian_Town12_Route761_Weather7", "v1/EnterActorFlow_Town13_Route611_Weather13", "v1/MergerIntoSlowTrafficV2_Town15_Route525_Weather5", "v1/HazardAtSideLane_Town12_Route1517_Weather14", "v1/ParkingExit_Town12_Route1322_Weather7", "v1/ConstructionObstacle_Town12_Route77_Weather25", "v1/ConstructionObstacle_Town05_Route71_Weather19", "v1/StaticCutIn_Town13_Route564_Weather18", "v1/ConstructionObstacleTwoWays_Town12_Route1096_Weather3", "v1/BlockedIntersection_Town12_Route835_Weather3", "v1/HighwayExit_Town06_Route293_Weather15", "v1/NonSignalizedJunctionRightTurn_Town12_Route815_Weather9", "v1/VanillaSignalizedTurnEncounterGreenLight_Town15_Route489_Weather21", "v1/BlockedIntersection_Town12_Route836_Weather3", "v1/ParkingCutIn_Town12_Route900_Weather19", "v1/CrossingBicycleFlow_Town12_Route862_Weather3", "v1/OppositeVehicleTakingPriority_Town03_Route155_Weather25", "v1/VanillaSignalizedTurnEncounterGreenLight_Town12_Route869_Weather11", "v1/ParkingExit_Town12_Route1307_Weather20", "v1/ControlLoss_Town11_Route401_Weather11", "v1/AccidentTwoWays_Town12_Route1107_Weather15", "v1/InterurbanActorFlow_Town13_Route708_Weather6", "v1/VehicleTurningRoute_Town12_Route824_Weather20", "v1/ControlLoss_Town10HD_Route377_Weather13"], "val": ["v1/ParkingCrossingPedestrian_Town13_Route545_Weather25", "v1/OppositeVehicleTakingPriority_Town04_Route214_Weather6", "v1/DynamicObjectCrossing_Town02_Route11_Weather11", "v1/AccidentTwoWays_Town12_Route1115_Weather23", "v1/VehicleTurningRoute_Town15_Route504_Weather10", "v1/ParkingExit_Town12_Route922_Weather12", "v1/SignalizedJunctionLeftTurn_Town04_Route173_Weather26", "v1/EnterActorFlow_Town03_Route132_Weather2", "v1/HighwayExit_Town06_Route312_Weather0", "v1/VanillaSignalizedTurnEncounterRedLight_Town15_Route491_Weather23", "v1/CrossingBicycleFlow_Town12_Route977_Weather15", "v1/OppositeVehicleRunningRedLight_Town04_Route180_Weather23", "v1/VanillaSignalizedTurnEncounterRedLight_Town07_Route359_Weather21", "v1/ParkingCutIn_Town13_Route1343_Weather1", "v1/ParkedObstacle_Town06_Route282_Weather22", "v1/TJunction_Town06_Route306_Weather20", "v1/PedestrianCrossing_Town13_Route747_Weather19", "v1/VehicleTurningRoutePedestrian_Town15_Route445_Weather11", "v1/ConstructionObstacle_Town12_Route78_Weather0", "v1/HazardAtSideLaneTwoWays_Town12_Route1151_Weather7", "v1/ControlLoss_Town04_Route170_Weather14", "v1/MergerIntoSlowTrafficV2_Town12_Route857_Weather25", "v1/DynamicObjectCrossing_Town01_Route3_Weather3", "v1/SignalizedJunctionRightTurn_Town03_Route118_Weather14", "v1/BlockedIntersection_Town03_Route135_Weather5", "v1/MergerIntoSlowTraffic_Town06_Route317_Weather5", "v1/NonSignalizedJunctionRightTurn_Town03_Route126_Weather18", "v1/ParkedObstacleTwoWays_Town13_Route1333_Weather26", "v1/ConstructionObstacleTwoWays_Town12_Route1093_Weather1", "v1/TJunction_Town05_Route260_Weather0", "v1/NonSignalizedJunctionLeftTurn_Town07_Route342_Weather3", "v1/HighwayCutIn_Town12_Route1029_Weather15", "v1/HazardAtSideLane_Town10HD_Route373_Weather9", "v1/YieldToEmergencyVehicle_Town04_Route166_Weather10", "v1/HardBreakRoute_Town01_Route32_Weather6", "v1/SignalizedJunctionLeftTurnEnterFlow_Town13_Route657_Weather2", "v1/ConstructionObstacle_Town10HD_Route74_Weather22", "v1/ControlLoss_Town10HD_Route378_Weather14", "v1/Accident_Town05_Route218_Weather10", "v1/InterurbanActorFlow_Town12_Route1291_Weather1", "v1/LaneChange_Town06_Route307_Weather21", "v1/InvadingTurn_Town02_Route95_Weather9", "v1/VanillaNonSignalizedTurnEncounterStopsign_Town12_Route979_Weather9", "v1/StaticCutIn_Town05_Route226_Weather18", "v1/VehicleOpensDoorTwoWays_Town12_Route1203_Weather7", "v1/VehicleTurningRoutePedestrian_Town15_Route481_Weather19", "v1/VanillaSignalizedTurnEncounterGreenLight_Town07_Route354_Weather8", "v1/NonSignalizedJunctionLeftTurnEnterFlow_Town12_Route949_Weather13", "v1/InterurbanAdvancedActorFlow_Town06_Route324_Weather2", "v1/ParkedObstacle_Town10HD_Route372_Weather8"]}
\ No newline at end of file
diff --git a/docs/CONVERT_GUIDE.md b/docs/CONVERT_GUIDE.md
new file mode 100644
index 0000000..5b4c988
--- /dev/null
+++ b/docs/CONVERT_GUIDE.md
@@ -0,0 +1,29 @@
+# Code Convert Guide
+
+This document outlines important considerations for migrating code based on nuscenes or other datasets to bench2drive.
+
+## Models
+
+We integrated several MMCV dependencies into the `mmcv` directory and no longer install the original libraries. You can refer to our existing methods to utilize these modules and place your own models and utils in `mmcv` directory and register them. Please make sure the mmcv directory contains all the modules you need; if not, you will need to add them.
+
+## Scripts and configs
+
+You can place the configs and scripts for each method in the `adzoo` . Utils of each methods can also be placed here for easier management.
+
+## Details of configs
+
+To create a config for the bench2drive dataset, note the following:
+
+- We have included the bench2drive name-to-class mapping and evaluation settings directly in the config. You can use our settings or modify them as needed.
+- Unlike the 10 classes in nuscenes, we use 9 classes in bench2drive .
+- Methods like UniAD and VAD use 3 commands on nuscenes, while bench2drive uses 6 commands obtained from Carla.
+
+## Dataset
+
+- The reference frame of the Bench2Drive data differs significantly from the coordinate system used by Nuscenes.([here](https://github.com/Thinklab-SJTU/Bench2Drive/blob/main/docs/anno.md) for details). In `mmcv/datasets/prepare_B2D.py`, we convert the world coordinate system, ego coordinate system, and sensor coordinate system to match the Nuscenes reference frame, including the vehicle coordinates, bounding box coordinates, and sensor extrinsics. You can refer to our code for data alignment. 
+- In Nuscenes, keyframes are at 2Hz, while Bench2Drive runs at 10Hz with annotations for each frame. For reproducing UniAD and VAD, we set the window length (time interval between adjacent points in past and future trajectories) to 0.5s and the window shift to 0.1s (any frame can be selected as the current frame). This fully utilizes Bench2Drive's data and aligns the trajectories with Nuscenes.
+- For the map, Bench2Drive stores vectorized maps.  You can refer to our code to use the map, such as extracting map elements within a certain range.
+
+## Team agent
+
+To perform closed-loop evaluation in Carla, set up sensors to gather data from Carla. Use this data to compute all necessary model inputs, then convert the model outputs into a `carla.VehicleControl` object.
\ No newline at end of file
diff --git a/docs/DATA_PREP.md b/docs/DATA_PREP.md
new file mode 100644
index 0000000..066338d
--- /dev/null
+++ b/docs/DATA_PREP.md
@@ -0,0 +1,81 @@
+# Prepare Bench2Drive Dataset
+
+## Download Bench2Drive
+
+Download our dataset from (LINK) and make sure the structure of data as follows:
+
+```
+    Bench2DriveZoo
+    ├── ...                   
+    ├── data/
+    |   ├── bench2drive/
+    |   |   ├── v1/                                          # Bench2Drive base 
+    |   |   |   ├── Accident_Town03_Route101_Weather23/
+    |   |   |   ├── Accident_Town03_Route102_Weather20/
+    |   |   |   └── ...
+    |   |   └── maps/                                        # maps of Towns
+    |   |       ├── Town01_HD_map.npz
+    |   |       ├── Town02_HD_map.npz
+    |   |       └── ...
+    |   ├── others
+    |   |       └── b2d_motion_anchor_infos_mode6.pkl        # motion anchors for UniAD
+    |   └── splits
+    |           └── bench2drive_base_train_val_split.json    # trainval_split of Bench2Drive base 
+
+```
+
+## Prepare Bench2Drive data info
+
+Run the following command:
+
+```
+cd mmcv/datasets
+python prepare_B2D.py --workers 16   # workers used to prepare data
+```
+
+The command will generate `b2d_infos_train.pkl`, `b2d_infos_val.pkl`, `b2d_map_infos.pkl` under `data/infos`.
+*Note: It will take about 1 hour to generate all the data with 16 workers*
+
+
+## Structure of code
+
+
+After installing and data preparing, the structure of our code will be as follows:
+
+```
+    Bench2DriveZoo
+    ├── adzoo/
+    |   ├── bevformer/
+    |   ├── uniad/
+    |   └── vad/                   
+    ├── ckpts/
+    |   ├── r101_dcn_fcos3d_pretrain.pth                   # pretrain weights for bevformer
+    |   ├── resnet50-19c8e357.pth                          # image backbone pretrain weights for vad
+    |   ├── bevformer_base_b2d.pth                         # download weights you need
+    |   ├── uniad_base_b2d.pth                             # download weights you need
+    |   └── ...
+    ├── data/
+    |   ├── bench2drive/
+    |   |   ├── v1/                                        # Bench2Drive base 
+    |   |   |   ├── Accident_Town03_Route101_Weather23/
+    |   |   |   ├── Accident_Town03_Route102_Weather20/
+    |   |   |   └── ...
+    |   |   └── maps/                                      # maps of Towns
+    |   |       ├── Town01_HD_map.npz
+    |   |       ├── Town02_HD_map.npz
+    |   |       └── ...
+    │   ├── infos/
+    │   │   ├── b2d_infos_train.pkl
+    │   │   ├── b2d_infos_val.pkl
+    |   |   └── b2d_map_infos.pkl
+    |   ├── others
+    |   |       └── b2d_motion_anchor_infos_mode6.pkl      # motion anchors for UniAD
+    |   └── splits
+    |           └── bench2drive_base_train_val_split.json  # trainval_split of Bench2Drive base 
+    ├── docs/
+    ├── mmcv/
+    ├── team_code/  # for Closed-loop Evaluation in Carla
+```
+
+
+
diff --git a/docs/EVAL_IN_CARLA.md b/docs/EVAL_IN_CARLA.md
new file mode 100644
index 0000000..0d06c57
--- /dev/null
+++ b/docs/EVAL_IN_CARLA.md
@@ -0,0 +1,26 @@
+# Closed Loop Evaluation    
+
+Please follow these steps to evaluate UniAD and VAD in Carla:
+
+## Preparations
+
+- Install this repo as [doc](docs/INSTALL.md). 
+- Install Bench2Drive from [here](https://github.com/Thinklab-SJTU/Bench2Drive).
+
+
+## Link this repo to Bench2Drive
+
+```bash
+# Add your agent code
+cd Bench2Drive/leaderboard
+mkdir team_code
+cd Bench2Drive/leaderboard/team_code
+ln -s YOUR_TEAM_AGENT ./  # link your agent code
+cd Bench2Drive/
+ln -s Bench2DriveZoo/team_code/*  ./ # link entire repo to Bench2Drive
+```
+
+## Run evaluation 
+
+Follow [this](https://github.com/Thinklab-SJTU/Bench2Drive?tab=readme-ov-file#eval-tools) to use evaluation tools of Bench2Drive.
+
diff --git a/docs/INSTALL.md b/docs/INSTALL.md
new file mode 100644
index 0000000..2e69655
--- /dev/null
+++ b/docs/INSTALL.md
@@ -0,0 +1,52 @@
+## Follow these steps to install the environment
+- **STEP 1: Create enviroment**
+    ```
+    conda create -n uniad python=3.8
+    conda activate uniad
+    ```
+- **STEP 2: Install cudatoolkit**
+    ```
+    conda install -c "nvidia/label/cuda-11.8.0" cuda-toolkit
+    ```
+- **STEP 3: Install torch**
+    ```
+    pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+    ```
+- **STEP 4: Set environment variables**
+    ```
+    export PATH=YOUR_GCC_PATH/bin:$PATH
+    export CUDA_HOME=YOUR_CUDA_PATH/
+    ```
+- **STEP 5: Install ninja and packaging**
+    ```
+    pip install ninja packaging
+    ```
+- **STEP 6: Install our repo**
+    ```
+    pip install -v -e .
+    ```
+
+- **STEP 7: Prepare pretrained weights.**
+    create directory `ckpts`
+
+    ```
+    mkdir ckpts 
+    ```
+    Download `resnet50-19c8e357.pth` form [Hugging Face](https://huggingface.co/rethinklab/Bench2DriveZoo/blob/main/resnet50-19c8e357.pth) or [Baidu Cloud](https://pan.baidu.com/s/1LlSrbYvghnv3lOlX1uLU5g?pwd=1234 )
+    Download `r101_dcn_fcos3d_pretrain.pth` form [Hugging Face](https://huggingface.co/rethinklab/Bench2DriveZoo/blob/main/r101_dcn_fcos3d_pretrain.pth) or [Baidu Cloud](https://pan.baidu.com/s/1o7owaQ5G66xqq2S0TldwXQ?pwd=1234)
+
+
+- **STEP 8: Install Carla for closed-loop evaluation.**
+
+
+    ```
+    mkdir carla
+        cd carla
+        wget https://carla-releases.s3.us-east-005.backblazeb2.com/Linux/CARLA_0.9.15.tar.gz
+        tar -xvf CARLA_0.9.15.tar.gz
+        cd Import && wget https://carla-releases.s3.us-east-005.backblazeb2.com/Linux/AdditionalMaps_0.9.15.tar.gz
+        cd .. && bash ImportAssets.sh
+        export CARLA_ROOT=YOUR_CARLA_PATH
+        echo "$CARLA_ROOT/PythonAPI/carla/dist/carla-0.9.15-py3.7-linux-x86_64.egg" >> YOUR_CONDA_PATH/envs/YOUR_CONDA_ENV_NAME/lib/python3.7/site-packages/carla.pth # python 3.8 also works well, please set YOUR_CONDA_PATH and YOUR_CONDA_ENV_NAME
+
+    ```
\ No newline at end of file
diff --git a/docs/TRAIN_EVAL.md b/docs/TRAIN_EVAL.md
new file mode 100644
index 0000000..ce66ff0
--- /dev/null
+++ b/docs/TRAIN_EVAL.md
@@ -0,0 +1,68 @@
+# Train/Eval models
+
+You can use following commands to train and validate [BEVFormer](https://github.com/fundamentalvision/BEVFormer), [UniAD](https://github.com/OpenDriveLab/UniAD) and [VAD](https://github.com/hustvl/VAD)
+
+## BEVFormer
+
+### Train 
+
+```bash
+#train BEVFormer base
+./adzoo/bevformer/dist_train.sh ./adzoo/bevformer/configs/bevformer/bevformer_base_b2d.py 4 #N_GPUS
+#train BEVFormer tiny
+./adzoo/bevformer/dist_train.sh ./adzoo/bevformer/configs/bevformer/bevformer_tiny_b2d.py 4 #N_GPUS
+```
+### Open loop eval 
+
+```bash
+#eval BEVFormer base
+./adzoo/bevformer/dist_test.sh ./adzoo/bevformer/configs/bevformer/bevformer_base_b2d.py ./ckpts/bevformer_base_b2d.pth 1 
+#test BEVFormerr tiny
+./adzoo/bevformer/dist_test.sh ./adzoo/bevformer/configs/bevformer/bevformer_tiny_b2d.py ./ckpts/bevformer_tiny_b2d.pth 1 
+```
+
+
+## UniAD
+
+### Train stage1
+```bash
+#train UniAD base
+./adzoo/uniad/uniad_dist_train.sh  ./adzoo/uniad/configs/stage1_track_map/base_track_map_b2d.py 4 
+#train UniAD tiny
+./adzoo/uniad/uniad_dist_train.sh  ./adzoo/uniad/configs/stage1_track_map/tiny_track_map_b2d.py 4 
+```
+
+### Train stage2
+```bash
+#train UniAD base
+./adzoo/uniad/uniad_dist_train.sh  ./adzoo/uniad/configs/stage2_e2e/base_e2e_b2d.py 1 
+#train UniAD tiny
+./adzoo/uniad/uniad_dist_train.sh  ./adzoo/uniad/configs/stage2_e2e/tiny_e2e_b2d.py 1 
+```
+
+
+### Open loop eval
+
+```bash
+#eval UniAD base
+./adzoo/uniad/uniad_dist_eval.sh ./adzoo/uniad/configs/stage2_e2e/base_e2e_b2d.py ./ckpts/uniad_base_b2d.pth 1
+#eval UniAD tiny
+./adzoo/uniad/uniad_dist_eval.sh ./adzoo/uniad/configs/stage2_e2e/tiny_e2e_b2d.py ./ckpts/uniad_tiny_b2d.pth 1
+```
+
+
+## VAD
+
+### Train 
+
+```bash
+./adzoo/vad/dist_test.sh ./adzoo/vad/configs/VAD/VAD_base_e2e_b2d.py ./ckpts/vad_b2d_base.pth 1
+```
+
+### Open loop eval
+
+```bash
+./adzoo/vad/dist_test.sh ./adzoo/vad/configs/VAD/VAD_base_e2e_b2d.py ./ckpts/vad_b2d_base.pth 1
+```
+
+**NOTE**: UniAD and VAD use different definitions to calculate Planning L2. UniAD calculates L2 at each time step(0.5s,1.0s,1.5s,...), while VAD calculates the average over each time period(0s-0.5s,0s-1.0s,0s-1.5s,...). We retain the original calculation logic in the code, but report UniAD's Planning L2 converted to VAD's definition.
\ No newline at end of file
diff --git a/mmcv/__init__.py b/mmcv/__init__.py
new file mode 100644
index 0000000..29f79b1
--- /dev/null
+++ b/mmcv/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# flake8: noqa
+__version__ = '0.0.1'
+
+from .fileio import *
+from .image import *
+from .utils import *
+from .core.bbox.coder.nms_free_coder import NMSFreeCoder
+from .core.bbox.match_costs import BBox3DL1Cost, DiceCost 
+from .core.evaluation.eval_hooks import CustomDistEvalHook
+from .models.utils import *
+from .models.opt.adamw import AdamW2
+from .losses import *
+from .structures import Instances, BoxMode, Boxes
+from .layers import cat, Conv2d, batched_nms, get_norm
\ No newline at end of file
diff --git a/mmcv/core/__init__.py b/mmcv/core/__init__.py
new file mode 100644
index 0000000..a401238
--- /dev/null
+++ b/mmcv/core/__init__.py
@@ -0,0 +1,10 @@
+from .anchor import *  # noqa: F401, F403
+from .bbox import *  # noqa: F401, F403
+from .evaluation import *  # noqa: F401, F403
+from .points import *  # noqa: F401, F403
+from .mask import *  # noqa: F401, F403
+from .post_processing import *  # noqa: F401, F403
+from .utils import *  # noqa: F401, F403
+# from .seg import *  # noqa: F401, F403
+from .visualizer import *  # noqa: F401, F403
+from .voxel import *  # noqa: F401, F403
diff --git a/mmcv/core/anchor/__init__.py b/mmcv/core/anchor/__init__.py
new file mode 100644
index 0000000..e3262a7
--- /dev/null
+++ b/mmcv/core/anchor/__init__.py
@@ -0,0 +1,18 @@
+from .anchor_generator import (AnchorGenerator, LegacyAnchorGenerator,
+                               YOLOAnchorGenerator)
+from .builder import (ANCHOR_GENERATORS, PRIOR_GENERATORS,
+                      build_anchor_generator, build_prior_generator)
+from .point_generator import MlvlPointGenerator, PointGenerator
+from .utils import anchor_inside_flags, calc_region, images_to_levels
+from .anchor_3d_generator import (AlignedAnchor3DRangeGenerator,
+                                  AlignedAnchor3DRangeGeneratorPerCls,
+                                  Anchor3DRangeGenerator)
+
+__all__ = [
+    'AnchorGenerator', 'LegacyAnchorGenerator', 'anchor_inside_flags',
+    'PointGenerator', 'images_to_levels', 'calc_region',
+    'build_anchor_generator', 'ANCHOR_GENERATORS', 'YOLOAnchorGenerator',
+    'build_prior_generator', 'PRIOR_GENERATORS', 'MlvlPointGenerator',
+    'AlignedAnchor3DRangeGenerator', 'Anchor3DRangeGenerator',
+    'AlignedAnchor3DRangeGeneratorPerCls'
+]
diff --git a/mmcv/core/anchor/anchor_3d_generator.py b/mmcv/core/anchor/anchor_3d_generator.py
new file mode 100644
index 0000000..118f6ea
--- /dev/null
+++ b/mmcv/core/anchor/anchor_3d_generator.py
@@ -0,0 +1,404 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmcv.core.anchor import ANCHOR_GENERATORS
+from mmcv.utils import is_list_of
+
+
+@ANCHOR_GENERATORS.register_module()
+class Anchor3DRangeGenerator(object):
+    """3D Anchor Generator by range.
+
+    This anchor generator generates anchors by the given range in different
+    feature levels.
+    Due the convention in 3D detection, different anchor sizes are related to
+    different ranges for different categories. However we find this setting
+    does not effect the performance much in some datasets, e.g., nuScenes.
+
+    Args:
+        ranges (list[list[float]]): Ranges of different anchors.
+            The ranges are the same across different feature levels. But may
+            vary for different anchor sizes if size_per_range is True.
+        sizes (list[list[float]]): 3D sizes of anchors.
+        scales (list[int]): Scales of anchors in different feature levels.
+        rotations (list[float]): Rotations of anchors in a feature grid.
+        custom_values (tuple[float]): Customized values of that anchor. For
+            example, in nuScenes the anchors have velocities.
+        reshape_out (bool): Whether to reshape the output into (N x 4).
+        size_per_range: Whether to use separate ranges for different sizes.
+            If size_per_range is True, the ranges should have the same length
+            as the sizes, if not, it will be duplicated.
+    """
+
+    def __init__(self,
+                 ranges,
+                 sizes=[[1.6, 3.9, 1.56]],
+                 scales=[1],
+                 rotations=[0, 1.5707963],
+                 custom_values=(),
+                 reshape_out=True,
+                 size_per_range=True):
+        assert is_list_of(ranges, list)
+        if size_per_range:
+            if len(sizes) != len(ranges):
+                assert len(ranges) == 1
+                ranges = ranges * len(sizes)
+            assert len(ranges) == len(sizes)
+        else:
+            assert len(ranges) == 1
+        assert is_list_of(sizes, list)
+        assert isinstance(scales, list)
+
+        self.sizes = sizes
+        self.scales = scales
+        self.ranges = ranges
+        self.rotations = rotations
+        self.custom_values = custom_values
+        self.cached_anchors = None
+        self.reshape_out = reshape_out
+        self.size_per_range = size_per_range
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += f'anchor_range={self.ranges},\n'
+        s += f'scales={self.scales},\n'
+        s += f'sizes={self.sizes},\n'
+        s += f'rotations={self.rotations},\n'
+        s += f'reshape_out={self.reshape_out},\n'
+        s += f'size_per_range={self.size_per_range})'
+        return s
+
+    @property
+    def num_base_anchors(self):
+        """list[int]: Total number of base anchors in a feature grid."""
+        num_rot = len(self.rotations)
+        num_size = torch.tensor(self.sizes).reshape(-1, 3).size(0)
+        return num_rot * num_size
+
+    @property
+    def num_levels(self):
+        """int: Number of feature levels that the generator is applied to."""
+        return len(self.scales)
+
+    def grid_anchors(self, featmap_sizes, device='cuda'):
+        """Generate grid anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels.
+            device (str): Device where the anchors will be put on.
+
+        Returns:
+            list[torch.Tensor]: Anchors in multiple feature levels. \
+                The sizes of each tensor should be [N, 4], where \
+                N = width * height * num_base_anchors, width and height \
+                are the sizes of the corresponding feature lavel, \
+                num_base_anchors is the number of anchors for that level.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_anchors = []
+        for i in range(self.num_levels):
+            anchors = self.single_level_grid_anchors(
+                featmap_sizes[i], self.scales[i], device=device)
+            if self.reshape_out:
+                anchors = anchors.reshape(-1, anchors.size(-1))
+            multi_level_anchors.append(anchors)
+        return multi_level_anchors
+
+    def single_level_grid_anchors(self, featmap_size, scale, device='cuda'):
+        """Generate grid anchors of a single level feature map.
+
+        This function is usually called by method ``self.grid_anchors``.
+
+        Args:
+            featmap_size (tuple[int]): Size of the feature map.
+            scale (float): Scale factor of the anchors in the current level.
+            device (str, optional): Device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors in the overall feature map.
+        """
+        # We reimplement the anchor generator using torch in cuda
+        # torch: 0.6975 s for 1000 times
+        # numpy: 4.3345 s for 1000 times
+        # which is ~5 times faster than the numpy implementation
+        if not self.size_per_range:
+            return self.anchors_single_range(
+                featmap_size,
+                self.ranges[0],
+                scale,
+                self.sizes,
+                self.rotations,
+                device=device)
+
+        mr_anchors = []
+        for anchor_range, anchor_size in zip(self.ranges, self.sizes):
+            mr_anchors.append(
+                self.anchors_single_range(
+                    featmap_size,
+                    anchor_range,
+                    scale,
+                    anchor_size,
+                    self.rotations,
+                    device=device))
+        mr_anchors = torch.cat(mr_anchors, dim=-3)
+        return mr_anchors
+
+    def anchors_single_range(self,
+                             feature_size,
+                             anchor_range,
+                             scale=1,
+                             sizes=[[1.6, 3.9, 1.56]],
+                             rotations=[0, 1.5707963],
+                             device='cuda'):
+        """Generate anchors in a single range.
+
+        Args:
+            feature_size (list[float] | tuple[float]): Feature map size. It is
+                either a list of a tuple of [D, H, W](in order of z, y, and x).
+            anchor_range (torch.Tensor | list[float]): Range of anchors with
+                shape [6]. The order is consistent with that of anchors, i.e.,
+                (x_min, y_min, z_min, x_max, y_max, z_max).
+            scale (float | int, optional): The scale factor of anchors.
+            sizes (list[list] | np.ndarray | torch.Tensor): Anchor size with
+                shape [N, 3], in order of x, y, z.
+            rotations (list[float] | np.ndarray | torch.Tensor): Rotations of
+                anchors in a single feature grid.
+            device (str): Devices that the anchors will be put on.
+
+        Returns:
+            torch.Tensor: Anchors with shape \
+                [*feature_size, num_sizes, num_rots, 7].
+        """
+        if len(feature_size) == 2:
+            feature_size = [1, feature_size[0], feature_size[1]]
+        anchor_range = torch.tensor(anchor_range, device=device)
+        z_centers = torch.linspace(
+            anchor_range[2], anchor_range[5], feature_size[0], device=device)
+        y_centers = torch.linspace(
+            anchor_range[1], anchor_range[4], feature_size[1], device=device)
+        x_centers = torch.linspace(
+            anchor_range[0], anchor_range[3], feature_size[2], device=device)
+        sizes = torch.tensor(sizes, device=device).reshape(-1, 3) * scale
+        rotations = torch.tensor(rotations, device=device)
+
+        # torch.meshgrid default behavior is 'id', np's default is 'xy'
+        rets = torch.meshgrid(x_centers, y_centers, z_centers, rotations)
+        # torch.meshgrid returns a tuple rather than list
+        rets = list(rets)
+        tile_shape = [1] * 5
+        tile_shape[-2] = int(sizes.shape[0])
+        for i in range(len(rets)):
+            rets[i] = rets[i].unsqueeze(-2).repeat(tile_shape).unsqueeze(-1)
+
+        sizes = sizes.reshape([1, 1, 1, -1, 1, 3])
+        tile_size_shape = list(rets[0].shape)
+        tile_size_shape[3] = 1
+        sizes = sizes.repeat(tile_size_shape)
+        rets.insert(3, sizes)
+
+        ret = torch.cat(rets, dim=-1).permute([2, 1, 0, 3, 4, 5])
+        # [1, 200, 176, N, 2, 7] for kitti after permute
+
+        if len(self.custom_values) > 0:
+            custom_ndim = len(self.custom_values)
+            custom = ret.new_zeros([*ret.shape[:-1], custom_ndim])
+            # custom[:] = self.custom_values
+            ret = torch.cat([ret, custom], dim=-1)
+            # [1, 200, 176, N, 2, 9] for nus dataset after permute
+        return ret
+
+
+@ANCHOR_GENERATORS.register_module()
+class AlignedAnchor3DRangeGenerator(Anchor3DRangeGenerator):
+    """Aligned 3D Anchor Generator by range.
+
+    This anchor generator uses a different manner to generate the positions
+    of anchors' centers from :class:`Anchor3DRangeGenerator`.
+
+    Note:
+        The `align` means that the anchor's center is aligned with the voxel
+        grid, which is also the feature grid. The previous implementation of
+        :class:`Anchor3DRangeGenerator` does not generate the anchors' center
+        according to the voxel grid. Rather, it generates the center by
+        uniformly distributing the anchors inside the minimum and maximum
+        anchor ranges according to the feature map sizes.
+        However, this makes the anchors center does not match the feature grid.
+        The :class:`AlignedAnchor3DRangeGenerator` add + 1 when using the
+        feature map sizes to obtain the corners of the voxel grid. Then it
+        shifts the coordinates to the center of voxel grid and use the left
+        up corner to distribute anchors.
+
+    Args:
+        anchor_corner (bool): Whether to align with the corner of the voxel
+            grid. By default it is False and the anchor's center will be
+            the same as the corresponding voxel's center, which is also the
+            center of the corresponding greature grid.
+    """
+
+    def __init__(self, align_corner=False, **kwargs):
+        super(AlignedAnchor3DRangeGenerator, self).__init__(**kwargs)
+        self.align_corner = align_corner
+
+    def anchors_single_range(self,
+                             feature_size,
+                             anchor_range,
+                             scale,
+                             sizes=[[1.6, 3.9, 1.56]],
+                             rotations=[0, 1.5707963],
+                             device='cuda'):
+        """Generate anchors in a single range.
+
+        Args:
+            feature_size (list[float] | tuple[float]): Feature map size. It is
+                either a list of a tuple of [D, H, W](in order of z, y, and x).
+            anchor_range (torch.Tensor | list[float]): Range of anchors with
+                shape [6]. The order is consistent with that of anchors, i.e.,
+                (x_min, y_min, z_min, x_max, y_max, z_max).
+            scale (float | int, optional): The scale factor of anchors.
+            sizes (list[list] | np.ndarray | torch.Tensor): Anchor size with
+                shape [N, 3], in order of x, y, z.
+            rotations (list[float] | np.ndarray | torch.Tensor): Rotations of
+                anchors in a single feature grid.
+            device (str): Devices that the anchors will be put on.
+
+        Returns:
+            torch.Tensor: Anchors with shape \
+                [*feature_size, num_sizes, num_rots, 7].
+        """
+        if len(feature_size) == 2:
+            feature_size = [1, feature_size[0], feature_size[1]]
+        anchor_range = torch.tensor(anchor_range, device=device)
+        z_centers = torch.linspace(
+            anchor_range[2],
+            anchor_range[5],
+            feature_size[0] + 1,
+            device=device)
+        y_centers = torch.linspace(
+            anchor_range[1],
+            anchor_range[4],
+            feature_size[1] + 1,
+            device=device)
+        x_centers = torch.linspace(
+            anchor_range[0],
+            anchor_range[3],
+            feature_size[2] + 1,
+            device=device)
+        sizes = torch.tensor(sizes, device=device).reshape(-1, 3) * scale
+        rotations = torch.tensor(rotations, device=device)
+
+        # shift the anchor center
+        if not self.align_corner:
+            z_shift = (z_centers[1] - z_centers[0]) / 2
+            y_shift = (y_centers[1] - y_centers[0]) / 2
+            x_shift = (x_centers[1] - x_centers[0]) / 2
+            z_centers += z_shift
+            y_centers += y_shift
+            x_centers += x_shift
+
+        # torch.meshgrid default behavior is 'id', np's default is 'xy'
+        rets = torch.meshgrid(x_centers[:feature_size[2]],
+                              y_centers[:feature_size[1]],
+                              z_centers[:feature_size[0]], rotations)
+
+        # torch.meshgrid returns a tuple rather than list
+        rets = list(rets)
+        tile_shape = [1] * 5
+        tile_shape[-2] = int(sizes.shape[0])
+        for i in range(len(rets)):
+            rets[i] = rets[i].unsqueeze(-2).repeat(tile_shape).unsqueeze(-1)
+
+        sizes = sizes.reshape([1, 1, 1, -1, 1, 3])
+        tile_size_shape = list(rets[0].shape)
+        tile_size_shape[3] = 1
+        sizes = sizes.repeat(tile_size_shape)
+        rets.insert(3, sizes)
+
+        ret = torch.cat(rets, dim=-1).permute([2, 1, 0, 3, 4, 5])
+
+        if len(self.custom_values) > 0:
+            custom_ndim = len(self.custom_values)
+            custom = ret.new_zeros([*ret.shape[:-1], custom_ndim])
+            # TODO: check the support of custom values
+            # custom[:] = self.custom_values
+            ret = torch.cat([ret, custom], dim=-1)
+        return ret
+
+
+@ANCHOR_GENERATORS.register_module()
+class AlignedAnchor3DRangeGeneratorPerCls(AlignedAnchor3DRangeGenerator):
+    """3D Anchor Generator by range for per class.
+
+    This anchor generator generates anchors by the given range for per class.
+    Note that feature maps of different classes may be different.
+
+    Args:
+        kwargs (dict): Arguments are the same as those in \
+            :class:`AlignedAnchor3DRangeGenerator`.
+    """
+
+    def __init__(self, **kwargs):
+        super(AlignedAnchor3DRangeGeneratorPerCls, self).__init__(**kwargs)
+        assert len(self.scales) == 1, 'Multi-scale feature map levels are' + \
+            ' not supported currently in this kind of anchor generator.'
+
+    def grid_anchors(self, featmap_sizes, device='cuda'):
+        """Generate grid anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes for \
+                different classes in a single feature level.
+            device (str): Device where the anchors will be put on.
+
+        Returns:
+            list[list[torch.Tensor]]: Anchors in multiple feature levels. \
+                Note that in this anchor generator, we currently only \
+                support single feature level. The sizes of each tensor \
+                should be [num_sizes/ranges*num_rots*featmap_size, \
+                box_code_size].
+        """
+        multi_level_anchors = []
+        anchors = self.multi_cls_grid_anchors(
+            featmap_sizes, self.scales[0], device=device)
+        multi_level_anchors.append(anchors)
+        return multi_level_anchors
+
+    def multi_cls_grid_anchors(self, featmap_sizes, scale, device='cuda'):
+        """Generate grid anchors of a single level feature map for multi-class
+        with different feature map sizes.
+
+        This function is usually called by method ``self.grid_anchors``.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes for \
+                different classes in a single feature level.
+            scale (float): Scale factor of the anchors in the current level.
+            device (str, optional): Device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors in the overall feature map.
+        """
+        assert len(featmap_sizes) == len(self.sizes) == len(self.ranges), \
+            'The number of different feature map sizes anchor sizes and ' + \
+            'ranges should be the same.'
+
+        multi_cls_anchors = []
+        for i in range(len(featmap_sizes)):
+            anchors = self.anchors_single_range(
+                featmap_sizes[i],
+                self.ranges[i],
+                scale,
+                self.sizes[i],
+                self.rotations,
+                device=device)
+            # [*featmap_size, num_sizes/ranges, num_rots, box_code_size]
+            ndim = len(featmap_sizes[i])
+            anchors = anchors.view(*featmap_sizes[i], -1, anchors.size(-1))
+            # [*featmap_size, num_sizes/ranges*num_rots, box_code_size]
+            anchors = anchors.permute(ndim, *range(0, ndim), ndim + 1)
+            # [num_sizes/ranges*num_rots, *featmap_size, box_code_size]
+            multi_cls_anchors.append(anchors.reshape(-1, anchors.size(-1)))
+            # [num_sizes/ranges*num_rots*featmap_size, box_code_size]
+        return multi_cls_anchors
diff --git a/mmcv/core/anchor/anchor_generator.py b/mmcv/core/anchor/anchor_generator.py
new file mode 100644
index 0000000..2b8c7d8
--- /dev/null
+++ b/mmcv/core/anchor/anchor_generator.py
@@ -0,0 +1,838 @@
+import warnings
+
+from mmcv.utils import is_tuple_of
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair
+
+from .builder import PRIOR_GENERATORS
+
+
+@PRIOR_GENERATORS.register_module()
+class AnchorGenerator:
+    """Standard anchor generator for 2D anchor-based detectors.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels in order (w, h).
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        scales (list[int] | None): Anchor scales for anchors in a single level.
+            It cannot be set at the same time if `octave_base_scale` and
+            `scales_per_octave` are set.
+        base_sizes (list[int] | None): The basic sizes
+            of anchors in multiple levels.
+            If None is given, strides will be used as base_sizes.
+            (If strides are non square, the shortest stride is taken.)
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. By default it is True in V2.0
+        octave_base_scale (int): The base scale of octave.
+        scales_per_octave (int): Number of scales for each octave.
+            `octave_base_scale` and `scales_per_octave` are usually used in
+            retinanet and the `scales` should be None when they are set.
+        centers (list[tuple[float, float]] | None): The centers of the anchor
+            relative to the feature grid center in multiple feature levels.
+            By default it is set to be None and not used. If a list of tuple of
+            float is given, they will be used to shift the centers of anchors.
+        center_offset (float): The offset of center in proportion to anchors'
+            width and height. By default it is 0 in V2.0.
+
+    Examples:
+        >>> from mmcv.core import AnchorGenerator
+        >>> self = AnchorGenerator([16], [1.], [1.], [9])
+        >>> all_anchors = self.grid_anchors([(2, 2)], device='cpu')
+        >>> print(all_anchors)
+        [tensor([[-4.5000, -4.5000,  4.5000,  4.5000],
+                [11.5000, -4.5000, 20.5000,  4.5000],
+                [-4.5000, 11.5000,  4.5000, 20.5000],
+                [11.5000, 11.5000, 20.5000, 20.5000]])]
+        >>> self = AnchorGenerator([16, 32], [1.], [1.], [9, 18])
+        >>> all_anchors = self.grid_anchors([(2, 2), (1, 1)], device='cpu')
+        >>> print(all_anchors)
+        [tensor([[-4.5000, -4.5000,  4.5000,  4.5000],
+                [11.5000, -4.5000, 20.5000,  4.5000],
+                [-4.5000, 11.5000,  4.5000, 20.5000],
+                [11.5000, 11.5000, 20.5000, 20.5000]]), \
+        tensor([[-9., -9., 9., 9.]])]
+    """
+
+    def __init__(self,
+                 strides,
+                 ratios,
+                 scales=None,
+                 base_sizes=None,
+                 scale_major=True,
+                 octave_base_scale=None,
+                 scales_per_octave=None,
+                 centers=None,
+                 center_offset=0.):
+        # check center and center_offset
+        if center_offset != 0:
+            assert centers is None, 'center cannot be set when center_offset' \
+                                    f'!=0, {centers} is given.'
+        if not (0 <= center_offset <= 1):
+            raise ValueError('center_offset should be in range [0, 1], '
+                             f'{center_offset} is given.')
+        if centers is not None:
+            assert len(centers) == len(strides), \
+                'The number of strides should be the same as centers, got ' \
+                f'{strides} and {centers}'
+
+        # calculate base sizes of anchors
+        self.strides = [_pair(stride) for stride in strides]
+        self.base_sizes = [min(stride) for stride in self.strides
+                           ] if base_sizes is None else base_sizes
+        assert len(self.base_sizes) == len(self.strides), \
+            'The number of strides should be the same as base sizes, got ' \
+            f'{self.strides} and {self.base_sizes}'
+
+        # calculate scales of anchors
+        assert ((octave_base_scale is not None
+                 and scales_per_octave is not None) ^ (scales is not None)), \
+            'scales and octave_base_scale with scales_per_octave cannot' \
+            ' be set at the same time'
+        if scales is not None:
+            self.scales = torch.Tensor(scales)
+        elif octave_base_scale is not None and scales_per_octave is not None:
+            octave_scales = np.array(
+                [2**(i / scales_per_octave) for i in range(scales_per_octave)])
+            scales = octave_scales * octave_base_scale
+            self.scales = torch.Tensor(scales)
+        else:
+            raise ValueError('Either scales or octave_base_scale with '
+                             'scales_per_octave should be set')
+
+        self.octave_base_scale = octave_base_scale
+        self.scales_per_octave = scales_per_octave
+        self.ratios = torch.Tensor(ratios)
+        self.scale_major = scale_major
+        self.centers = centers
+        self.center_offset = center_offset
+        self.base_anchors = self.gen_base_anchors()
+
+    @property
+    def num_base_anchors(self):
+        """list[int]: total number of base anchors in a feature grid"""
+        return self.num_base_priors
+
+    @property
+    def num_base_priors(self):
+        """list[int]: The number of priors (anchors) at a point
+        on the feature grid"""
+        return [base_anchors.size(0) for base_anchors in self.base_anchors]
+
+    @property
+    def num_levels(self):
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.strides)
+
+    def gen_base_anchors(self):
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_size in enumerate(self.base_sizes):
+            center = None
+            if self.centers is not None:
+                center = self.centers[i]
+            multi_level_base_anchors.append(
+                self.gen_single_level_base_anchors(
+                    base_size,
+                    scales=self.scales,
+                    ratios=self.ratios,
+                    center=center))
+        return multi_level_base_anchors
+
+    def gen_single_level_base_anchors(self,
+                                      base_size,
+                                      scales,
+                                      ratios,
+                                      center=None):
+        """Generate base anchors of a single level.
+
+        Args:
+            base_size (int | float): Basic size of an anchor.
+            scales (torch.Tensor): Scales of the anchor.
+            ratios (torch.Tensor): The ratio between between the height
+                and width of anchors in a single level.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature maps.
+        """
+        w = base_size
+        h = base_size
+        if center is None:
+            x_center = self.center_offset * w
+            y_center = self.center_offset * h
+        else:
+            x_center, y_center = center
+
+        h_ratios = torch.sqrt(ratios)
+        w_ratios = 1 / h_ratios
+        if self.scale_major:
+            ws = (w * w_ratios[:, None] * scales[None, :]).view(-1)
+            hs = (h * h_ratios[:, None] * scales[None, :]).view(-1)
+        else:
+            ws = (w * scales[:, None] * w_ratios[None, :]).view(-1)
+            hs = (h * scales[:, None] * h_ratios[None, :]).view(-1)
+
+        # use float anchor and the anchor's center is aligned with the
+        # pixel center
+        base_anchors = [
+            x_center - 0.5 * ws, y_center - 0.5 * hs, x_center + 0.5 * ws,
+            y_center + 0.5 * hs
+        ]
+        base_anchors = torch.stack(base_anchors, dim=-1)
+
+        return base_anchors
+
+    def _meshgrid(self, x, y, row_major=True):
+        """Generate mesh grid of x and y.
+
+        Args:
+            x (torch.Tensor): Grids of x dimension.
+            y (torch.Tensor): Grids of y dimension.
+            row_major (bool, optional): Whether to return y grids first.
+                Defaults to True.
+
+        Returns:
+            tuple[torch.Tensor]: The mesh grids of x and y.
+        """
+        # use shape instead of len to keep tracing while exporting to onnx
+        xx = x.repeat(y.shape[0])
+        yy = y.view(-1, 1).repeat(1, x.shape[0]).view(-1)
+        if row_major:
+            return xx, yy
+        else:
+            return yy, xx
+
+    def grid_priors(self, featmap_sizes, device='cuda'):
+        """Generate grid anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels.
+            device (str): The device where the anchors will be put on.
+
+        Return:
+            list[torch.Tensor]: Anchors in multiple feature levels. \
+                The sizes of each tensor should be [N, 4], where \
+                N = width * height * num_base_anchors, width and height \
+                are the sizes of the corresponding feature level, \
+                num_base_anchors is the number of anchors for that level.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_anchors = []
+        for i in range(self.num_levels):
+            anchors = self.single_level_grid_priors(
+                featmap_sizes[i], level_idx=i, device=device)
+            multi_level_anchors.append(anchors)
+        return multi_level_anchors
+
+    def single_level_grid_priors(self, featmap_size, level_idx, device='cuda'):
+        """Generate grid anchors of a single level.
+
+        Note:
+            This function is usually called by method ``self.grid_priors``.
+
+        Args:
+            featmap_size (tuple[int]): Size of the feature maps.
+            level_idx (int): The index of corresponding feature map level.
+            device (str, optional): The device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors in the overall feature maps.
+        """
+
+        base_anchors = self.base_anchors[level_idx].to(device)
+        feat_h, feat_w = featmap_size
+        stride_w, stride_h = self.strides[level_idx]
+        shift_x = torch.arange(0, feat_w, device=device) * stride_w
+        shift_y = torch.arange(0, feat_h, device=device) * stride_h
+
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1)
+        shifts = shifts.type_as(base_anchors)
+        # first feat_w elements correspond to the first row of shifts
+        # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get
+        # shifted anchors (K, A, 4), reshape to (K*A, 4)
+
+        all_anchors = base_anchors[None, :, :] + shifts[:, None, :]
+        all_anchors = all_anchors.view(-1, 4)
+        # first A rows correspond to A anchors of (0, 0) in feature map,
+        # then (0, 1), (0, 2), ...
+        return all_anchors
+
+    def sparse_priors(self,
+                      prior_idxs,
+                      featmap_size,
+                      level_idx,
+                      dtype=torch.float32,
+                      device='cuda'):
+        """Generate sparse anchors according to the ``prior_idxs``.
+
+        Args:
+            prior_idxs (Tensor): The index of corresponding anchors
+                in the feature map.
+            featmap_size (tuple[int]): feature map size arrange as (h, w).
+            level_idx (int): The level index of corresponding feature
+                map.
+            dtype (obj:`torch.dtype`): Date type of points.Defaults to
+                ``torch.float32``.
+            device (obj:`torch.device`): The device where the points is
+                located.
+        Returns:
+            Tensor: Anchor with shape (N, 4), N should be equal to
+                the length of ``prior_idxs``.
+        """
+
+        height, width = featmap_size
+        num_base_anchors = self.num_base_anchors[level_idx]
+        base_anchor_id = prior_idxs % num_base_anchors
+        x = (prior_idxs //
+             num_base_anchors) % width * self.strides[level_idx][0]
+        y = (prior_idxs // width //
+             num_base_anchors) % height * self.strides[level_idx][1]
+        priors = torch.stack([x, y, x, y], 1).to(dtype).to(device) + \
+            self.base_anchors[level_idx][base_anchor_id, :].to(device)
+
+        return priors
+
+    def grid_anchors(self, featmap_sizes, device='cuda'):
+        """Generate grid anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels.
+            device (str): Device where the anchors will be put on.
+
+        Return:
+            list[torch.Tensor]: Anchors in multiple feature levels. \
+                The sizes of each tensor should be [N, 4], where \
+                N = width * height * num_base_anchors, width and height \
+                are the sizes of the corresponding feature level, \
+                num_base_anchors is the number of anchors for that level.
+        """
+        warnings.warn('``grid_anchors`` would be deprecated soon. '
+                      'Please use ``grid_priors`` ')
+
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_anchors = []
+        for i in range(self.num_levels):
+            anchors = self.single_level_grid_anchors(
+                self.base_anchors[i].to(device),
+                featmap_sizes[i],
+                self.strides[i],
+                device=device)
+            multi_level_anchors.append(anchors)
+        return multi_level_anchors
+
+    def single_level_grid_anchors(self,
+                                  base_anchors,
+                                  featmap_size,
+                                  stride=(16, 16),
+                                  device='cuda'):
+        """Generate grid anchors of a single level.
+
+        Note:
+            This function is usually called by method ``self.grid_anchors``.
+
+        Args:
+            base_anchors (torch.Tensor): The base anchors of a feature grid.
+            featmap_size (tuple[int]): Size of the feature maps.
+            stride (tuple[int], optional): Stride of the feature map in order
+                (w, h). Defaults to (16, 16).
+            device (str, optional): Device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors in the overall feature maps.
+        """
+
+        warnings.warn(
+            '``single_level_grid_anchors`` would be deprecated soon. '
+            'Please use ``single_level_grid_priors`` ')
+
+        # keep featmap_size as Tensor instead of int, so that we
+        # can covert to ONNX correctly
+        feat_h, feat_w = featmap_size
+        shift_x = torch.arange(0, feat_w, device=device) * stride[0]
+        shift_y = torch.arange(0, feat_h, device=device) * stride[1]
+
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1)
+        shifts = shifts.type_as(base_anchors)
+        # first feat_w elements correspond to the first row of shifts
+        # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get
+        # shifted anchors (K, A, 4), reshape to (K*A, 4)
+
+        all_anchors = base_anchors[None, :, :] + shifts[:, None, :]
+        all_anchors = all_anchors.view(-1, 4)
+        # first A rows correspond to A anchors of (0, 0) in feature map,
+        # then (0, 1), (0, 2), ...
+        return all_anchors
+
+    def valid_flags(self, featmap_sizes, pad_shape, device='cuda'):
+        """Generate valid flags of anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list(tuple)): List of feature map sizes in
+                multiple feature levels.
+            pad_shape (tuple): The padded shape of the image.
+            device (str): Device where the anchors will be put on.
+
+        Return:
+            list(torch.Tensor): Valid flags of anchors in multiple levels.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_flags = []
+        for i in range(self.num_levels):
+            anchor_stride = self.strides[i]
+            feat_h, feat_w = featmap_sizes[i]
+            h, w = pad_shape[:2]
+            valid_feat_h = min(int(np.ceil(h / anchor_stride[1])), feat_h)
+            valid_feat_w = min(int(np.ceil(w / anchor_stride[0])), feat_w)
+            flags = self.single_level_valid_flags((feat_h, feat_w),
+                                                  (valid_feat_h, valid_feat_w),
+                                                  self.num_base_anchors[i],
+                                                  device=device)
+            multi_level_flags.append(flags)
+        return multi_level_flags
+
+    def single_level_valid_flags(self,
+                                 featmap_size,
+                                 valid_size,
+                                 num_base_anchors,
+                                 device='cuda'):
+        """Generate the valid flags of anchor in a single feature map.
+
+        Args:
+            featmap_size (tuple[int]): The size of feature maps, arrange
+                as (h, w).
+            valid_size (tuple[int]): The valid size of the feature maps.
+            num_base_anchors (int): The number of base anchors.
+            device (str, optional): Device where the flags will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: The valid flags of each anchor in a single level \
+                feature map.
+        """
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        valid = valid[:, None].expand(valid.size(0),
+                                      num_base_anchors).contiguous().view(-1)
+        return valid
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        indent_str = '    '
+        repr_str = self.__class__.__name__ + '(\n'
+        repr_str += f'{indent_str}strides={self.strides},\n'
+        repr_str += f'{indent_str}ratios={self.ratios},\n'
+        repr_str += f'{indent_str}scales={self.scales},\n'
+        repr_str += f'{indent_str}base_sizes={self.base_sizes},\n'
+        repr_str += f'{indent_str}scale_major={self.scale_major},\n'
+        repr_str += f'{indent_str}octave_base_scale='
+        repr_str += f'{self.octave_base_scale},\n'
+        repr_str += f'{indent_str}scales_per_octave='
+        repr_str += f'{self.scales_per_octave},\n'
+        repr_str += f'{indent_str}num_levels={self.num_levels}\n'
+        repr_str += f'{indent_str}centers={self.centers},\n'
+        repr_str += f'{indent_str}center_offset={self.center_offset})'
+        return repr_str
+
+
+@PRIOR_GENERATORS.register_module()
+class SSDAnchorGenerator(AnchorGenerator):
+    """Anchor generator for SSD.
+
+    Args:
+        strides (list[int]  | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels.
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        basesize_ratio_range (tuple(float)): Ratio range of anchors.
+        input_size (int): Size of feature map, 300 for SSD300,
+            512 for SSD512.
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. It is always set to be False in SSD.
+    """
+
+    def __init__(self,
+                 strides,
+                 ratios,
+                 basesize_ratio_range,
+                 input_size=300,
+                 scale_major=True):
+        assert len(strides) == len(ratios)
+        assert is_tuple_of(basesize_ratio_range, float)
+
+        self.strides = [_pair(stride) for stride in strides]
+        self.input_size = input_size
+        self.centers = [(stride[0] / 2., stride[1] / 2.)
+                        for stride in self.strides]
+        self.basesize_ratio_range = basesize_ratio_range
+
+        # calculate anchor ratios and sizes
+        min_ratio, max_ratio = basesize_ratio_range
+        min_ratio = int(min_ratio * 100)
+        max_ratio = int(max_ratio * 100)
+        step = int(np.floor(max_ratio - min_ratio) / (self.num_levels - 2))
+        min_sizes = []
+        max_sizes = []
+        for ratio in range(int(min_ratio), int(max_ratio) + 1, step):
+            min_sizes.append(int(self.input_size * ratio / 100))
+            max_sizes.append(int(self.input_size * (ratio + step) / 100))
+        if self.input_size == 300:
+            if basesize_ratio_range[0] == 0.15:  # SSD300 COCO
+                min_sizes.insert(0, int(self.input_size * 7 / 100))
+                max_sizes.insert(0, int(self.input_size * 15 / 100))
+            elif basesize_ratio_range[0] == 0.2:  # SSD300 VOC
+                min_sizes.insert(0, int(self.input_size * 10 / 100))
+                max_sizes.insert(0, int(self.input_size * 20 / 100))
+            else:
+                raise ValueError(
+                    'basesize_ratio_range[0] should be either 0.15'
+                    'or 0.2 when input_size is 300, got '
+                    f'{basesize_ratio_range[0]}.')
+        elif self.input_size == 512:
+            if basesize_ratio_range[0] == 0.1:  # SSD512 COCO
+                min_sizes.insert(0, int(self.input_size * 4 / 100))
+                max_sizes.insert(0, int(self.input_size * 10 / 100))
+            elif basesize_ratio_range[0] == 0.15:  # SSD512 VOC
+                min_sizes.insert(0, int(self.input_size * 7 / 100))
+                max_sizes.insert(0, int(self.input_size * 15 / 100))
+            else:
+                raise ValueError('basesize_ratio_range[0] should be either 0.1'
+                                 'or 0.15 when input_size is 512, got'
+                                 f' {basesize_ratio_range[0]}.')
+        else:
+            raise ValueError('Only support 300 or 512 in SSDAnchorGenerator'
+                             f', got {self.input_size}.')
+
+        anchor_ratios = []
+        anchor_scales = []
+        for k in range(len(self.strides)):
+            scales = [1., np.sqrt(max_sizes[k] / min_sizes[k])]
+            anchor_ratio = [1.]
+            for r in ratios[k]:
+                anchor_ratio += [1 / r, r]  # 4 or 6 ratio
+            anchor_ratios.append(torch.Tensor(anchor_ratio))
+            anchor_scales.append(torch.Tensor(scales))
+
+        self.base_sizes = min_sizes
+        self.scales = anchor_scales
+        self.ratios = anchor_ratios
+        self.scale_major = scale_major
+        self.center_offset = 0
+        self.base_anchors = self.gen_base_anchors()
+
+    def gen_base_anchors(self):
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_size in enumerate(self.base_sizes):
+            base_anchors = self.gen_single_level_base_anchors(
+                base_size,
+                scales=self.scales[i],
+                ratios=self.ratios[i],
+                center=self.centers[i])
+            indices = list(range(len(self.ratios[i])))
+            indices.insert(1, len(indices))
+            base_anchors = torch.index_select(base_anchors, 0,
+                                              torch.LongTensor(indices))
+            multi_level_base_anchors.append(base_anchors)
+        return multi_level_base_anchors
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        indent_str = '    '
+        repr_str = self.__class__.__name__ + '(\n'
+        repr_str += f'{indent_str}strides={self.strides},\n'
+        repr_str += f'{indent_str}scales={self.scales},\n'
+        repr_str += f'{indent_str}scale_major={self.scale_major},\n'
+        repr_str += f'{indent_str}input_size={self.input_size},\n'
+        repr_str += f'{indent_str}scales={self.scales},\n'
+        repr_str += f'{indent_str}ratios={self.ratios},\n'
+        repr_str += f'{indent_str}num_levels={self.num_levels},\n'
+        repr_str += f'{indent_str}base_sizes={self.base_sizes},\n'
+        repr_str += f'{indent_str}basesize_ratio_range='
+        repr_str += f'{self.basesize_ratio_range})'
+        return repr_str
+
+
+@PRIOR_GENERATORS.register_module()
+class LegacyAnchorGenerator(AnchorGenerator):
+    """Legacy anchor generator used in MMDetection V1.x.
+
+    Note:
+        Difference to the V2.0 anchor generator:
+
+        1. The center offset of V1.x anchors are set to be 0.5 rather than 0.
+        2. The width/height are minused by 1 when calculating the anchors' \
+            centers and corners to meet the V1.x coordinate system.
+        3. The anchors' corners are quantized.
+
+    Args:
+        strides (list[int] | list[tuple[int]]): Strides of anchors
+            in multiple feature levels.
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        scales (list[int] | None): Anchor scales for anchors in a single level.
+            It cannot be set at the same time if `octave_base_scale` and
+            `scales_per_octave` are set.
+        base_sizes (list[int]): The basic sizes of anchors in multiple levels.
+            If None is given, strides will be used to generate base_sizes.
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. By default it is True in V2.0
+        octave_base_scale (int): The base scale of octave.
+        scales_per_octave (int): Number of scales for each octave.
+            `octave_base_scale` and `scales_per_octave` are usually used in
+            retinanet and the `scales` should be None when they are set.
+        centers (list[tuple[float, float]] | None): The centers of the anchor
+            relative to the feature grid center in multiple feature levels.
+            By default it is set to be None and not used. It a list of float
+            is given, this list will be used to shift the centers of anchors.
+        center_offset (float): The offset of center in propotion to anchors'
+            width and height. By default it is 0.5 in V2.0 but it should be 0.5
+            in v1.x models.
+
+    Examples:
+        >>> from mmcv.core import LegacyAnchorGenerator
+        >>> self = LegacyAnchorGenerator(
+        >>>     [16], [1.], [1.], [9], center_offset=0.5)
+        >>> all_anchors = self.grid_anchors(((2, 2),), device='cpu')
+        >>> print(all_anchors)
+        [tensor([[ 0.,  0.,  8.,  8.],
+                [16.,  0., 24.,  8.],
+                [ 0., 16.,  8., 24.],
+                [16., 16., 24., 24.]])]
+    """
+
+    def gen_single_level_base_anchors(self,
+                                      base_size,
+                                      scales,
+                                      ratios,
+                                      center=None):
+        """Generate base anchors of a single level.
+
+        Note:
+            The width/height of anchors are minused by 1 when calculating \
+                the centers and corners to meet the V1.x coordinate system.
+
+        Args:
+            base_size (int | float): Basic size of an anchor.
+            scales (torch.Tensor): Scales of the anchor.
+            ratios (torch.Tensor): The ratio between between the height.
+                and width of anchors in a single level.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature map.
+        """
+        w = base_size
+        h = base_size
+        if center is None:
+            x_center = self.center_offset * (w - 1)
+            y_center = self.center_offset * (h - 1)
+        else:
+            x_center, y_center = center
+
+        h_ratios = torch.sqrt(ratios)
+        w_ratios = 1 / h_ratios
+        if self.scale_major:
+            ws = (w * w_ratios[:, None] * scales[None, :]).view(-1)
+            hs = (h * h_ratios[:, None] * scales[None, :]).view(-1)
+        else:
+            ws = (w * scales[:, None] * w_ratios[None, :]).view(-1)
+            hs = (h * scales[:, None] * h_ratios[None, :]).view(-1)
+
+        # use float anchor and the anchor's center is aligned with the
+        # pixel center
+        base_anchors = [
+            x_center - 0.5 * (ws - 1), y_center - 0.5 * (hs - 1),
+            x_center + 0.5 * (ws - 1), y_center + 0.5 * (hs - 1)
+        ]
+        base_anchors = torch.stack(base_anchors, dim=-1).round()
+
+        return base_anchors
+
+
+@PRIOR_GENERATORS.register_module()
+class LegacySSDAnchorGenerator(SSDAnchorGenerator, LegacyAnchorGenerator):
+    """Legacy anchor generator used in MMDetection V1.x.
+
+    The difference between `LegacySSDAnchorGenerator` and `SSDAnchorGenerator`
+    can be found in `LegacyAnchorGenerator`.
+    """
+
+    def __init__(self,
+                 strides,
+                 ratios,
+                 basesize_ratio_range,
+                 input_size=300,
+                 scale_major=True):
+        super(LegacySSDAnchorGenerator,
+              self).__init__(strides, ratios, basesize_ratio_range, input_size,
+                             scale_major)
+        self.centers = [((stride - 1) / 2., (stride - 1) / 2.)
+                        for stride in strides]
+        self.base_anchors = self.gen_base_anchors()
+
+
+@PRIOR_GENERATORS.register_module()
+class YOLOAnchorGenerator(AnchorGenerator):
+    """Anchor generator for YOLO.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels.
+        base_sizes (list[list[tuple[int, int]]]): The basic sizes
+            of anchors in multiple levels.
+    """
+
+    def __init__(self, strides, base_sizes):
+        self.strides = [_pair(stride) for stride in strides]
+        self.centers = [(stride[0] / 2., stride[1] / 2.)
+                        for stride in self.strides]
+        self.base_sizes = []
+        num_anchor_per_level = len(base_sizes[0])
+        for base_sizes_per_level in base_sizes:
+            assert num_anchor_per_level == len(base_sizes_per_level)
+            self.base_sizes.append(
+                [_pair(base_size) for base_size in base_sizes_per_level])
+        self.base_anchors = self.gen_base_anchors()
+
+    @property
+    def num_levels(self):
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.base_sizes)
+
+    def gen_base_anchors(self):
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_sizes_per_level in enumerate(self.base_sizes):
+            center = None
+            if self.centers is not None:
+                center = self.centers[i]
+            multi_level_base_anchors.append(
+                self.gen_single_level_base_anchors(base_sizes_per_level,
+                                                   center))
+        return multi_level_base_anchors
+
+    def gen_single_level_base_anchors(self, base_sizes_per_level, center=None):
+        """Generate base anchors of a single level.
+
+        Args:
+            base_sizes_per_level (list[tuple[int, int]]): Basic sizes of
+                anchors.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature maps.
+        """
+        x_center, y_center = center
+        base_anchors = []
+        for base_size in base_sizes_per_level:
+            w, h = base_size
+
+            # use float anchor and the anchor's center is aligned with the
+            # pixel center
+            base_anchor = torch.Tensor([
+                x_center - 0.5 * w, y_center - 0.5 * h, x_center + 0.5 * w,
+                y_center + 0.5 * h
+            ])
+            base_anchors.append(base_anchor)
+        base_anchors = torch.stack(base_anchors, dim=0)
+
+        return base_anchors
+
+    def responsible_flags(self, featmap_sizes, gt_bboxes, device='cuda'):
+        """Generate responsible anchor flags of grid cells in multiple scales.
+
+        Args:
+            featmap_sizes (list(tuple)): List of feature map sizes in multiple
+                feature levels.
+            gt_bboxes (Tensor): Ground truth boxes, shape (n, 4).
+            device (str): Device where the anchors will be put on.
+
+        Return:
+            list(torch.Tensor): responsible flags of anchors in multiple level
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_responsible_flags = []
+        for i in range(self.num_levels):
+            anchor_stride = self.strides[i]
+            flags = self.single_level_responsible_flags(
+                featmap_sizes[i],
+                gt_bboxes,
+                anchor_stride,
+                self.num_base_anchors[i],
+                device=device)
+            multi_level_responsible_flags.append(flags)
+        return multi_level_responsible_flags
+
+    def single_level_responsible_flags(self,
+                                       featmap_size,
+                                       gt_bboxes,
+                                       stride,
+                                       num_base_anchors,
+                                       device='cuda'):
+        """Generate the responsible flags of anchor in a single feature map.
+
+        Args:
+            featmap_size (tuple[int]): The size of feature maps.
+            gt_bboxes (Tensor): Ground truth boxes, shape (n, 4).
+            stride (tuple(int)): stride of current level
+            num_base_anchors (int): The number of base anchors.
+            device (str, optional): Device where the flags will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: The valid flags of each anchor in a single level \
+                feature map.
+        """
+        feat_h, feat_w = featmap_size
+        gt_bboxes_cx = ((gt_bboxes[:, 0] + gt_bboxes[:, 2]) * 0.5).to(device)
+        gt_bboxes_cy = ((gt_bboxes[:, 1] + gt_bboxes[:, 3]) * 0.5).to(device)
+        gt_bboxes_grid_x = torch.floor(gt_bboxes_cx / stride[0]).long()
+        gt_bboxes_grid_y = torch.floor(gt_bboxes_cy / stride[1]).long()
+
+        # row major indexing
+        gt_bboxes_grid_idx = gt_bboxes_grid_y * feat_w + gt_bboxes_grid_x
+
+        responsible_grid = torch.zeros(
+            feat_h * feat_w, dtype=torch.uint8, device=device)
+        responsible_grid[gt_bboxes_grid_idx] = 1
+
+        responsible_grid = responsible_grid[:, None].expand(
+            responsible_grid.size(0), num_base_anchors).contiguous().view(-1)
+        return responsible_grid
diff --git a/mmcv/core/anchor/builder.py b/mmcv/core/anchor/builder.py
new file mode 100644
index 0000000..d53a624
--- /dev/null
+++ b/mmcv/core/anchor/builder.py
@@ -0,0 +1,18 @@
+import warnings
+
+from mmcv.utils import Registry, build_from_cfg
+
+PRIOR_GENERATORS = Registry('Generator for anchors and points')
+
+ANCHOR_GENERATORS = PRIOR_GENERATORS
+
+
+def build_prior_generator(cfg, default_args=None):
+    return build_from_cfg(cfg, PRIOR_GENERATORS, default_args)
+
+
+def build_anchor_generator(cfg, default_args=None):
+    warnings.warn(
+        '``build_anchor_generator`` would be deprecated soon, please use '
+        '``build_prior_generator`` ')
+    return build_prior_generator(cfg, default_args=default_args)
diff --git a/mmcv/core/anchor/point_generator.py b/mmcv/core/anchor/point_generator.py
new file mode 100644
index 0000000..7b11a85
--- /dev/null
+++ b/mmcv/core/anchor/point_generator.py
@@ -0,0 +1,241 @@
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair
+
+from .builder import PRIOR_GENERATORS
+
+
+@PRIOR_GENERATORS.register_module()
+class PointGenerator:
+
+    def _meshgrid(self, x, y, row_major=True):
+        xx = x.repeat(len(y))
+        yy = y.view(-1, 1).repeat(1, len(x)).view(-1)
+        if row_major:
+            return xx, yy
+        else:
+            return yy, xx
+
+    def grid_points(self, featmap_size, stride=16, device='cuda'):
+        feat_h, feat_w = featmap_size
+        shift_x = torch.arange(0., feat_w, device=device) * stride
+        shift_y = torch.arange(0., feat_h, device=device) * stride
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        stride = shift_x.new_full((shift_xx.shape[0], ), stride)
+        shifts = torch.stack([shift_xx, shift_yy, stride], dim=-1)
+        all_points = shifts.to(device)
+        return all_points
+
+    def valid_flags(self, featmap_size, valid_size, device='cuda'):
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        return valid
+
+
+@PRIOR_GENERATORS.register_module()
+class MlvlPointGenerator:
+    """Standard points generator for multi-level (Mlvl) feature maps in 2D
+    points-based detectors.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels in order (w, h).
+        offset (float): The offset of points, the value is normalized with
+            corresponding stride. Defaults to 0.5.
+    """
+
+    def __init__(self, strides, offset=0.5):
+        self.strides = [_pair(stride) for stride in strides]
+        self.offset = offset
+
+    @property
+    def num_levels(self):
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.strides)
+
+    @property
+    def num_base_priors(self):
+        """list[int]: The number of priors (points) at a point
+        on the feature grid"""
+        return [1 for _ in range(len(self.strides))]
+
+    def _meshgrid(self, x, y, row_major=True):
+        xx = x.repeat(len(y))
+        yy = y.view(-1, 1).repeat(1, len(x)).view(-1)
+        if row_major:
+            return xx, yy
+        else:
+            return yy, xx
+
+    def grid_priors(self, featmap_sizes, device='cuda', with_stride=False):
+        """Generate grid points of multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels, each size arrange as
+                as (h, w).
+            device (str): The device where the anchors will be put on.
+            with_stride (bool): Whether to concatenate the stride to
+                the last dimension of points.
+
+        Return:
+            list[torch.Tensor]: Points of  multiple feature levels.
+            The sizes of each tensor should be (N, 2) when with stride is
+            ``False``, where N = width * height, width and height
+            are the sizes of the corresponding feature level,
+            and the last dimension 2 represent (coord_x, coord_y),
+            otherwise the shape should be (N, 4),
+            and the last dimension 4 represent
+            (coord_x, coord_y, stride_w, stride_h).
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_priors = []
+        for i in range(self.num_levels):
+            priors = self.single_level_grid_priors(
+                featmap_sizes[i],
+                level_idx=i,
+                device=device,
+                with_stride=with_stride)
+            multi_level_priors.append(priors)
+        return multi_level_priors
+
+    def single_level_grid_priors(self,
+                                 featmap_size,
+                                 level_idx,
+                                 device='cuda',
+                                 with_stride=False):
+        """Generate grid Points of a single level.
+
+        Note:
+            This function is usually called by method ``self.grid_priors``.
+
+        Args:
+            featmap_size (tuple[int]): Size of the feature maps, arrange as
+                (h, w).
+            level_idx (int): The index of corresponding feature map level.
+            device (str, optional): The device the tensor will be put on.
+                Defaults to 'cuda'.
+            with_stride (bool): Concatenate the stride to the last dimension
+                of points.
+
+        Return:
+            Tensor: Points of single feature levels.
+            The shape of tensor should be (N, 2) when with stride is
+            ``False``, where N = width * height, width and height
+            are the sizes of the corresponding feature level,
+            and the last dimension 2 represent (coord_x, coord_y),
+            otherwise the shape should be (N, 4),
+            and the last dimension 4 represent
+            (coord_x, coord_y, stride_w, stride_h).
+        """
+        feat_h, feat_w = featmap_size
+        stride_w, stride_h = self.strides[level_idx]
+        shift_x = (torch.arange(0., feat_w, device=device) +
+                   self.offset) * stride_w
+        shift_y = (torch.arange(0., feat_h, device=device) +
+                   self.offset) * stride_h
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        if not with_stride:
+            shifts = torch.stack([shift_xx, shift_yy], dim=-1)
+        else:
+            stride_w = shift_xx.new_full((len(shift_xx), ), stride_w)
+            stride_h = shift_xx.new_full((len(shift_yy), ), stride_h)
+            shifts = torch.stack([shift_xx, shift_yy, stride_w, stride_h],
+                                 dim=-1)
+        all_points = shifts.to(device)
+        return all_points
+
+    def valid_flags(self, featmap_sizes, pad_shape, device='cuda'):
+        """Generate valid flags of points of multiple feature levels.
+
+        Args:
+            featmap_sizes (list(tuple)): List of feature map sizes in
+                multiple feature levels, each size arrange as
+                as (h, w).
+            pad_shape (tuple(int)): The padded shape of the image,
+                 arrange as (h, w).
+            device (str): The device where the anchors will be put on.
+
+        Return:
+            list(torch.Tensor): Valid flags of points of multiple levels.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_flags = []
+        for i in range(self.num_levels):
+            point_stride = self.strides[i]
+            feat_h, feat_w = featmap_sizes[i]
+            h, w = pad_shape[:2]
+            valid_feat_h = min(int(np.ceil(h / point_stride[1])), feat_h)
+            valid_feat_w = min(int(np.ceil(w / point_stride[0])), feat_w)
+            flags = self.single_level_valid_flags((feat_h, feat_w),
+                                                  (valid_feat_h, valid_feat_w),
+                                                  device=device)
+            multi_level_flags.append(flags)
+        return multi_level_flags
+
+    def single_level_valid_flags(self,
+                                 featmap_size,
+                                 valid_size,
+                                 device='cuda'):
+        """Generate the valid flags of points of a single feature map.
+
+        Args:
+            featmap_size (tuple[int]): The size of feature maps, arrange as
+                as (h, w).
+            valid_size (tuple[int]): The valid size of the feature maps.
+                The size arrange as as (h, w).
+            device (str, optional): The device where the flags will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: The valid flags of each points in a single level \
+                feature map.
+        """
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        return valid
+
+    def sparse_priors(self,
+                      prior_idxs,
+                      featmap_size,
+                      level_idx,
+                      dtype=torch.float32,
+                      device='cuda'):
+        """Generate sparse points according to the ``prior_idxs``.
+
+        Args:
+            prior_idxs (Tensor): The index of corresponding anchors
+                in the feature map.
+            featmap_size (tuple[int]): feature map size arrange as (w, h).
+            level_idx (int): The level index of corresponding feature
+                map.
+            dtype (obj:`torch.dtype`): Date type of points. Defaults to
+                ``torch.float32``.
+            device (obj:`torch.device`): The device where the points is
+                located.
+        Returns:
+            Tensor: Anchor with shape (N, 2), N should be equal to
+            the length of ``prior_idxs``. And last dimension
+            2 represent (coord_x, coord_y).
+        """
+        height, width = featmap_size
+        x = (prior_idxs % width + self.offset) * self.strides[level_idx][0]
+        y = ((prior_idxs // width) % height +
+             self.offset) * self.strides[level_idx][1]
+        prioris = torch.stack([x, y], 1).to(dtype)
+        prioris = prioris.to(device)
+        return prioris
diff --git a/mmcv/core/anchor/utils.py b/mmcv/core/anchor/utils.py
new file mode 100644
index 0000000..ab9b53f
--- /dev/null
+++ b/mmcv/core/anchor/utils.py
@@ -0,0 +1,71 @@
+import torch
+
+
+def images_to_levels(target, num_levels):
+    """Convert targets by image to targets by feature level.
+
+    [target_img0, target_img1] -> [target_level0, target_level1, ...]
+    """
+    target = torch.stack(target, 0)
+    level_targets = []
+    start = 0
+    for n in num_levels:
+        end = start + n
+        # level_targets.append(target[:, start:end].squeeze(0))
+        level_targets.append(target[:, start:end])
+        start = end
+    return level_targets
+
+
+def anchor_inside_flags(flat_anchors,
+                        valid_flags,
+                        img_shape,
+                        allowed_border=0):
+    """Check whether the anchors are inside the border.
+
+    Args:
+        flat_anchors (torch.Tensor): Flatten anchors, shape (n, 4).
+        valid_flags (torch.Tensor): An existing valid flags of anchors.
+        img_shape (tuple(int)): Shape of current image.
+        allowed_border (int, optional): The border to allow the valid anchor.
+            Defaults to 0.
+
+    Returns:
+        torch.Tensor: Flags indicating whether the anchors are inside a \
+            valid range.
+    """
+    img_h, img_w = img_shape[:2]
+    if allowed_border >= 0:
+        inside_flags = valid_flags & \
+            (flat_anchors[:, 0] >= -allowed_border) & \
+            (flat_anchors[:, 1] >= -allowed_border) & \
+            (flat_anchors[:, 2] < img_w + allowed_border) & \
+            (flat_anchors[:, 3] < img_h + allowed_border)
+    else:
+        inside_flags = valid_flags
+    return inside_flags
+
+
+def calc_region(bbox, ratio, featmap_size=None):
+    """Calculate a proportional bbox region.
+
+    The bbox center are fixed and the new h' and w' is h * ratio and w * ratio.
+
+    Args:
+        bbox (Tensor): Bboxes to calculate regions, shape (n, 4).
+        ratio (float): Ratio of the output region.
+        featmap_size (tuple): Feature map size used for clipping the boundary.
+
+    Returns:
+        tuple: x1, y1, x2, y2
+    """
+    x1 = torch.round((1 - ratio) * bbox[0] + ratio * bbox[2]).long()
+    y1 = torch.round((1 - ratio) * bbox[1] + ratio * bbox[3]).long()
+    x2 = torch.round(ratio * bbox[0] + (1 - ratio) * bbox[2]).long()
+    y2 = torch.round(ratio * bbox[1] + (1 - ratio) * bbox[3]).long()
+    if featmap_size is not None:
+        x1 = x1.clamp(min=0, max=featmap_size[1])
+        y1 = y1.clamp(min=0, max=featmap_size[0])
+        x2 = x2.clamp(min=0, max=featmap_size[1])
+        y2 = y2.clamp(min=0, max=featmap_size[0])
+    return (x1, y1, x2, y2)
diff --git a/mmcv/core/bbox/__init__.py b/mmcv/core/bbox/__init__.py
new file mode 100644
index 0000000..3399260
--- /dev/null
+++ b/mmcv/core/bbox/__init__.py
@@ -0,0 +1,13 @@
+from .builder import build_assigner, build_bbox_coder, build_sampler
+from .samplers import (PseudoSampler)
+from .structures import (get_box_type, limit_period,
+                         mono_cam_box2vis, points_cam2img, xywhr2xyxyr)
+from .transforms import (bbox2distance, bbox2result, bbox2roi,
+                         bbox_cxcywh_to_xyxy, bbox_flip, bbox_mapping,
+                         bbox_mapping_back, bbox_rescale, bbox_xyxy_to_cxcywh,
+                         distance2bbox, roi2bbox,
+                         bbox3d2result, bbox3d2roi, bbox3d_mapping_back)
+from .iou_calculators import (BboxOverlaps2D, bbox_overlaps, AxisAlignedBboxOverlaps3D, 
+                              BboxOverlaps3D, BboxOverlapsNearest3D,
+                              axis_aligned_bbox_overlaps_3d, bbox_overlaps_3d,
+                              bbox_overlaps_nearest_3d)
\ No newline at end of file
diff --git a/mmcv/core/bbox/assigners/__init__.py b/mmcv/core/bbox/assigners/__init__.py
new file mode 100644
index 0000000..9c6d438
--- /dev/null
+++ b/mmcv/core/bbox/assigners/__init__.py
@@ -0,0 +1,10 @@
+from .hungarian_assigner import HungarianAssigner
+from .hungarian_assigner_3d import HungarianAssigner3D
+from .hungarian_assigner_3d_track import HungarianAssigner3DTrack
+from .base_assigner import BaseAssigner
+from .map_hungarian_assigner_3d import MapHungarianAssigner3D
+
+# __all__ = [
+#     'HungarianAssigner', 
+    
+# ]
diff --git a/mmcv/core/bbox/assigners/assign_result.py b/mmcv/core/bbox/assigners/assign_result.py
new file mode 100644
index 0000000..f3b9543
--- /dev/null
+++ b/mmcv/core/bbox/assigners/assign_result.py
@@ -0,0 +1,204 @@
+import torch
+
+from mmcv.utils import util_mixins
+
+
+class AssignResult(util_mixins.NiceRepr):
+    """Stores assignments between predicted and truth boxes.
+
+    Attributes:
+        num_gts (int): the number of truth boxes considered when computing this
+            assignment
+
+        gt_inds (LongTensor): for each predicted box indicates the 1-based
+            index of the assigned truth box. 0 means unassigned and -1 means
+            ignore.
+
+        max_overlaps (FloatTensor): the iou between the predicted box and its
+            assigned truth box.
+
+        labels (None | LongTensor): If specified, for each predicted box
+            indicates the category label of the assigned truth box.
+
+    Example:
+        >>> # An assign result between 4 predicted boxes and 9 true boxes
+        >>> # where only two boxes were assigned.
+        >>> num_gts = 9
+        >>> max_overlaps = torch.LongTensor([0, .5, .9, 0])
+        >>> gt_inds = torch.LongTensor([-1, 1, 2, 0])
+        >>> labels = torch.LongTensor([0, 3, 4, 0])
+        >>> self = AssignResult(num_gts, gt_inds, max_overlaps, labels)
+        >>> print(str(self))  # xdoctest: +IGNORE_WANT
+        <AssignResult(num_gts=9, gt_inds.shape=(4,), max_overlaps.shape=(4,),
+                      labels.shape=(4,))>
+        >>> # Force addition of gt labels (when adding gt as proposals)
+        >>> new_labels = torch.LongTensor([3, 4, 5])
+        >>> self.add_gt_(new_labels)
+        >>> print(str(self))  # xdoctest: +IGNORE_WANT
+        <AssignResult(num_gts=9, gt_inds.shape=(7,), max_overlaps.shape=(7,),
+                      labels.shape=(7,))>
+    """
+
+    def __init__(self, num_gts, gt_inds, max_overlaps, labels=None):
+        self.num_gts = num_gts
+        self.gt_inds = gt_inds
+        self.max_overlaps = max_overlaps
+        self.labels = labels
+        # Interface for possible user-defined properties
+        self._extra_properties = {}
+
+    @property
+    def num_preds(self):
+        """int: the number of predictions in this assignment"""
+        return len(self.gt_inds)
+
+    def set_extra_property(self, key, value):
+        """Set user-defined new property."""
+        assert key not in self.info
+        self._extra_properties[key] = value
+
+    def get_extra_property(self, key):
+        """Get user-defined property."""
+        return self._extra_properties.get(key, None)
+
+    @property
+    def info(self):
+        """dict: a dictionary of info about the object"""
+        basic_info = {
+            'num_gts': self.num_gts,
+            'num_preds': self.num_preds,
+            'gt_inds': self.gt_inds,
+            'max_overlaps': self.max_overlaps,
+            'labels': self.labels,
+        }
+        basic_info.update(self._extra_properties)
+        return basic_info
+
+    def __nice__(self):
+        """str: a "nice" summary string describing this assign result"""
+        parts = []
+        parts.append(f'num_gts={self.num_gts!r}')
+        if self.gt_inds is None:
+            parts.append(f'gt_inds={self.gt_inds!r}')
+        else:
+            parts.append(f'gt_inds.shape={tuple(self.gt_inds.shape)!r}')
+        if self.max_overlaps is None:
+            parts.append(f'max_overlaps={self.max_overlaps!r}')
+        else:
+            parts.append('max_overlaps.shape='
+                         f'{tuple(self.max_overlaps.shape)!r}')
+        if self.labels is None:
+            parts.append(f'labels={self.labels!r}')
+        else:
+            parts.append(f'labels.shape={tuple(self.labels.shape)!r}')
+        return ', '.join(parts)
+
+    @classmethod
+    def random(cls, **kwargs):
+        """Create random AssignResult for tests or debugging.
+
+        Args:
+            num_preds: number of predicted boxes
+            num_gts: number of true boxes
+            p_ignore (float): probability of a predicted box assigned to an
+                ignored truth
+            p_assigned (float): probability of a predicted box not being
+                assigned
+            p_use_label (float | bool): with labels or not
+            rng (None | int | numpy.random.RandomState): seed or state
+
+        Returns:
+            :obj:`AssignResult`: Randomly generated assign results.
+
+        Example:
+            >>> from mmcv.core.bbox.assigners.assign_result import *  # NOQA
+            >>> self = AssignResult.random()
+            >>> print(self.info)
+        """
+        from mmcv.core.bbox import demodata
+        rng = demodata.ensure_rng(kwargs.get('rng', None))
+
+        num_gts = kwargs.get('num_gts', None)
+        num_preds = kwargs.get('num_preds', None)
+        p_ignore = kwargs.get('p_ignore', 0.3)
+        p_assigned = kwargs.get('p_assigned', 0.7)
+        p_use_label = kwargs.get('p_use_label', 0.5)
+        num_classes = kwargs.get('p_use_label', 3)
+
+        if num_gts is None:
+            num_gts = rng.randint(0, 8)
+        if num_preds is None:
+            num_preds = rng.randint(0, 16)
+
+        if num_gts == 0:
+            max_overlaps = torch.zeros(num_preds, dtype=torch.float32)
+            gt_inds = torch.zeros(num_preds, dtype=torch.int64)
+            if p_use_label is True or p_use_label < rng.rand():
+                labels = torch.zeros(num_preds, dtype=torch.int64)
+            else:
+                labels = None
+        else:
+            import numpy as np
+            # Create an overlap for each predicted box
+            max_overlaps = torch.from_numpy(rng.rand(num_preds))
+
+            # Construct gt_inds for each predicted box
+            is_assigned = torch.from_numpy(rng.rand(num_preds) < p_assigned)
+            # maximum number of assignments constraints
+            n_assigned = min(num_preds, min(num_gts, is_assigned.sum()))
+
+            assigned_idxs = np.where(is_assigned)[0]
+            rng.shuffle(assigned_idxs)
+            assigned_idxs = assigned_idxs[0:n_assigned]
+            assigned_idxs.sort()
+
+            is_assigned[:] = 0
+            is_assigned[assigned_idxs] = True
+
+            is_ignore = torch.from_numpy(
+                rng.rand(num_preds) < p_ignore) & is_assigned
+
+            gt_inds = torch.zeros(num_preds, dtype=torch.int64)
+
+            true_idxs = np.arange(num_gts)
+            rng.shuffle(true_idxs)
+            true_idxs = torch.from_numpy(true_idxs)
+            gt_inds[is_assigned] = true_idxs[:n_assigned]
+
+            gt_inds = torch.from_numpy(
+                rng.randint(1, num_gts + 1, size=num_preds))
+            gt_inds[is_ignore] = -1
+            gt_inds[~is_assigned] = 0
+            max_overlaps[~is_assigned] = 0
+
+            if p_use_label is True or p_use_label < rng.rand():
+                if num_classes == 0:
+                    labels = torch.zeros(num_preds, dtype=torch.int64)
+                else:
+                    labels = torch.from_numpy(
+                        # remind that we set FG labels to [0, num_class-1]
+                        # since mmcv v2.0
+                        # BG cat_id: num_class
+                        rng.randint(0, num_classes, size=num_preds))
+                    labels[~is_assigned] = 0
+            else:
+                labels = None
+
+        self = cls(num_gts, gt_inds, max_overlaps, labels)
+        return self
+
+    def add_gt_(self, gt_labels):
+        """Add ground truth as assigned results.
+
+        Args:
+            gt_labels (torch.Tensor): Labels of gt boxes
+        """
+        self_inds = torch.arange(
+            1, len(gt_labels) + 1, dtype=torch.long, device=gt_labels.device)
+        self.gt_inds = torch.cat([self_inds, self.gt_inds])
+
+        self.max_overlaps = torch.cat(
+            [self.max_overlaps.new_ones(len(gt_labels)), self.max_overlaps])
+
+        if self.labels is not None:
+            self.labels = torch.cat([gt_labels, self.labels])
diff --git a/mmcv/core/bbox/assigners/base_assigner.py b/mmcv/core/bbox/assigners/base_assigner.py
new file mode 100644
index 0000000..1ff0160
--- /dev/null
+++ b/mmcv/core/bbox/assigners/base_assigner.py
@@ -0,0 +1,9 @@
+from abc import ABCMeta, abstractmethod
+
+
+class BaseAssigner(metaclass=ABCMeta):
+    """Base assigner that assigns boxes to ground truth boxes."""
+
+    @abstractmethod
+    def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None):
+        """Assign boxes to either a ground truth boxes or a negative boxes."""
diff --git a/mmcv/core/bbox/assigners/hungarian_assigner.py b/mmcv/core/bbox/assigners/hungarian_assigner.py
new file mode 100644
index 0000000..e10cc14
--- /dev/null
+++ b/mmcv/core/bbox/assigners/hungarian_assigner.py
@@ -0,0 +1,145 @@
+import torch
+
+from ..builder import BBOX_ASSIGNERS
+from ..match_costs import build_match_cost
+from ..transforms import bbox_cxcywh_to_xyxy
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+
+try:
+    from scipy.optimize import linear_sum_assignment
+except ImportError:
+    linear_sum_assignment = None
+
+
+@BBOX_ASSIGNERS.register_module()
+class HungarianAssigner(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth.
+
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of three components:
+    classification cost, regression L1 cost and regression iou cost. The
+    targets don't include the no_object, so generally there are more
+    predictions than targets. After the one-to-one matching, the un-matched
+    are treated as backgrounds. Thus each query prediction will be assigned
+    with `0` or a positive integer indicating the ground truth index:
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+        bbox_weight (int | float, optional): The scale factor for regression
+            L1 cost. Default 1.0.
+        iou_weight (int | float, optional): The scale factor for regression
+            iou cost. Default 1.0.
+        iou_calculator (dict | optional): The config for the iou calculation.
+            Default type `BboxOverlaps2D`.
+        iou_mode (str | optional): "iou" (intersection over union), "iof"
+                (intersection over foreground), or "giou" (generalized
+                intersection over union). Default "giou".
+    """
+
+    def __init__(self,
+                 cls_cost=dict(type='ClassificationCost', weight=1.),
+                 reg_cost=dict(type='BBoxL1Cost', weight=1.0),
+                 iou_cost=dict(type='IoUCost', iou_mode='giou', weight=1.0)):
+        self.cls_cost = build_match_cost(cls_cost)
+        self.reg_cost = build_match_cost(reg_cost)
+        self.iou_cost = build_match_cost(iou_cost)
+
+    def assign(self,
+               bbox_pred,
+               cls_pred,
+               gt_bboxes,
+               gt_labels,
+               img_meta,
+               gt_bboxes_ignore=None,
+               eps=1e-7):
+        """Computes one-to-one matching based on the weighted costs.
+
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+            img_meta (dict): Meta information for current image.
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`. Default None.
+            eps (int | float, optional): A value added to the denominator for
+                numerical stability. Default 1e-7.
+
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert gt_bboxes_ignore is None, \
+            'Only case when gt_bboxes_ignore is None is supported.'
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              -1,
+                                              dtype=torch.long)
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+        img_h, img_w, _ = img_meta['img_shape']
+        factor = gt_bboxes.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+
+        # 2. compute the weighted costs
+        # classification and bboxcost.
+        cls_cost = self.cls_cost(cls_pred, gt_labels)
+        # regression L1 cost
+        normalize_gt_bboxes = gt_bboxes / factor
+        reg_cost = self.reg_cost(bbox_pred, normalize_gt_bboxes)
+        # regression iou cost, defaultly giou is used in official DETR.
+        bboxes = bbox_cxcywh_to_xyxy(bbox_pred) * factor
+        iou_cost = self.iou_cost(bboxes, gt_bboxes)
+        # weighted sum of above three costs
+        cost = cls_cost + reg_cost + iou_cost
+
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(
+            bbox_pred.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(
+            bbox_pred.device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(
+            num_gts, assigned_gt_inds, None, labels=assigned_labels)
diff --git a/mmcv/core/bbox/assigners/hungarian_assigner_3d.py b/mmcv/core/bbox/assigners/hungarian_assigner_3d.py
new file mode 100755
index 0000000..86d6cf2
--- /dev/null
+++ b/mmcv/core/bbox/assigners/hungarian_assigner_3d.py
@@ -0,0 +1,136 @@
+import torch
+
+from mmcv.core.bbox.builder import BBOX_ASSIGNERS
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+from mmcv.core.bbox.match_costs import build_match_cost
+from mmcv.models.utils.transformer import inverse_sigmoid
+from mmcv.core.bbox.util import normalize_bbox
+
+try:
+    from scipy.optimize import linear_sum_assignment
+except ImportError:
+    linear_sum_assignment = None
+
+
+@BBOX_ASSIGNERS.register_module()
+class HungarianAssigner3D(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth.
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of three components:
+    classification cost, regression L1 cost and regression iou cost. The
+    targets don't include the no_object, so generally there are more
+    predictions than targets. After the one-to-one matching, the un-matched
+    are treated as backgrounds. Thus each query prediction will be assigned
+    with `0` or a positive integer indicating the ground truth index:
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+    Args:
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+        bbox_weight (int | float, optional): The scale factor for regression
+            L1 cost. Default 1.0.
+        iou_weight (int | float, optional): The scale factor for regression
+            iou cost. Default 1.0.
+        iou_calculator (dict | optional): The config for the iou calculation.
+            Default type `BboxOverlaps2D`.
+        iou_mode (str | optional): "iou" (intersection over union), "iof"
+                (intersection over foreground), or "giou" (generalized
+                intersection over union). Default "giou".
+    """
+
+    def __init__(self,
+                 cls_cost=dict(type='ClassificationCost', weight=1.),
+                 reg_cost=dict(type='BBoxL1Cost', weight=1.0),
+                 iou_cost=dict(type='IoUCost', weight=0.0),
+                 pc_range=None):
+        self.cls_cost = build_match_cost(cls_cost)
+        self.reg_cost = build_match_cost(reg_cost)
+        self.iou_cost = build_match_cost(iou_cost)
+        self.pc_range = pc_range
+
+    def assign(self,
+               bbox_pred,
+               cls_pred,
+               gt_bboxes, 
+               gt_labels,
+               gt_bboxes_ignore=None,
+               eps=1e-7):
+        """Computes one-to-one matching based on the weighted costs.
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`. Default None.
+            eps (int | float, optional): A value added to the denominator for
+                numerical stability. Default 1e-7.
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert gt_bboxes_ignore is None, \
+            'Only case when gt_bboxes_ignore is None is supported.'
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              -1,
+                                              dtype=torch.long)
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+
+        # 2. compute the weighted costs
+        # classification and bboxcost.
+        cls_cost = self.cls_cost(cls_pred, gt_labels)
+        # regression L1 cost
+       
+        normalized_gt_bboxes = normalize_bbox(gt_bboxes, self.pc_range)
+    
+        reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8])
+      
+        # weighted sum of above two costs
+        cost = cls_cost + reg_cost
+        
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(
+            bbox_pred.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(
+            bbox_pred.device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(
+            num_gts, assigned_gt_inds, None, labels=assigned_labels)
\ No newline at end of file
diff --git a/mmcv/core/bbox/assigners/hungarian_assigner_3d_track.py b/mmcv/core/bbox/assigners/hungarian_assigner_3d_track.py
new file mode 100644
index 0000000..792d0f9
--- /dev/null
+++ b/mmcv/core/bbox/assigners/hungarian_assigner_3d_track.py
@@ -0,0 +1,122 @@
+import numpy as np
+import torch
+
+from mmcv.core.bbox.builder import BBOX_ASSIGNERS
+from mmcv.core.bbox.assigners.base_assigner import BaseAssigner
+from mmcv.core.bbox.match_costs import build_match_cost
+try:
+    from scipy.optimize import linear_sum_assignment
+except ImportError:
+    linear_sum_assignment = None
+
+
+@BBOX_ASSIGNERS.register_module()
+class HungarianAssigner3DTrack(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth.
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of three components:
+    classification cost, regression L1 cost and regression iou cost. The
+    targets don't include the no_object, so generally there are more
+    predictions than targets. After the one-to-one matching, the un-matched
+    are treated as backgrounds. Thus each query prediction will be assigned
+    with `0` or a positive integer indicating the ground truth index:
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+    Args:
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+        bbox_weight (int | float, optional): The scale factor for regression
+            L1 cost. Default 1.0.
+    """
+
+    def __init__(self,
+                 cls_cost=dict(type='ClassificationCost', weight=1.),
+                 reg_cost=dict(type='BBoxL1Cost', weight=1.0),
+                 pc_range=None):
+        self.cls_cost = build_match_cost(cls_cost)
+        self.reg_cost = build_match_cost(reg_cost)
+        self.pc_range = pc_range
+
+    def assign(self,
+               bbox_pred,
+               cls_pred,
+               gt_bboxes, 
+               gt_labels,
+               gt_bboxes_ignore=None,
+               eps=1e-7):
+        """Computes one-to-one matching based on the weighted costs.
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`. Default None.
+            eps (int | float, optional): A value added to the denominator for
+                numerical stability. Default 1e-7.
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert gt_bboxes_ignore is None, \
+            'Only case when gt_bboxes_ignore is None is supported.'
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              -1,
+                                              dtype=torch.long)
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return (None, None)
+        # 2. compute the weighted costs
+        # classification and bboxcost.
+        cls_cost = self.cls_cost(cls_pred, gt_labels)
+        # regression L1 cost
+        reg_cost = self.reg_cost(bbox_pred[:, :8], gt_bboxes[:, :8])
+        # weighted sum of above three costs
+        cost = cls_cost + reg_cost
+
+        cost = torch.nan_to_num(cost)
+
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+        cost = np.nan_to_num(cost)
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(
+            bbox_pred.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(
+            bbox_pred.device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        
+        return (matched_row_inds, matched_col_inds)
+
diff --git a/mmcv/core/bbox/assigners/map_hungarian_assigner_3d.py b/mmcv/core/bbox/assigners/map_hungarian_assigner_3d.py
new file mode 100644
index 0000000..2bfc278
--- /dev/null
+++ b/mmcv/core/bbox/assigners/map_hungarian_assigner_3d.py
@@ -0,0 +1,162 @@
+import torch
+import torch.nn.functional as F
+
+from mmcv.core.bbox.builder import BBOX_ASSIGNERS
+from mmcv.core.bbox.assigners.assign_result import AssignResult
+from mmcv.core.bbox.assigners.base_assigner import BaseAssigner
+from mmcv.core.bbox.match_costs import build_match_cost
+from mmcv.models.utils.transformer import inverse_sigmoid
+from mmcv.core.bbox.util import normalize_bbox
+from mmcv.models.vad_utils.map_utils import (
+    normalize_2d_bbox, normalize_2d_pts, denormalize_2d_bbox
+)
+
+try:
+    from scipy.optimize import linear_sum_assignment
+except ImportError:
+    linear_sum_assignment = None
+
+@BBOX_ASSIGNERS.register_module()
+class MapHungarianAssigner3D(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth.
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of three components:
+    classification cost, regression L1 cost and regression iou cost. The
+    targets don't include the no_object, so generally there are more
+    predictions than targets. After the one-to-one matching, the un-matched
+    are treated as backgrounds. Thus each query prediction will be assigned
+    with `0` or a positive integer indicating the ground truth index:
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+    Args:
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+        bbox_weight (int | float, optional): The scale factor for regression
+            L1 cost. Default 1.0.
+        iou_weight (int | float, optional): The scale factor for regression
+            iou cost. Default 1.0.
+        iou_calculator (dict | optional): The config for the iou calculation.
+            Default type `BboxOverlaps2D`.
+        iou_mode (str | optional): "iou" (intersection over union), "iof"
+                (intersection over foreground), or "giou" (generalized
+                intersection over union). Default "giou".
+    """
+
+    def __init__(self,
+                 cls_cost=dict(type='ClassificationCost', weight=1.),
+                 reg_cost=dict(type='BBoxL1Cost', weight=1.0),
+                 iou_cost=dict(type='IoUCost', weight=0.0),
+                 pts_cost=dict(type='ChamferDistance',loss_src_weight=1.0,loss_dst_weight=1.0),
+                 pc_range=None):
+        self.cls_cost = build_match_cost(cls_cost)
+        self.reg_cost = build_match_cost(reg_cost)
+        self.iou_cost = build_match_cost(iou_cost)
+        self.pts_cost = build_match_cost(pts_cost)
+        self.pc_range = pc_range
+
+    def assign(self,
+               bbox_pred,
+               cls_pred,
+               pts_pred,
+               gt_bboxes, 
+               gt_labels,
+               gt_pts,
+               gt_bboxes_ignore=None,
+               eps=1e-7):
+        """Computes one-to-one matching based on the weighted costs.
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`. Default None.
+            eps (int | float, optional): A value added to the denominator for
+                numerical stability. Default 1e-7.
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert gt_bboxes_ignore is None, \
+            'Only case when gt_bboxes_ignore is None is supported.'
+        assert bbox_pred.shape[-1] == 4, \
+            'Only support bbox pred shape is 4 dims'
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              -1,
+                                              dtype=torch.long)
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels), None
+
+        # 2. compute the weighted costs
+        # classification and bboxcost.
+        cls_cost = self.cls_cost(cls_pred, gt_labels)
+        # regression L1 cost
+        
+        normalized_gt_bboxes = normalize_2d_bbox(gt_bboxes, self.pc_range)
+        # normalized_gt_bboxes = gt_bboxes
+        # import pdb;pdb.set_trace()
+        reg_cost = self.reg_cost(bbox_pred[:, :4], normalized_gt_bboxes[:, :4])
+
+        _, num_orders, num_pts_per_gtline, num_coords = gt_pts.shape
+        normalized_gt_pts = normalize_2d_pts(gt_pts, self.pc_range)
+        num_pts_per_predline = pts_pred.size(1)
+        if num_pts_per_predline != num_pts_per_gtline:
+            pts_pred_interpolated = F.interpolate(pts_pred.permute(0,2,1),size=(num_pts_per_gtline),
+                                            mode='linear', align_corners=True)
+            pts_pred_interpolated = pts_pred_interpolated.permute(0,2,1).contiguous()
+        else:
+            pts_pred_interpolated = pts_pred
+        # num_q, num_pts, 2 <-> num_gt, num_pts, 2
+        pts_cost_ordered = self.pts_cost(pts_pred_interpolated, normalized_gt_pts)
+        pts_cost_ordered = pts_cost_ordered.view(num_bboxes, num_gts, num_orders)
+        pts_cost, order_index = torch.min(pts_cost_ordered, 2)
+        
+        bboxes = denormalize_2d_bbox(bbox_pred, self.pc_range)
+        iou_cost = self.iou_cost(bboxes, gt_bboxes)
+        # weighted sum of above three costs
+        cost = cls_cost + reg_cost + iou_cost + pts_cost
+        
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(
+            bbox_pred.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(
+            bbox_pred.device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(
+            num_gts, assigned_gt_inds, None, labels=assigned_labels), order_index
\ No newline at end of file
diff --git a/mmcv/core/bbox/box_np_ops.py b/mmcv/core/bbox/box_np_ops.py
new file mode 100644
index 0000000..6740e4e
--- /dev/null
+++ b/mmcv/core/bbox/box_np_ops.py
@@ -0,0 +1,896 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# TODO: clean the functions in this file and move the APIs into box structures
+# in the future
+
+import numba
+import numpy as np
+
+
+def camera_to_lidar(points, r_rect, velo2cam):
+    """Convert points in camera coordinate to lidar coordinate.
+
+    Args:
+        points (np.ndarray, shape=[N, 3]): Points in camera coordinate.
+        r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+            specific camera coordinate (e.g. CAM2) to CAM0.
+        velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
+            camera coordinate to lidar coordinate.
+
+    Returns:
+        np.ndarray, shape=[N, 3]: Points in lidar coordinate.
+    """
+    points_shape = list(points.shape[0:-1])
+    if points.shape[-1] == 3:
+        points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1)
+    lidar_points = points @ np.linalg.inv((r_rect @ velo2cam).T)
+    return lidar_points[..., :3]
+
+
+def box_camera_to_lidar(data, r_rect, velo2cam):
+    """Covert boxes in camera coordinate to lidar coordinate.
+
+    Args:
+        data (np.ndarray, shape=[N, 7]): Boxes in camera coordinate.
+        r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+            specific camera coordinate (e.g. CAM2) to CAM0.
+        velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
+            camera coordinate to lidar coordinate.
+
+    Returns:
+        np.ndarray, shape=[N, 3]: Boxes in lidar coordinate.
+    """
+    xyz = data[:, 0:3]
+    l, h, w = data[:, 3:4], data[:, 4:5], data[:, 5:6]
+    r = data[:, 6:7]
+    xyz_lidar = camera_to_lidar(xyz, r_rect, velo2cam)
+    return np.concatenate([xyz_lidar, w, l, h, r], axis=1)
+
+
+def corners_nd(dims, origin=0.5):
+    """Generate relative box corners based on length per dim and origin point.
+
+    Args:
+        dims (np.ndarray, shape=[N, ndim]): Array of length per dim
+        origin (list or array or float, optional): origin point relate to
+            smallest point. Defaults to 0.5
+
+    Returns:
+        np.ndarray, shape=[N, 2 ** ndim, ndim]: Returned corners.
+        point layout example: (2d) x0y0, x0y1, x1y0, x1y1;
+            (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
+            where x0 < x1, y0 < y1, z0 < z1.
+    """
+    ndim = int(dims.shape[1])
+    corners_norm = np.stack(
+        np.unravel_index(np.arange(2**ndim), [2] * ndim),
+        axis=1).astype(dims.dtype)
+    # now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1
+    # (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
+    # so need to convert to a format which is convenient to do other computing.
+    # for 2d boxes, format is clockwise start with minimum point
+    # for 3d boxes, please draw lines by your hand.
+    if ndim == 2:
+        # generate clockwise box corners
+        corners_norm = corners_norm[[0, 1, 3, 2]]
+    elif ndim == 3:
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+    corners_norm = corners_norm - np.array(origin, dtype=dims.dtype)
+    corners = dims.reshape([-1, 1, ndim]) * corners_norm.reshape(
+        [1, 2**ndim, ndim])
+    return corners
+
+
+def rotation_2d(points, angles):
+    """Rotation 2d points based on origin point clockwise when angle positive.
+
+    Args:
+        points (np.ndarray): Points to be rotated with shape \
+            (N, point_size, 2).
+        angles (np.ndarray): Rotation angle with shape (N).
+
+    Returns:
+        np.ndarray: Same shape as points.
+    """
+    rot_sin = np.sin(angles)
+    rot_cos = np.cos(angles)
+    rot_mat_T = np.stack([[rot_cos, -rot_sin], [rot_sin, rot_cos]])
+    return np.einsum('aij,jka->aik', points, rot_mat_T)
+
+
+def center_to_corner_box2d(centers, dims, angles=None, origin=0.5):
+    """Convert kitti locations, dimensions and angles to corners.
+    format: center(xy), dims(xy), angles(clockwise when positive)
+
+    Args:
+        centers (np.ndarray): Locations in kitti label file with shape (N, 2).
+        dims (np.ndarray): Dimensions in kitti label file with shape (N, 2).
+        angles (np.ndarray, optional): Rotation_y in kitti label file with
+            shape (N). Defaults to None.
+        origin (list or array or float, optional): origin point relate to
+            smallest point. Defaults to 0.5.
+
+    Returns:
+        np.ndarray: Corners with the shape of (N, 4, 2).
+    """
+    # 'length' in kitti format is in x axis.
+    # xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar)
+    # center in kitti format is [0.5, 1.0, 0.5] in xyz.
+    corners = corners_nd(dims, origin=origin)
+    # corners: [N, 4, 2]
+    if angles is not None:
+        corners = rotation_2d(corners, angles)
+    corners += centers.reshape([-1, 1, 2])
+    return corners
+
+
+@numba.jit(nopython=True)
+def depth_to_points(depth, trunc_pixel):
+    """Convert depth map to points.
+
+    Args:
+        depth (np.array, shape=[H, W]): Depth map which
+            the row of [0~`trunc_pixel`] are truncated.
+        trunc_pixel (int): The number of truncated row.
+
+    Returns:
+        np.ndarray: Points in camera coordinates.
+    """
+    num_pts = np.sum(depth[trunc_pixel:, ] > 0.1)
+    points = np.zeros((num_pts, 3), dtype=depth.dtype)
+    x = np.array([0, 0, 1], dtype=depth.dtype)
+    k = 0
+    for i in range(trunc_pixel, depth.shape[0]):
+        for j in range(depth.shape[1]):
+            if depth[i, j] > 0.1:
+                x = np.array([j, i, 1], dtype=depth.dtype)
+                points[k] = x * depth[i, j]
+                k += 1
+    return points
+
+
+def depth_to_lidar_points(depth, trunc_pixel, P2, r_rect, velo2cam):
+    """Convert depth map to points in lidar coordinate.
+
+    Args:
+        depth (np.array, shape=[H, W]): Depth map which
+            the row of [0~`trunc_pixel`] are truncated.
+        trunc_pixel (int): The number of truncated row.
+        P2 (p.array, shape=[4, 4]): Intrinsics of Camera2.
+        r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+            specific camera coordinate (e.g. CAM2) to CAM0.
+        velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
+            camera coordinate to lidar coordinate.
+
+    Returns:
+        np.ndarray: Points in lidar coordinates.
+    """
+    pts = depth_to_points(depth, trunc_pixel)
+    points_shape = list(pts.shape[0:-1])
+    points = np.concatenate([pts, np.ones(points_shape + [1])], axis=-1)
+    points = points @ np.linalg.inv(P2.T)
+    lidar_points = camera_to_lidar(points, r_rect, velo2cam)
+    return lidar_points
+
+
+def rotation_3d_in_axis(points, angles, axis=0):
+    """Rotate points in specific axis.
+
+    Args:
+        points (np.ndarray, shape=[N, point_size, 3]]):
+        angles (np.ndarray, shape=[N]]):
+        axis (int, optional): Axis to rotate at. Defaults to 0.
+
+    Returns:
+        np.ndarray: Rotated points.
+    """
+    # points: [N, point_size, 3]
+    rot_sin = np.sin(angles)
+    rot_cos = np.cos(angles)
+    ones = np.ones_like(rot_cos)
+    zeros = np.zeros_like(rot_cos)
+    if axis == 1:
+        rot_mat_T = np.stack([[rot_cos, zeros, -rot_sin], [zeros, ones, zeros],
+                              [rot_sin, zeros, rot_cos]])
+    elif axis == 2 or axis == -1:
+        rot_mat_T = np.stack([[rot_cos, -rot_sin, zeros],
+                              [rot_sin, rot_cos, zeros], [zeros, zeros, ones]])
+    elif axis == 0:
+        rot_mat_T = np.stack([[zeros, rot_cos, -rot_sin],
+                              [zeros, rot_sin, rot_cos], [ones, zeros, zeros]])
+    else:
+        raise ValueError('axis should in range')
+
+    return np.einsum('aij,jka->aik', points, rot_mat_T)
+
+
+def center_to_corner_box3d(centers,
+                           dims,
+                           angles=None,
+                           origin=(0.5, 1.0, 0.5),
+                           axis=1):
+    """Convert kitti locations, dimensions and angles to corners.
+
+    Args:
+        centers (np.ndarray): Locations in kitti label file with shape (N, 3).
+        dims (np.ndarray): Dimensions in kitti label file with shape (N, 3).
+        angles (np.ndarray, optional): Rotation_y in kitti label file with
+            shape (N). Defaults to None.
+        origin (list or array or float, optional): Origin point relate to
+            smallest point. Use (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0)
+            in lidar. Defaults to (0.5, 1.0, 0.5).
+        axis (int, optional): Rotation axis. 1 for camera and 2 for lidar.
+            Defaults to 1.
+
+    Returns:
+        np.ndarray: Corners with the shape of (N, 8, 3).
+    """
+    # 'length' in kitti format is in x axis.
+    # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar)
+    # center in kitti format is [0.5, 1.0, 0.5] in xyz.
+    corners = corners_nd(dims, origin=origin)
+    # corners: [N, 8, 3]
+    if angles is not None:
+        corners = rotation_3d_in_axis(corners, angles, axis=axis)
+    corners += centers.reshape([-1, 1, 3])
+    return corners
+
+
+@numba.jit(nopython=True)
+def box2d_to_corner_jit(boxes):
+    """Convert box2d to corner.
+
+    Args:
+        boxes (np.ndarray, shape=[N, 5]): Boxes2d with rotation.
+
+    Returns:
+        box_corners (np.ndarray, shape=[N, 4, 2]): Box corners.
+    """
+    num_box = boxes.shape[0]
+    corners_norm = np.zeros((4, 2), dtype=boxes.dtype)
+    corners_norm[1, 1] = 1.0
+    corners_norm[2] = 1.0
+    corners_norm[3, 0] = 1.0
+    corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype)
+    corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape(
+        1, 4, 2)
+    rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
+    box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype)
+    for i in range(num_box):
+        rot_sin = np.sin(boxes[i, -1])
+        rot_cos = np.cos(boxes[i, -1])
+        rot_mat_T[0, 0] = rot_cos
+        rot_mat_T[0, 1] = -rot_sin
+        rot_mat_T[1, 0] = rot_sin
+        rot_mat_T[1, 1] = rot_cos
+        box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2]
+    return box_corners
+
+
+@numba.njit
+def corner_to_standup_nd_jit(boxes_corner):
+    """Convert boxes_corner to aligned (min-max) boxes.
+
+    Args:
+        boxes_corner (np.ndarray, shape=[N, 2**dim, dim]): Boxes corners.
+
+    Returns:
+        np.ndarray, shape=[N, dim*2]: Aligned (min-max) boxes.
+    """
+    num_boxes = boxes_corner.shape[0]
+    ndim = boxes_corner.shape[-1]
+    result = np.zeros((num_boxes, ndim * 2), dtype=boxes_corner.dtype)
+    for i in range(num_boxes):
+        for j in range(ndim):
+            result[i, j] = np.min(boxes_corner[i, :, j])
+        for j in range(ndim):
+            result[i, j + ndim] = np.max(boxes_corner[i, :, j])
+    return result
+
+
+@numba.jit(nopython=True)
+def corner_to_surfaces_3d_jit(corners):
+    """Convert 3d box corners from corner function above to surfaces that
+    normal vectors all direct to internal.
+
+    Args:
+        corners (np.ndarray): 3d box corners with the shape of (N, 8, 3).
+
+    Returns:
+        np.ndarray: Surfaces with the shape of (N, 6, 4, 3).
+    """
+    # box_corners: [N, 8, 3], must from corner functions in this module
+    num_boxes = corners.shape[0]
+    surfaces = np.zeros((num_boxes, 6, 4, 3), dtype=corners.dtype)
+    corner_idxes = np.array([
+        0, 1, 2, 3, 7, 6, 5, 4, 0, 3, 7, 4, 1, 5, 6, 2, 0, 4, 5, 1, 3, 2, 6, 7
+    ]).reshape(6, 4)
+    for i in range(num_boxes):
+        for j in range(6):
+            for k in range(4):
+                surfaces[i, j, k] = corners[i, corner_idxes[j, k]]
+    return surfaces
+
+
+def rotation_points_single_angle(points, angle, axis=0):
+    """Rotate points with a single angle.
+
+    Args:
+        points (np.ndarray, shape=[N, 3]]):
+        angle (np.ndarray, shape=[1]]):
+        axis (int, optional): Axis to rotate at. Defaults to 0.
+
+    Returns:
+        np.ndarray: Rotated points.
+    """
+    # points: [N, 3]
+    rot_sin = np.sin(angle)
+    rot_cos = np.cos(angle)
+    if axis == 1:
+        rot_mat_T = np.array(
+            [[rot_cos, 0, -rot_sin], [0, 1, 0], [rot_sin, 0, rot_cos]],
+            dtype=points.dtype)
+    elif axis == 2 or axis == -1:
+        rot_mat_T = np.array(
+            [[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]],
+            dtype=points.dtype)
+    elif axis == 0:
+        rot_mat_T = np.array(
+            [[1, 0, 0], [0, rot_cos, -rot_sin], [0, rot_sin, rot_cos]],
+            dtype=points.dtype)
+    else:
+        raise ValueError('axis should in range')
+
+    return points @ rot_mat_T, rot_mat_T
+
+
+def points_cam2img(points_3d, proj_mat, with_depth=False):
+    """Project points in camera coordinates to image coordinates.
+
+    Args:
+        points_3d (np.ndarray): Points in shape (N, 3)
+        proj_mat (np.ndarray): Transformation matrix between coordinates.
+        with_depth (bool, optional): Whether to keep depth in the output.
+            Defaults to False.
+
+    Returns:
+        np.ndarray: Points in image coordinates with shape [N, 2].
+    """
+    points_shape = list(points_3d.shape)
+    points_shape[-1] = 1
+
+    assert len(proj_mat.shape) == 2, 'The dimension of the projection'\
+        f' matrix should be 2 instead of {len(proj_mat.shape)}.'
+    d1, d2 = proj_mat.shape[:2]
+    assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
+        d1 == 4 and d2 == 4), 'The shape of the projection matrix'\
+        f' ({d1}*{d2}) is not supported.'
+    if d1 == 3:
+        proj_mat_expanded = np.eye(4, dtype=proj_mat.dtype)
+        proj_mat_expanded[:d1, :d2] = proj_mat
+        proj_mat = proj_mat_expanded
+
+    points_4 = np.concatenate([points_3d, np.ones(points_shape)], axis=-1)
+    point_2d = points_4 @ proj_mat.T
+    point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]
+
+    if with_depth:
+        points_2d_depth = np.concatenate([point_2d_res, point_2d[..., 2:3]],
+                                         axis=-1)
+        return points_2d_depth
+
+    return point_2d_res
+
+
+def box3d_to_bbox(box3d, P2):
+    """Convert box3d in camera coordinates to bbox in image coordinates.
+
+    Args:
+        box3d (np.ndarray, shape=[N, 7]): Boxes in camera coordinate.
+        P2 (np.array, shape=[4, 4]): Intrinsics of Camera2.
+
+    Returns:
+        np.ndarray, shape=[N, 4]: Boxes 2d in image coordinates.
+    """
+    box_corners = center_to_corner_box3d(
+        box3d[:, :3], box3d[:, 3:6], box3d[:, 6], [0.5, 1.0, 0.5], axis=1)
+    box_corners_in_image = points_cam2img(box_corners, P2)
+    # box_corners_in_image: [N, 8, 2]
+    minxy = np.min(box_corners_in_image, axis=1)
+    maxxy = np.max(box_corners_in_image, axis=1)
+    bbox = np.concatenate([minxy, maxxy], axis=1)
+    return bbox
+
+
+def corner_to_surfaces_3d(corners):
+    """convert 3d box corners from corner function above to surfaces that
+    normal vectors all direct to internal.
+
+    Args:
+        corners (np.ndarray): 3D box corners with shape of (N, 8, 3).
+
+    Returns:
+        np.ndarray: Surfaces with the shape of (N, 6, 4, 3).
+    """
+    # box_corners: [N, 8, 3], must from corner functions in this module
+    surfaces = np.array([
+        [corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]],
+        [corners[:, 7], corners[:, 6], corners[:, 5], corners[:, 4]],
+        [corners[:, 0], corners[:, 3], corners[:, 7], corners[:, 4]],
+        [corners[:, 1], corners[:, 5], corners[:, 6], corners[:, 2]],
+        [corners[:, 0], corners[:, 4], corners[:, 5], corners[:, 1]],
+        [corners[:, 3], corners[:, 2], corners[:, 6], corners[:, 7]],
+    ]).transpose([2, 0, 1, 3])
+    return surfaces
+
+
+def points_in_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0)):
+    """Check points in rotated bbox and return indicces.
+
+    Args:
+        points (np.ndarray, shape=[N, 3+dim]): Points to query.
+        rbbox (np.ndarray, shape=[M, 7]): Boxes3d with rotation.
+        z_axis (int, optional): Indicate which axis is height.
+            Defaults to 2.
+        origin (tuple[int], optional): Indicate the position of
+            box center. Defaults to (0.5, 0.5, 0).
+
+    Returns:
+        np.ndarray, shape=[N, M]: Indices of points in each box.
+    """
+    # TODO: this function is different from PointCloud3D, be careful
+    # when start to use nuscene, check the input
+    rbbox_corners = center_to_corner_box3d(
+        rbbox[:, :3], rbbox[:, 3:6], rbbox[:, 6], origin=origin, axis=z_axis)
+    surfaces = corner_to_surfaces_3d(rbbox_corners)
+    indices = points_in_convex_polygon_3d_jit(points[:, :3], surfaces)
+    return indices
+
+
+def minmax_to_corner_2d(minmax_box):
+    """Convert minmax box to corners2d.
+
+    Args:
+        minmax_box (np.ndarray, shape=[N, dims]): minmax boxes.
+
+    Returns:
+        np.ndarray: 2d corners of boxes
+    """
+    ndim = minmax_box.shape[-1] // 2
+    center = minmax_box[..., :ndim]
+    dims = minmax_box[..., ndim:] - center
+    return center_to_corner_box2d(center, dims, origin=0.0)
+
+
+def limit_period(val, offset=0.5, period=np.pi):
+    """Limit the value into a period for periodic function.
+
+    Args:
+        val (np.ndarray): The value to be converted.
+        offset (float, optional): Offset to set the value range. \
+            Defaults to 0.5.
+        period (float, optional): Period of the value. Defaults to np.pi.
+
+    Returns:
+        torch.Tensor: Value in the range of \
+            [-offset * period, (1-offset) * period]
+    """
+    return val - np.floor(val / period + offset) * period
+
+
+def create_anchors_3d_range(feature_size,
+                            anchor_range,
+                            sizes=((1.6, 3.9, 1.56), ),
+                            rotations=(0, np.pi / 2),
+                            dtype=np.float32):
+    """Create anchors 3d by range.
+
+    Args:
+        feature_size (list[float] | tuple[float]): Feature map size. It is
+            either a list of a tuple of [D, H, W](in order of z, y, and x).
+        anchor_range (torch.Tensor | list[float]): Range of anchors with
+            shape [6]. The order is consistent with that of anchors, i.e.,
+            (x_min, y_min, z_min, x_max, y_max, z_max).
+        sizes (list[list] | np.ndarray | torch.Tensor, optional):
+            Anchor size with shape [N, 3], in order of x, y, z.
+            Defaults to ((1.6, 3.9, 1.56), ).
+        rotations (list[float] | np.ndarray | torch.Tensor, optional):
+            Rotations of anchors in a single feature grid.
+            Defaults to (0, np.pi / 2).
+        dtype (type, optional): Data type. Default to np.float32.
+
+    Returns:
+        np.ndarray: Range based anchors with shape of \
+            (*feature_size, num_sizes, num_rots, 7).
+    """
+    anchor_range = np.array(anchor_range, dtype)
+    z_centers = np.linspace(
+        anchor_range[2], anchor_range[5], feature_size[0], dtype=dtype)
+    y_centers = np.linspace(
+        anchor_range[1], anchor_range[4], feature_size[1], dtype=dtype)
+    x_centers = np.linspace(
+        anchor_range[0], anchor_range[3], feature_size[2], dtype=dtype)
+    sizes = np.reshape(np.array(sizes, dtype=dtype), [-1, 3])
+    rotations = np.array(rotations, dtype=dtype)
+    rets = np.meshgrid(
+        x_centers, y_centers, z_centers, rotations, indexing='ij')
+    tile_shape = [1] * 5
+    tile_shape[-2] = int(sizes.shape[0])
+    for i in range(len(rets)):
+        rets[i] = np.tile(rets[i][..., np.newaxis, :], tile_shape)
+        rets[i] = rets[i][..., np.newaxis]  # for concat
+    sizes = np.reshape(sizes, [1, 1, 1, -1, 1, 3])
+    tile_size_shape = list(rets[0].shape)
+    tile_size_shape[3] = 1
+    sizes = np.tile(sizes, tile_size_shape)
+    rets.insert(3, sizes)
+    ret = np.concatenate(rets, axis=-1)
+    return np.transpose(ret, [2, 1, 0, 3, 4, 5])
+
+
+def center_to_minmax_2d(centers, dims, origin=0.5):
+    """Center to minmax.
+
+    Args:
+        centers (np.ndarray): Center points.
+        dims (np.ndarray): Dimensions.
+        origin (list or array or float, optional): Origin point relate
+            to smallest point. Defaults to 0.5.
+
+    Returns:
+        np.ndarray: Minmax points.
+    """
+    if origin == 0.5:
+        return np.concatenate([centers - dims / 2, centers + dims / 2],
+                              axis=-1)
+    corners = center_to_corner_box2d(centers, dims, origin=origin)
+    return corners[:, [0, 2]].reshape([-1, 4])
+
+
+def rbbox2d_to_near_bbox(rbboxes):
+    """convert rotated bbox to nearest 'standing' or 'lying' bbox.
+
+    Args:
+        rbboxes (np.ndarray): Rotated bboxes with shape of \
+            (N, 5(x, y, xdim, ydim, rad)).
+
+    Returns:
+        np.ndarray: Bounding boxes with the shpae of
+            (N, 4(xmin, ymin, xmax, ymax)).
+    """
+    rots = rbboxes[..., -1]
+    rots_0_pi_div_2 = np.abs(limit_period(rots, 0.5, np.pi))
+    cond = (rots_0_pi_div_2 > np.pi / 4)[..., np.newaxis]
+    bboxes_center = np.where(cond, rbboxes[:, [0, 1, 3, 2]], rbboxes[:, :4])
+    bboxes = center_to_minmax_2d(bboxes_center[:, :2], bboxes_center[:, 2:])
+    return bboxes
+
+
+@numba.jit(nopython=True)
+def iou_jit(boxes, query_boxes, mode='iou', eps=0.0):
+    """Calculate box iou. Note that jit version runs ~10x faster than the
+    box_overlaps function in mmdet3d.core.evaluation.
+
+    Args:
+        boxes (np.ndarray): Input bounding boxes with shape of (N, 4).
+        query_boxes (np.ndarray): Query boxes with shape of (K, 4).
+        mode (str, optional): IoU mode. Defaults to 'iou'.
+        eps (float, optional): Value added to denominator. Defaults to 0.
+
+    Returns:
+        np.ndarray: Overlap between boxes and query_boxes
+            with the shape of [N, K].
+    """
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    overlaps = np.zeros((N, K), dtype=boxes.dtype)
+    for k in range(K):
+        box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + eps) *
+                    (query_boxes[k, 3] - query_boxes[k, 1] + eps))
+        for n in range(N):
+            iw = (
+                min(boxes[n, 2], query_boxes[k, 2]) -
+                max(boxes[n, 0], query_boxes[k, 0]) + eps)
+            if iw > 0:
+                ih = (
+                    min(boxes[n, 3], query_boxes[k, 3]) -
+                    max(boxes[n, 1], query_boxes[k, 1]) + eps)
+                if ih > 0:
+                    if mode == 'iou':
+                        ua = ((boxes[n, 2] - boxes[n, 0] + eps) *
+                              (boxes[n, 3] - boxes[n, 1] + eps) + box_area -
+                              iw * ih)
+                    else:
+                        ua = ((boxes[n, 2] - boxes[n, 0] + eps) *
+                              (boxes[n, 3] - boxes[n, 1] + eps))
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
+
+
+def projection_matrix_to_CRT_kitti(proj):
+    """Split projection matrix of kitti.
+
+    P = C @ [R|T]
+    C is upper triangular matrix, so we need to inverse CR and use QR
+    stable for all kitti camera projection matrix.
+
+    Args:
+        proj (p.array, shape=[4, 4]): Intrinsics of camera.
+
+    Returns:
+        tuple[np.ndarray]: Splited matrix of C, R and T.
+    """
+
+    CR = proj[0:3, 0:3]
+    CT = proj[0:3, 3]
+    RinvCinv = np.linalg.inv(CR)
+    Rinv, Cinv = np.linalg.qr(RinvCinv)
+    C = np.linalg.inv(Cinv)
+    R = np.linalg.inv(Rinv)
+    T = Cinv @ CT
+    return C, R, T
+
+
+def remove_outside_points(points, rect, Trv2c, P2, image_shape):
+    """Remove points which are outside of image.
+
+    Args:
+        points (np.ndarray, shape=[N, 3+dims]): Total points.
+        rect (np.ndarray, shape=[4, 4]): Matrix to project points in
+            specific camera coordinate (e.g. CAM2) to CAM0.
+        Trv2c (np.ndarray, shape=[4, 4]): Matrix to project points in
+            camera coordinate to lidar coordinate.
+        P2 (p.array, shape=[4, 4]): Intrinsics of Camera2.
+        image_shape (list[int]): Shape of image.
+
+    Returns:
+        np.ndarray, shape=[N, 3+dims]: Filtered points.
+    """
+    # 5x faster than remove_outside_points_v1(2ms vs 10ms)
+    C, R, T = projection_matrix_to_CRT_kitti(P2)
+    image_bbox = [0, 0, image_shape[1], image_shape[0]]
+    frustum = get_frustum(image_bbox, C)
+    frustum -= T
+    frustum = np.linalg.inv(R) @ frustum.T
+    frustum = camera_to_lidar(frustum.T, rect, Trv2c)
+    frustum_surfaces = corner_to_surfaces_3d_jit(frustum[np.newaxis, ...])
+    indices = points_in_convex_polygon_3d_jit(points[:, :3], frustum_surfaces)
+    points = points[indices.reshape([-1])]
+    return points
+
+
+def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100):
+    """Get frustum corners in camera coordinates.
+
+    Args:
+        bbox_image (list[int]): box in image coordinates.
+        C (np.ndarray): Intrinsics.
+        near_clip (float, optional): Nearest distance of frustum.
+            Defaults to 0.001.
+        far_clip (float, optional): Farthest distance of frustum.
+            Defaults to 100.
+
+    Returns:
+        np.ndarray, shape=[8, 3]: coordinates of frustum corners.
+    """
+    fku = C[0, 0]
+    fkv = -C[1, 1]
+    u0v0 = C[0:2, 2]
+    z_points = np.array(
+        [near_clip] * 4 + [far_clip] * 4, dtype=C.dtype)[:, np.newaxis]
+    b = bbox_image
+    box_corners = np.array(
+        [[b[0], b[1]], [b[0], b[3]], [b[2], b[3]], [b[2], b[1]]],
+        dtype=C.dtype)
+    near_box_corners = (box_corners - u0v0) / np.array(
+        [fku / near_clip, -fkv / near_clip], dtype=C.dtype)
+    far_box_corners = (box_corners - u0v0) / np.array(
+        [fku / far_clip, -fkv / far_clip], dtype=C.dtype)
+    ret_xy = np.concatenate([near_box_corners, far_box_corners],
+                            axis=0)  # [8, 2]
+    ret_xyz = np.concatenate([ret_xy, z_points], axis=1)
+    return ret_xyz
+
+
+def surface_equ_3d(polygon_surfaces):
+    """
+
+    Args:
+        polygon_surfaces (np.ndarray): Polygon surfaces with shape of
+            [num_polygon, max_num_surfaces, max_num_points_of_surface, 3].
+            All surfaces' normal vector must direct to internal.
+            Max_num_points_of_surface must at least 3.
+
+    Returns:
+        tuple: normal vector and its direction.
+    """
+    # return [a, b, c], d in ax+by+cz+d=0
+    # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3]
+    surface_vec = polygon_surfaces[:, :, :2, :] - \
+        polygon_surfaces[:, :, 1:3, :]
+    # normal_vec: [..., 3]
+    normal_vec = np.cross(surface_vec[:, :, 0, :], surface_vec[:, :, 1, :])
+    # print(normal_vec.shape, points[..., 0, :].shape)
+    # d = -np.inner(normal_vec, points[..., 0, :])
+    d = np.einsum('aij, aij->ai', normal_vec, polygon_surfaces[:, :, 0, :])
+    return normal_vec, -d
+
+
+@numba.njit
+def _points_in_convex_polygon_3d_jit(points, polygon_surfaces, normal_vec, d,
+                                     num_surfaces):
+    """
+    Args:
+        points (np.ndarray): Input points with shape of (num_points, 3).
+        polygon_surfaces (np.ndarray): Polygon surfaces with shape of
+            (num_polygon, max_num_surfaces, max_num_points_of_surface, 3).
+            All surfaces' normal vector must direct to internal.
+            Max_num_points_of_surface must at least 3.
+        normal_vec (np.ndarray): Normal vector of polygon_surfaces.
+        d (int): Directions of normal vector.
+        num_surfaces (np.ndarray): Number of surfaces a polygon contains
+            shape of (num_polygon).
+
+    Returns:
+        np.ndarray: Result matrix with the shape of [num_points, num_polygon].
+    """
+    max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3]
+    num_points = points.shape[0]
+    num_polygons = polygon_surfaces.shape[0]
+    ret = np.ones((num_points, num_polygons), dtype=np.bool_)
+    sign = 0.0
+    for i in range(num_points):
+        for j in range(num_polygons):
+            for k in range(max_num_surfaces):
+                if k > num_surfaces[j]:
+                    break
+                sign = (
+                    points[i, 0] * normal_vec[j, k, 0] +
+                    points[i, 1] * normal_vec[j, k, 1] +
+                    points[i, 2] * normal_vec[j, k, 2] + d[j, k])
+                if sign >= 0:
+                    ret[i, j] = False
+                    break
+    return ret
+
+
+def points_in_convex_polygon_3d_jit(points,
+                                    polygon_surfaces,
+                                    num_surfaces=None):
+    """Check points is in 3d convex polygons.
+
+    Args:
+        points (np.ndarray): Input points with shape of (num_points, 3).
+        polygon_surfaces (np.ndarray): Polygon surfaces with shape of
+            (num_polygon, max_num_surfaces, max_num_points_of_surface, 3).
+            All surfaces' normal vector must direct to internal.
+            Max_num_points_of_surface must at least 3.
+        num_surfaces (np.ndarray, optional): Number of surfaces a polygon
+            contains shape of (num_polygon). Defaults to None.
+
+    Returns:
+        np.ndarray: Result matrix with the shape of [num_points, num_polygon].
+    """
+    max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3]
+    # num_points = points.shape[0]
+    num_polygons = polygon_surfaces.shape[0]
+    if num_surfaces is None:
+        num_surfaces = np.full((num_polygons, ), 9999999, dtype=np.int64)
+    normal_vec, d = surface_equ_3d(polygon_surfaces[:, :, :3, :])
+    # normal_vec: [num_polygon, max_num_surfaces, 3]
+    # d: [num_polygon, max_num_surfaces]
+    return _points_in_convex_polygon_3d_jit(points, polygon_surfaces,
+                                            normal_vec, d, num_surfaces)
+
+
+@numba.jit
+def points_in_convex_polygon_jit(points, polygon, clockwise=True):
+    """Check points is in 2d convex polygons. True when point in polygon.
+
+    Args:
+        points (np.ndarray): Input points with the shape of [num_points, 2].
+        polygon (np.ndarray): Input polygon with the shape of
+            [num_polygon, num_points_of_polygon, 2].
+        clockwise (bool, optional): Indicate polygon is clockwise. Defaults
+            to True.
+
+    Returns:
+        np.ndarray: Result matrix with the shape of [num_points, num_polygon].
+    """
+    # first convert polygon to directed lines
+    num_points_of_polygon = polygon.shape[1]
+    num_points = points.shape[0]
+    num_polygons = polygon.shape[0]
+    # if clockwise:
+    #     vec1 = polygon - polygon[:, [num_points_of_polygon - 1] +
+    #                              list(range(num_points_of_polygon - 1)), :]
+    # else:
+    #     vec1 = polygon[:, [num_points_of_polygon - 1] +
+    #                    list(range(num_points_of_polygon - 1)), :] - polygon
+    # vec1: [num_polygon, num_points_of_polygon, 2]
+    vec1 = np.zeros((2), dtype=polygon.dtype)
+    ret = np.zeros((num_points, num_polygons), dtype=np.bool_)
+    success = True
+    cross = 0.0
+    for i in range(num_points):
+        for j in range(num_polygons):
+            success = True
+            for k in range(num_points_of_polygon):
+                if clockwise:
+                    vec1 = polygon[j, k] - polygon[j, k - 1]
+                else:
+                    vec1 = polygon[j, k - 1] - polygon[j, k]
+                cross = vec1[1] * (polygon[j, k, 0] - points[i, 0])
+                cross -= vec1[0] * (polygon[j, k, 1] - points[i, 1])
+                if cross >= 0:
+                    success = False
+                    break
+            ret[i, j] = success
+    return ret
+
+
+def boxes3d_to_corners3d_lidar(boxes3d, bottom_center=True):
+    """Convert kitti center boxes to corners.
+
+        7 -------- 4
+       /|         /|
+      6 -------- 5 .
+      | |        | |
+      . 3 -------- 0
+      |/         |/
+      2 -------- 1
+
+    Args:
+        boxes3d (np.ndarray): Boxes with shape of (N, 7)
+            [x, y, z, w, l, h, ry] in LiDAR coords, see the definition of ry
+            in KITTI dataset.
+        bottom_center (bool, optional): Whether z is on the bottom center
+            of object. Defaults to True.
+
+    Returns:
+        np.ndarray: Box corners with the shape of [N, 8, 3].
+    """
+    boxes_num = boxes3d.shape[0]
+    w, l, h = boxes3d[:, 3], boxes3d[:, 4], boxes3d[:, 5]
+    x_corners = np.array(
+        [w / 2., -w / 2., -w / 2., w / 2., w / 2., -w / 2., -w / 2., w / 2.],
+        dtype=np.float32).T
+    y_corners = np.array(
+        [-l / 2., -l / 2., l / 2., l / 2., -l / 2., -l / 2., l / 2., l / 2.],
+        dtype=np.float32).T
+    if bottom_center:
+        z_corners = np.zeros((boxes_num, 8), dtype=np.float32)
+        z_corners[:, 4:8] = h.reshape(boxes_num, 1).repeat(4, axis=1)  # (N, 8)
+    else:
+        z_corners = np.array([
+            -h / 2., -h / 2., -h / 2., -h / 2., h / 2., h / 2., h / 2., h / 2.
+        ],
+                             dtype=np.float32).T
+
+    ry = boxes3d[:, 6]
+    zeros, ones = np.zeros(
+        ry.size, dtype=np.float32), np.ones(
+            ry.size, dtype=np.float32)
+    rot_list = np.array([[np.cos(ry), -np.sin(ry), zeros],
+                         [np.sin(ry), np.cos(ry), zeros], [zeros, zeros,
+                                                           ones]])  # (3, 3, N)
+    R_list = np.transpose(rot_list, (2, 0, 1))  # (N, 3, 3)
+
+    temp_corners = np.concatenate((x_corners.reshape(
+        -1, 8, 1), y_corners.reshape(-1, 8, 1), z_corners.reshape(-1, 8, 1)),
+                                  axis=2)  # (N, 8, 3)
+    rotated_corners = np.matmul(temp_corners, R_list)  # (N, 8, 3)
+    x_corners = rotated_corners[:, :, 0]
+    y_corners = rotated_corners[:, :, 1]
+    z_corners = rotated_corners[:, :, 2]
+
+    x_loc, y_loc, z_loc = boxes3d[:, 0], boxes3d[:, 1], boxes3d[:, 2]
+
+    x = x_loc.reshape(-1, 1) + x_corners.reshape(-1, 8)
+    y = y_loc.reshape(-1, 1) + y_corners.reshape(-1, 8)
+    z = z_loc.reshape(-1, 1) + z_corners.reshape(-1, 8)
+
+    corners = np.concatenate(
+        (x.reshape(-1, 8, 1), y.reshape(-1, 8, 1), z.reshape(-1, 8, 1)),
+        axis=2)
+
+    return corners.astype(np.float32)
diff --git a/mmcv/core/bbox/builder.py b/mmcv/core/bbox/builder.py
new file mode 100644
index 0000000..682683b
--- /dev/null
+++ b/mmcv/core/bbox/builder.py
@@ -0,0 +1,20 @@
+from mmcv.utils import Registry, build_from_cfg
+
+BBOX_ASSIGNERS = Registry('bbox_assigner')
+BBOX_SAMPLERS = Registry('bbox_sampler')
+BBOX_CODERS = Registry('bbox_coder')
+
+
+def build_assigner(cfg, **default_args):
+    """Builder of box assigner."""
+    return build_from_cfg(cfg, BBOX_ASSIGNERS, default_args)
+
+
+def build_sampler(cfg, **default_args):
+    """Builder of box sampler."""
+    return build_from_cfg(cfg, BBOX_SAMPLERS, default_args)
+
+
+def build_bbox_coder(cfg, **default_args):
+    """Builder of box coder."""
+    return build_from_cfg(cfg, BBOX_CODERS, default_args)
diff --git a/mmcv/core/bbox/coder/__init__.py b/mmcv/core/bbox/coder/__init__.py
new file mode 100644
index 0000000..ab2e6be
--- /dev/null
+++ b/mmcv/core/bbox/coder/__init__.py
@@ -0,0 +1,11 @@
+from .nms_free_coder import NMSFreeCoder
+from .detr3d_track_coder import DETRTrack3DCoder
+from mmcv.core.bbox import build_bbox_coder
+from .fut_nms_free_coder import CustomNMSFreeCoder
+from .map_nms_free_coder import MapNMSFreeCoder
+
+__all__ = [
+    'build_bbox_coder', 
+    'NMSFreeCoder', 'DETRTrack3DCoder',
+    'CustomNMSFreeCoder','MapNMSFreeCoder'
+]
diff --git a/mmcv/core/bbox/coder/base_bbox_coder.py b/mmcv/core/bbox/coder/base_bbox_coder.py
new file mode 100644
index 0000000..cf0b34c
--- /dev/null
+++ b/mmcv/core/bbox/coder/base_bbox_coder.py
@@ -0,0 +1,17 @@
+from abc import ABCMeta, abstractmethod
+
+
+class BaseBBoxCoder(metaclass=ABCMeta):
+    """Base bounding box coder."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    @abstractmethod
+    def encode(self, bboxes, gt_bboxes):
+        """Encode deltas between bboxes and ground truth boxes."""
+
+    @abstractmethod
+    def decode(self, bboxes, bboxes_pred):
+        """Decode the predicted bboxes according to prediction and base
+        boxes."""
diff --git a/mmcv/core/bbox/coder/detr3d_track_coder.py b/mmcv/core/bbox/coder/detr3d_track_coder.py
new file mode 100755
index 0000000..1c0e017
--- /dev/null
+++ b/mmcv/core/bbox/coder/detr3d_track_coder.py
@@ -0,0 +1,156 @@
+import torch
+
+from mmcv.core.bbox.coder.base_bbox_coder import BaseBBoxCoder
+from mmcv.core.bbox.builder import BBOX_CODERS
+from ..util import normalize_bbox, denormalize_bbox
+from ..structures.utils import xywhr2xyxyr
+from mmcv.ops.iou3d import nms_bev
+
+@BBOX_CODERS.register_module()
+class DETRTrack3DCoder(BaseBBoxCoder):
+    """Bbox coder for DETR3D.
+    Args:
+        pc_range (list[float]): Range of point cloud.
+        post_center_range (list[float]): Limit of the center.
+            Default: None.
+        max_num (int): Max number to be kept. Default: 100.
+        score_threshold (float): Threshold to filter boxes based on score.
+            Default: None.
+        code_size (int): Code size of bboxes. Default: 9
+    """
+
+    def __init__(self,
+                 pc_range,
+                 post_center_range=None,
+                 max_num=100,
+                 score_threshold=0.2,
+                 num_classes=7,
+                 with_nms=False,
+                 iou_thres=0.3):
+        
+        self.pc_range = pc_range
+        self.post_center_range = post_center_range
+        self.max_num = max_num
+        self.score_threshold = score_threshold
+        self.num_classes = num_classes
+        self.with_nms = with_nms
+        self.nms_iou_thres = iou_thres
+
+    def encode(self):
+        pass
+
+    def decode_single(self, cls_scores, bbox_preds, 
+                      track_scores, obj_idxes, with_mask=True, img_metas=None):
+        """Decode bboxes.
+        Args:
+            cls_scores (Tensor): Outputs from the classification head, \
+                shape [num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            bbox_preds (Tensor): Outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [num_query, 9].
+
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        max_num = self.max_num
+        max_num = min(cls_scores.size(0), self.max_num)
+
+        cls_scores = cls_scores.sigmoid()
+        _, indexs = cls_scores.max(dim=-1)
+        labels = indexs % self.num_classes
+
+        _, bbox_index = track_scores.topk(max_num)
+        
+        labels = labels[bbox_index]
+        bbox_preds = bbox_preds[bbox_index]
+        track_scores = track_scores[bbox_index]
+        obj_idxes = obj_idxes[bbox_index]
+
+        scores = track_scores
+        
+        final_box_preds = denormalize_bbox(bbox_preds, self.pc_range)   
+        final_scores = track_scores
+        final_preds = labels
+
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold
+
+        if self.with_nms:
+            boxes_for_nms = xywhr2xyxyr(img_metas[0]['box_type_3d'](final_box_preds[:, :], 9).bev)
+            nms_mask = boxes_for_nms.new_zeros(boxes_for_nms.shape[0]) > 0
+            # print(self.nms_iou_thres)
+            try:
+                selected = nms_bev(
+                    boxes_for_nms,
+                    final_scores,
+                    thresh=self.nms_iou_thres)
+                nms_mask[selected] = True
+            except:
+                print('Error', boxes_for_nms, final_scores)
+                nms_mask = boxes_for_nms.new_ones(boxes_for_nms.shape[0]) > 0
+        if self.post_center_range is not None:
+            self.post_center_range = torch.tensor(
+                self.post_center_range, device=scores.device)
+            mask = (final_box_preds[..., :3] >=
+                    self.post_center_range[:3]).all(1)
+            mask &= (final_box_preds[..., :3] <=
+                     self.post_center_range[3:]).all(1)
+
+            if self.score_threshold:
+                mask &= thresh_mask
+            if not with_mask:
+                mask = torch.ones_like(mask) > 0
+            if self.with_nms:
+                mask &= nms_mask
+
+            boxes3d = final_box_preds[mask]
+            scores = final_scores[mask]
+            labels = final_preds[mask]
+            track_scores = track_scores[mask]
+            obj_idxes = obj_idxes[mask]
+            predictions_dict = {
+                'bboxes': boxes3d,
+                'scores': scores,
+                'labels': labels,
+                'track_scores': track_scores,
+                'obj_idxes': obj_idxes,
+                'bbox_index': bbox_index,
+                'mask': mask
+            }
+
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!')
+        return predictions_dict
+
+    def decode(self, preds_dicts, with_mask=True, img_metas=None):
+        """Decode bboxes.
+        Args:
+            cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+                Note: before sigmoid!
+            bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        all_cls_scores = preds_dicts['cls_scores']
+        all_bbox_preds = preds_dicts['bbox_preds']
+        track_scores = preds_dicts['track_scores']
+        obj_idxes = preds_dicts['obj_idxes']
+        
+        batch_size = all_cls_scores.size()[0]
+        predictions_list = []
+        # bs size = 1
+        predictions_list.append(self.decode_single(
+            all_cls_scores, all_bbox_preds,
+            track_scores, obj_idxes, with_mask, img_metas))
+        #for i in range(batch_size):
+        #    predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i]))
+        return predictions_list
diff --git a/mmcv/core/bbox/coder/fut_nms_free_coder.py b/mmcv/core/bbox/coder/fut_nms_free_coder.py
new file mode 100644
index 0000000..b8a8a95
--- /dev/null
+++ b/mmcv/core/bbox/coder/fut_nms_free_coder.py
@@ -0,0 +1,127 @@
+import torch
+
+from mmcv.core.bbox.coder.base_bbox_coder import BaseBBoxCoder
+from mmcv.core.bbox.builder import BBOX_CODERS
+from mmcv.core.bbox.util import denormalize_bbox
+import numpy as np
+
+
+@BBOX_CODERS.register_module()
+class CustomNMSFreeCoder(BaseBBoxCoder):
+    """Bbox coder for NMS-free detector.
+    Args:
+        pc_range (list[float]): Range of point cloud.
+        post_center_range (list[float]): Limit of the center.
+            Default: None.
+        max_num (int): Max number to be kept. Default: 100.
+        score_threshold (float): Threshold to filter boxes based on score.
+            Default: None.
+        code_size (int): Code size of bboxes. Default: 9
+    """
+
+    def __init__(self,
+                 pc_range,
+                 voxel_size=None,
+                 post_center_range=None,
+                 max_num=100,
+                 score_threshold=None,
+                 num_classes=10):
+        self.pc_range = pc_range
+        self.voxel_size = voxel_size
+        self.post_center_range = post_center_range
+        self.max_num = max_num
+        self.score_threshold = score_threshold
+        self.num_classes = num_classes
+
+    def encode(self):
+
+        pass
+
+    def decode_single(self, cls_scores, bbox_preds, traj_preds):
+        """Decode bboxes.
+        Args:
+            cls_scores (Tensor): Outputs from the classification head, \
+                shape [num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            bbox_preds (Tensor): Outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        max_num = self.max_num
+
+        cls_scores = cls_scores.sigmoid()
+        scores, indexs = cls_scores.view(-1).topk(max_num)
+        labels = indexs % self.num_classes
+        bbox_index = indexs // self.num_classes
+        bbox_preds = bbox_preds[bbox_index]
+        traj_preds = traj_preds[bbox_index]
+       
+        final_box_preds = denormalize_bbox(bbox_preds, self.pc_range)   
+        final_scores = scores 
+        final_preds = labels
+        final_traj_preds = traj_preds
+
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold
+            tmp_score = self.score_threshold
+            while thresh_mask.sum() == 0:
+                tmp_score *= 0.9
+                if tmp_score < 0.01:
+                    thresh_mask = final_scores > -1
+                    break
+                thresh_mask = final_scores >= tmp_score
+
+        if self.post_center_range is not None:
+            self.post_center_range = torch.tensor(
+                self.post_center_range, device=scores.device)
+            mask = (final_box_preds[..., :3] >=
+                    self.post_center_range[:3]).all(1)
+            mask &= (final_box_preds[..., :3] <=
+                     self.post_center_range[3:]).all(1)
+
+            if self.score_threshold:
+                mask &= thresh_mask
+
+            boxes3d = final_box_preds[mask]
+            scores = final_scores[mask]
+            labels = final_preds[mask]
+            trajs = final_traj_preds[mask]
+
+            predictions_dict = {
+                'bboxes': boxes3d,
+                'scores': scores,
+                'labels': labels,
+                'trajs': trajs
+            }
+
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!')
+        return predictions_dict
+
+    def decode(self, preds_dicts):
+        """Decode bboxes.
+        Args:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        all_cls_scores = preds_dicts['all_cls_scores'][-1]
+        all_bbox_preds = preds_dicts['all_bbox_preds'][-1]
+        all_traj_preds = preds_dicts['all_traj_preds'][-1]
+        
+        batch_size = all_cls_scores.size()[0]
+        predictions_list = []
+        for i in range(batch_size):
+            predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i], all_traj_preds[i]))
+        return predictions_list
+
diff --git a/mmcv/core/bbox/coder/map_nms_free_coder.py b/mmcv/core/bbox/coder/map_nms_free_coder.py
new file mode 100644
index 0000000..f20d300
--- /dev/null
+++ b/mmcv/core/bbox/coder/map_nms_free_coder.py
@@ -0,0 +1,126 @@
+import torch
+
+from mmcv.core.bbox.coder.base_bbox_coder import BaseBBoxCoder
+from mmcv.core.bbox.builder import BBOX_CODERS
+from mmcv.models.vad_utils.map_utils import (
+    denormalize_2d_pts, denormalize_2d_bbox
+)
+
+
+@BBOX_CODERS.register_module()
+class MapNMSFreeCoder(BaseBBoxCoder):
+    """Bbox coder for NMS-free detector.
+    Args:
+        pc_range (list[float]): Range of point cloud.
+        post_center_range (list[float]): Limit of the center.
+            Default: None.
+        max_num (int): Max number to be kept. Default: 100.
+        score_threshold (float): Threshold to filter boxes based on score.
+            Default: None.
+        code_size (int): Code size of bboxes. Default: 9
+    """
+
+    def __init__(self,
+                 pc_range,
+                 voxel_size=None,
+                 post_center_range=None,
+                 max_num=100,
+                 score_threshold=None,
+                 num_classes=10):
+        self.pc_range = pc_range
+        self.voxel_size = voxel_size
+        self.post_center_range = post_center_range
+        self.max_num = max_num
+        self.score_threshold = score_threshold
+        self.num_classes = num_classes
+
+    def encode(self):
+
+        pass
+
+    def decode_single(self, cls_scores, bbox_preds, pts_preds):
+        """Decode bboxes.
+        Args:
+            cls_scores (Tensor): Outputs from the classification head, \
+                shape [num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            bbox_preds (Tensor): Outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [num_query, 9].
+            pts_preds (Tensor):
+                Shape [num_query, fixed_num_pts, 2]
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        max_num = self.max_num
+
+        cls_scores = cls_scores.sigmoid()
+        scores, indexs = cls_scores.view(-1).topk(max_num)
+        labels = indexs % self.num_classes
+        bbox_index = indexs // self.num_classes
+        bbox_preds = bbox_preds[bbox_index]
+        pts_preds = pts_preds[bbox_index]
+       
+        final_box_preds = denormalize_2d_bbox(bbox_preds, self.pc_range) 
+        final_pts_preds = denormalize_2d_pts(pts_preds, self.pc_range) #num_q,num_p,2
+        # final_box_preds = bbox_preds 
+        final_scores = scores 
+        final_preds = labels 
+
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold
+            tmp_score = self.score_threshold
+            while thresh_mask.sum() == 0:
+                tmp_score *= 0.9
+                if tmp_score < 0.01:
+                    thresh_mask = final_scores > -1
+                    break
+                thresh_mask = final_scores >= tmp_score
+
+        if self.post_center_range is not None:
+            self.post_center_range = torch.tensor(
+                self.post_center_range, device=scores.device)
+            mask = (final_box_preds[..., :4] >= self.post_center_range[:4]).all(1)
+            mask &= (final_box_preds[..., :4] <= self.post_center_range[4:]).all(1)
+
+            if self.score_threshold:
+                mask &= thresh_mask
+
+            boxes3d = final_box_preds[mask]
+            scores = final_scores[mask]
+            pts = final_pts_preds[mask]
+            labels = final_preds[mask]
+            predictions_dict = {
+                'map_bboxes': boxes3d,
+                'map_scores': scores,
+                'map_labels': labels,
+                'map_pts': pts,
+            }
+
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!')
+        return predictions_dict
+
+    def decode(self, preds_dicts):
+        """Decode bboxes.
+        Args:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        all_cls_scores = preds_dicts['map_all_cls_scores'][-1]
+        all_bbox_preds = preds_dicts['map_all_bbox_preds'][-1]
+        all_pts_preds = preds_dicts['map_all_pts_preds'][-1]
+        batch_size = all_cls_scores.size()[0]
+        predictions_list = []
+        for i in range(batch_size):
+            predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i],all_pts_preds[i]))
+        return predictions_list
\ No newline at end of file
diff --git a/mmcv/core/bbox/coder/nms_free_coder.py b/mmcv/core/bbox/coder/nms_free_coder.py
new file mode 100755
index 0000000..95430bc
--- /dev/null
+++ b/mmcv/core/bbox/coder/nms_free_coder.py
@@ -0,0 +1,124 @@
+import torch
+
+from mmcv.core.bbox.coder.base_bbox_coder import BaseBBoxCoder
+from mmcv.core.bbox.builder import BBOX_CODERS
+from mmcv.core.bbox.util import denormalize_bbox
+import numpy as np
+
+
+@BBOX_CODERS.register_module()
+class NMSFreeCoder(BaseBBoxCoder):
+    """Bbox coder for NMS-free detector.
+    Args:
+        pc_range (list[float]): Range of point cloud.
+        post_center_range (list[float]): Limit of the center.
+            Default: None.
+        max_num (int): Max number to be kept. Default: 100.
+        score_threshold (float): Threshold to filter boxes based on score.
+            Default: None.
+        code_size (int): Code size of bboxes. Default: 9
+    """
+
+    def __init__(self,
+                 pc_range,
+                 voxel_size=None,
+                 post_center_range=None,
+                 max_num=100,
+                 score_threshold=None,
+                 num_classes=10):
+        self.pc_range = pc_range
+        self.voxel_size = voxel_size
+        self.post_center_range = post_center_range
+        self.max_num = max_num
+        self.score_threshold = score_threshold
+        self.num_classes = num_classes
+
+    def encode(self):
+
+        pass
+
+    def decode_single(self, cls_scores, bbox_preds):
+        """Decode bboxes.
+        Args:
+            cls_scores (Tensor): Outputs from the classification head, \
+                shape [num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            bbox_preds (Tensor): Outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        max_num = min(self.max_num, cls_scores.shape[0])
+
+        cls_scores = cls_scores.sigmoid()
+        scores, indexs = cls_scores.view(-1).topk(max_num)
+        labels = indexs % self.num_classes
+        bbox_index = indexs // self.num_classes
+        bbox_preds = bbox_preds[bbox_index]
+       
+        final_box_preds = denormalize_bbox(bbox_preds, self.pc_range)   
+        final_scores = scores 
+        final_preds = labels 
+
+        # use score threshold
+        if self.score_threshold is not None:
+            thresh_mask = final_scores > self.score_threshold
+            tmp_score = self.score_threshold
+            while thresh_mask.sum() == 0:
+                tmp_score *= 0.9
+                if tmp_score < 0.01:
+                    thresh_mask = final_scores > -1
+                    break
+                thresh_mask = final_scores >= tmp_score
+
+        if self.post_center_range is not None:
+            self.post_center_range = torch.tensor(
+                self.post_center_range, device=scores.device)
+            mask = (final_box_preds[..., :3] >=
+                    self.post_center_range[:3]).all(1)
+            mask &= (final_box_preds[..., :3] <=
+                     self.post_center_range[3:]).all(1)
+
+            if self.score_threshold:
+                mask &= thresh_mask
+
+            boxes3d = final_box_preds[mask]
+            scores = final_scores[mask]
+
+            labels = final_preds[mask]
+            predictions_dict = {
+                'bboxes': boxes3d,
+                'scores': scores,
+                'labels': labels,
+                'mask': mask,
+                'bbox_index': bbox_index
+            }
+
+        else:
+            raise NotImplementedError(
+                'Need to reorganize output as a batch, only '
+                'support post_center_range is not None for now!')
+        return predictions_dict
+
+    def decode(self, preds_dicts):
+        """Decode bboxes.
+        Args:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        Returns:
+            list[dict]: Decoded boxes.
+        """
+        all_cls_scores = preds_dicts['all_cls_scores'][-1]
+        all_bbox_preds = preds_dicts['all_bbox_preds'][-1]
+        
+        batch_size = all_cls_scores.size()[0]
+        predictions_list = []
+        for i in range(batch_size):
+            predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i]))
+        return predictions_list
+
diff --git a/mmcv/core/bbox/iou_calculators/__init__.py b/mmcv/core/bbox/iou_calculators/__init__.py
new file mode 100644
index 0000000..3c13f41
--- /dev/null
+++ b/mmcv/core/bbox/iou_calculators/__init__.py
@@ -0,0 +1,11 @@
+from .builder import build_iou_calculator
+from .iou2d_calculator import BboxOverlaps2D, bbox_overlaps
+from .iou3d_calculator import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D,
+                               BboxOverlapsNearest3D,
+                               axis_aligned_bbox_overlaps_3d, bbox_overlaps_3d,
+                               bbox_overlaps_nearest_3d)
+
+__all__ = ['build_iou_calculator', 'BboxOverlaps2D', 'bbox_overlaps',
+           'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d',
+            'bbox_overlaps_3d', 'AxisAlignedBboxOverlaps3D',
+            'axis_aligned_bbox_overlaps_3d']
diff --git a/mmcv/core/bbox/iou_calculators/builder.py b/mmcv/core/bbox/iou_calculators/builder.py
new file mode 100644
index 0000000..09094d7
--- /dev/null
+++ b/mmcv/core/bbox/iou_calculators/builder.py
@@ -0,0 +1,8 @@
+from mmcv.utils import Registry, build_from_cfg
+
+IOU_CALCULATORS = Registry('IoU calculator')
+
+
+def build_iou_calculator(cfg, default_args=None):
+    """Builder of IoU calculator."""
+    return build_from_cfg(cfg, IOU_CALCULATORS, default_args)
diff --git a/mmcv/core/bbox/iou_calculators/iou2d_calculator.py b/mmcv/core/bbox/iou_calculators/iou2d_calculator.py
new file mode 100644
index 0000000..25f2b46
--- /dev/null
+++ b/mmcv/core/bbox/iou_calculators/iou2d_calculator.py
@@ -0,0 +1,260 @@
+import torch
+
+from .builder import IOU_CALCULATORS
+
+
+def cast_tensor_type(x, scale=1., dtype=None):
+    if dtype == 'fp16':
+        # scale is for preventing overflows
+        x = (x / scale).half()
+    return x
+
+
+def fp16_clamp(x, min=None, max=None):
+    if not x.is_cuda and x.dtype == torch.float16:
+        # clamp for cpu float16, tensor fp16 has no clamp implementation
+        return x.float().clamp(min, max).half()
+
+    return x.clamp(min, max)
+
+
+@IOU_CALCULATORS.register_module()
+class BboxOverlaps2D:
+    """2D Overlaps (e.g. IoUs, GIoUs) Calculator."""
+
+    def __init__(self, scale=1., dtype=None):
+        self.scale = scale
+        self.dtype = dtype
+
+    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+        """Calculate IoU between 2D bboxes.
+
+        Args:
+            bboxes1 (Tensor): bboxes have shape (m, 4) in <x1, y1, x2, y2>
+                format, or shape (m, 5) in <x1, y1, x2, y2, score> format.
+            bboxes2 (Tensor): bboxes have shape (m, 4) in <x1, y1, x2, y2>
+                format, shape (m, 5) in <x1, y1, x2, y2, score> format, or be
+                empty. If ``is_aligned `` is ``True``, then m and n must be
+                equal.
+            mode (str): "iou" (intersection over union), "iof" (intersection
+                over foreground), or "giou" (generalized intersection over
+                union).
+            is_aligned (bool, optional): If True, then m and n must be equal.
+                Default False.
+
+        Returns:
+            Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
+        """
+        assert bboxes1.size(-1) in [0, 4, 5]
+        assert bboxes2.size(-1) in [0, 4, 5]
+        if bboxes2.size(-1) == 5:
+            bboxes2 = bboxes2[..., :4]
+        if bboxes1.size(-1) == 5:
+            bboxes1 = bboxes1[..., :4]
+
+        if self.dtype == 'fp16':
+            # change tensor type to save cpu and cuda memory and keep speed
+            bboxes1 = cast_tensor_type(bboxes1, self.scale, self.dtype)
+            bboxes2 = cast_tensor_type(bboxes2, self.scale, self.dtype)
+            overlaps = bbox_overlaps(bboxes1, bboxes2, mode, is_aligned)
+            if not overlaps.is_cuda and overlaps.dtype == torch.float16:
+                # resume cpu float32
+                overlaps = overlaps.float()
+            return overlaps
+
+        return bbox_overlaps(bboxes1, bboxes2, mode, is_aligned)
+
+    def __repr__(self):
+        """str: a string describing the module"""
+        repr_str = self.__class__.__name__ + f'(' \
+            f'scale={self.scale}, dtype={self.dtype})'
+        return repr_str
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
+    """Calculate overlap between two set of bboxes.
+
+    FP16 Contributed by https://github.com/open-mmlab/mmdetection/pull/4889
+    Note:
+        Assume bboxes1 is M x 4, bboxes2 is N x 4, when mode is 'iou',
+        there are some new generated variable when calculating IOU
+        using bbox_overlaps function:
+
+        1) is_aligned is False
+            area1: M x 1
+            area2: N x 1
+            lt: M x N x 2
+            rb: M x N x 2
+            wh: M x N x 2
+            overlap: M x N x 1
+            union: M x N x 1
+            ious: M x N x 1
+
+            Total memory:
+                S = (9 x N x M + N + M) * 4 Byte,
+
+            When using FP16, we can reduce:
+                R = (9 x N x M + N + M) * 4 / 2 Byte
+                R large than (N + M) * 4 * 2 is always true when N and M >= 1.
+                Obviously, N + M <= N * M < 3 * N * M, when N >=2 and M >=2,
+                           N + 1 < 3 * N, when N or M is 1.
+
+            Given M = 40 (ground truth), N = 400000 (three anchor boxes
+            in per grid, FPN, R-CNNs),
+                R = 275 MB (one times)
+
+            A special case (dense detection), M = 512 (ground truth),
+                R = 3516 MB = 3.43 GB
+
+            When the batch size is B, reduce:
+                B x R
+
+            Therefore, CUDA memory runs out frequently.
+
+            Experiments on GeForce RTX 2080Ti (11019 MiB):
+
+            |   dtype   |   M   |   N   |   Use    |   Real   |   Ideal   |
+            |:----:|:----:|:----:|:----:|:----:|:----:|
+            |   FP32   |   512 | 400000 | 8020 MiB |   --   |   --   |
+            |   FP16   |   512 | 400000 |   4504 MiB | 3516 MiB | 3516 MiB |
+            |   FP32   |   40 | 400000 |   1540 MiB |   --   |   --   |
+            |   FP16   |   40 | 400000 |   1264 MiB |   276MiB   | 275 MiB |
+
+        2) is_aligned is True
+            area1: N x 1
+            area2: N x 1
+            lt: N x 2
+            rb: N x 2
+            wh: N x 2
+            overlap: N x 1
+            union: N x 1
+            ious: N x 1
+
+            Total memory:
+                S = 11 x N * 4 Byte
+
+            When using FP16, we can reduce:
+                R = 11 x N * 4 / 2 Byte
+
+        So do the 'giou' (large than 'iou').
+
+        Time-wise, FP16 is generally faster than FP32.
+
+        When gpu_assign_thr is not -1, it takes more time on cpu
+        but not reduce memory.
+        There, we can reduce half the memory and keep the speed.
+
+    If ``is_aligned `` is ``False``, then calculate the overlaps between each
+    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
+    pair of bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
+        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
+            B indicates the batch dim, in shape (B1, B2, ..., Bn).
+            If ``is_aligned `` is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union), "iof" (intersection over
+            foreground) or "giou" (generalized intersection over union).
+            Default "iou".
+        is_aligned (bool, optional): If True, then m and n must be equal.
+            Default False.
+        eps (float, optional): A value added to the denominator for numerical
+            stability. Default 1e-6.
+
+    Returns:
+        Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
+
+    Example:
+        >>> bboxes1 = torch.FloatTensor([
+        >>>     [0, 0, 10, 10],
+        >>>     [10, 10, 20, 20],
+        >>>     [32, 32, 38, 42],
+        >>> ])
+        >>> bboxes2 = torch.FloatTensor([
+        >>>     [0, 0, 10, 20],
+        >>>     [0, 10, 10, 19],
+        >>>     [10, 10, 20, 20],
+        >>> ])
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2)
+        >>> assert overlaps.shape == (3, 3)
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
+        >>> assert overlaps.shape == (3, )
+
+    Example:
+        >>> empty = torch.empty(0, 4)
+        >>> nonempty = torch.FloatTensor([[0, 0, 10, 9]])
+        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+    """
+
+    assert mode in ['iou', 'iof', 'giou'], f'Unsupported mode {mode}'
+    # Either the boxes are empty or the length of boxes' last dimension is 4
+    assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
+    assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
+
+    # Batch dim must be the same
+    # Batch dim: (B1, B2, ... Bn)
+    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
+    batch_shape = bboxes1.shape[:-2]
+
+    rows = bboxes1.size(-2)
+    cols = bboxes2.size(-2)
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        if is_aligned:
+            return bboxes1.new(batch_shape + (rows, ))
+        else:
+            return bboxes1.new(batch_shape + (rows, cols))
+
+    area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
+        bboxes1[..., 3] - bboxes1[..., 1])
+    area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
+        bboxes2[..., 3] - bboxes2[..., 1])
+
+    if is_aligned:
+        lt = torch.max(bboxes1[..., :2], bboxes2[..., :2])  # [B, rows, 2]
+        rb = torch.min(bboxes1[..., 2:], bboxes2[..., 2:])  # [B, rows, 2]
+
+        wh = fp16_clamp(rb - lt, min=0)
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1 + area2 - overlap
+        else:
+            union = area1
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :2], bboxes2[..., :2])
+            enclosed_rb = torch.max(bboxes1[..., 2:], bboxes2[..., 2:])
+    else:
+        lt = torch.max(bboxes1[..., :, None, :2],
+                       bboxes2[..., None, :, :2])  # [B, rows, cols, 2]
+        rb = torch.min(bboxes1[..., :, None, 2:],
+                       bboxes2[..., None, :, 2:])  # [B, rows, cols, 2]
+
+        wh = fp16_clamp(rb - lt, min=0)
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1[..., None] + area2[..., None, :] - overlap
+        else:
+            union = area1[..., None]
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :, None, :2],
+                                    bboxes2[..., None, :, :2])
+            enclosed_rb = torch.max(bboxes1[..., :, None, 2:],
+                                    bboxes2[..., None, :, 2:])
+
+    eps = union.new_tensor([eps])
+    union = torch.max(union, eps)
+    ious = overlap / union
+    if mode in ['iou', 'iof']:
+        return ious
+    # calculate gious
+    enclose_wh = fp16_clamp(enclosed_rb - enclosed_lt, min=0)
+    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
+    enclose_area = torch.max(enclose_area, eps)
+    gious = ious - (enclose_area - union) / enclose_area
+    return gious
diff --git a/mmcv/core/bbox/iou_calculators/iou3d_calculator.py b/mmcv/core/bbox/iou_calculators/iou3d_calculator.py
new file mode 100644
index 0000000..5bc00b4
--- /dev/null
+++ b/mmcv/core/bbox/iou_calculators/iou3d_calculator.py
@@ -0,0 +1,321 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .iou2d_calculator import bbox_overlaps
+from .builder import IOU_CALCULATORS
+from ..structures.utils import get_box_type
+
+
+@IOU_CALCULATORS.register_module()
+class BboxOverlapsNearest3D(object):
+    """Nearest 3D IoU Calculator.
+
+    Note:
+        This IoU calculator first finds the nearest 2D boxes in bird eye view
+        (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`.
+
+    Args:
+        coordinate (str): 'camera', 'lidar', or 'depth' coordinate system.
+    """
+
+    def __init__(self, coordinate='lidar'):
+        assert coordinate in ['camera', 'lidar', 'depth']
+        self.coordinate = coordinate
+
+    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+        """Calculate nearest 3D IoU.
+
+        Note:
+            If ``is_aligned`` is ``False``, then it calculates the ious between
+            each bbox of bboxes1 and bboxes2, otherwise it calculates the ious
+            between each aligned pair of bboxes1 and bboxes2.
+
+        Args:
+            bboxes1 (torch.Tensor): shape (N, 7+N) [x, y, z, h, w, l, ry, v].
+            bboxes2 (torch.Tensor): shape (M, 7+N) [x, y, z, h, w, l, ry, v].
+            mode (str): "iou" (intersection over union) or iof
+                (intersection over foreground).
+            is_aligned (bool): Whether the calculation is aligned.
+
+        Return:
+            torch.Tensor: If ``is_aligned`` is ``True``, return ious between \
+                bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is \
+                ``False``, return shape is M.
+        """
+        return bbox_overlaps_nearest_3d(bboxes1, bboxes2, mode, is_aligned,
+                                        self.coordinate)
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(coordinate={self.coordinate}'
+        return repr_str
+
+
+@IOU_CALCULATORS.register_module()
+class BboxOverlaps3D(object):
+    """3D IoU Calculator.
+
+    Args:
+        coordinate (str): The coordinate system, valid options are
+            'camera', 'lidar', and 'depth'.
+    """
+
+    def __init__(self, coordinate):
+        assert coordinate in ['camera', 'lidar', 'depth']
+        self.coordinate = coordinate
+
+    def __call__(self, bboxes1, bboxes2, mode='iou'):
+        """Calculate 3D IoU using cuda implementation.
+
+        Note:
+            This function calculate the IoU of 3D boxes based on their volumes.
+            IoU calculator ``:class:BboxOverlaps3D`` uses this function to
+            calculate the actual 3D IoUs of boxes.
+
+        Args:
+            bboxes1 (torch.Tensor): shape (N, 7+C) [x, y, z, h, w, l, ry].
+            bboxes2 (torch.Tensor): shape (M, 7+C) [x, y, z, h, w, l, ry].
+            mode (str): "iou" (intersection over union) or
+                iof (intersection over foreground).
+
+        Return:
+            torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2 \
+                with shape (M, N) (aligned mode is not supported currently).
+        """
+        return bbox_overlaps_3d(bboxes1, bboxes2, mode, self.coordinate)
+
+    def __repr__(self):
+        """str: return a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(coordinate={self.coordinate}'
+        return repr_str
+
+
+def bbox_overlaps_nearest_3d(bboxes1,
+                             bboxes2,
+                             mode='iou',
+                             is_aligned=False,
+                             coordinate='lidar'):
+    """Calculate nearest 3D IoU.
+
+    Note:
+        This function first finds the nearest 2D boxes in bird eye view
+        (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`.
+        Ths IoU calculator :class:`BboxOverlapsNearest3D` uses this
+        function to calculate IoUs of boxes.
+
+        If ``is_aligned`` is ``False``, then it calculates the ious between
+        each bbox of bboxes1 and bboxes2, otherwise the ious between each
+        aligned pair of bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (torch.Tensor): shape (N, 7+C) [x, y, z, h, w, l, ry, v].
+        bboxes2 (torch.Tensor): shape (M, 7+C) [x, y, z, h, w, l, ry, v].
+        mode (str): "iou" (intersection over union) or iof
+            (intersection over foreground).
+        is_aligned (bool): Whether the calculation is aligned
+
+    Return:
+        torch.Tensor: If ``is_aligned`` is ``True``, return ious between \
+            bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is \
+            ``False``, return shape is M.
+    """
+    assert bboxes1.size(-1) == bboxes2.size(-1) >= 7
+
+    box_type, _ = get_box_type(coordinate)
+
+    bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1])
+    bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1])
+
+    # Change the bboxes to bev
+    # box conversion and iou calculation in torch version on CUDA
+    # is 10x faster than that in numpy version
+    bboxes1_bev = bboxes1.nearest_bev
+    bboxes2_bev = bboxes2.nearest_bev
+
+    ret = bbox_overlaps(
+        bboxes1_bev, bboxes2_bev, mode=mode, is_aligned=is_aligned)
+    return ret
+
+
+def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', coordinate='camera'):
+    """Calculate 3D IoU using cuda implementation.
+
+    Note:
+        This function calculates the IoU of 3D boxes based on their volumes.
+        IoU calculator :class:`BboxOverlaps3D` uses this function to
+        calculate the actual IoUs of boxes.
+
+    Args:
+        bboxes1 (torch.Tensor): shape (N, 7+C) [x, y, z, h, w, l, ry].
+        bboxes2 (torch.Tensor): shape (M, 7+C) [x, y, z, h, w, l, ry].
+        mode (str): "iou" (intersection over union) or
+            iof (intersection over foreground).
+        coordinate (str): 'camera' or 'lidar' coordinate system.
+
+    Return:
+        torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2 \
+            with shape (M, N) (aligned mode is not supported currently).
+    """
+    assert bboxes1.size(-1) == bboxes2.size(-1) >= 7
+
+    box_type, _ = get_box_type(coordinate)
+
+    bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1])
+    bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1])
+
+    return bboxes1.overlaps(bboxes1, bboxes2, mode=mode)
+
+
+@IOU_CALCULATORS.register_module()
+class AxisAlignedBboxOverlaps3D(object):
+    """Axis-aligned 3D Overlaps (IoU) Calculator."""
+
+    def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
+        """Calculate IoU between 2D bboxes.
+
+        Args:
+            bboxes1 (Tensor): shape (B, m, 6) in <x1, y1, z1, x2, y2, z2>
+                format or empty.
+            bboxes2 (Tensor): shape (B, n, 6) in <x1, y1, z1, x2, y2, z2>
+                format or empty.
+                B indicates the batch dim, in shape (B1, B2, ..., Bn).
+                If ``is_aligned`` is ``True``, then m and n must be equal.
+            mode (str): "iou" (intersection over union) or "giou" (generalized
+                intersection over union).
+            is_aligned (bool, optional): If True, then m and n must be equal.
+                Default False.
+        Returns:
+            Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+        """
+        assert bboxes1.size(-1) == bboxes2.size(-1) == 6
+        return axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2, mode,
+                                             is_aligned)
+
+    def __repr__(self):
+        """str: a string describing the module"""
+        repr_str = self.__class__.__name__ + '()'
+        return repr_str
+
+
+def axis_aligned_bbox_overlaps_3d(bboxes1,
+                                  bboxes2,
+                                  mode='iou',
+                                  is_aligned=False,
+                                  eps=1e-6):
+    """Calculate overlap between two set of axis aligned 3D bboxes. If
+    ``is_aligned`` is ``False``, then calculate the overlaps between each bbox
+    of bboxes1 and bboxes2, otherwise the overlaps between each aligned pair of
+    bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (Tensor): shape (B, m, 6) in <x1, y1, z1, x2, y2, z2>
+            format or empty.
+        bboxes2 (Tensor): shape (B, n, 6) in <x1, y1, z1, x2, y2, z2>
+            format or empty.
+            B indicates the batch dim, in shape (B1, B2, ..., Bn).
+            If ``is_aligned`` is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union) or "giou" (generalized
+            intersection over union).
+        is_aligned (bool, optional): If True, then m and n must be equal.
+            Default False.
+        eps (float, optional): A value added to the denominator for numerical
+            stability. Default 1e-6.
+
+    Returns:
+        Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)
+
+    Example:
+        >>> bboxes1 = torch.FloatTensor([
+        >>>     [0, 0, 0, 10, 10, 10],
+        >>>     [10, 10, 10, 20, 20, 20],
+        >>>     [32, 32, 32, 38, 40, 42],
+        >>> ])
+        >>> bboxes2 = torch.FloatTensor([
+        >>>     [0, 0, 0, 10, 20, 20],
+        >>>     [0, 10, 10, 10, 19, 20],
+        >>>     [10, 10, 10, 20, 20, 20],
+        >>> ])
+        >>> overlaps = axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2)
+        >>> assert overlaps.shape == (3, 3)
+        >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
+        >>> assert overlaps.shape == (3, )
+    Example:
+        >>> empty = torch.empty(0, 6)
+        >>> nonempty = torch.FloatTensor([[0, 0, 0, 10, 9, 10]])
+        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+    """
+
+    assert mode in ['iou', 'giou'], f'Unsupported mode {mode}'
+    # Either the boxes are empty or the length of boxes's last dimenstion is 6
+    assert (bboxes1.size(-1) == 6 or bboxes1.size(0) == 0)
+    assert (bboxes2.size(-1) == 6 or bboxes2.size(0) == 0)
+
+    # Batch dim must be the same
+    # Batch dim: (B1, B2, ... Bn)
+    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
+    batch_shape = bboxes1.shape[:-2]
+
+    rows = bboxes1.size(-2)
+    cols = bboxes2.size(-2)
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        if is_aligned:
+            return bboxes1.new(batch_shape + (rows, ))
+        else:
+            return bboxes1.new(batch_shape + (rows, cols))
+
+    area1 = (bboxes1[..., 3] -
+             bboxes1[..., 0]) * (bboxes1[..., 4] - bboxes1[..., 1]) * (
+                 bboxes1[..., 5] - bboxes1[..., 2])
+    area2 = (bboxes2[..., 3] -
+             bboxes2[..., 0]) * (bboxes2[..., 4] - bboxes2[..., 1]) * (
+                 bboxes2[..., 5] - bboxes2[..., 2])
+
+    if is_aligned:
+        lt = torch.max(bboxes1[..., :3], bboxes2[..., :3])  # [B, rows, 3]
+        rb = torch.min(bboxes1[..., 3:], bboxes2[..., 3:])  # [B, rows, 3]
+
+        wh = (rb - lt).clamp(min=0)  # [B, rows, 2]
+        overlap = wh[..., 0] * wh[..., 1] * wh[..., 2]
+
+        if mode in ['iou', 'giou']:
+            union = area1 + area2 - overlap
+        else:
+            union = area1
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :3], bboxes2[..., :3])
+            enclosed_rb = torch.max(bboxes1[..., 3:], bboxes2[..., 3:])
+    else:
+        lt = torch.max(bboxes1[..., :, None, :3],
+                       bboxes2[..., None, :, :3])  # [B, rows, cols, 3]
+        rb = torch.min(bboxes1[..., :, None, 3:],
+                       bboxes2[..., None, :, 3:])  # [B, rows, cols, 3]
+
+        wh = (rb - lt).clamp(min=0)  # [B, rows, cols, 3]
+        overlap = wh[..., 0] * wh[..., 1] * wh[..., 2]
+
+        if mode in ['iou', 'giou']:
+            union = area1[..., None] + area2[..., None, :] - overlap
+        if mode == 'giou':
+            enclosed_lt = torch.min(bboxes1[..., :, None, :3],
+                                    bboxes2[..., None, :, :3])
+            enclosed_rb = torch.max(bboxes1[..., :, None, 3:],
+                                    bboxes2[..., None, :, 3:])
+
+    eps = union.new_tensor([eps])
+    union = torch.max(union, eps)
+    ious = overlap / union
+    if mode in ['iou']:
+        return ious
+    # calculate gious
+    enclose_wh = (enclosed_rb - enclosed_lt).clamp(min=0)
+    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] * enclose_wh[..., 2]
+    enclose_area = torch.max(enclose_area, eps)
+    gious = ious - (enclose_area - union) / enclose_area
+    return gious
diff --git a/mmcv/core/bbox/match_costs/__init__.py b/mmcv/core/bbox/match_costs/__init__.py
new file mode 100644
index 0000000..8fdb6d2
--- /dev/null
+++ b/mmcv/core/bbox/match_costs/__init__.py
@@ -0,0 +1,7 @@
+from .builder import build_match_cost
+from .match_cost import BBoxL1Cost, ClassificationCost, FocalLossCost, IoUCost, BBox3DL1Cost, DiceCost
+
+__all__ = [
+    'build_match_cost', 'ClassificationCost', 'BBoxL1Cost', 'IoUCost',
+    'FocalLossCost', 'BBox3DL1Cost', 'DiceCost'
+]
diff --git a/mmcv/core/bbox/match_costs/builder.py b/mmcv/core/bbox/match_costs/builder.py
new file mode 100644
index 0000000..6894017
--- /dev/null
+++ b/mmcv/core/bbox/match_costs/builder.py
@@ -0,0 +1,8 @@
+from mmcv.utils import Registry, build_from_cfg
+
+MATCH_COST = Registry('Match Cost')
+
+
+def build_match_cost(cfg, default_args=None):
+    """Builder of IoU calculator."""
+    return build_from_cfg(cfg, MATCH_COST, default_args)
diff --git a/mmcv/core/bbox/match_costs/match_cost.py b/mmcv/core/bbox/match_costs/match_cost.py
new file mode 100644
index 0000000..b5a6a68
--- /dev/null
+++ b/mmcv/core/bbox/match_costs/match_cost.py
@@ -0,0 +1,324 @@
+import torch
+import torch.nn.functional as F
+from mmcv.core.bbox.iou_calculators import bbox_overlaps
+from mmcv.core.bbox.transforms import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh
+from .builder import MATCH_COST
+
+
+@MATCH_COST.register_module()
+class BBoxL1Cost:
+    """BBoxL1Cost.
+
+     Args:
+         weight (int | float, optional): loss_weight
+         box_format (str, optional): 'xyxy' for DETR, 'xywh' for Sparse_RCNN
+
+     Examples:
+         >>> from mmcv.core.bbox.match_costs.match_cost import BBoxL1Cost
+         >>> import torch
+         >>> self = BBoxL1Cost()
+         >>> bbox_pred = torch.rand(1, 4)
+         >>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+         >>> factor = torch.tensor([10, 8, 10, 8])
+         >>> self(bbox_pred, gt_bboxes, factor)
+         tensor([[1.6172, 1.6422]])
+    """
+
+    def __init__(self, weight=1., box_format='xyxy'):
+        self.weight = weight
+        assert box_format in ['xyxy', 'xywh']
+        self.box_format = box_format
+
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        if self.box_format == 'xywh':
+            gt_bboxes = bbox_xyxy_to_cxcywh(gt_bboxes)
+        elif self.box_format == 'xyxy':
+            bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred)
+        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class FocalLossCost:
+    """FocalLossCost.
+
+     Args:
+         weight (int | float, optional): loss_weight
+         alpha (int | float, optional): focal_loss alpha
+         gamma (int | float, optional): focal_loss gamma
+         eps (float, optional): default 1e-12
+
+     Examples:
+         >>> from mmcv.core.bbox.match_costs.match_cost import FocalLossCost
+         >>> import torch
+         >>> self = FocalLossCost()
+         >>> cls_pred = torch.rand(4, 3)
+         >>> gt_labels = torch.tensor([0, 1, 2])
+         >>> factor = torch.tensor([10, 8, 10, 8])
+         >>> self(cls_pred, gt_labels)
+         tensor([[-0.3236, -0.3364, -0.2699],
+                [-0.3439, -0.3209, -0.4807],
+                [-0.4099, -0.3795, -0.2929],
+                [-0.1950, -0.1207, -0.2626]])
+    """
+
+    def __init__(self, weight=1., alpha=0.25, gamma=2, eps=1e-12):
+        self.weight = weight
+        self.alpha = alpha
+        self.gamma = gamma
+        self.eps = eps
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            torch.Tensor: cls_cost value with weight
+        """
+        cls_pred = cls_pred.sigmoid()
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+        cls_cost = pos_cost[:, gt_labels] - neg_cost[:, gt_labels]
+        return cls_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class ClassificationCost:
+    """ClsSoftmaxCost.
+
+     Args:
+         weight (int | float, optional): loss_weight
+
+     Examples:
+         >>> from mmcv.core.bbox.match_costs.match_cost import \
+         ... ClassificationCost
+         >>> import torch
+         >>> self = ClassificationCost()
+         >>> cls_pred = torch.rand(4, 3)
+         >>> gt_labels = torch.tensor([0, 1, 2])
+         >>> factor = torch.tensor([10, 8, 10, 8])
+         >>> self(cls_pred, gt_labels)
+         tensor([[-0.3430, -0.3525, -0.3045],
+                [-0.3077, -0.2931, -0.3992],
+                [-0.3664, -0.3455, -0.2881],
+                [-0.3343, -0.2701, -0.3956]])
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            torch.Tensor: cls_cost value with weight
+        """
+        # Following the official DETR repo, contrary to the loss that
+        # NLL is used, we approximate it in 1 - cls_score[gt_label].
+        # The 1 is a constant that doesn't change the matching,
+        # so it can be omitted.
+        cls_score = cls_pred.softmax(-1)
+        cls_cost = -cls_score[:, gt_labels]
+        return cls_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class IoUCost:
+    """IoUCost.
+
+     Args:
+         iou_mode (str, optional): iou mode such as 'iou' | 'giou'
+         weight (int | float, optional): loss weight
+
+     Examples:
+         >>> from mmcv.core.bbox.match_costs.match_cost import IoUCost
+         >>> import torch
+         >>> self = IoUCost()
+         >>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]])
+         >>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+         >>> self(bboxes, gt_bboxes)
+         tensor([[-0.1250,  0.1667],
+                [ 0.1667, -0.5000]])
+    """
+
+    def __init__(self, iou_mode='giou', weight=1.):
+        self.weight = weight
+        self.iou_mode = iou_mode
+
+    def __call__(self, bboxes, gt_bboxes):
+        """
+        Args:
+            bboxes (Tensor): Predicted boxes with unnormalized coordinates
+                (x1, y1, x2, y2). Shape [num_query, 4].
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+
+        Returns:
+            torch.Tensor: iou_cost value with weight
+        """
+        # overlaps: [num_bboxes, num_gt]
+        overlaps = bbox_overlaps(
+            bboxes, gt_bboxes, mode=self.iou_mode, is_aligned=False)
+        # The 1 is a constant that doesn't change the matching, so omitted.
+        iou_cost = -overlaps
+        return iou_cost * self.weight
+    
+    
+@MATCH_COST.register_module()
+class BBox3DL1Cost(object):
+    """BBox3DL1Cost.
+     Args:
+         weight (int | float, optional): loss_weight
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
+
+#@weighted_loss
+def smooth_l1_loss(pred, target, beta=1.0):
+    """Smooth L1 loss.
+    Args:
+        pred (torch.Tensor): The prediction.
+        target (torch.Tensor): The learning target of the prediction.
+        beta (float, optional): The threshold in the piecewise function.
+            Defaults to 1.0.
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    assert beta > 0
+    if target.numel() == 0:
+        return pred.sum() * 0
+
+    # assert pred.size() == target.size()
+    diff = torch.abs(pred - target)
+    loss = torch.where(diff < beta, 0.5 * diff * diff / beta,
+                       diff - 0.5 * beta)
+    return loss.sum(-1)
+
+
+@MATCH_COST.register_module()
+class SmoothL1Cost(object):
+    """SmoothL1Cost.
+     Args:
+         weight (int | float, optional): loss weight
+
+     Examples:
+         >>> from mmdet.core.bbox.match_costs.match_cost import IoUCost
+         >>> import torch
+         >>> self = IoUCost()
+         >>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]])
+         >>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+         >>> self(bboxes, gt_bboxes)
+         tensor([[-0.1250,  0.1667],
+                [ 0.1667, -0.5000]])
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, input, target):
+        """
+        Args:
+            bboxes (Tensor): Predicted boxes with unnormalized coordinates
+                (x1, y1, x2, y2). Shape [num_query, 4].
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+
+        Returns:
+            torch.Tensor: iou_cost value with weight
+        """
+        N1, C = input.shape
+        N2, C = target.shape
+        input = input.contiguous().view(N1, C)[:, None, :]
+        target = target.contiguous().view(N2, C)[None, :, :]
+        cost = smooth_l1_loss(input, target)
+
+        return cost * self.weight
+
+
+@MATCH_COST.register_module()
+class DiceCost(object):
+    """IoUCost.
+
+     Args:
+         iou_mode (str, optional): iou mode such as 'iou' | 'giou'
+         weight (int | float, optional): loss weight
+
+     Examples:
+         >>> from mmcv.core.bbox.match_costs.match_cost import IoUCost
+         >>> import torch
+         >>> self = IoUCost()
+         >>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]])
+         >>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+         >>> self(bboxes, gt_bboxes)
+         tensor([[-0.1250,  0.1667],
+                [ 0.1667, -0.5000]])
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+        self.count = 0
+
+    def __call__(self, input, target):
+        """
+        Args:
+            bboxes (Tensor): Predicted boxes with unnormalized coordinates
+                (x1, y1, x2, y2). Shape [num_query, 4].
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+
+        Returns:
+            torch.Tensor: iou_cost value with weight
+        """
+        # overlaps: [num_bboxes, num_gt]
+        # print('INPUT', input.shape)
+        # print('target',target.shape)
+
+        N1, H1, W1 = input.shape
+        N2, H2, W2 = target.shape
+
+        if H1 != H2 or W1 != W2:
+            target = F.interpolate(target.unsqueeze(0), size=(H1, W1), mode='bilinear').squeeze(0)
+
+        input = input.contiguous().view(N1, -1)[:, None, :]
+        target = target.contiguous().view(N2, -1)[None, :, :]
+
+        a = torch.sum(input * target, -1)
+        b = torch.sum(input * input, -1) + 0.001
+        c = torch.sum(target * target, -1) + 0.001
+        d = (2 * a) / (b + c)
+        return (1 - d) * self.weight
diff --git a/mmcv/core/bbox/samplers/__init__.py b/mmcv/core/bbox/samplers/__init__.py
new file mode 100644
index 0000000..3a743dc
--- /dev/null
+++ b/mmcv/core/bbox/samplers/__init__.py
@@ -0,0 +1,6 @@
+from .pseudo_sampler import PseudoSampler
+
+
+__all__ = [
+    'PseudoSampler'
+]
diff --git a/mmcv/core/bbox/samplers/base_sampler.py b/mmcv/core/bbox/samplers/base_sampler.py
new file mode 100644
index 0000000..1534082
--- /dev/null
+++ b/mmcv/core/bbox/samplers/base_sampler.py
@@ -0,0 +1,101 @@
+from abc import ABCMeta, abstractmethod
+
+import torch
+
+from .sampling_result import SamplingResult
+
+
+class BaseSampler(metaclass=ABCMeta):
+    """Base class of samplers."""
+
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 neg_pos_ub=-1,
+                 add_gt_as_proposals=True,
+                 **kwargs):
+        self.num = num
+        self.pos_fraction = pos_fraction
+        self.neg_pos_ub = neg_pos_ub
+        self.add_gt_as_proposals = add_gt_as_proposals
+        self.pos_sampler = self
+        self.neg_sampler = self
+
+    @abstractmethod
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        """Sample positive samples."""
+        pass
+
+    @abstractmethod
+    def _sample_neg(self, assign_result, num_expected, **kwargs):
+        """Sample negative samples."""
+        pass
+
+    def sample(self,
+               assign_result,
+               bboxes,
+               gt_bboxes,
+               gt_labels=None,
+               **kwargs):
+        """Sample positive and negative bboxes.
+
+        This is a simple implementation of bbox sampling given candidates,
+        assigning results and ground truth bboxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            bboxes (Tensor): Boxes to be sampled from.
+            gt_bboxes (Tensor): Ground truth bboxes.
+            gt_labels (Tensor, optional): Class labels of ground truth bboxes.
+
+        Returns:
+            :obj:`SamplingResult`: Sampling result.
+
+        Example:
+            >>> from mmcv.core.bbox import RandomSampler
+            >>> from mmcv.core.bbox import AssignResult
+            >>> from mmcv.core.bbox.demodata import ensure_rng, random_boxes
+            >>> rng = ensure_rng(None)
+            >>> assign_result = AssignResult.random(rng=rng)
+            >>> bboxes = random_boxes(assign_result.num_preds, rng=rng)
+            >>> gt_bboxes = random_boxes(assign_result.num_gts, rng=rng)
+            >>> gt_labels = None
+            >>> self = RandomSampler(num=32, pos_fraction=0.5, neg_pos_ub=-1,
+            >>>                      add_gt_as_proposals=False)
+            >>> self = self.sample(assign_result, bboxes, gt_bboxes, gt_labels)
+        """
+        if len(bboxes.shape) < 2:
+            bboxes = bboxes[None, :]
+
+        bboxes = bboxes[:, :4]
+
+        gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.uint8)
+        if self.add_gt_as_proposals and len(gt_bboxes) > 0:
+            if gt_labels is None:
+                raise ValueError(
+                    'gt_labels must be given when add_gt_as_proposals is True')
+            bboxes = torch.cat([gt_bboxes, bboxes], dim=0)
+            assign_result.add_gt_(gt_labels)
+            gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.uint8)
+            gt_flags = torch.cat([gt_ones, gt_flags])
+
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self.pos_sampler._sample_pos(
+            assign_result, num_expected_pos, bboxes=bboxes, **kwargs)
+        # We found that sampled indices have duplicated items occasionally.
+        # (may be a bug of PyTorch)
+        pos_inds = pos_inds.unique()
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        if self.neg_pos_ub >= 0:
+            _pos = max(1, num_sampled_pos)
+            neg_upper_bound = int(self.neg_pos_ub * _pos)
+            if num_expected_neg > neg_upper_bound:
+                num_expected_neg = neg_upper_bound
+        neg_inds = self.neg_sampler._sample_neg(
+            assign_result, num_expected_neg, bboxes=bboxes, **kwargs)
+        neg_inds = neg_inds.unique()
+
+        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+                                         assign_result, gt_flags)
+        return sampling_result
diff --git a/mmcv/core/bbox/samplers/pseudo_sampler.py b/mmcv/core/bbox/samplers/pseudo_sampler.py
new file mode 100644
index 0000000..2bd81ab
--- /dev/null
+++ b/mmcv/core/bbox/samplers/pseudo_sampler.py
@@ -0,0 +1,41 @@
+import torch
+
+from ..builder import BBOX_SAMPLERS
+from .base_sampler import BaseSampler
+from .sampling_result import SamplingResult
+
+
+@BBOX_SAMPLERS.register_module()
+class PseudoSampler(BaseSampler):
+    """A pseudo sampler that does not do sampling actually."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
+
+    def sample(self, assign_result, bboxes, gt_bboxes, **kwargs):
+        """Directly returns the positive and negative indices  of samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigned results
+            bboxes (torch.Tensor): Bounding boxes
+            gt_bboxes (torch.Tensor): Ground truth boxes
+
+        Returns:
+            :obj:`SamplingResult`: sampler results
+        """
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        gt_flags = bboxes.new_zeros(bboxes.shape[0], dtype=torch.uint8)
+        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+                                         assign_result, gt_flags)
+        return sampling_result
diff --git a/mmcv/core/bbox/samplers/sampling_result.py b/mmcv/core/bbox/samplers/sampling_result.py
new file mode 100644
index 0000000..06eff7e
--- /dev/null
+++ b/mmcv/core/bbox/samplers/sampling_result.py
@@ -0,0 +1,152 @@
+import torch
+
+from mmcv.utils import util_mixins
+
+
+class SamplingResult(util_mixins.NiceRepr):
+    """Bbox sampling result.
+
+    Example:
+        >>> # xdoctest: +IGNORE_WANT
+        >>> from mmcv.core.bbox.samplers.sampling_result import *  # NOQA
+        >>> self = SamplingResult.random(rng=10)
+        >>> print(f'self = {self}')
+        self = <SamplingResult({
+            'neg_bboxes': torch.Size([12, 4]),
+            'neg_inds': tensor([ 0,  1,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12]),
+            'num_gts': 4,
+            'pos_assigned_gt_inds': tensor([], dtype=torch.int64),
+            'pos_bboxes': torch.Size([0, 4]),
+            'pos_inds': tensor([], dtype=torch.int64),
+            'pos_is_gt': tensor([], dtype=torch.uint8)
+        })>
+    """
+
+    def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result,
+                 gt_flags):
+        self.pos_inds = pos_inds
+        self.neg_inds = neg_inds
+        self.pos_bboxes = bboxes[pos_inds]
+        self.neg_bboxes = bboxes[neg_inds]
+        self.pos_is_gt = gt_flags[pos_inds]
+
+        self.num_gts = gt_bboxes.shape[0]
+        self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+
+        if gt_bboxes.numel() == 0:
+            # hack for index error case
+            assert self.pos_assigned_gt_inds.numel() == 0
+            self.pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4)
+        else:
+            if len(gt_bboxes.shape) < 2:
+                gt_bboxes = gt_bboxes.view(-1, 4)
+
+            self.pos_gt_bboxes = gt_bboxes[self.pos_assigned_gt_inds, :]
+
+        if assign_result.labels is not None:
+            self.pos_gt_labels = assign_result.labels[pos_inds]
+        else:
+            self.pos_gt_labels = None
+
+    @property
+    def bboxes(self):
+        """torch.Tensor: concatenated positive and negative boxes"""
+        return torch.cat([self.pos_bboxes, self.neg_bboxes])
+
+    def to(self, device):
+        """Change the device of the data inplace.
+
+        Example:
+            >>> self = SamplingResult.random()
+            >>> print(f'self = {self.to(None)}')
+            >>> # xdoctest: +REQUIRES(--gpu)
+            >>> print(f'self = {self.to(0)}')
+        """
+        _dict = self.__dict__
+        for key, value in _dict.items():
+            if isinstance(value, torch.Tensor):
+                _dict[key] = value.to(device)
+        return self
+
+    def __nice__(self):
+        data = self.info.copy()
+        data['pos_bboxes'] = data.pop('pos_bboxes').shape
+        data['neg_bboxes'] = data.pop('neg_bboxes').shape
+        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
+        body = '    ' + ',\n    '.join(parts)
+        return '{\n' + body + '\n}'
+
+    @property
+    def info(self):
+        """Returns a dictionary of info about the object."""
+        return {
+            'pos_inds': self.pos_inds,
+            'neg_inds': self.neg_inds,
+            'pos_bboxes': self.pos_bboxes,
+            'neg_bboxes': self.neg_bboxes,
+            'pos_is_gt': self.pos_is_gt,
+            'num_gts': self.num_gts,
+            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
+        }
+
+    @classmethod
+    def random(cls, rng=None, **kwargs):
+        """
+        Args:
+            rng (None | int | numpy.random.RandomState): seed or state.
+            kwargs (keyword arguments):
+                - num_preds: number of predicted boxes
+                - num_gts: number of true boxes
+                - p_ignore (float): probability of a predicted box assigned to \
+                    an ignored truth.
+                - p_assigned (float): probability of a predicted box not being \
+                    assigned.
+                - p_use_label (float | bool): with labels or not.
+
+        Returns:
+            :obj:`SamplingResult`: Randomly generated sampling result.
+
+        Example:
+            >>> from mmcv.core.bbox.samplers.sampling_result import *  # NOQA
+            >>> self = SamplingResult.random()
+            >>> print(self.__dict__)
+        """
+        from mmcv.core.bbox.samplers.random_sampler import RandomSampler
+        from mmcv.core.bbox.assigners.assign_result import AssignResult
+        from mmcv.core.bbox import demodata
+        rng = demodata.ensure_rng(rng)
+
+        # make probabalistic?
+        num = 32
+        pos_fraction = 0.5
+        neg_pos_ub = -1
+
+        assign_result = AssignResult.random(rng=rng, **kwargs)
+
+        # Note we could just compute an assignment
+        bboxes = demodata.random_boxes(assign_result.num_preds, rng=rng)
+        gt_bboxes = demodata.random_boxes(assign_result.num_gts, rng=rng)
+
+        if rng.rand() > 0.2:
+            # sometimes algorithms squeeze their data, be robust to that
+            gt_bboxes = gt_bboxes.squeeze()
+            bboxes = bboxes.squeeze()
+
+        if assign_result.labels is None:
+            gt_labels = None
+        else:
+            gt_labels = None  # todo
+
+        if gt_labels is None:
+            add_gt_as_proposals = False
+        else:
+            add_gt_as_proposals = True  # make probabalistic?
+
+        sampler = RandomSampler(
+            num,
+            pos_fraction,
+            neg_pos_ub=neg_pos_ub,
+            add_gt_as_proposals=add_gt_as_proposals,
+            rng=rng)
+        self = sampler.sample(assign_result, bboxes, gt_bboxes, gt_labels)
+        return self
diff --git a/mmcv/core/bbox/structures/__init__.py b/mmcv/core/bbox/structures/__init__.py
new file mode 100644
index 0000000..7e55840
--- /dev/null
+++ b/mmcv/core/bbox/structures/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from .utils import (get_box_type, get_proj_mat_by_coord_type, limit_period,
+                    mono_cam_box2vis, points_cam2img, rotation_3d_in_axis,
+                    xywhr2xyxyr)
diff --git a/mmcv/core/bbox/structures/base_box3d.py b/mmcv/core/bbox/structures/base_box3d.py
new file mode 100644
index 0000000..a09caf4
--- /dev/null
+++ b/mmcv/core/bbox/structures/base_box3d.py
@@ -0,0 +1,462 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from abc import abstractmethod
+
+# from mmcv.ops.iou3d import iou3d_cuda
+from .utils import limit_period, xywhr2xyxyr
+from mmcv.ops.iou3d_det import iou3d_cuda
+
+
+class BaseInstance3DBoxes(object):
+    """Base class for 3D Boxes.
+
+    Note:
+        The box is bottom centered, i.e. the relative position of origin in
+        the box is (0.5, 0.5, 0).
+
+    Args:
+        tensor (torch.Tensor | np.ndarray | list): a N x box_dim matrix.
+        box_dim (int): Number of the dimension of a box.
+            Each row is (x, y, z, x_size, y_size, z_size, yaw).
+            Default to 7.
+        with_yaw (bool): Whether the box is with yaw rotation.
+            If False, the value of yaw will be set to 0 as minmax boxes.
+            Default to True.
+        origin (tuple[float]): The relative position of origin in the box.
+            Default to (0.5, 0.5, 0). This will guide the box be converted to
+            (0.5, 0.5, 0) mode.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x box_dim.
+        box_dim (int): Integer indicating the dimension of a box.
+            Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+
+    def __init__(self, tensor, box_dim=7, with_yaw=True, origin=(0.5, 0.5, 0)):
+        if isinstance(tensor, torch.Tensor):
+            device = tensor.device
+        else:
+            device = torch.device('cpu')
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that
+            # does not depend on the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((0, box_dim)).to(
+                dtype=torch.float32, device=device)
+        assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size()
+
+        if tensor.shape[-1] == 6:
+            # If the dimension of boxes is 6, we expand box_dim by padding
+            # 0 as a fake yaw and set with_yaw to False.
+            assert box_dim == 6
+            fake_rot = tensor.new_zeros(tensor.shape[0], 1)
+            tensor = torch.cat((tensor, fake_rot), dim=-1)
+            self.box_dim = box_dim + 1
+            self.with_yaw = False
+        else:
+            self.box_dim = box_dim
+            self.with_yaw = with_yaw
+        self.tensor = tensor.clone()
+
+        if origin != (0.5, 0.5, 0):
+            dst = self.tensor.new_tensor((0.5, 0.5, 0))
+            src = self.tensor.new_tensor(origin)
+            self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
+
+    @property
+    def volume(self):
+        """torch.Tensor: A vector with volume of each box."""
+        return self.tensor[:, 3] * self.tensor[:, 4] * self.tensor[:, 5]
+
+    @property
+    def dims(self):
+        """torch.Tensor: Corners of each box with size (N, 8, 3)."""
+        return self.tensor[:, 3:6]
+
+    @property
+    def yaw(self):
+        """torch.Tensor: A vector with yaw of each box."""
+        return self.tensor[:, 6]
+
+    @property
+    def height(self):
+        """torch.Tensor: A vector with height of each box."""
+        return self.tensor[:, 5]
+
+    @property
+    def top_height(self):
+        """torch.Tensor: A vector with the top height of each box."""
+        return self.bottom_height + self.height
+
+    @property
+    def bottom_height(self):
+        """torch.Tensor: A vector with bottom's height of each box."""
+        return self.tensor[:, 2]
+
+    @property
+    def center(self):
+        """Calculate the center of all the boxes.
+
+        Note:
+            In the MMDetection3D's convention, the bottom center is
+            usually taken as the default center.
+
+            The relative position of the centers in different kinds of
+            boxes are different, e.g., the relative center of a boxes is
+            (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar.
+            It is recommended to use ``bottom_center`` or ``gravity_center``
+            for more clear usage.
+
+        Returns:
+            torch.Tensor: A tensor with center of each box.
+        """
+        return self.bottom_center
+
+    @property
+    def bottom_center(self):
+        """torch.Tensor: A tensor with center of each box."""
+        return self.tensor[:, :3]
+
+    @property
+    def gravity_center(self):
+        """torch.Tensor: A tensor with center of each box."""
+        pass
+
+    @property
+    def corners(self):
+        """torch.Tensor: a tensor with 8 corners of each box."""
+        pass
+
+    @abstractmethod
+    def rotate(self, angle, points=None):
+        """Rotate boxes with points (optional) with the given angle or \
+        rotation matrix.
+
+        Args:
+            angle (float | torch.Tensor | np.ndarray):
+                Rotation angle or rotation matrix.
+            points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+        """
+        pass
+
+    @abstractmethod
+    def flip(self, bev_direction='horizontal'):
+        """Flip the boxes in BEV along given BEV direction."""
+        pass
+
+    def translate(self, trans_vector):
+        """Translate boxes with the given translation vector.
+
+        Args:
+            trans_vector (torch.Tensor): Translation vector of size 1x3.
+        """
+        if not isinstance(trans_vector, torch.Tensor):
+            trans_vector = self.tensor.new_tensor(trans_vector)
+        self.tensor[:, :3] += trans_vector
+
+    def in_range_3d(self, box_range):
+        """Check whether the boxes are in the given range.
+
+        Args:
+            box_range (list | torch.Tensor): The range of box
+                (x_min, y_min, z_min, x_max, y_max, z_max)
+
+        Note:
+            In the original implementation of SECOND, checking whether
+            a box in the range checks whether the points are in a convex
+            polygon, we try to reduce the burden for simpler cases.
+
+        Returns:
+            torch.Tensor: A binary vector indicating whether each box is \
+                inside the reference range.
+        """
+        in_range_flags = ((self.tensor[:, 0] > box_range[0])
+                          & (self.tensor[:, 1] > box_range[1])
+                          & (self.tensor[:, 2] > box_range[2])
+                          & (self.tensor[:, 0] < box_range[3])
+                          & (self.tensor[:, 1] < box_range[4])
+                          & (self.tensor[:, 2] < box_range[5]))
+        return in_range_flags
+
+    @abstractmethod
+    def in_range_bev(self, box_range):
+        """Check whether the boxes are in the given range.
+
+        Args:
+            box_range (list | torch.Tensor): The range of box
+                in order of (x_min, y_min, x_max, y_max).
+
+        Returns:
+            torch.Tensor: Indicating whether each box is inside \
+                the reference range.
+        """
+        pass
+
+    @abstractmethod
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (:obj:`Box3DMode`): The target Box mode.
+            rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+                matrix between different coordinates. Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: The converted box of the same type \
+                in the `dst` mode.
+        """
+        pass
+
+    def scale(self, scale_factor):
+        """Scale the box with horizontal and vertical scaling factors.
+
+        Args:
+            scale_factors (float): Scale factors to scale the boxes.
+        """
+        self.tensor[:, :6] *= scale_factor
+        self.tensor[:, 7:] *= scale_factor
+
+    def limit_yaw(self, offset=0.5, period=np.pi):
+        """Limit the yaw to a given period and offset.
+
+        Args:
+            offset (float): The offset of the yaw.
+            period (float): The expected period.
+        """
+        self.tensor[:, 6] = limit_period(self.tensor[:, 6], offset, period)
+
+    def nonempty(self, threshold: float = 0.0):
+        """Find boxes that are non-empty.
+
+        A box is considered empty,
+        if either of its side is no larger than threshold.
+
+        Args:
+            threshold (float): The threshold of minimal sizes.
+
+        Returns:
+            torch.Tensor: A binary vector which represents whether each \
+                box is empty (False) or non-empty (True).
+        """
+        box = self.tensor
+        size_x = box[..., 3]
+        size_y = box[..., 4]
+        size_z = box[..., 5]
+        keep = ((size_x > threshold)
+                & (size_y > threshold) & (size_z > threshold))
+        return keep
+
+    def __getitem__(self, item):
+        """
+        Note:
+            The following usage are allowed:
+            1. `new_boxes = boxes[3]`:
+                return a `Boxes` that contains only one box.
+            2. `new_boxes = boxes[2:10]`:
+                return a slice of boxes.
+            3. `new_boxes = boxes[vector]`:
+                where vector is a torch.BoolTensor with `length = len(boxes)`.
+                Nonzero elements in the vector will be selected.
+            Note that the returned Boxes might share storage with this Boxes,
+            subject to Pytorch's indexing semantics.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: A new object of  \
+                :class:`BaseInstances3DBoxes` after indexing.
+        """
+        original_type = type(self)
+        if isinstance(item, int):
+            return original_type(
+                self.tensor[item].view(1, -1),
+                box_dim=self.box_dim,
+                with_yaw=self.with_yaw)
+        b = self.tensor[item]
+        assert b.dim() == 2, \
+            f'Indexing on Boxes with {item} failed to return a matrix!'
+        return original_type(b, box_dim=self.box_dim, with_yaw=self.with_yaw)
+
+    def __len__(self):
+        """int: Number of boxes in the current object."""
+        return self.tensor.shape[0]
+
+    def __repr__(self):
+        """str: Return a strings that describes the object."""
+        return self.__class__.__name__ + '(\n    ' + str(self.tensor) + ')'
+
+    @classmethod
+    def cat(cls, boxes_list):
+        """Concatenate a list of Boxes into a single Boxes.
+
+        Args:
+            boxes_list (list[:obj:`BaseInstance3DBoxes`]): List of boxes.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: The concatenated Boxes.
+        """
+        assert isinstance(boxes_list, (list, tuple))
+        if len(boxes_list) == 0:
+            return cls(torch.empty(0))
+        assert all(isinstance(box, cls) for box in boxes_list)
+
+        # use torch.cat (v.s. layers.cat)
+        # so the returned boxes never share storage with input
+        cat_boxes = cls(
+            torch.cat([b.tensor for b in boxes_list], dim=0),
+            box_dim=boxes_list[0].tensor.shape[1],
+            with_yaw=boxes_list[0].with_yaw)
+        return cat_boxes
+
+    def to(self, device):
+        """Convert current boxes to a specific device.
+
+        Args:
+            device (str | :obj:`torch.device`): The name of the device.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: A new boxes object on the \
+                specific device.
+        """
+        original_type = type(self)
+        return original_type(
+            self.tensor.to(device),
+            box_dim=self.box_dim,
+            with_yaw=self.with_yaw)
+
+    def clone(self):
+        """Clone the Boxes.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: Box object with the same properties \
+                as self.
+        """
+        original_type = type(self)
+        return original_type(
+            self.tensor.clone(), box_dim=self.box_dim, with_yaw=self.with_yaw)
+
+    @property
+    def device(self):
+        """str: The device of the boxes are on."""
+        return self.tensor.device
+
+    def __iter__(self):
+        """Yield a box as a Tensor of shape (4,) at a time.
+
+        Returns:
+            torch.Tensor: A box of shape (4,).
+        """
+        yield from self.tensor
+
+    @classmethod
+    def height_overlaps(cls, boxes1, boxes2, mode='iou'):
+        """Calculate height overlaps of two boxes.
+
+        Note:
+            This function calculates the height overlaps between boxes1 and
+            boxes2,  boxes1 and boxes2 should be in the same type.
+
+        Args:
+            boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes.
+            boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes.
+            mode (str, optional): Mode of iou calculation. Defaults to 'iou'.
+
+        Returns:
+            torch.Tensor: Calculated iou of boxes.
+        """
+        assert isinstance(boxes1, BaseInstance3DBoxes)
+        assert isinstance(boxes2, BaseInstance3DBoxes)
+        assert type(boxes1) == type(boxes2), '"boxes1" and "boxes2" should' \
+            f'be in the same type, got {type(boxes1)} and {type(boxes2)}.'
+
+        boxes1_top_height = boxes1.top_height.view(-1, 1)
+        boxes1_bottom_height = boxes1.bottom_height.view(-1, 1)
+        boxes2_top_height = boxes2.top_height.view(1, -1)
+        boxes2_bottom_height = boxes2.bottom_height.view(1, -1)
+
+        heighest_of_bottom = torch.max(boxes1_bottom_height,
+                                       boxes2_bottom_height)
+        lowest_of_top = torch.min(boxes1_top_height, boxes2_top_height)
+        overlaps_h = torch.clamp(lowest_of_top - heighest_of_bottom, min=0)
+        return overlaps_h
+
+    @classmethod
+    def overlaps(cls, boxes1, boxes2, mode='iou'):
+        """Calculate 3D overlaps of two boxes.
+
+        Note:
+            This function calculates the overlaps between ``boxes1`` and
+            ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type.
+
+        Args:
+            boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes.
+            boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes.
+            mode (str, optional): Mode of iou calculation. Defaults to 'iou'.
+
+        Returns:
+            torch.Tensor: Calculated iou of boxes' heights.
+        """
+        assert isinstance(boxes1, BaseInstance3DBoxes)
+        assert isinstance(boxes2, BaseInstance3DBoxes)
+        assert type(boxes1) == type(boxes2), '"boxes1" and "boxes2" should' \
+            f'be in the same type, got {type(boxes1)} and {type(boxes2)}.'
+
+        assert mode in ['iou', 'iof']
+
+        rows = len(boxes1)
+        cols = len(boxes2)
+        if rows * cols == 0:
+            return boxes1.tensor.new(rows, cols)
+
+        # height overlap
+        overlaps_h = cls.height_overlaps(boxes1, boxes2)
+
+        # obtain BEV boxes in XYXYR format
+        boxes1_bev = xywhr2xyxyr(boxes1.bev)
+        boxes2_bev = xywhr2xyxyr(boxes2.bev)
+
+        # bev overlap
+        overlaps_bev = boxes1_bev.new_zeros(
+            (boxes1_bev.shape[0], boxes2_bev.shape[0])).cuda()  # (N, M)
+        iou3d_cuda.boxes_overlap_bev_gpu(boxes1_bev.contiguous().cuda(),
+                                         boxes2_bev.contiguous().cuda(),
+                                         overlaps_bev)
+
+        # 3d overlaps
+        overlaps_3d = overlaps_bev.to(boxes1.device) * overlaps_h
+
+        volume1 = boxes1.volume.view(-1, 1)
+        volume2 = boxes2.volume.view(1, -1)
+
+        if mode == 'iou':
+            # the clamp func is used to avoid division of 0
+            iou3d = overlaps_3d / torch.clamp(
+                volume1 + volume2 - overlaps_3d, min=1e-8)
+        else:
+            iou3d = overlaps_3d / torch.clamp(volume1, min=1e-8)
+
+        return iou3d
+
+    def new_box(self, data):
+        """Create a new box object with data.
+
+        The new box and its tensor has the similar properties \
+            as self and self.tensor, respectively.
+
+        Args:
+            data (torch.Tensor | numpy.array | list): Data to be copied.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: A new bbox object with ``data``, \
+                the object's other properties are similar to ``self``.
+        """
+        new_tensor = self.tensor.new_tensor(data) \
+            if not isinstance(data, torch.Tensor) else data.to(self.device)
+        original_type = type(self)
+        return original_type(
+            new_tensor, box_dim=self.box_dim, with_yaw=self.with_yaw)
diff --git a/mmcv/core/bbox/structures/box_3d_mode.py b/mmcv/core/bbox/structures/box_3d_mode.py
new file mode 100644
index 0000000..6e2db4f
--- /dev/null
+++ b/mmcv/core/bbox/structures/box_3d_mode.py
@@ -0,0 +1,166 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from enum import IntEnum, unique
+
+from .base_box3d import BaseInstance3DBoxes
+from .cam_box3d import CameraInstance3DBoxes
+from .depth_box3d import DepthInstance3DBoxes
+from .lidar_box3d import LiDARInstance3DBoxes
+
+
+@unique
+class Box3DMode(IntEnum):
+    r"""Enum of different ways to represent a box.
+
+    Coordinates in LiDAR:
+
+    .. code-block:: none
+
+                    up z
+                       ^   x front
+                       |  /
+                       | /
+        left y <------ 0
+
+    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+
+    Coordinates in camera:
+
+    .. code-block:: none
+
+                z front
+               /
+              /
+             0 ------> x right
+             |
+             |
+             v
+        down y
+
+    The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5],
+    and the yaw is around the y axis, thus the rotation axis=1.
+
+    Coordinates in Depth mode:
+
+    .. code-block:: none
+
+        up z
+           ^   y front
+           |  /
+           | /
+           0 ------> x right
+
+    The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+    """
+
+    LIDAR = 0
+    CAM = 1
+    DEPTH = 2
+
+    @staticmethod
+    def convert(box, src, dst, rt_mat=None):
+        """Convert boxes from `src` mode to `dst` mode.
+
+        Args:
+            box (tuple | list | np.ndarray |
+                torch.Tensor | BaseInstance3DBoxes):
+                Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7.
+            src (:obj:`Box3DMode`): The src Box mode.
+            dst (:obj:`Box3DMode`): The target Box mode.
+            rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+                matrix between different coordinates. Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            (tuple | list | np.ndarray | torch.Tensor | BaseInstance3DBoxes): \
+                The converted box of the same type.
+        """
+        if src == dst:
+            return box
+
+        is_numpy = isinstance(box, np.ndarray)
+        is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes)
+        single_box = isinstance(box, (list, tuple))
+        if single_box:
+            assert len(box) >= 7, (
+                'Box3DMode.convert takes either a k-tuple/list or '
+                'an Nxk array/tensor, where k >= 7')
+            arr = torch.tensor(box)[None, :]
+        else:
+            # avoid modifying the input box
+            if is_numpy:
+                arr = torch.from_numpy(np.asarray(box)).clone()
+            elif is_Instance3DBoxes:
+                arr = box.tensor.clone()
+            else:
+                arr = box.clone()
+
+        # convert box from `src` mode to `dst` mode.
+        x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6]
+        if src == Box3DMode.LIDAR and dst == Box3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
+            xyz_size = torch.cat([y_size, z_size, x_size], dim=-1)
+        elif src == Box3DMode.CAM and dst == Box3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
+            xyz_size = torch.cat([z_size, x_size, y_size], dim=-1)
+        elif src == Box3DMode.DEPTH and dst == Box3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
+            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+        elif src == Box3DMode.CAM and dst == Box3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
+            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+        elif src == Box3DMode.LIDAR and dst == Box3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
+            xyz_size = torch.cat([y_size, x_size, z_size], dim=-1)
+        elif src == Box3DMode.DEPTH and dst == Box3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
+            xyz_size = torch.cat([y_size, x_size, z_size], dim=-1)
+        else:
+            raise NotImplementedError(
+                f'Conversion from Box3DMode {src} to {dst} '
+                'is not supported yet')
+
+        if not isinstance(rt_mat, torch.Tensor):
+            rt_mat = arr.new_tensor(rt_mat)
+        if rt_mat.size(1) == 4:
+            extended_xyz = torch.cat(
+                [arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1)
+            xyz = extended_xyz @ rt_mat.t()
+        else:
+            xyz = arr[:, :3] @ rt_mat.t()
+
+        remains = arr[..., 6:]
+        arr = torch.cat([xyz[:, :3], xyz_size, remains], dim=-1)
+
+        # convert arr to the original type
+        original_type = type(box)
+        if single_box:
+            return original_type(arr.flatten().tolist())
+        if is_numpy:
+            return arr.numpy()
+        elif is_Instance3DBoxes:
+            if dst == Box3DMode.CAM:
+                target_type = CameraInstance3DBoxes
+            elif dst == Box3DMode.LIDAR:
+                target_type = LiDARInstance3DBoxes
+            elif dst == Box3DMode.DEPTH:
+                target_type = DepthInstance3DBoxes
+            else:
+                raise NotImplementedError(
+                    f'Conversion to {dst} through {original_type}'
+                    ' is not supported yet')
+            return target_type(
+                arr, box_dim=arr.size(-1), with_yaw=box.with_yaw)
+        else:
+            return arr
diff --git a/mmcv/core/bbox/structures/cam_box3d.py b/mmcv/core/bbox/structures/cam_box3d.py
new file mode 100644
index 0000000..2f0a74b
--- /dev/null
+++ b/mmcv/core/bbox/structures/cam_box3d.py
@@ -0,0 +1,324 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmcv.core.points import BasePoints
+from .base_box3d import BaseInstance3DBoxes
+from .utils import limit_period, rotation_3d_in_axis
+
+
+class CameraInstance3DBoxes(BaseInstance3DBoxes):
+    """3D boxes of instances in CAM coordinates.
+
+    Coordinates in camera:
+
+    .. code-block:: none
+
+                z front (yaw=-0.5*pi)
+               /
+              /
+             0 ------> x right (yaw=0)
+             |
+             |
+             v
+        down y
+
+    The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5),
+    and the yaw is around the y axis, thus the rotation axis=1.
+    The yaw is 0 at the positive direction of x axis, and decreases from
+    the positive direction of x to the positive direction of z.
+
+    A refactor is ongoing to make the three coordinate systems
+    easier to understand and convert between each other.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x box_dim.
+        box_dim (int): Integer indicates the dimension of a box
+            Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+
+    def __init__(self,
+                 tensor,
+                 box_dim=7,
+                 with_yaw=True,
+                 origin=(0.5, 1.0, 0.5)):
+        if isinstance(tensor, torch.Tensor):
+            device = tensor.device
+        else:
+            device = torch.device('cpu')
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that
+            # does not depend on the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((0, box_dim)).to(
+                dtype=torch.float32, device=device)
+        assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size()
+
+        if tensor.shape[-1] == 6:
+            # If the dimension of boxes is 6, we expand box_dim by padding
+            # 0 as a fake yaw and set with_yaw to False.
+            assert box_dim == 6
+            fake_rot = tensor.new_zeros(tensor.shape[0], 1)
+            tensor = torch.cat((tensor, fake_rot), dim=-1)
+            self.box_dim = box_dim + 1
+            self.with_yaw = False
+        else:
+            self.box_dim = box_dim
+            self.with_yaw = with_yaw
+        self.tensor = tensor.clone()
+
+        if origin != (0.5, 1.0, 0.5):
+            dst = self.tensor.new_tensor((0.5, 1.0, 0.5))
+            src = self.tensor.new_tensor(origin)
+            self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
+
+    @property
+    def height(self):
+        """torch.Tensor: A vector with height of each box."""
+        return self.tensor[:, 4]
+
+    @property
+    def top_height(self):
+        """torch.Tensor: A vector with the top height of each box."""
+        # the positive direction is down rather than up
+        return self.bottom_height - self.height
+
+    @property
+    def bottom_height(self):
+        """torch.Tensor: A vector with bottom's height of each box."""
+        return self.tensor[:, 1]
+
+    @property
+    def gravity_center(self):
+        """torch.Tensor: A tensor with center of each box."""
+        bottom_center = self.bottom_center
+        gravity_center = torch.zeros_like(bottom_center)
+        gravity_center[:, [0, 2]] = bottom_center[:, [0, 2]]
+        gravity_center[:, 1] = bottom_center[:, 1] - self.tensor[:, 4] * 0.5
+        return gravity_center
+
+    @property
+    def corners(self):
+        """torch.Tensor: Coordinates of corners of all the boxes in
+                         shape (N, 8, 3).
+
+        Convert the boxes to  in clockwise order, in the form of
+        (x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)
+
+        .. code-block:: none
+
+                         front z
+                              /
+                             /
+               (x0, y0, z1) + -----------  + (x1, y0, z1)
+                           /|            / |
+                          / |           /  |
+            (x0, y0, z0) + ----------- +   + (x1, y1, z1)
+                         |  /      .   |  /
+                         | / origin    | /
+            (x0, y1, z0) + ----------- + -------> x right
+                         |             (x1, y1, z0)
+                         |
+                         v
+                    down y
+        """
+        # TODO: rotation_3d_in_axis function do not support
+        #  empty tensor currently.
+        assert len(self.tensor) != 0
+        dims = self.dims
+        corners_norm = torch.from_numpy(
+            np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
+                device=dims.device, dtype=dims.dtype)
+
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+        # use relative origin [0.5, 1, 0.5]
+        corners_norm = corners_norm - dims.new_tensor([0.5, 1, 0.5])
+        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+        # rotate around y axis
+        corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=1)
+        corners += self.tensor[:, :3].view(-1, 1, 3)
+        return corners
+
+    @property
+    def bev(self):
+        """torch.Tensor: A n x 5 tensor of 2D BEV box of each box
+        with rotation in XYWHR format."""
+        return self.tensor[:, [0, 2, 3, 5, 6]]
+
+    @property
+    def nearest_bev(self):
+        """torch.Tensor: A tensor of 2D BEV box of each box
+        without rotation."""
+        # Obtain BEV boxes with rotation in XZWHR format
+        bev_rotated_boxes = self.bev
+        # convert the rotation to a valid range
+        rotations = bev_rotated_boxes[:, -1]
+        normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi))
+
+        # find the center of boxes
+        conditions = (normed_rotations > np.pi / 4)[..., None]
+        bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:,
+                                                                [0, 1, 3, 2]],
+                                  bev_rotated_boxes[:, :4])
+
+        centers = bboxes_xywh[:, :2]
+        dims = bboxes_xywh[:, 2:]
+        bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1)
+        return bev_boxes
+
+    def rotate(self, angle, points=None):
+        """Rotate boxes with points (optional) with the given angle or \
+        rotation matrix.
+
+        Args:
+            angle (float | torch.Tensor | np.ndarray):
+                Rotation angle or rotation matrix.
+            points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+
+        Returns:
+            tuple or None: When ``points`` is None, the function returns \
+                None, otherwise it returns the rotated points and the \
+                rotation matrix ``rot_mat_T``.
+        """
+        if not isinstance(angle, torch.Tensor):
+            angle = self.tensor.new_tensor(angle)
+        assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \
+            f'invalid rotation angle shape {angle.shape}'
+
+        if angle.numel() == 1:
+            rot_sin = torch.sin(angle)
+            rot_cos = torch.cos(angle)
+            rot_mat_T = self.tensor.new_tensor([[rot_cos, 0, -rot_sin],
+                                                [0, 1, 0],
+                                                [rot_sin, 0, rot_cos]])
+        else:
+            rot_mat_T = angle
+            rot_sin = rot_mat_T[2, 0]
+            rot_cos = rot_mat_T[0, 0]
+            angle = np.arctan2(rot_sin, rot_cos)
+
+        self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T
+        self.tensor[:, 6] += angle
+
+        if points is not None:
+            if isinstance(points, torch.Tensor):
+                points[:, :3] = points[:, :3] @ rot_mat_T
+            elif isinstance(points, np.ndarray):
+                rot_mat_T = rot_mat_T.numpy()
+                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
+            elif isinstance(points, BasePoints):
+                # clockwise
+                points.rotate(-angle)
+            else:
+                raise ValueError
+            return points, rot_mat_T
+
+    def flip(self, bev_direction='horizontal', points=None):
+        """Flip the boxes in BEV along given BEV direction.
+
+        In CAM coordinates, it flips the x (horizontal) or z (vertical) axis.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+            points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None):
+                Points to flip. Defaults to None.
+
+        Returns:
+            torch.Tensor, numpy.ndarray or None: Flipped points.
+        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
+            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+        elif bev_direction == 'vertical':
+            self.tensor[:, 2::7] = -self.tensor[:, 2::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6]
+
+        if points is not None:
+            assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
+            if isinstance(points, (torch.Tensor, np.ndarray)):
+                if bev_direction == 'horizontal':
+                    points[:, 0] = -points[:, 0]
+                elif bev_direction == 'vertical':
+                    points[:, 2] = -points[:, 2]
+            elif isinstance(points, BasePoints):
+                points.flip(bev_direction)
+            return points
+
+    def in_range_bev(self, box_range):
+        """Check whether the boxes are in the given range.
+
+        Args:
+            box_range (list | torch.Tensor): The range of box
+                (x_min, z_min, x_max, z_max).
+
+        Note:
+            The original implementation of SECOND checks whether boxes in
+            a range by checking whether the points are in a convex
+            polygon, we reduce the burden for simpler cases.
+
+        Returns:
+            torch.Tensor: Indicating whether each box is inside \
+                the reference range.
+        """
+        in_range_flags = ((self.tensor[:, 0] > box_range[0])
+                          & (self.tensor[:, 2] > box_range[1])
+                          & (self.tensor[:, 0] < box_range[2])
+                          & (self.tensor[:, 2] < box_range[3]))
+        return in_range_flags
+
+    @classmethod
+    def height_overlaps(cls, boxes1, boxes2, mode='iou'):
+        """Calculate height overlaps of two boxes.
+
+        This function calculates the height overlaps between ``boxes1`` and
+        ``boxes2``, where ``boxes1`` and ``boxes2`` should be in the same type.
+
+        Args:
+            boxes1 (:obj:`CameraInstance3DBoxes`): Boxes 1 contain N boxes.
+            boxes2 (:obj:`CameraInstance3DBoxes`): Boxes 2 contain M boxes.
+            mode (str, optional): Mode of iou calculation. Defaults to 'iou'.
+
+        Returns:
+            torch.Tensor: Calculated iou of boxes' heights.
+        """
+        assert isinstance(boxes1, CameraInstance3DBoxes)
+        assert isinstance(boxes2, CameraInstance3DBoxes)
+
+        boxes1_top_height = boxes1.top_height.view(-1, 1)
+        boxes1_bottom_height = boxes1.bottom_height.view(-1, 1)
+        boxes2_top_height = boxes2.top_height.view(1, -1)
+        boxes2_bottom_height = boxes2.bottom_height.view(1, -1)
+
+        # In camera coordinate system
+        # from up to down is the positive direction
+        heighest_of_bottom = torch.min(boxes1_bottom_height,
+                                       boxes2_bottom_height)
+        lowest_of_top = torch.max(boxes1_top_height, boxes2_top_height)
+        overlaps_h = torch.clamp(heighest_of_bottom - lowest_of_top, min=0)
+        return overlaps_h
+
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (:obj:`Box3DMode`): The target Box mode.
+            rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+                matrix between different coordinates. Defaults to None.
+                The conversion from ``src`` coordinates to ``dst`` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`:  \
+                The converted box of the same type in the ``dst`` mode.
+        """
+        from .box_3d_mode import Box3DMode
+        return Box3DMode.convert(
+            box=self, src=Box3DMode.CAM, dst=dst, rt_mat=rt_mat)
diff --git a/mmcv/core/bbox/structures/coord_3d_mode.py b/mmcv/core/bbox/structures/coord_3d_mode.py
new file mode 100644
index 0000000..2d0de8d
--- /dev/null
+++ b/mmcv/core/bbox/structures/coord_3d_mode.py
@@ -0,0 +1,281 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from enum import IntEnum, unique
+
+from mmcv.core.points import (BasePoints, CameraPoints, DepthPoints,
+                                 LiDARPoints)
+from .base_box3d import BaseInstance3DBoxes
+from .cam_box3d import CameraInstance3DBoxes
+from .depth_box3d import DepthInstance3DBoxes
+from .lidar_box3d import LiDARInstance3DBoxes
+
+
+@unique
+class Coord3DMode(IntEnum):
+    r"""Enum of different ways to represent a box
+        and point cloud.
+
+    Coordinates in LiDAR:
+
+    .. code-block:: none
+
+                    up z
+                       ^   x front
+                       |  /
+                       | /
+        left y <------ 0
+
+    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+
+    Coordinates in camera:
+
+    .. code-block:: none
+
+                z front
+               /
+              /
+             0 ------> x right
+             |
+             |
+             v
+        down y
+
+    The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5],
+    and the yaw is around the y axis, thus the rotation axis=1.
+
+    Coordinates in Depth mode:
+
+    .. code-block:: none
+
+        up z
+           ^   y front
+           |  /
+           | /
+           0 ------> x right
+
+    The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+    """
+
+    LIDAR = 0
+    CAM = 1
+    DEPTH = 2
+
+    @staticmethod
+    def convert(input, src, dst, rt_mat=None):
+        """Convert boxes or points from `src` mode to `dst` mode."""
+        if isinstance(input, BaseInstance3DBoxes):
+            return Coord3DMode.convert_box(input, src, dst, rt_mat=rt_mat)
+        elif isinstance(input, BasePoints):
+            return Coord3DMode.convert_point(input, src, dst, rt_mat=rt_mat)
+        else:
+            raise NotImplementedError
+
+    @staticmethod
+    def convert_box(box, src, dst, rt_mat=None):
+        """Convert boxes from `src` mode to `dst` mode.
+
+        Args:
+            box (tuple | list | np.ndarray |
+                torch.Tensor | BaseInstance3DBoxes):
+                Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7.
+            src (:obj:`CoordMode`): The src Box mode.
+            dst (:obj:`CoordMode`): The target Box mode.
+            rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+                matrix between different coordinates. Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            (tuple | list | np.ndarray | torch.Tensor | BaseInstance3DBoxes): \
+                The converted box of the same type.
+        """
+        if src == dst:
+            return box
+
+        is_numpy = isinstance(box, np.ndarray)
+        is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes)
+        single_box = isinstance(box, (list, tuple))
+        if single_box:
+            assert len(box) >= 7, (
+                'CoordMode.convert takes either a k-tuple/list or '
+                'an Nxk array/tensor, where k >= 7')
+            arr = torch.tensor(box)[None, :]
+        else:
+            # avoid modifying the input box
+            if is_numpy:
+                arr = torch.from_numpy(np.asarray(box)).clone()
+            elif is_Instance3DBoxes:
+                arr = box.tensor.clone()
+            else:
+                arr = box.clone()
+
+        # convert box from `src` mode to `dst` mode.
+        x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6]
+        if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
+            xyz_size = torch.cat([y_size, z_size, x_size], dim=-1)
+        elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
+            xyz_size = torch.cat([z_size, x_size, y_size], dim=-1)
+        elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
+            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+        elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
+            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+        elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
+            xyz_size = torch.cat([y_size, x_size, z_size], dim=-1)
+        elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
+            xyz_size = torch.cat([y_size, x_size, z_size], dim=-1)
+        else:
+            raise NotImplementedError(
+                f'Conversion from Coord3DMode {src} to {dst} '
+                'is not supported yet')
+
+        if not isinstance(rt_mat, torch.Tensor):
+            rt_mat = arr.new_tensor(rt_mat)
+        if rt_mat.size(1) == 4:
+            extended_xyz = torch.cat(
+                [arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1)
+            xyz = extended_xyz @ rt_mat.t()
+        else:
+            xyz = arr[:, :3] @ rt_mat.t()
+
+        remains = arr[..., 6:]
+        arr = torch.cat([xyz[:, :3], xyz_size, remains], dim=-1)
+
+        # convert arr to the original type
+        original_type = type(box)
+        if single_box:
+            return original_type(arr.flatten().tolist())
+        if is_numpy:
+            return arr.numpy()
+        elif is_Instance3DBoxes:
+            if dst == Coord3DMode.CAM:
+                target_type = CameraInstance3DBoxes
+            elif dst == Coord3DMode.LIDAR:
+                target_type = LiDARInstance3DBoxes
+            elif dst == Coord3DMode.DEPTH:
+                target_type = DepthInstance3DBoxes
+            else:
+                raise NotImplementedError(
+                    f'Conversion to {dst} through {original_type}'
+                    ' is not supported yet')
+            return target_type(
+                arr, box_dim=arr.size(-1), with_yaw=box.with_yaw)
+        else:
+            return arr
+
+    @staticmethod
+    def convert_point(point, src, dst, rt_mat=None):
+        """Convert points from `src` mode to `dst` mode.
+
+        Args:
+            point (tuple | list | np.ndarray |
+                torch.Tensor | BasePoints):
+                Can be a k-tuple, k-list or an Nxk array/tensor.
+            src (:obj:`CoordMode`): The src Point mode.
+            dst (:obj:`CoordMode`): The target Point mode.
+            rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+                matrix between different coordinates. Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            (tuple | list | np.ndarray | torch.Tensor | BasePoints): \
+                The converted point of the same type.
+        """
+        if src == dst:
+            return point
+
+        is_numpy = isinstance(point, np.ndarray)
+        is_InstancePoints = isinstance(point, BasePoints)
+        single_point = isinstance(point, (list, tuple))
+        if single_point:
+            assert len(point) >= 3, (
+                'CoordMode.convert takes either a k-tuple/list or '
+                'an Nxk array/tensor, where k >= 3')
+            arr = torch.tensor(point)[None, :]
+        else:
+            # avoid modifying the input point
+            if is_numpy:
+                arr = torch.from_numpy(np.asarray(point)).clone()
+            elif is_InstancePoints:
+                arr = point.tensor.clone()
+            else:
+                arr = point.clone()
+
+        # convert point from `src` mode to `dst` mode.
+        # TODO: LIDAR
+        # only implemented provided Rt matrix in cam-depth conversion
+        if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
+        elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
+        elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
+        elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
+        elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
+        elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
+        else:
+            raise NotImplementedError(
+                f'Conversion from Coord3DMode {src} to {dst} '
+                'is not supported yet')
+
+        if not isinstance(rt_mat, torch.Tensor):
+            rt_mat = arr.new_tensor(rt_mat)
+        if rt_mat.size(1) == 4:
+            extended_xyz = torch.cat(
+                [arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1)
+            xyz = extended_xyz @ rt_mat.t()
+        else:
+            xyz = arr[:, :3] @ rt_mat.t()
+
+        remains = arr[:, 3:]
+        arr = torch.cat([xyz[:, :3], remains], dim=-1)
+
+        # convert arr to the original type
+        original_type = type(point)
+        if single_point:
+            return original_type(arr.flatten().tolist())
+        if is_numpy:
+            return arr.numpy()
+        elif is_InstancePoints:
+            if dst == Coord3DMode.CAM:
+                target_type = CameraPoints
+            elif dst == Coord3DMode.LIDAR:
+                target_type = LiDARPoints
+            elif dst == Coord3DMode.DEPTH:
+                target_type = DepthPoints
+            else:
+                raise NotImplementedError(
+                    f'Conversion to {dst} through {original_type}'
+                    ' is not supported yet')
+            return target_type(
+                arr,
+                points_dim=arr.size(-1),
+                attribute_dims=point.attribute_dims)
+        else:
+            return arr
diff --git a/mmcv/core/bbox/structures/depth_box3d.py b/mmcv/core/bbox/structures/depth_box3d.py
new file mode 100644
index 0000000..058e975
--- /dev/null
+++ b/mmcv/core/bbox/structures/depth_box3d.py
@@ -0,0 +1,343 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmcv.core.points import BasePoints
+from mmcv.ops.roiaware_pool3d import points_in_boxes_batch
+from .base_box3d import BaseInstance3DBoxes
+from .utils import limit_period, rotation_3d_in_axis
+
+
+class DepthInstance3DBoxes(BaseInstance3DBoxes):
+    """3D boxes of instances in Depth coordinates.
+
+    Coordinates in Depth:
+
+    .. code-block:: none
+
+                    up z    y front (yaw=-0.5*pi)
+                       ^   ^
+                       |  /
+                       | /
+                       0 ------> x right (yaw=0)
+
+    The relative coordinate of bottom center in a Depth box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+    The yaw is 0 at the positive direction of x axis, and decreases from
+    the positive direction of x to the positive direction of y.
+    Also note that rotation of DepthInstance3DBoxes is counterclockwise,
+    which is reverse to the definition of the yaw angle (clockwise).
+
+    A refactor is ongoing to make the three coordinate systems
+    easier to understand and convert between each other.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x box_dim.
+        box_dim (int): Integer indicates the dimension of a box
+            Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+
+    @property
+    def gravity_center(self):
+        """torch.Tensor: A tensor with center of each box."""
+        bottom_center = self.bottom_center
+        gravity_center = torch.zeros_like(bottom_center)
+        gravity_center[:, :2] = bottom_center[:, :2]
+        gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5
+        return gravity_center
+
+    @property
+    def corners(self):
+        """torch.Tensor: Coordinates of corners of all the boxes
+        in shape (N, 8, 3).
+
+        Convert the boxes to corners in clockwise order, in form of
+        ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)``
+
+        .. code-block:: none
+
+                                           up z
+                            front y           ^
+                                 /            |
+                                /             |
+                  (x0, y1, z1) + -----------  + (x1, y1, z1)
+                              /|            / |
+                             / |           /  |
+               (x0, y0, z1) + ----------- +   + (x1, y1, z0)
+                            |  /      .   |  /
+                            | / origin    | /
+               (x0, y0, z0) + ----------- + --------> right x
+                                          (x1, y0, z0)
+        """
+        # TODO: rotation_3d_in_axis function do not support
+        #  empty tensor currently.
+        assert len(self.tensor) != 0
+        dims = self.dims
+        corners_norm = torch.from_numpy(
+            np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
+                device=dims.device, dtype=dims.dtype)
+
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+        # use relative origin (0.5, 0.5, 0)
+        corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0])
+        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+        # rotate around z axis
+        corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=2)
+        corners += self.tensor[:, :3].view(-1, 1, 3)
+        return corners
+
+    @property
+    def bev(self):
+        """torch.Tensor: A n x 5 tensor of 2D BEV box of each box
+        in XYWHR format."""
+        return self.tensor[:, [0, 1, 3, 4, 6]]
+
+    @property
+    def nearest_bev(self):
+        """torch.Tensor: A tensor of 2D BEV box of each box
+        without rotation."""
+        # Obtain BEV boxes with rotation in XYWHR format
+        bev_rotated_boxes = self.bev
+        # convert the rotation to a valid range
+        rotations = bev_rotated_boxes[:, -1]
+        normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi))
+
+        # find the center of boxes
+        conditions = (normed_rotations > np.pi / 4)[..., None]
+        bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:,
+                                                                [0, 1, 3, 2]],
+                                  bev_rotated_boxes[:, :4])
+
+        centers = bboxes_xywh[:, :2]
+        dims = bboxes_xywh[:, 2:]
+        bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1)
+        return bev_boxes
+
+    def rotate(self, angle, points=None):
+        """Rotate boxes with points (optional) with the given angle or \
+        rotation matrix.
+
+        Args:
+            angle (float | torch.Tensor | np.ndarray):
+                Rotation angle or rotation matrix.
+            points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+
+        Returns:
+            tuple or None: When ``points`` is None, the function returns \
+                None, otherwise it returns the rotated points and the \
+                rotation matrix ``rot_mat_T``.
+        """
+        if not isinstance(angle, torch.Tensor):
+            angle = self.tensor.new_tensor(angle)
+        assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \
+            f'invalid rotation angle shape {angle.shape}'
+
+        if angle.numel() == 1:
+            rot_sin = torch.sin(angle)
+            rot_cos = torch.cos(angle)
+            rot_mat_T = self.tensor.new_tensor([[rot_cos, -rot_sin, 0],
+                                                [rot_sin, rot_cos, 0],
+                                                [0, 0, 1]]).T
+        else:
+            rot_mat_T = angle.T
+            rot_sin = rot_mat_T[0, 1]
+            rot_cos = rot_mat_T[0, 0]
+            angle = np.arctan2(rot_sin, rot_cos)
+
+        self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T
+        if self.with_yaw:
+            self.tensor[:, 6] -= angle
+        else:
+            corners_rot = self.corners @ rot_mat_T
+            new_x_size = corners_rot[..., 0].max(
+                dim=1, keepdim=True)[0] - corners_rot[..., 0].min(
+                    dim=1, keepdim=True)[0]
+            new_y_size = corners_rot[..., 1].max(
+                dim=1, keepdim=True)[0] - corners_rot[..., 1].min(
+                    dim=1, keepdim=True)[0]
+            self.tensor[:, 3:5] = torch.cat((new_x_size, new_y_size), dim=-1)
+
+        if points is not None:
+            if isinstance(points, torch.Tensor):
+                points[:, :3] = points[:, :3] @ rot_mat_T
+            elif isinstance(points, np.ndarray):
+                rot_mat_T = rot_mat_T.numpy()
+                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
+            elif isinstance(points, BasePoints):
+                # anti-clockwise
+                points.rotate(angle)
+            else:
+                raise ValueError
+            return points, rot_mat_T
+
+    def flip(self, bev_direction='horizontal', points=None):
+        """Flip the boxes in BEV along given BEV direction.
+
+        In Depth coordinates, it flips x (horizontal) or y (vertical) axis.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+            points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None):
+                Points to flip. Defaults to None.
+
+        Returns:
+            torch.Tensor, numpy.ndarray or None: Flipped points.
+        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
+            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+        elif bev_direction == 'vertical':
+            self.tensor[:, 1::7] = -self.tensor[:, 1::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6]
+
+        if points is not None:
+            assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
+            if isinstance(points, (torch.Tensor, np.ndarray)):
+                if bev_direction == 'horizontal':
+                    points[:, 0] = -points[:, 0]
+                elif bev_direction == 'vertical':
+                    points[:, 1] = -points[:, 1]
+            elif isinstance(points, BasePoints):
+                points.flip(bev_direction)
+            return points
+
+    def in_range_bev(self, box_range):
+        """Check whether the boxes are in the given range.
+
+        Args:
+            box_range (list | torch.Tensor): The range of box
+                (x_min, y_min, x_max, y_max).
+
+        Note:
+            In the original implementation of SECOND, checking whether
+            a box in the range checks whether the points are in a convex
+            polygon, we try to reduce the burdun for simpler cases.
+
+        Returns:
+            torch.Tensor: Indicating whether each box is inside \
+                the reference range.
+        """
+        in_range_flags = ((self.tensor[:, 0] > box_range[0])
+                          & (self.tensor[:, 1] > box_range[1])
+                          & (self.tensor[:, 0] < box_range[2])
+                          & (self.tensor[:, 1] < box_range[3]))
+        return in_range_flags
+
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (:obj:`Box3DMode`): The target Box mode.
+            rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+                matrix between different coordinates. Defaults to None.
+                The conversion from ``src`` coordinates to ``dst`` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            :obj:`DepthInstance3DBoxes`: \
+                The converted box of the same type in the ``dst`` mode.
+        """
+        from .box_3d_mode import Box3DMode
+        return Box3DMode.convert(
+            box=self, src=Box3DMode.DEPTH, dst=dst, rt_mat=rt_mat)
+
+    def points_in_boxes(self, points):
+        """Find points that are in boxes (CUDA).
+
+        Args:
+            points (torch.Tensor): Points in shape [1, M, 3] or [M, 3], \
+                3 dimensions are [x, y, z] in LiDAR coordinate.
+
+        Returns:
+            torch.Tensor: The index of boxes each point lies in with shape \
+                of (B, M, T).
+        """
+        from .box_3d_mode import Box3DMode
+
+        # to lidar
+        points_lidar = points.clone()
+        points_lidar = points_lidar[..., [1, 0, 2]]
+        points_lidar[..., 1] *= -1
+        if points.dim() == 2:
+            points_lidar = points_lidar.unsqueeze(0)
+        else:
+            assert points.dim() == 3 and points_lidar.shape[0] == 1
+
+        boxes_lidar = self.convert_to(Box3DMode.LIDAR).tensor
+        boxes_lidar = boxes_lidar.to(points.device).unsqueeze(0)
+        box_idxs_of_pts = points_in_boxes_batch(points_lidar, boxes_lidar)
+
+        return box_idxs_of_pts.squeeze(0)
+
+    def enlarged_box(self, extra_width):
+        """Enlarge the length, width and height boxes.
+
+        Args:
+            extra_width (float | torch.Tensor): Extra width to enlarge the box.
+
+        Returns:
+            :obj:`LiDARInstance3DBoxes`: Enlarged boxes.
+        """
+        enlarged_boxes = self.tensor.clone()
+        enlarged_boxes[:, 3:6] += extra_width * 2
+        # bottom center z minus extra_width
+        enlarged_boxes[:, 2] -= extra_width
+        return self.new_box(enlarged_boxes)
+
+    def get_surface_line_center(self):
+        """Compute surface and line center of bounding boxes.
+
+        Returns:
+            torch.Tensor: Surface and line center of bounding boxes.
+        """
+        obj_size = self.dims
+        center = self.gravity_center.view(-1, 1, 3)
+        batch_size = center.shape[0]
+
+        rot_sin = torch.sin(-self.yaw)
+        rot_cos = torch.cos(-self.yaw)
+        rot_mat_T = self.yaw.new_zeros(tuple(list(self.yaw.shape) + [3, 3]))
+        rot_mat_T[..., 0, 0] = rot_cos
+        rot_mat_T[..., 0, 1] = -rot_sin
+        rot_mat_T[..., 1, 0] = rot_sin
+        rot_mat_T[..., 1, 1] = rot_cos
+        rot_mat_T[..., 2, 2] = 1
+
+        # Get the object surface center
+        offset = obj_size.new_tensor([[0, 0, 1], [0, 0, -1], [0, 1, 0],
+                                      [0, -1, 0], [1, 0, 0], [-1, 0, 0]])
+        offset = offset.view(1, 6, 3) / 2
+        surface_3d = (offset *
+                      obj_size.view(batch_size, 1, 3).repeat(1, 6, 1)).reshape(
+                          -1, 3)
+
+        # Get the object line center
+        offset = obj_size.new_tensor([[1, 0, 1], [-1, 0, 1], [0, 1, 1],
+                                      [0, -1, 1], [1, 0, -1], [-1, 0, -1],
+                                      [0, 1, -1], [0, -1, -1], [1, 1, 0],
+                                      [1, -1, 0], [-1, 1, 0], [-1, -1, 0]])
+        offset = offset.view(1, 12, 3) / 2
+
+        line_3d = (offset *
+                   obj_size.view(batch_size, 1, 3).repeat(1, 12, 1)).reshape(
+                       -1, 3)
+
+        surface_rot = rot_mat_T.repeat(6, 1, 1)
+        surface_3d = torch.matmul(
+            surface_3d.unsqueeze(-2), surface_rot.transpose(2, 1)).squeeze(-2)
+        surface_center = center.repeat(1, 6, 1).reshape(-1, 3) + surface_3d
+
+        line_rot = rot_mat_T.repeat(12, 1, 1)
+        line_3d = torch.matmul(
+            line_3d.unsqueeze(-2), line_rot.transpose(2, 1)).squeeze(-2)
+        line_center = center.repeat(1, 12, 1).reshape(-1, 3) + line_3d
+
+        return surface_center, line_center
diff --git a/mmcv/core/bbox/structures/lidar_box3d.py b/mmcv/core/bbox/structures/lidar_box3d.py
new file mode 100644
index 0000000..f7e7ec2
--- /dev/null
+++ b/mmcv/core/bbox/structures/lidar_box3d.py
@@ -0,0 +1,270 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmcv.core.points import BasePoints
+from mmcv.ops.roiaware_pool3d import points_in_boxes_gpu
+from .base_box3d import BaseInstance3DBoxes
+from .utils import limit_period, rotation_3d_in_axis
+
+
+class LiDARInstance3DBoxes(BaseInstance3DBoxes):
+    """3D boxes of instances in LIDAR coordinates.
+
+    Coordinates in LiDAR:
+
+    .. code-block:: none
+
+                            up z    x front (yaw=-0.5*pi)
+                               ^   ^
+                               |  /
+                               | /
+      (yaw=-pi) left y <------ 0 -------- (yaw=0)
+
+    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+    The yaw is 0 at the negative direction of y axis, and decreases from
+    the negative direction of y to the positive direction of x.
+
+    A refactor is ongoing to make the three coordinate systems
+    easier to understand and convert between each other.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x box_dim.
+        box_dim (int): Integer indicating the dimension of a box.
+            Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+
+    @property
+    def gravity_center(self):
+        """torch.Tensor: A tensor with center of each box."""
+        bottom_center = self.bottom_center
+        gravity_center = torch.zeros_like(bottom_center)
+        gravity_center[:, :2] = bottom_center[:, :2]
+        gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5
+        return gravity_center
+
+    @property
+    def corners(self):
+        """torch.Tensor: Coordinates of corners of all the boxes
+        in shape (N, 8, 3).
+
+        Convert the boxes to corners in clockwise order, in form of
+        ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)``
+
+        .. code-block:: none
+
+                                           up z
+                            front x           ^
+                                 /            |
+                                /             |
+                  (x1, y0, z1) + -----------  + (x1, y1, z1)
+                              /|            / |
+                             / |           /  |
+               (x0, y0, z1) + ----------- +   + (x1, y1, z0)
+                            |  /      .   |  /
+                            | / origin    | /
+            left y<-------- + ----------- + (x0, y1, z0)
+                (x0, y0, z0)
+        """
+        # TODO: rotation_3d_in_axis function do not support
+        #  empty tensor currently.
+        assert len(self.tensor) != 0
+        dims = self.dims
+        corners_norm = torch.from_numpy(
+            np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
+                device=dims.device, dtype=dims.dtype)
+
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+        # use relative origin [0.5, 0.5, 0]
+        corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0])
+        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+        # rotate around z axis
+        corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=2)
+        corners += self.tensor[:, :3].view(-1, 1, 3)
+        return corners
+
+    @property
+    def bev(self):
+        """torch.Tensor: 2D BEV box of each box with rotation
+        in XYWHR format."""
+        return self.tensor[:, [0, 1, 3, 4, 6]]
+
+    @property
+    def nearest_bev(self):
+        """torch.Tensor: A tensor of 2D BEV box of each box
+        without rotation."""
+        # Obtain BEV boxes with rotation in XYWHR format
+        bev_rotated_boxes = self.bev
+        # convert the rotation to a valid range
+        rotations = bev_rotated_boxes[:, -1]
+        normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi))
+
+        # find the center of boxes
+        conditions = (normed_rotations > np.pi / 4)[..., None]
+        bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:,
+                                                                [0, 1, 3, 2]],
+                                  bev_rotated_boxes[:, :4])
+
+        centers = bboxes_xywh[:, :2]
+        dims = bboxes_xywh[:, 2:]
+        bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1)
+        return bev_boxes
+
+    def rotate(self, angle, points=None):
+        """Rotate boxes with points (optional) with the given angle or \
+        rotation matrix.
+
+        Args:
+            angles (float | torch.Tensor | np.ndarray):
+                Rotation angle or rotation matrix.
+            points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+
+        Returns:
+            tuple or None: When ``points`` is None, the function returns \
+                None, otherwise it returns the rotated points and the \
+                rotation matrix ``rot_mat_T``.
+        """
+        if not isinstance(angle, torch.Tensor):
+            angle = self.tensor.new_tensor(angle)
+        assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \
+            f'invalid rotation angle shape {angle.shape}'
+
+        if angle.numel() == 1:
+            rot_sin = torch.sin(angle)
+            rot_cos = torch.cos(angle)
+            rot_mat_T = self.tensor.new_tensor([[rot_cos, -rot_sin, 0],
+                                                [rot_sin, rot_cos, 0],
+                                                [0, 0, 1]])
+        else:
+            rot_mat_T = angle
+            rot_sin = rot_mat_T[1, 0]
+            rot_cos = rot_mat_T[0, 0]
+            angle = np.arctan2(rot_sin, rot_cos)
+
+        self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T
+        self.tensor[:, 6] += angle
+
+        if self.tensor.shape[1] == 9:
+            # rotate velo vector
+            self.tensor[:, 7:9] = self.tensor[:, 7:9] @ rot_mat_T[:2, :2]
+
+        if points is not None:
+            if isinstance(points, torch.Tensor):
+                points[:, :3] = points[:, :3] @ rot_mat_T
+            elif isinstance(points, np.ndarray):
+                rot_mat_T = rot_mat_T.numpy()
+                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
+            elif isinstance(points, BasePoints):
+                # clockwise
+                points.rotate(-angle)
+            else:
+                raise ValueError
+            return points, rot_mat_T
+
+    def flip(self, bev_direction='horizontal', points=None):
+        """Flip the boxes in BEV along given BEV direction.
+
+        In LIDAR coordinates, it flips the y (horizontal) or x (vertical) axis.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+            points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None):
+                Points to flip. Defaults to None.
+
+        Returns:
+            torch.Tensor, numpy.ndarray or None: Flipped points.
+        """
+        assert bev_direction in ('horizontal', 'vertical')
+        if bev_direction == 'horizontal':
+            self.tensor[:, 1::7] = -self.tensor[:, 1::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+        elif bev_direction == 'vertical':
+            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6]
+
+        if points is not None:
+            assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
+            if isinstance(points, (torch.Tensor, np.ndarray)):
+                if bev_direction == 'horizontal':
+                    points[:, 1] = -points[:, 1]
+                elif bev_direction == 'vertical':
+                    points[:, 0] = -points[:, 0]
+            elif isinstance(points, BasePoints):
+                points.flip(bev_direction)
+            return points
+
+    def in_range_bev(self, box_range):
+        """Check whether the boxes are in the given range.
+
+        Args:
+            box_range (list | torch.Tensor): the range of box
+                (x_min, y_min, x_max, y_max)
+
+        Note:
+            The original implementation of SECOND checks whether boxes in
+            a range by checking whether the points are in a convex
+            polygon, we reduce the burden for simpler cases.
+
+        Returns:
+            torch.Tensor: Whether each box is inside the reference range.
+        """
+        in_range_flags = ((self.tensor[:, 0] > box_range[0])
+                          & (self.tensor[:, 1] > box_range[1])
+                          & (self.tensor[:, 0] < box_range[2])
+                          & (self.tensor[:, 1] < box_range[3]))
+        return in_range_flags
+
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (:obj:`Box3DMode`): the target Box mode
+            rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+                matrix between different coordinates. Defaults to None.
+                The conversion from ``src`` coordinates to ``dst`` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: \
+                The converted box of the same type in the ``dst`` mode.
+        """
+        from .box_3d_mode import Box3DMode
+        return Box3DMode.convert(
+            box=self, src=Box3DMode.LIDAR, dst=dst, rt_mat=rt_mat)
+
+    def enlarged_box(self, extra_width):
+        """Enlarge the length, width and height boxes.
+
+        Args:
+            extra_width (float | torch.Tensor): Extra width to enlarge the box.
+
+        Returns:
+            :obj:`LiDARInstance3DBoxes`: Enlarged boxes.
+        """
+        enlarged_boxes = self.tensor.clone()
+        enlarged_boxes[:, 3:6] += extra_width * 2
+        # bottom center z minus extra_width
+        enlarged_boxes[:, 2] -= extra_width
+        return self.new_box(enlarged_boxes)
+
+    def points_in_boxes(self, points):
+        """Find the box which the points are in.
+
+        Args:
+            points (torch.Tensor): Points in shape (N, 3).
+
+        Returns:
+            torch.Tensor: The index of box where each point are in.
+        """
+        box_idx = points_in_boxes_gpu(
+            points.unsqueeze(0),
+            self.tensor.unsqueeze(0).to(points.device)).squeeze(0)
+        return box_idx
diff --git a/mmcv/core/bbox/structures/nuscenes_box.py b/mmcv/core/bbox/structures/nuscenes_box.py
new file mode 100644
index 0000000..05200a0
--- /dev/null
+++ b/mmcv/core/bbox/structures/nuscenes_box.py
@@ -0,0 +1,458 @@
+# nuScenes dev-kit.
+# Code written by Oscar Beijbom, 2018.
+
+import copy
+from typing import Tuple, List
+
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.axes import Axes
+from matplotlib.collections import LineCollection
+from pyquaternion import Quaternion
+from nuscenes.utils.geometry_utils import view_points
+from nuscenes.eval.common.data_classes import EvalBox
+from nuscenes.eval.detection.constants import DETECTION_NAMES, ATTRIBUTE_NAMES
+
+
+def color_map(data, cmap):
+    """数值映射为颜色"""
+    
+    dmin, dmax = np.nanmin(data), np.nanmax(data)
+    cmo = plt.cm.get_cmap(cmap)
+    cs, k = list(), 256/cmo.N
+    
+    for i in range(cmo.N):
+        c = cmo(i)
+        for j in range(int(i*k), int((i+1)*k)):
+            cs.append(c)
+    cs = np.array(cs)
+    data = np.uint8(255*(data-dmin)/(dmax-dmin))
+    
+    return cs[data]
+
+class CustomNuscenesBox:
+    """ Simple data class representing a 3d box including, label, score and velocity. """
+
+    def __init__(self,
+                 center: List[float],
+                 size: List[float],
+                 orientation: Quaternion,
+                 fut_trajs: List[float],
+                 label: int = np.nan,
+                 score: float = np.nan,
+                 velocity: Tuple = (np.nan, np.nan, np.nan),
+                 name: str = None,
+                 token: str = None):
+        """
+        :param center: Center of box given as x, y, z.
+        :param size: Size of box in width, length, height.
+        :param orientation: Box orientation.
+        :param label: Integer label, optional.
+        :param score: Classification score, optional.
+        :param velocity: Box velocity in x, y, z direction.
+        :param name: Box name, optional. Can be used e.g. for denote category name.
+        :param token: Unique string identifier from DB.
+        """
+        assert not np.any(np.isnan(center))
+        assert not np.any(np.isnan(size))
+        assert len(center) == 3
+        assert len(size) == 3
+        assert type(orientation) == Quaternion
+
+        self.center = np.array(center)
+        self.wlh = np.array(size)
+        self.orientation = orientation
+        self.label = int(label) if not np.isnan(label) else label
+        self.score = float(score) if not np.isnan(score) else score
+        self.velocity = np.array(velocity)
+        self.name = name
+        self.token = token
+        self.fut_trajs = np.array(fut_trajs)
+
+    def __eq__(self, other):
+        center = np.allclose(self.center, other.center)
+        wlh = np.allclose(self.wlh, other.wlh)
+        orientation = np.allclose(self.orientation.elements, other.orientation.elements)
+        label = (self.label == other.label) or (np.isnan(self.label) and np.isnan(other.label))
+        score = (self.score == other.score) or (np.isnan(self.score) and np.isnan(other.score))
+        vel = (np.allclose(self.velocity, other.velocity) or
+               (np.all(np.isnan(self.velocity)) and np.all(np.isnan(other.velocity))))
+
+        return center and wlh and orientation and label and score and vel
+
+    def __repr__(self):
+        repr_str = 'label: {}, score: {:.2f}, xyz: [{:.2f}, {:.2f}, {:.2f}], wlh: [{:.2f}, {:.2f}, {:.2f}], ' \
+                   'rot axis: [{:.2f}, {:.2f}, {:.2f}], ang(degrees): {:.2f}, ang(rad): {:.2f}, ' \
+                   'vel: {:.2f}, {:.2f}, {:.2f}, name: {}, token: {}'
+
+        return repr_str.format(self.label, self.score, self.center[0], self.center[1], self.center[2], self.wlh[0],
+                               self.wlh[1], self.wlh[2], self.orientation.axis[0], self.orientation.axis[1],
+                               self.orientation.axis[2], self.orientation.degrees, self.orientation.radians,
+                               self.velocity[0], self.velocity[1], self.velocity[2], self.name, self.token)
+
+    @property
+    def rotation_matrix(self) -> np.ndarray:
+        """
+        Return a rotation matrix.
+        :return: <np.float: 3, 3>. The box's rotation matrix.
+        """
+        return self.orientation.rotation_matrix
+
+    def translate(self, x: np.ndarray) -> None:
+        """
+        Applies a translation.
+        :param x: <np.float: 3, 1>. Translation in x, y, z direction.
+        """
+        self.center += x
+
+    def rotate(self, quaternion: Quaternion) -> None:
+        """
+        Rotates box.
+        :param quaternion: Rotation to apply.
+        """
+        self.center = np.dot(quaternion.rotation_matrix, self.center)
+        self.orientation = quaternion * self.orientation
+        self.velocity = np.dot(quaternion.rotation_matrix, self.velocity)
+
+    def corners(self, wlh_factor: float = 1.0) -> np.ndarray:
+        """
+        Returns the bounding box corners.
+        :param wlh_factor: Multiply w, l, h by a factor to scale the box.
+        :return: <np.float: 3, 8>. First four corners are the ones facing forward.
+            The last four are the ones facing backwards.
+        """
+        w, l, h = self.wlh * wlh_factor
+
+        # 3D bounding box corners. (Convention: x points forward, y to the left, z up.)
+        x_corners = l / 2 * np.array([1,  1,  1,  1, -1, -1, -1, -1])
+        y_corners = w / 2 * np.array([1, -1, -1,  1,  1, -1, -1,  1])
+        z_corners = h / 2 * np.array([1,  1, -1, -1,  1,  1, -1, -1])
+        corners = np.vstack((x_corners, y_corners, z_corners))
+
+        # Rotate
+        corners = np.dot(self.orientation.rotation_matrix, corners)
+
+        # Translate
+        x, y, z = self.center
+        corners[0, :] = corners[0, :] + x
+        corners[1, :] = corners[1, :] + y
+        corners[2, :] = corners[2, :] + z
+
+        return corners
+
+    def bottom_corners(self) -> np.ndarray:
+        """
+        Returns the four bottom corners.
+        :return: <np.float: 3, 4>. Bottom corners. First two face forward, last two face backwards.
+        """
+        return self.corners()[:, [2, 3, 7, 6]]
+
+    def render(self,
+               axis: Axes,
+               view: np.ndarray = np.eye(3),
+               normalize: bool = False,
+               colors: Tuple = ('b', 'r', 'k'),
+               linewidth: float = 2,
+               box_idx=None,
+               alpha=0.5) -> None:
+        """
+        Renders the box in the provided Matplotlib axis.
+        :param axis: Axis onto which the box should be drawn.
+        :param view: <np.array: 3, 3>. Define a projection in needed (e.g. for drawing projection in an image).
+        :param normalize: Whether to normalize the remaining coordinate.
+        :param colors: (<Matplotlib.colors>: 3). Valid Matplotlib colors (<str> or normalized RGB tuple) for front,
+            back and sides.
+        :param linewidth: Width in pixel of the box sides.
+        """
+        corners = view_points(self.corners(), view, normalize=normalize)[:2, :]
+
+        def draw_rect(selected_corners, color, alpha):
+            prev = selected_corners[-1]
+            for corner in selected_corners:
+                axis.plot([prev[0], corner[0]], [prev[1], corner[1]], color=color, linewidth=linewidth, alpha=alpha)
+                prev = corner
+
+        # Draw the sides
+        for i in range(4):
+            axis.plot([corners.T[i][0], corners.T[i + 4][0]],
+                      [corners.T[i][1], corners.T[i + 4][1]],
+                      color=colors[2], linewidth=linewidth, alpha=alpha)
+
+        # Draw front (first 4 corners) and rear (last 4 corners) rectangles(3d)/lines(2d)
+        draw_rect(corners.T[:4], colors[0], alpha)
+        draw_rect(corners.T[4:], colors[1], alpha)
+
+        # Draw line indicating the front
+        center_bottom_forward = np.mean(corners.T[2:4], axis=0)
+        center_bottom = np.mean(corners.T[[2, 3, 7, 6]], axis=0)
+        axis.plot([center_bottom[0], center_bottom_forward[0]],
+                  [center_bottom[1], center_bottom_forward[1]],
+                  color=colors[0], linewidth=linewidth, alpha=alpha)
+        if box_idx is not None and center_bottom[0] > -35 and center_bottom[1] > -35 \
+            and center_bottom[0] < 35 and center_bottom[1] < 35:
+            text = f'{box_idx}'
+            axis.text(center_bottom[0], center_bottom[1], text, ha='left', fontsize=5)
+    
+    def render_fut_trajs(self,
+               axis: Axes,
+               color: str = 'b',
+               linewidth: float = 1,
+               fut_ts: int = 6,
+               mode_idx=None) -> None:
+        """
+        Renders the box in the provided Matplotlib axis.
+        :param axis: Axis onto which the box should be drawn.
+        :param view: <np.array: 3, 3>. Define a projection in needed (e.g. for drawing projection in an image).
+        :param normalize: Whether to normalize the remaining coordinate.
+        :param colors: (<Matplotlib.colors>: 3). Valid Matplotlib colors (<str> or normalized RGB tuple) for front,
+            back and sides.
+        :param linewidth: Width in pixel of the box sides.
+        """
+
+        fut_coords = self.fut_trajs.reshape((-1, fut_ts, 2))
+        if mode_idx is not None:
+            fut_coords = fut_coords[[mode_idx]]
+        alpha = 0.8
+        for i in range(fut_coords.shape[0]):
+            fut_coord = fut_coords[i]
+            fut_coord = fut_coord.cumsum(axis=-2)
+            fut_coord = fut_coord + self.center[:2]
+            if np.abs(fut_coord[-1] - self.center[:2]).max() >= 10:
+                if color == 'g':
+                    axis.scatter(fut_coord[-1, 0], fut_coord[-1, 1], c=color, marker='*', s=70, alpha=alpha)
+                elif color == 'b':
+                    axis.scatter(fut_coord[-1, 0], fut_coord[-1, 1], c=color, marker='o', s=20, alpha=alpha)
+                if mode_idx is None and fut_coord[-1, 0] > -35 and fut_coord[-1, 1] > -35 \
+                    and fut_coord[-1, 0] < 35 and fut_coord[-1, 1] < 35:
+                    text = f'{i}'
+                    axis.text(fut_coord[-1, 0], fut_coord[-1, 1], text, ha='left', fontsize=5)
+            axis.plot(
+                [self.center[0], fut_coord[0, 0]],
+                [self.center[1], fut_coord[0, 1]],
+                color=color, linewidth=linewidth, alpha=alpha
+            )
+            for i in range(fut_coord.shape[0]-1):
+                axis.plot(
+                    [fut_coord[i, 0], fut_coord[i+1, 0]],
+                    [fut_coord[i, 1], fut_coord[i+1, 1]],
+                    color=color, linewidth=linewidth, alpha=alpha
+                )
+
+    def render_fut_trajs_grad_color(self,
+               axis: Axes,
+               linewidth: float = 1,
+               linestyles='solid',
+               cmap='viridis',
+               fut_ts: int = 6,
+               alpha: int = 0.8,
+               mode_idx=None) -> None:
+        """
+        Renders the box in the provided Matplotlib axis.
+        :param axis: Axis onto which the box should be drawn.
+        :param view: <np.array: 3, 3>. Define a projection in needed (e.g. for drawing projection in an image).
+        :param normalize: Whether to normalize the remaining coordinate.
+        :param colors: (<Matplotlib.colors>: 3). Valid Matplotlib colors (<str> or normalized RGB tuple) for front,
+            back and sides.
+        :param linewidth: Width in pixel of the box sides.
+        """
+
+        fut_coords = self.fut_trajs.reshape((-1, fut_ts, 2))
+        if mode_idx is not None:
+            fut_coords = fut_coords[[mode_idx]]
+
+        for i in range(fut_coords.shape[0]):
+            fut_coord = fut_coords[i]
+            fut_coord = fut_coord.cumsum(axis=-2)
+            fut_coord = fut_coord + self.center[:2]
+            fut_coord = np.concatenate((self.center[np.newaxis, :2], fut_coord), axis=0)
+            fut_coord_segments = np.stack((fut_coord[:-1], fut_coord[1:]), axis=1)
+
+            fut_vecs = None
+            for j in range(fut_coord_segments.shape[0]):
+                fut_vec_j = fut_coord_segments[j]
+                x_linspace = np.linspace(fut_vec_j[0, 0], fut_vec_j[1, 0], 51)
+                y_linspace = np.linspace(fut_vec_j[0, 1], fut_vec_j[1, 1], 51)
+                xy = np.stack((x_linspace, y_linspace), axis=1)
+                xy = np.stack((xy[:-1], xy[1:]), axis=1)
+                if fut_vecs is None:
+                    fut_vecs = xy
+                else:
+                    fut_vecs = np.concatenate((fut_vecs, xy), axis=0)
+
+            y = np.sin(np.linspace(3/2*np.pi, 5/2*np.pi, 301))
+            colors = color_map(y[:-1], cmap)
+            line_segments = LineCollection(fut_vecs, colors=colors, linewidths=linewidth, linestyles=linestyles, cmap=cmap)
+
+            # if mode_idx is None and abs(fut_coord[-1, 0]) < 35 and abs(fut_coord[-1, 1]) < 35:
+            #     text = f'{i}'
+            #     axis.text(fut_coord[-1, 0], fut_coord[-1, 1], text, ha='left', fontsize=5)
+
+            axis.add_collection(line_segments)
+
+    def render_fut_trajs_coords(self,
+               axis: Axes,
+               color: str = 'b',
+               linewidth: float = 1,
+               fut_ts: int = 12) -> None:
+        """
+        Renders the box in the provided Matplotlib axis.
+        :param axis: Axis onto which the box should be drawn.
+        :param view: <np.array: 3, 3>. Define a projection in needed (e.g. for drawing projection in an image).
+        :param normalize: Whether to normalize the remaining coordinate.
+        :param colors: (<Matplotlib.colors>: 3). Valid Matplotlib colors (<str> or normalized RGB tuple) for front,
+            back and sides.
+        :param linewidth: Width in pixel of the box sides.
+        """
+
+        fut_coords = self.fut_trajs.reshape((-1, fut_ts, 2))
+        alpha = 0.2 if color == 'b' else 1
+        for i in range(fut_coords.shape[0]):
+            fut_coord = fut_coords[i]
+            fut_coord = fut_coord + self.center[:2]
+            if np.abs(fut_coord[-1] - self.center[:2]).max() >= 10:
+                if color == 'g':
+                    axis.scatter(fut_coord[-1, 0], fut_coord[-1, 1], c=color, marker='*', s=70, alpha=alpha)
+                elif color == 'b':
+                    axis.scatter(fut_coord[-1, 0], fut_coord[-1, 1], c=color, marker='o', s=20, alpha=alpha)
+            axis.plot(
+                [self.center[0], fut_coord[0, 0]],
+                [self.center[1], fut_coord[0, 1]],
+                color=color, linewidth=linewidth, alpha=alpha
+            )
+            for i in range(fut_coord.shape[0]-1):
+                axis.plot(
+                    [fut_coord[i, 0], fut_coord[i+1, 0]],
+                    [fut_coord[i, 1], fut_coord[i+1, 1]],
+                    color=color, linewidth=linewidth, alpha=alpha
+                )
+
+    def render_cv2(self,
+                   im: np.ndarray,
+                   view: np.ndarray = np.eye(3),
+                   normalize: bool = False,
+                   colors: Tuple = ((0, 0, 255), (255, 0, 0), (155, 155, 155)),
+                   linewidth: int = 2) -> None:
+        """
+        Renders box using OpenCV2.
+        :param im: <np.array: width, height, 3>. Image array. Channels are in BGR order.
+        :param view: <np.array: 3, 3>. Define a projection if needed (e.g. for drawing projection in an image).
+        :param normalize: Whether to normalize the remaining coordinate.
+        :param colors: ((R, G, B), (R, G, B), (R, G, B)). Colors for front, side & rear.
+        :param linewidth: Linewidth for plot.
+        """
+        corners = view_points(self.corners(), view, normalize=normalize)[:2, :]
+
+        def draw_rect(selected_corners, color):
+            prev = selected_corners[-1]
+            for corner in selected_corners:
+                cv2.line(im,
+                         (int(prev[0]), int(prev[1])),
+                         (int(corner[0]), int(corner[1])),
+                         color, linewidth)
+                prev = corner
+
+        # Draw the sides
+        for i in range(4):
+            cv2.line(im,
+                     (int(corners.T[i][0]), int(corners.T[i][1])),
+                     (int(corners.T[i + 4][0]), int(corners.T[i + 4][1])),
+                     colors[2][::-1], linewidth)
+
+        # Draw front (first 4 corners) and rear (last 4 corners) rectangles(3d)/lines(2d)
+        draw_rect(corners.T[:4], colors[0][::-1])
+        draw_rect(corners.T[4:], colors[1][::-1])
+
+        # Draw line indicating the front
+        center_bottom_forward = np.mean(corners.T[2:4], axis=0)
+        center_bottom = np.mean(corners.T[[2, 3, 7, 6]], axis=0)
+        cv2.line(im,
+                 (int(center_bottom[0]), int(center_bottom[1])),
+                 (int(center_bottom_forward[0]), int(center_bottom_forward[1])),
+                 colors[0][::-1], linewidth)
+
+    def copy(self) -> 'CustomNuscenesBox':
+        """
+        Create a copy of self.
+        :return: A copy.
+        """
+        return copy.deepcopy(self)
+
+
+class CustomDetectionBox(EvalBox):
+    """ Data class used during detection evaluation. Can be a prediction or ground truth."""
+
+    def __init__(self,
+                 sample_token: str = "",
+                 translation: Tuple[float, float, float] = (0, 0, 0),
+                 size: Tuple[float, float, float] = (0, 0, 0),
+                 rotation: Tuple[float, float, float, float] = (0, 0, 0, 0),
+                 velocity: Tuple[float, float] = (0, 0),
+                 ego_translation: Tuple[float, float, float] = (0, 0, 0),  # Translation to ego vehicle in meters.
+                 num_pts: int = -1,  # Nbr. LIDAR or RADAR inside the box. Only for gt boxes.
+                 detection_name: str = 'car',  # The class name used in the detection challenge.
+                 detection_score: float = -1.0,  # GT samples do not have a score.
+                 attribute_name: str = '',  # Box attribute. Each box can have at most 1 attribute.
+                 fut_trajs=None):  # future trajectories of a pred box, shape=[fut_ts*2].
+
+        super().__init__(sample_token, translation, size, rotation, velocity, ego_translation, num_pts)
+
+        assert detection_name is not None, 'Error: detection_name cannot be empty!'
+        assert detection_name in DETECTION_NAMES, 'Error: Unknown detection_name %s' % detection_name
+
+        assert attribute_name in ATTRIBUTE_NAMES or attribute_name == '', \
+            'Error: Unknown attribute_name %s' % attribute_name
+
+        assert type(detection_score) == float, 'Error: detection_score must be a float!'
+        assert not np.any(np.isnan(detection_score)), 'Error: detection_score may not be NaN!'
+
+        # Assign.
+        self.detection_name = detection_name
+        self.detection_score = detection_score
+        self.attribute_name = attribute_name
+        self.fut_trajs = fut_trajs
+
+    def __eq__(self, other):
+        return (self.sample_token == other.sample_token and
+                self.translation == other.translation and
+                self.size == other.size and
+                self.rotation == other.rotation and
+                self.velocity == other.velocity and
+                self.ego_translation == other.ego_translation and
+                self.num_pts == other.num_pts and
+                self.detection_name == other.detection_name and
+                self.detection_score == other.detection_score and
+                self.attribute_name == other.attribute_name and
+                self.fut_trajs == other.fut_trajs)
+
+    def serialize(self) -> dict:
+        """ Serialize instance into json-friendly format. """
+        return {
+            'sample_token': self.sample_token,
+            'translation': self.translation,
+            'size': self.size,
+            'rotation': self.rotation,
+            'velocity': self.velocity,
+            'ego_translation': self.ego_translation,
+            'num_pts': self.num_pts,
+            'detection_name': self.detection_name,
+            'detection_score': self.detection_score,
+            'attribute_name': self.attribute_name,
+            'fut_trajs': self.fut_trajs
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized content. """
+        return cls(sample_token=content['sample_token'],
+                   translation=tuple(content['translation']),
+                   size=tuple(content['size']),
+                   rotation=tuple(content['rotation']),
+                   velocity=tuple(content['velocity']),
+                   fut_trajs=tuple(content['fut_trajs']),
+                   ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+                   else tuple(content['ego_translation']),
+                   num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+                   detection_name=content['detection_name'],
+                   detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+                   attribute_name=content['attribute_name'])
diff --git a/mmcv/core/bbox/structures/utils.py b/mmcv/core/bbox/structures/utils.py
new file mode 100644
index 0000000..842131f
--- /dev/null
+++ b/mmcv/core/bbox/structures/utils.py
@@ -0,0 +1,214 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from logging import warning
+
+
+def limit_period(val, offset=0.5, period=np.pi):
+    """Limit the value into a period for periodic function.
+
+    Args:
+        val (torch.Tensor): The value to be converted.
+        offset (float, optional): Offset to set the value range. \
+            Defaults to 0.5.
+        period ([type], optional): Period of the value. Defaults to np.pi.
+
+    Returns:
+        torch.Tensor: Value in the range of \
+            [-offset * period, (1-offset) * period]
+    """
+    return val - torch.floor(val / period + offset) * period
+
+
+def rotation_3d_in_axis(points, angles, axis=0):
+    """Rotate points by angles according to axis.
+
+    Args:
+        points (torch.Tensor): Points of shape (N, M, 3).
+        angles (torch.Tensor): Vector of angles in shape (N,)
+        axis (int, optional): The axis to be rotated. Defaults to 0.
+
+    Raises:
+        ValueError: when the axis is not in range [0, 1, 2], it will \
+            raise value error.
+
+    Returns:
+        torch.Tensor: Rotated points in shape (N, M, 3)
+    """
+    rot_sin = torch.sin(angles)
+    rot_cos = torch.cos(angles)
+    ones = torch.ones_like(rot_cos)
+    zeros = torch.zeros_like(rot_cos)
+    if axis == 1:
+        rot_mat_T = torch.stack([
+            torch.stack([rot_cos, zeros, -rot_sin]),
+            torch.stack([zeros, ones, zeros]),
+            torch.stack([rot_sin, zeros, rot_cos])
+        ])
+    elif axis == 2 or axis == -1:
+        rot_mat_T = torch.stack([
+            torch.stack([rot_cos, -rot_sin, zeros]),
+            torch.stack([rot_sin, rot_cos, zeros]),
+            torch.stack([zeros, zeros, ones])
+        ])
+    elif axis == 0:
+        rot_mat_T = torch.stack([
+            torch.stack([zeros, rot_cos, -rot_sin]),
+            torch.stack([zeros, rot_sin, rot_cos]),
+            torch.stack([ones, zeros, zeros])
+        ])
+    else:
+        raise ValueError(f'axis should in range [0, 1, 2], got {axis}')
+
+    return torch.einsum('aij,jka->aik', (points, rot_mat_T))
+
+
+def xywhr2xyxyr(boxes_xywhr):
+    """Convert a rotated boxes in XYWHR format to XYXYR format.
+
+    Args:
+        boxes_xywhr (torch.Tensor): Rotated boxes in XYWHR format.
+
+    Returns:
+        torch.Tensor: Converted boxes in XYXYR format.
+    """
+    boxes = torch.zeros_like(boxes_xywhr)
+    half_w = boxes_xywhr[:, 2] / 2
+    half_h = boxes_xywhr[:, 3] / 2
+
+    boxes[:, 0] = boxes_xywhr[:, 0] - half_w
+    boxes[:, 1] = boxes_xywhr[:, 1] - half_h
+    boxes[:, 2] = boxes_xywhr[:, 0] + half_w
+    boxes[:, 3] = boxes_xywhr[:, 1] + half_h
+    boxes[:, 4] = boxes_xywhr[:, 4]
+    return boxes
+
+
+def get_box_type(box_type):
+    """Get the type and mode of box structure.
+
+    Args:
+        box_type (str): The type of box structure.
+            The valid value are "LiDAR", "Camera", or "Depth".
+
+    Returns:
+        tuple: Box type and box mode.
+    """
+    from .box_3d_mode import (Box3DMode, CameraInstance3DBoxes,
+                              DepthInstance3DBoxes, LiDARInstance3DBoxes)
+    box_type_lower = box_type.lower()
+    if box_type_lower == 'lidar':
+        box_type_3d = LiDARInstance3DBoxes
+        box_mode_3d = Box3DMode.LIDAR
+    elif box_type_lower == 'camera':
+        box_type_3d = CameraInstance3DBoxes
+        box_mode_3d = Box3DMode.CAM
+    elif box_type_lower == 'depth':
+        box_type_3d = DepthInstance3DBoxes
+        box_mode_3d = Box3DMode.DEPTH
+    else:
+        raise ValueError('Only "box_type" of "camera", "lidar", "depth"'
+                         f' are supported, got {box_type}')
+
+    return box_type_3d, box_mode_3d
+
+
+def points_cam2img(points_3d, proj_mat, with_depth=False):
+    """Project points from camera coordicates to image coordinates.
+
+    Args:
+        points_3d (torch.Tensor): Points in shape (N, 3).
+        proj_mat (torch.Tensor): Transformation matrix between coordinates.
+        with_depth (bool, optional): Whether to keep depth in the output.
+            Defaults to False.
+
+    Returns:
+        torch.Tensor: Points in image coordinates with shape [N, 2].
+    """
+    points_num = list(points_3d.shape)[:-1]
+
+    points_shape = np.concatenate([points_num, [1]], axis=0).tolist()
+    assert len(proj_mat.shape) == 2, 'The dimension of the projection'\
+        f' matrix should be 2 instead of {len(proj_mat.shape)}.'
+    d1, d2 = proj_mat.shape[:2]
+    assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
+        d1 == 4 and d2 == 4), 'The shape of the projection matrix'\
+        f' ({d1}*{d2}) is not supported.'
+    if d1 == 3:
+        proj_mat_expanded = torch.eye(
+            4, device=proj_mat.device, dtype=proj_mat.dtype)
+        proj_mat_expanded[:d1, :d2] = proj_mat
+        proj_mat = proj_mat_expanded
+
+    # previous implementation use new_zeros, new_one yeilds better results
+    points_4 = torch.cat(
+        [points_3d, points_3d.new_ones(*points_shape)], dim=-1)
+    point_2d = torch.matmul(points_4, proj_mat.t())
+    point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]
+
+    if with_depth:
+        return torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1)
+    return point_2d_res
+
+
+def mono_cam_box2vis(cam_box):
+    """This is a post-processing function on the bboxes from Mono-3D task. If
+    we want to perform projection visualization, we need to:
+
+        1. rotate the box along x-axis for np.pi / 2 (roll)
+        2. change orientation from local yaw to global yaw
+        3. convert yaw by (np.pi / 2 - yaw)
+
+    After applying this function, we can project and draw it on 2D images.
+
+    Args:
+        cam_box (:obj:`CameraInstance3DBoxes`): 3D bbox in camera coordinate \
+            system before conversion. Could be gt bbox loaded from dataset or \
+                network prediction output.
+
+    Returns:
+        :obj:`CameraInstance3DBoxes`: Box after conversion.
+    """
+    warning.warn('DeprecationWarning: The hack of yaw and dimension in the '
+                 'monocular 3D detection on nuScenes has been removed. The '
+                 'function mono_cam_box2vis will be deprecated.')
+    from . import CameraInstance3DBoxes
+    assert isinstance(cam_box, CameraInstance3DBoxes), \
+        'input bbox should be CameraInstance3DBoxes!'
+
+    loc = cam_box.gravity_center
+    dim = cam_box.dims
+    yaw = cam_box.yaw
+    feats = cam_box.tensor[:, 7:]
+    # rotate along x-axis for np.pi / 2
+    # see also here: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L557  # noqa
+    dim[:, [1, 2]] = dim[:, [2, 1]]
+    # change local yaw to global yaw for visualization
+    # refer to https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L164-L166  # noqa
+    yaw += torch.atan2(loc[:, 0], loc[:, 2])
+    # convert yaw by (-yaw - np.pi / 2)
+    # this is because mono 3D box class such as `NuScenesBox` has different
+    # definition of rotation with our `CameraInstance3DBoxes`
+    yaw = -yaw - np.pi / 2
+    cam_box = torch.cat([loc, dim, yaw[:, None], feats], dim=1)
+    cam_box = CameraInstance3DBoxes(
+        cam_box, box_dim=cam_box.shape[-1], origin=(0.5, 0.5, 0.5))
+
+    return cam_box
+
+
+def get_proj_mat_by_coord_type(img_meta, coord_type):
+    """Obtain image features using points.
+
+    Args:
+        img_meta (dict): Meta info.
+        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
+            Can be case-insensitive.
+
+    Returns:
+        torch.Tensor: transformation matrix.
+    """
+    coord_type = coord_type.upper()
+    mapping = {'LIDAR': 'lidar2img', 'DEPTH': 'depth2img', 'CAMERA': 'cam2img'}
+    assert coord_type in mapping.keys()
+    return img_meta[mapping[coord_type]]
diff --git a/mmcv/core/bbox/transforms.py b/mmcv/core/bbox/transforms.py
new file mode 100644
index 0000000..2dcd769
--- /dev/null
+++ b/mmcv/core/bbox/transforms.py
@@ -0,0 +1,320 @@
+import numpy as np
+import torch
+
+
+def bbox_flip(bboxes, img_shape, direction='horizontal'):
+    """Flip bboxes horizontally or vertically.
+
+    Args:
+        bboxes (Tensor): Shape (..., 4*k)
+        img_shape (tuple): Image shape.
+        direction (str): Flip direction, options are "horizontal", "vertical",
+            "diagonal". Default: "horizontal"
+
+    Returns:
+        Tensor: Flipped bboxes.
+    """
+    assert bboxes.shape[-1] % 4 == 0
+    assert direction in ['horizontal', 'vertical', 'diagonal']
+    flipped = bboxes.clone()
+    if direction == 'horizontal':
+        flipped[..., 0::4] = img_shape[1] - bboxes[..., 2::4]
+        flipped[..., 2::4] = img_shape[1] - bboxes[..., 0::4]
+    elif direction == 'vertical':
+        flipped[..., 1::4] = img_shape[0] - bboxes[..., 3::4]
+        flipped[..., 3::4] = img_shape[0] - bboxes[..., 1::4]
+    else:
+        flipped[..., 0::4] = img_shape[1] - bboxes[..., 2::4]
+        flipped[..., 1::4] = img_shape[0] - bboxes[..., 3::4]
+        flipped[..., 2::4] = img_shape[1] - bboxes[..., 0::4]
+        flipped[..., 3::4] = img_shape[0] - bboxes[..., 1::4]
+    return flipped
+
+
+def bbox_mapping(bboxes,
+                 img_shape,
+                 scale_factor,
+                 flip,
+                 flip_direction='horizontal'):
+    """Map bboxes from the original image scale to testing scale."""
+    new_bboxes = bboxes * bboxes.new_tensor(scale_factor)
+    if flip:
+        new_bboxes = bbox_flip(new_bboxes, img_shape, flip_direction)
+    return new_bboxes
+
+
+def bbox_mapping_back(bboxes,
+                      img_shape,
+                      scale_factor,
+                      flip,
+                      flip_direction='horizontal'):
+    """Map bboxes from testing scale to original image scale."""
+    new_bboxes = bbox_flip(bboxes, img_shape,
+                           flip_direction) if flip else bboxes
+    new_bboxes = new_bboxes.view(-1, 4) / new_bboxes.new_tensor(scale_factor)
+    return new_bboxes.view(bboxes.shape)
+
+
+def bbox2roi(bbox_list):
+    """Convert a list of bboxes to roi format.
+
+    Args:
+        bbox_list (list[Tensor]): a list of bboxes corresponding to a batch
+            of images.
+
+    Returns:
+        Tensor: shape (n, 5), [batch_ind, x1, y1, x2, y2]
+    """
+    rois_list = []
+    for img_id, bboxes in enumerate(bbox_list):
+        if bboxes.size(0) > 0:
+            img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
+            rois = torch.cat([img_inds, bboxes[:, :4]], dim=-1)
+        else:
+            rois = bboxes.new_zeros((0, 5))
+        rois_list.append(rois)
+    rois = torch.cat(rois_list, 0)
+    return rois
+
+
+def roi2bbox(rois):
+    """Convert rois to bounding box format.
+
+    Args:
+        rois (torch.Tensor): RoIs with the shape (n, 5) where the first
+            column indicates batch id of each RoI.
+
+    Returns:
+        list[torch.Tensor]: Converted boxes of corresponding rois.
+    """
+    bbox_list = []
+    img_ids = torch.unique(rois[:, 0].cpu(), sorted=True)
+    for img_id in img_ids:
+        inds = (rois[:, 0] == img_id.item())
+        bbox = rois[inds, 1:]
+        bbox_list.append(bbox)
+    return bbox_list
+
+
+def bbox2result(bboxes, labels, num_classes):
+    """Convert detection results to a list of numpy arrays.
+
+    Args:
+        bboxes (torch.Tensor | np.ndarray): shape (n, 5)
+        labels (torch.Tensor | np.ndarray): shape (n, )
+        num_classes (int): class number, including background class
+
+    Returns:
+        list(ndarray): bbox results of each class
+    """
+    if bboxes.shape[0] == 0:
+        return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)]
+    else:
+        if isinstance(bboxes, torch.Tensor):
+            bboxes = bboxes.detach().cpu().numpy()
+            labels = labels.detach().cpu().numpy()
+        return [bboxes[labels == i, :] for i in range(num_classes)]
+
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (B, N, 2) or (N, 2).
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom). Shape (B, N, 4) or (N, 4)
+        max_shape (Sequence[int] or torch.Tensor or Sequence[
+            Sequence[int]],optional): Maximum bounds for boxes, specifies
+            (H, W, C) or (H, W). If priors shape is (B, N, 4), then
+            the max_shape should be a Sequence[Sequence[int]]
+            and the length of max_shape should also be B.
+
+    Returns:
+        Tensor: Boxes with shape (N, 4) or (B, N, 4)
+    """
+    x1 = points[..., 0] - distance[..., 0]
+    y1 = points[..., 1] - distance[..., 1]
+    x2 = points[..., 0] + distance[..., 2]
+    y2 = points[..., 1] + distance[..., 3]
+
+    bboxes = torch.stack([x1, y1, x2, y2], -1)
+
+    if max_shape is not None:
+        # clip bboxes with dynamic `min` and `max` for onnx
+        if torch.onnx.is_in_onnx_export():
+            from mmcv.core.export import dynamic_clip_for_onnx
+            x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, max_shape)
+            bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+            return bboxes
+        if not isinstance(max_shape, torch.Tensor):
+            max_shape = x1.new_tensor(max_shape)
+        max_shape = max_shape[..., :2].type_as(x1)
+        if max_shape.ndim == 2:
+            assert bboxes.ndim == 3
+            assert max_shape.size(0) == bboxes.size(0)
+
+        min_xy = x1.new_tensor(0)
+        max_xy = torch.cat([max_shape, max_shape],
+                           dim=-1).flip(-1).unsqueeze(-2)
+        bboxes = torch.where(bboxes < min_xy, min_xy, bboxes)
+        bboxes = torch.where(bboxes > max_xy, max_xy, bboxes)
+
+    return bboxes
+
+
+def bbox2distance(points, bbox, max_dis=None, eps=0.1):
+    """Decode bounding box based on distances.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        bbox (Tensor): Shape (n, 4), "xyxy" format
+        max_dis (float): Upper bound of the distance.
+        eps (float): a small value to ensure target < max_dis, instead <=
+
+    Returns:
+        Tensor: Decoded distances.
+    """
+    left = points[:, 0] - bbox[:, 0]
+    top = points[:, 1] - bbox[:, 1]
+    right = bbox[:, 2] - points[:, 0]
+    bottom = bbox[:, 3] - points[:, 1]
+    if max_dis is not None:
+        left = left.clamp(min=0, max=max_dis - eps)
+        top = top.clamp(min=0, max=max_dis - eps)
+        right = right.clamp(min=0, max=max_dis - eps)
+        bottom = bottom.clamp(min=0, max=max_dis - eps)
+    return torch.stack([left, top, right, bottom], -1)
+
+
+def bbox_rescale(bboxes, scale_factor=1.0):
+    """Rescale bounding box w.r.t. scale_factor.
+
+    Args:
+        bboxes (Tensor): Shape (n, 4) for bboxes or (n, 5) for rois
+        scale_factor (float): rescale factor
+
+    Returns:
+        Tensor: Rescaled bboxes.
+    """
+    if bboxes.size(1) == 5:
+        bboxes_ = bboxes[:, 1:]
+        inds_ = bboxes[:, 0]
+    else:
+        bboxes_ = bboxes
+    cx = (bboxes_[:, 0] + bboxes_[:, 2]) * 0.5
+    cy = (bboxes_[:, 1] + bboxes_[:, 3]) * 0.5
+    w = bboxes_[:, 2] - bboxes_[:, 0]
+    h = bboxes_[:, 3] - bboxes_[:, 1]
+    w = w * scale_factor
+    h = h * scale_factor
+    x1 = cx - 0.5 * w
+    x2 = cx + 0.5 * w
+    y1 = cy - 0.5 * h
+    y2 = cy + 0.5 * h
+    if bboxes.size(1) == 5:
+        rescaled_bboxes = torch.stack([inds_, x1, y1, x2, y2], dim=-1)
+    else:
+        rescaled_bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+    return rescaled_bboxes
+
+
+def bbox_cxcywh_to_xyxy(bbox):
+    """Convert bbox coordinates from (cx, cy, w, h) to (x1, y1, x2, y2).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) for bboxes.
+
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    cx, cy, w, h = bbox.split((1, 1, 1, 1), dim=-1)
+    bbox_new = [(cx - 0.5 * w), (cy - 0.5 * h), (cx + 0.5 * w), (cy + 0.5 * h)]
+    return torch.cat(bbox_new, dim=-1)
+
+
+def bbox_xyxy_to_cxcywh(bbox):
+    """Convert bbox coordinates from (x1, y1, x2, y2) to (cx, cy, w, h).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) for bboxes.
+
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    x1, y1, x2, y2 = bbox.split((1, 1, 1, 1), dim=-1)
+    bbox_new = [(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)]
+    return torch.cat(bbox_new, dim=-1)
+
+def bbox3d_mapping_back(bboxes, scale_factor, flip_horizontal, flip_vertical):
+    """Map bboxes from testing scale to original image scale.
+
+    Args:
+        bboxes (:obj:`BaseInstance3DBoxes`): Boxes to be mapped back.
+        scale_factor (float): Scale factor.
+        flip_horizontal (bool): Whether to flip horizontally.
+        flip_vertical (bool): Whether to flip vertically.
+
+    Returns:
+        :obj:`BaseInstance3DBoxes`: Boxes mapped back.
+    """
+    new_bboxes = bboxes.clone()
+    if flip_horizontal:
+        new_bboxes.flip('horizontal')
+    if flip_vertical:
+        new_bboxes.flip('vertical')
+    new_bboxes.scale(1 / scale_factor)
+
+    return new_bboxes
+
+
+def bbox3d2roi(bbox_list):
+    """Convert a list of bounding boxes to roi format.
+
+    Args:
+        bbox_list (list[torch.Tensor]): A list of bounding boxes
+            corresponding to a batch of images.
+
+    Returns:
+        torch.Tensor: Region of interests in shape (n, c), where \
+            the channels are in order of [batch_ind, x, y ...].
+    """
+    rois_list = []
+    for img_id, bboxes in enumerate(bbox_list):
+        if bboxes.size(0) > 0:
+            img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
+            rois = torch.cat([img_inds, bboxes], dim=-1)
+        else:
+            rois = torch.zeros_like(bboxes)
+        rois_list.append(rois)
+    rois = torch.cat(rois_list, 0)
+    return rois
+
+
+def bbox3d2result(bboxes, scores, labels, attrs=None):
+    """Convert detection results to a list of numpy arrays.
+
+    Args:
+        bboxes (torch.Tensor): Bounding boxes with shape of (n, 5).
+        labels (torch.Tensor): Labels with shape of (n, ).
+        scores (torch.Tensor): Scores with shape of (n, ).
+        attrs (torch.Tensor, optional): Attributes with shape of (n, ). \
+            Defaults to None.
+
+    Returns:
+        dict[str, torch.Tensor]: Bounding box results in cpu mode.
+
+            - boxes_3d (torch.Tensor): 3D boxes.
+            - scores (torch.Tensor): Prediction scores.
+            - labels_3d (torch.Tensor): Box labels.
+            - attrs_3d (torch.Tensor, optional): Box attributes.
+    """
+    result_dict = dict(
+        boxes_3d=bboxes.to('cpu'),
+        scores_3d=scores.cpu(),
+        labels_3d=labels.cpu())
+
+    if attrs is not None:
+        result_dict['attrs_3d'] = attrs.cpu()
+
+    return result_dict
+
diff --git a/mmcv/core/bbox/util.py b/mmcv/core/bbox/util.py
new file mode 100755
index 0000000..c54bd75
--- /dev/null
+++ b/mmcv/core/bbox/util.py
@@ -0,0 +1,53 @@
+import torch 
+
+
+def normalize_bbox(bboxes, pc_range):
+
+    cx = bboxes[..., 0:1]
+    cy = bboxes[..., 1:2]
+    cz = bboxes[..., 2:3]
+    w = bboxes[..., 3:4].log()
+    l = bboxes[..., 4:5].log()
+    h = bboxes[..., 5:6].log()
+
+    rot = bboxes[..., 6:7]
+    if bboxes.size(-1) > 7:
+        vx = bboxes[..., 7:8] 
+        vy = bboxes[..., 8:9]
+        normalized_bboxes = torch.cat(
+            (cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy), dim=-1
+        )
+    else:
+        normalized_bboxes = torch.cat(
+            (cx, cy, w, l, cz, h, rot.sin(), rot.cos()), dim=-1
+        )
+    return normalized_bboxes
+
+def denormalize_bbox(normalized_bboxes, pc_range):
+    # rotation 
+    rot_sine = normalized_bboxes[..., 6:7]
+
+    rot_cosine = normalized_bboxes[..., 7:8]
+    rot = torch.atan2(rot_sine, rot_cosine)
+
+    # center in the bev
+    cx = normalized_bboxes[..., 0:1]
+    cy = normalized_bboxes[..., 1:2]
+    cz = normalized_bboxes[..., 4:5]
+   
+    # size
+    w = normalized_bboxes[..., 2:3]
+    l = normalized_bboxes[..., 3:4]
+    h = normalized_bboxes[..., 5:6]
+
+    w = w.exp() 
+    l = l.exp() 
+    h = h.exp() 
+    if normalized_bboxes.size(-1) > 8:
+         # velocity 
+        vx = normalized_bboxes[:, 8:9]
+        vy = normalized_bboxes[:, 9:10]
+        denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1)
+    else:
+        denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1)
+    return denormalized_bboxes
\ No newline at end of file
diff --git a/mmcv/core/evaluation/__init__.py b/mmcv/core/evaluation/__init__.py
new file mode 100644
index 0000000..b93b087
--- /dev/null
+++ b/mmcv/core/evaluation/__init__.py
@@ -0,0 +1,13 @@
+from .indoor_eval import indoor_eval
+from .kitti_utils import kitti_eval, kitti_eval_coco_style
+from .lyft_eval import lyft_eval
+from .seg_eval import seg_eval
+from .class_names import (cityscapes_classes, coco_classes, dataset_aliases,
+                          get_classes, get_palette, imagenet_det_classes,
+                          imagenet_vid_classes, voc_classes)
+from .eval_hooks import DistEvalHook, EvalHook, CustomDistEvalHook
+from .mean_ap import average_precision, eval_map, print_map_summary
+from .recall import (eval_recalls, plot_iou_recall, plot_num_recall,
+                     print_recall_summary)
+from .metrics import eval_metrics, mean_dice, mean_fscore, mean_iou
+from .metric_motion import get_ade,get_best_preds,get_fde
\ No newline at end of file
diff --git a/mmcv/core/evaluation/bbox_overlaps.py b/mmcv/core/evaluation/bbox_overlaps.py
new file mode 100644
index 0000000..93559ea
--- /dev/null
+++ b/mmcv/core/evaluation/bbox_overlaps.py
@@ -0,0 +1,48 @@
+import numpy as np
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou', eps=1e-6):
+    """Calculate the ious between each bbox of bboxes1 and bboxes2.
+
+    Args:
+        bboxes1(ndarray): shape (n, 4)
+        bboxes2(ndarray): shape (k, 4)
+        mode(str): iou (intersection over union) or iof (intersection
+            over foreground)
+
+    Returns:
+        ious(ndarray): shape (n, k)
+    """
+
+    assert mode in ['iou', 'iof']
+
+    bboxes1 = bboxes1.astype(np.float32)
+    bboxes2 = bboxes2.astype(np.float32)
+    rows = bboxes1.shape[0]
+    cols = bboxes2.shape[0]
+    ious = np.zeros((rows, cols), dtype=np.float32)
+    if rows * cols == 0:
+        return ious
+    exchange = False
+    if bboxes1.shape[0] > bboxes2.shape[0]:
+        bboxes1, bboxes2 = bboxes2, bboxes1
+        ious = np.zeros((cols, rows), dtype=np.float32)
+        exchange = True
+    area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
+    area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])
+    for i in range(bboxes1.shape[0]):
+        x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0])
+        y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1])
+        x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2])
+        y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3])
+        overlap = np.maximum(x_end - x_start, 0) * np.maximum(
+            y_end - y_start, 0)
+        if mode == 'iou':
+            union = area1[i] + area2 - overlap
+        else:
+            union = area1[i] if not exchange else area2
+        union = np.maximum(union, eps)
+        ious[i, :] = overlap / union
+    if exchange:
+        ious = ious.T
+    return ious
diff --git a/mmcv/core/evaluation/class_names.py b/mmcv/core/evaluation/class_names.py
new file mode 100644
index 0000000..0e0e4f2
--- /dev/null
+++ b/mmcv/core/evaluation/class_names.py
@@ -0,0 +1,219 @@
+from mmcv.utils import is_str
+
+def ade_classes():
+    """ADE20K class names for external use."""
+    return [
+        'wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road', 'bed ',
+        'windowpane', 'grass', 'cabinet', 'sidewalk', 'person', 'earth',
+        'door', 'table', 'mountain', 'plant', 'curtain', 'chair', 'car',
+        'water', 'painting', 'sofa', 'shelf', 'house', 'sea', 'mirror', 'rug',
+        'field', 'armchair', 'seat', 'fence', 'desk', 'rock', 'wardrobe',
+        'lamp', 'bathtub', 'railing', 'cushion', 'base', 'box', 'column',
+        'signboard', 'chest of drawers', 'counter', 'sand', 'sink',
+        'skyscraper', 'fireplace', 'refrigerator', 'grandstand', 'path',
+        'stairs', 'runway', 'case', 'pool table', 'pillow', 'screen door',
+        'stairway', 'river', 'bridge', 'bookcase', 'blind', 'coffee table',
+        'toilet', 'flower', 'book', 'hill', 'bench', 'countertop', 'stove',
+        'palm', 'kitchen island', 'computer', 'swivel chair', 'boat', 'bar',
+        'arcade machine', 'hovel', 'bus', 'towel', 'light', 'truck', 'tower',
+        'chandelier', 'awning', 'streetlight', 'booth', 'television receiver',
+        'airplane', 'dirt track', 'apparel', 'pole', 'land', 'bannister',
+        'escalator', 'ottoman', 'bottle', 'buffet', 'poster', 'stage', 'van',
+        'ship', 'fountain', 'conveyer belt', 'canopy', 'washer', 'plaything',
+        'swimming pool', 'stool', 'barrel', 'basket', 'waterfall', 'tent',
+        'bag', 'minibike', 'cradle', 'oven', 'ball', 'food', 'step', 'tank',
+        'trade name', 'microwave', 'pot', 'animal', 'bicycle', 'lake',
+        'dishwasher', 'screen', 'blanket', 'sculpture', 'hood', 'sconce',
+        'vase', 'traffic light', 'tray', 'ashcan', 'fan', 'pier', 'crt screen',
+        'plate', 'monitor', 'bulletin board', 'shower', 'radiator', 'glass',
+        'clock', 'flag'
+    ]
+
+def ade_palette():
+    """ADE20K palette for external use."""
+    return [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
+            [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
+            [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+            [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+            [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+            [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+            [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+            [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+            [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+            [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
+            [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
+            [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
+            [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
+            [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
+            [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
+            [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
+            [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
+            [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
+            [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
+            [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
+            [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
+            [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
+            [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
+            [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
+            [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
+            [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
+            [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
+            [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
+            [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
+            [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
+            [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
+            [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
+            [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
+            [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
+            [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
+            [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
+            [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
+            [102, 255, 0], [92, 0, 255]]
+
+def wider_face_classes():
+    return ['face']
+
+
+def voc_classes():
+    return [
+        'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat',
+        'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person',
+        'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
+    ]
+    
+def voc_palette():
+    """Pascal VOC palette for external use."""
+    return [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128],
+            [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0],
+            [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128],
+            [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0],
+            [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]]
+
+
+def imagenet_det_classes():
+    return [
+        'accordion', 'airplane', 'ant', 'antelope', 'apple', 'armadillo',
+        'artichoke', 'axe', 'baby_bed', 'backpack', 'bagel', 'balance_beam',
+        'banana', 'band_aid', 'banjo', 'baseball', 'basketball', 'bathing_cap',
+        'beaker', 'bear', 'bee', 'bell_pepper', 'bench', 'bicycle', 'binder',
+        'bird', 'bookshelf', 'bow_tie', 'bow', 'bowl', 'brassiere', 'burrito',
+        'bus', 'butterfly', 'camel', 'can_opener', 'car', 'cart', 'cattle',
+        'cello', 'centipede', 'chain_saw', 'chair', 'chime', 'cocktail_shaker',
+        'coffee_maker', 'computer_keyboard', 'computer_mouse', 'corkscrew',
+        'cream', 'croquet_ball', 'crutch', 'cucumber', 'cup_or_mug', 'diaper',
+        'digital_clock', 'dishwasher', 'dog', 'domestic_cat', 'dragonfly',
+        'drum', 'dumbbell', 'electric_fan', 'elephant', 'face_powder', 'fig',
+        'filing_cabinet', 'flower_pot', 'flute', 'fox', 'french_horn', 'frog',
+        'frying_pan', 'giant_panda', 'goldfish', 'golf_ball', 'golfcart',
+        'guacamole', 'guitar', 'hair_dryer', 'hair_spray', 'hamburger',
+        'hammer', 'hamster', 'harmonica', 'harp', 'hat_with_a_wide_brim',
+        'head_cabbage', 'helmet', 'hippopotamus', 'horizontal_bar', 'horse',
+        'hotdog', 'iPod', 'isopod', 'jellyfish', 'koala_bear', 'ladle',
+        'ladybug', 'lamp', 'laptop', 'lemon', 'lion', 'lipstick', 'lizard',
+        'lobster', 'maillot', 'maraca', 'microphone', 'microwave', 'milk_can',
+        'miniskirt', 'monkey', 'motorcycle', 'mushroom', 'nail', 'neck_brace',
+        'oboe', 'orange', 'otter', 'pencil_box', 'pencil_sharpener', 'perfume',
+        'person', 'piano', 'pineapple', 'ping-pong_ball', 'pitcher', 'pizza',
+        'plastic_bag', 'plate_rack', 'pomegranate', 'popsicle', 'porcupine',
+        'power_drill', 'pretzel', 'printer', 'puck', 'punching_bag', 'purse',
+        'rabbit', 'racket', 'ray', 'red_panda', 'refrigerator',
+        'remote_control', 'rubber_eraser', 'rugby_ball', 'ruler',
+        'salt_or_pepper_shaker', 'saxophone', 'scorpion', 'screwdriver',
+        'seal', 'sheep', 'ski', 'skunk', 'snail', 'snake', 'snowmobile',
+        'snowplow', 'soap_dispenser', 'soccer_ball', 'sofa', 'spatula',
+        'squirrel', 'starfish', 'stethoscope', 'stove', 'strainer',
+        'strawberry', 'stretcher', 'sunglasses', 'swimming_trunks', 'swine',
+        'syringe', 'table', 'tape_player', 'tennis_ball', 'tick', 'tie',
+        'tiger', 'toaster', 'traffic_light', 'train', 'trombone', 'trumpet',
+        'turtle', 'tv_or_monitor', 'unicycle', 'vacuum', 'violin',
+        'volleyball', 'waffle_iron', 'washer', 'water_bottle', 'watercraft',
+        'whale', 'wine_bottle', 'zebra'
+    ]
+
+
+def imagenet_vid_classes():
+    return [
+        'airplane', 'antelope', 'bear', 'bicycle', 'bird', 'bus', 'car',
+        'cattle', 'dog', 'domestic_cat', 'elephant', 'fox', 'giant_panda',
+        'hamster', 'horse', 'lion', 'lizard', 'monkey', 'motorcycle', 'rabbit',
+        'red_panda', 'sheep', 'snake', 'squirrel', 'tiger', 'train', 'turtle',
+        'watercraft', 'whale', 'zebra'
+    ]
+
+
+def coco_classes():
+    return [
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+        'truck', 'boat', 'traffic_light', 'fire_hydrant', 'stop_sign',
+        'parking_meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+        'sports_ball', 'kite', 'baseball_bat', 'baseball_glove', 'skateboard',
+        'surfboard', 'tennis_racket', 'bottle', 'wine_glass', 'cup', 'fork',
+        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot_dog', 'pizza', 'donut', 'cake', 'chair',
+        'couch', 'potted_plant', 'bed', 'dining_table', 'toilet', 'tv',
+        'laptop', 'mouse', 'remote', 'keyboard', 'cell_phone', 'microwave',
+        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+        'scissors', 'teddy_bear', 'hair_drier', 'toothbrush'
+    ]
+
+
+def cityscapes_classes():
+    return [
+        'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+        'bicycle'
+    ]
+    
+def cityscapes_palette():
+    """Cityscapes palette for external use."""
+    return [[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
+            [190, 153, 153], [153, 153, 153], [250, 170, 30], [220, 220, 0],
+            [107, 142, 35], [152, 251, 152], [70, 130, 180], [220, 20, 60],
+            [255, 0, 0], [0, 0, 142], [0, 0, 70], [0, 60, 100], [0, 80, 100],
+            [0, 0, 230], [119, 11, 32]]
+
+
+dataset_aliases = {
+    'voc': ['voc', 'pascal_voc', 'voc07', 'voc12'],
+    'imagenet_det': ['det', 'imagenet_det', 'ilsvrc_det'],
+    'imagenet_vid': ['vid', 'imagenet_vid', 'ilsvrc_vid'],
+    'coco': ['coco', 'mscoco', 'ms_coco'],
+    'wider_face': ['WIDERFaceDataset', 'wider_face', 'WIDERFace'],
+    'cityscapes': ['cityscapes'],
+    'ade': ['ade', 'ade20k']
+}
+
+
+def get_classes(dataset):
+    """Get class names of a dataset."""
+    alias2name = {}
+    for name, aliases in dataset_aliases.items():
+        for alias in aliases:
+            alias2name[alias] = name
+
+    if is_str(dataset):
+        if dataset in alias2name:
+            labels = eval(alias2name[dataset] + '_classes()')
+        else:
+            raise ValueError(f'Unrecognized dataset: {dataset}')
+    else:
+        raise TypeError(f'dataset must a str, but got {type(dataset)}')
+    return labels
+
+
+def get_palette(dataset):
+    """Get class palette (RGB) of a dataset."""
+    alias2name = {}
+    for name, aliases in dataset_aliases.items():
+        for alias in aliases:
+            alias2name[alias] = name
+
+    if is_str(dataset):
+        if dataset in alias2name:
+            labels = eval(alias2name[dataset] + '_palette()')
+        else:
+            raise ValueError(f'Unrecognized dataset: {dataset}')
+    else:
+        raise TypeError(f'dataset must a str, but got {type(dataset)}')
+    return labels
diff --git a/mmcv/core/evaluation/eval_hooks.py b/mmcv/core/evaluation/eval_hooks.py
new file mode 100644
index 0000000..dcaf73d
--- /dev/null
+++ b/mmcv/core/evaluation/eval_hooks.py
@@ -0,0 +1,133 @@
+import bisect
+import os.path as osp
+
+import torch.distributed as dist
+from mmcv.runner import DistEvalHook as BaseDistEvalHook
+from mmcv.runner import EvalHook as BaseEvalHook
+from mmcv.utils import is_list_of
+from torch.nn.modules.batchnorm import _BatchNorm
+
+
+class EvalHook(BaseEvalHook):
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        if not self._should_evaluate(runner):
+            return
+
+        results = self.test_fn(runner.model, self.dataloader, show=False)
+        runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+        key_score = self.evaluate(runner, results)
+        if self.save_best:
+            self._save_ckpt(runner, key_score)
+
+
+class DistEvalHook(BaseDistEvalHook):
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        # Synchronization of BatchNorm's buffer (running_mean
+        # and running_var) is not supported in the DDP of pytorch,
+        # which may cause the inconsistent performance of models in
+        # different ranks, so we broadcast BatchNorm's buffers
+        # of rank 0 to other ranks to avoid this.
+        if self.broadcast_bn_buffer:
+            model = runner.model
+            for name, module in model.named_modules():
+                if isinstance(module,
+                              _BatchNorm) and module.track_running_stats:
+                    dist.broadcast(module.running_var, 0)
+                    dist.broadcast(module.running_mean, 0)
+
+        if not self._should_evaluate(runner):
+            return
+
+        tmpdir = self.tmpdir
+        if tmpdir is None:
+            tmpdir = osp.join(runner.work_dir, '.eval_hook')
+
+        results = self.test_fn(
+            runner.model,
+            self.dataloader,
+            tmpdir=tmpdir,
+            gpu_collect=self.gpu_collect)
+        if runner.rank == 0:
+            print('\n')
+            runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+            key_score = self.evaluate(runner, results)
+
+            if self.save_best:
+                self._save_ckpt(runner, key_score)
+                
+def _calc_dynamic_intervals(start_interval, dynamic_interval_list):
+    assert is_list_of(dynamic_interval_list, tuple)
+
+    dynamic_milestones = [0]
+    dynamic_milestones.extend(
+        [dynamic_interval[0] for dynamic_interval in dynamic_interval_list])
+    dynamic_intervals = [start_interval]
+    dynamic_intervals.extend(
+        [dynamic_interval[1] for dynamic_interval in dynamic_interval_list])
+    return dynamic_milestones, dynamic_intervals
+
+
+class CustomDistEvalHook(BaseDistEvalHook):
+
+    def __init__(self, *args, dynamic_intervals=None,  **kwargs):
+        super(CustomDistEvalHook, self).__init__(*args, **kwargs)
+        self.use_dynamic_intervals = dynamic_intervals is not None
+        if self.use_dynamic_intervals:
+            self.dynamic_milestones, self.dynamic_intervals = \
+                _calc_dynamic_intervals(self.interval, dynamic_intervals)
+
+    def _decide_interval(self, runner):
+        if self.use_dynamic_intervals:
+            progress = runner.epoch if self.by_epoch else runner.iter
+            step = bisect.bisect(self.dynamic_milestones, (progress + 1))
+            # Dynamically modify the evaluation interval
+            self.interval = self.dynamic_intervals[step - 1]
+
+    def before_train_epoch(self, runner):
+        """Evaluate the model only at the start of training by epoch."""
+        self._decide_interval(runner)
+        super().before_train_epoch(runner)
+
+    def before_train_iter(self, runner):
+        self._decide_interval(runner)
+        super().before_train_iter(runner)
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        # Synchronization of BatchNorm's buffer (running_mean
+        # and running_var) is not supported in the DDP of pytorch,
+        # which may cause the inconsistent performance of models in
+        # different ranks, so we broadcast BatchNorm's buffers
+        # of rank 0 to other ranks to avoid this.
+        if self.broadcast_bn_buffer:
+            model = runner.model
+            for name, module in model.named_modules():
+                if isinstance(module,
+                              _BatchNorm) and module.track_running_stats:
+                    dist.broadcast(module.running_var, 0)
+                    dist.broadcast(module.running_mean, 0)
+
+        if not self._should_evaluate(runner):
+            return
+
+        tmpdir = self.tmpdir
+        if tmpdir is None:
+            tmpdir = osp.join(runner.work_dir, '.eval_hook')
+
+        results = self.test_fn(
+            runner.model,
+            self.dataloader,
+            tmpdir=tmpdir,
+            gpu_collect=self.gpu_collect)
+        if runner.rank == 0:
+            print('\n')
+            runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+
+            key_score = self.evaluate(runner, results)
+
+            if self.save_best:
+                self._save_ckpt(runner, key_score)
diff --git a/mmcv/core/evaluation/indoor_eval.py b/mmcv/core/evaluation/indoor_eval.py
new file mode 100644
index 0000000..ff0dac1
--- /dev/null
+++ b/mmcv/core/evaluation/indoor_eval.py
@@ -0,0 +1,310 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+
+def average_precision(recalls, precisions, mode='area'):
+    """Calculate average precision (for single or multiple scales).
+
+    Args:
+        recalls (np.ndarray): Recalls with shape of (num_scales, num_dets) \
+            or (num_dets, ).
+        precisions (np.ndarray): Precisions with shape of \
+            (num_scales, num_dets) or (num_dets, ).
+        mode (str): 'area' or '11points', 'area' means calculating the area
+            under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1]
+
+    Returns:
+        float or np.ndarray: Calculated average precision.
+    """
+    if recalls.ndim == 1:
+        recalls = recalls[np.newaxis, :]
+        precisions = precisions[np.newaxis, :]
+
+    assert recalls.shape == precisions.shape
+    assert recalls.ndim == 2
+
+    num_scales = recalls.shape[0]
+    ap = np.zeros(num_scales, dtype=np.float32)
+    if mode == 'area':
+        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
+        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
+        mrec = np.hstack((zeros, recalls, ones))
+        mpre = np.hstack((zeros, precisions, zeros))
+        for i in range(mpre.shape[1] - 1, 0, -1):
+            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
+        for i in range(num_scales):
+            ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
+            ap[i] = np.sum(
+                (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
+    elif mode == '11points':
+        for i in range(num_scales):
+            for thr in np.arange(0, 1 + 1e-3, 0.1):
+                precs = precisions[i, recalls[i, :] >= thr]
+                prec = precs.max() if precs.size > 0 else 0
+                ap[i] += prec
+            ap /= 11
+    else:
+        raise ValueError(
+            'Unrecognized mode, only "area" and "11points" are supported')
+    return ap
+
+
+def eval_det_cls(pred, gt, iou_thr=None):
+    """Generic functions to compute precision/recall for object detection for a
+    single class.
+
+    Args:
+        pred (dict): Predictions mapping from image id to bounding boxes \
+            and scores.
+        gt (dict): Ground truths mapping from image id to bounding boxes.
+        iou_thr (list[float]): A list of iou thresholds.
+
+    Return:
+        tuple (np.ndarray, np.ndarray, float): Recalls, precisions and \
+            average precision.
+    """
+
+    # {img_id: {'bbox': box structure, 'det': matched list}}
+    class_recs = {}
+    npos = 0
+    for img_id in gt.keys():
+        cur_gt_num = len(gt[img_id])
+        if cur_gt_num != 0:
+            gt_cur = torch.zeros([cur_gt_num, 7], dtype=torch.float32)
+            for i in range(cur_gt_num):
+                gt_cur[i] = gt[img_id][i].tensor
+            bbox = gt[img_id][0].new_box(gt_cur)
+        else:
+            bbox = gt[img_id]
+        det = [[False] * len(bbox) for i in iou_thr]
+        npos += len(bbox)
+        class_recs[img_id] = {'bbox': bbox, 'det': det}
+
+    # construct dets
+    image_ids = []
+    confidence = []
+    ious = []
+    for img_id in pred.keys():
+        cur_num = len(pred[img_id])
+        if cur_num == 0:
+            continue
+        pred_cur = torch.zeros((cur_num, 7), dtype=torch.float32)
+        box_idx = 0
+        for box, score in pred[img_id]:
+            image_ids.append(img_id)
+            confidence.append(score)
+            pred_cur[box_idx] = box.tensor
+            box_idx += 1
+        pred_cur = box.new_box(pred_cur)
+        gt_cur = class_recs[img_id]['bbox']
+        if len(gt_cur) > 0:
+            # calculate iou in each image
+            iou_cur = pred_cur.overlaps(pred_cur, gt_cur)
+            for i in range(cur_num):
+                ious.append(iou_cur[i])
+        else:
+            for i in range(cur_num):
+                ious.append(np.zeros(1))
+
+    confidence = np.array(confidence)
+
+    # sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    image_ids = [image_ids[x] for x in sorted_ind]
+    ious = [ious[x] for x in sorted_ind]
+
+    # go down dets and mark TPs and FPs
+    nd = len(image_ids)
+    tp_thr = [np.zeros(nd) for i in iou_thr]
+    fp_thr = [np.zeros(nd) for i in iou_thr]
+    for d in range(nd):
+        R = class_recs[image_ids[d]]
+        iou_max = -np.inf
+        BBGT = R['bbox']
+        cur_iou = ious[d]
+
+        if len(BBGT) > 0:
+            # compute overlaps
+            for j in range(len(BBGT)):
+                # iou = get_iou_main(get_iou_func, (bb, BBGT[j,...]))
+                iou = cur_iou[j]
+                if iou > iou_max:
+                    iou_max = iou
+                    jmax = j
+
+        for iou_idx, thresh in enumerate(iou_thr):
+            if iou_max > thresh:
+                if not R['det'][iou_idx][jmax]:
+                    tp_thr[iou_idx][d] = 1.
+                    R['det'][iou_idx][jmax] = 1
+                else:
+                    fp_thr[iou_idx][d] = 1.
+            else:
+                fp_thr[iou_idx][d] = 1.
+
+    ret = []
+    for iou_idx, thresh in enumerate(iou_thr):
+        # compute precision recall
+        fp = np.cumsum(fp_thr[iou_idx])
+        tp = np.cumsum(tp_thr[iou_idx])
+        recall = tp / float(npos)
+        # avoid divide by zero in case the first detection matches a difficult
+        # ground truth
+        precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+        ap = average_precision(recall, precision)
+        ret.append((recall, precision, ap))
+
+    return ret
+
+
+def eval_map_recall(pred, gt, ovthresh=None):
+    """Evaluate mAP and recall.
+
+    Generic functions to compute precision/recall for object detection
+        for multiple classes.
+
+    Args:
+        pred (dict): Information of detection results,
+            which maps class_id and predictions.
+        gt (dict): Information of ground truths, which maps class_id and \
+            ground truths.
+        ovthresh (list[float]): iou threshold.
+            Default: None.
+
+    Return:
+        tuple[dict]: dict results of recall, AP, and precision for all classes.
+    """
+
+    ret_values = {}
+    for classname in gt.keys():
+        if classname in pred:
+            ret_values[classname] = eval_det_cls(pred[classname],
+                                                 gt[classname], ovthresh)
+    recall = [{} for i in ovthresh]
+    precision = [{} for i in ovthresh]
+    ap = [{} for i in ovthresh]
+
+    for label in gt.keys():
+        for iou_idx, thresh in enumerate(ovthresh):
+            if label in pred:
+                recall[iou_idx][label], precision[iou_idx][label], ap[iou_idx][
+                    label] = ret_values[label][iou_idx]
+            else:
+                recall[iou_idx][label] = np.zeros(1)
+                precision[iou_idx][label] = np.zeros(1)
+                ap[iou_idx][label] = np.zeros(1)
+
+    return recall, precision, ap
+
+
+def indoor_eval(gt_annos,
+                dt_annos,
+                metric,
+                label2cat,
+                logger=None,
+                box_type_3d=None,
+                box_mode_3d=None):
+    """Indoor Evaluation.
+
+    Evaluate the result of the detection.
+
+    Args:
+        gt_annos (list[dict]): Ground truth annotations.
+        dt_annos (list[dict]): Detection annotations. the dict
+            includes the following keys
+
+            - labels_3d (torch.Tensor): Labels of boxes.
+            - boxes_3d (:obj:`BaseInstance3DBoxes`): \
+                3D bounding boxes in Depth coordinate.
+            - scores_3d (torch.Tensor): Scores of boxes.
+        metric (list[float]): IoU thresholds for computing average precisions.
+        label2cat (dict): Map from label to category.
+        logger (logging.Logger | str | None): The way to print the mAP
+            summary. See `mmcv.utils.print_log()` for details. Default: None.
+
+    Return:
+        dict[str, float]: Dict of results.
+    """
+    assert len(dt_annos) == len(gt_annos)
+    pred = {}  # map {class_id: pred}
+    gt = {}  # map {class_id: gt}
+    for img_id in range(len(dt_annos)):
+        # parse detected annotations
+        det_anno = dt_annos[img_id]
+        for i in range(len(det_anno['labels_3d'])):
+            label = det_anno['labels_3d'].numpy()[i]
+            bbox = det_anno['boxes_3d'].convert_to(box_mode_3d)[i]
+            score = det_anno['scores_3d'].numpy()[i]
+            if label not in pred:
+                pred[int(label)] = {}
+            if img_id not in pred[label]:
+                pred[int(label)][img_id] = []
+            if label not in gt:
+                gt[int(label)] = {}
+            if img_id not in gt[label]:
+                gt[int(label)][img_id] = []
+            pred[int(label)][img_id].append((bbox, score))
+
+        # parse gt annotations
+        gt_anno = gt_annos[img_id]
+        if gt_anno['gt_num'] != 0:
+            gt_boxes = box_type_3d(
+                gt_anno['gt_boxes_upright_depth'],
+                box_dim=gt_anno['gt_boxes_upright_depth'].shape[-1],
+                origin=(0.5, 0.5, 0.5)).convert_to(box_mode_3d)
+            labels_3d = gt_anno['class']
+        else:
+            gt_boxes = box_type_3d(np.array([], dtype=np.float32))
+            labels_3d = np.array([], dtype=np.int64)
+
+        for i in range(len(labels_3d)):
+            label = labels_3d[i]
+            bbox = gt_boxes[i]
+            if label not in gt:
+                gt[label] = {}
+            if img_id not in gt[label]:
+                gt[label][img_id] = []
+            gt[label][img_id].append(bbox)
+
+    rec, prec, ap = eval_map_recall(pred, gt, metric)
+    ret_dict = dict()
+    header = ['classes']
+    table_columns = [[label2cat[label]
+                      for label in ap[0].keys()] + ['Overall']]
+
+    for i, iou_thresh in enumerate(metric):
+        header.append(f'AP_{iou_thresh:.2f}')
+        header.append(f'AR_{iou_thresh:.2f}')
+        rec_list = []
+        for label in ap[i].keys():
+            ret_dict[f'{label2cat[label]}_AP_{iou_thresh:.2f}'] = float(
+                ap[i][label][0])
+        ret_dict[f'mAP_{iou_thresh:.2f}'] = float(
+            np.mean(list(ap[i].values())))
+
+        table_columns.append(list(map(float, list(ap[i].values()))))
+        table_columns[-1] += [ret_dict[f'mAP_{iou_thresh:.2f}']]
+        table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
+
+        for label in rec[i].keys():
+            ret_dict[f'{label2cat[label]}_rec_{iou_thresh:.2f}'] = float(
+                rec[i][label][-1])
+            rec_list.append(rec[i][label][-1])
+        ret_dict[f'mAR_{iou_thresh:.2f}'] = float(np.mean(rec_list))
+
+        table_columns.append(list(map(float, rec_list)))
+        table_columns[-1] += [ret_dict[f'mAR_{iou_thresh:.2f}']]
+        table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
+
+    table_data = [header]
+    table_rows = list(zip(*table_columns))
+    table_data += table_rows
+    table = AsciiTable(table_data)
+    table.inner_footing_row_border = True
+    print_log('\n' + table.table, logger=logger)
+
+    return ret_dict
diff --git a/mmcv/core/evaluation/kitti_utils/__init__.py b/mmcv/core/evaluation/kitti_utils/__init__.py
new file mode 100644
index 0000000..23c1cdf
--- /dev/null
+++ b/mmcv/core/evaluation/kitti_utils/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .eval import kitti_eval, kitti_eval_coco_style
+
+__all__ = ['kitti_eval', 'kitti_eval_coco_style']
diff --git a/mmcv/core/evaluation/kitti_utils/eval.py b/mmcv/core/evaluation/kitti_utils/eval.py
new file mode 100644
index 0000000..93492c4
--- /dev/null
+++ b/mmcv/core/evaluation/kitti_utils/eval.py
@@ -0,0 +1,847 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import gc
+import io as sysio
+import numba
+import numpy as np
+
+
+@numba.jit
+def get_thresholds(scores: np.ndarray, num_gt, num_sample_pts=41):
+    scores.sort()
+    scores = scores[::-1]
+    current_recall = 0
+    thresholds = []
+    for i, score in enumerate(scores):
+        l_recall = (i + 1) / num_gt
+        if i < (len(scores) - 1):
+            r_recall = (i + 2) / num_gt
+        else:
+            r_recall = l_recall
+        if (((r_recall - current_recall) < (current_recall - l_recall))
+                and (i < (len(scores) - 1))):
+            continue
+        # recall = l_recall
+        thresholds.append(score)
+        current_recall += 1 / (num_sample_pts - 1.0)
+    return thresholds
+
+
+def clean_data(gt_anno, dt_anno, current_class, difficulty):
+    CLASS_NAMES = ['car', 'pedestrian', 'cyclist']
+    MIN_HEIGHT = [40, 25, 25]
+    MAX_OCCLUSION = [0, 1, 2]
+    MAX_TRUNCATION = [0.15, 0.3, 0.5]
+    dc_bboxes, ignored_gt, ignored_dt = [], [], []
+    current_cls_name = CLASS_NAMES[current_class].lower()
+    num_gt = len(gt_anno['name'])
+    num_dt = len(dt_anno['name'])
+    num_valid_gt = 0
+    for i in range(num_gt):
+        bbox = gt_anno['bbox'][i]
+        gt_name = gt_anno['name'][i].lower()
+        height = bbox[3] - bbox[1]
+        valid_class = -1
+        if (gt_name == current_cls_name):
+            valid_class = 1
+        elif (current_cls_name == 'Pedestrian'.lower()
+              and 'Person_sitting'.lower() == gt_name):
+            valid_class = 0
+        elif (current_cls_name == 'Car'.lower() and 'Van'.lower() == gt_name):
+            valid_class = 0
+        else:
+            valid_class = -1
+        ignore = False
+        if ((gt_anno['occluded'][i] > MAX_OCCLUSION[difficulty])
+                or (gt_anno['truncated'][i] > MAX_TRUNCATION[difficulty])
+                or (height <= MIN_HEIGHT[difficulty])):
+            ignore = True
+        if valid_class == 1 and not ignore:
+            ignored_gt.append(0)
+            num_valid_gt += 1
+        elif (valid_class == 0 or (ignore and (valid_class == 1))):
+            ignored_gt.append(1)
+        else:
+            ignored_gt.append(-1)
+    # for i in range(num_gt):
+        if gt_anno['name'][i] == 'DontCare':
+            dc_bboxes.append(gt_anno['bbox'][i])
+    for i in range(num_dt):
+        if (dt_anno['name'][i].lower() == current_cls_name):
+            valid_class = 1
+        else:
+            valid_class = -1
+        height = abs(dt_anno['bbox'][i, 3] - dt_anno['bbox'][i, 1])
+        if height < MIN_HEIGHT[difficulty]:
+            ignored_dt.append(1)
+        elif valid_class == 1:
+            ignored_dt.append(0)
+        else:
+            ignored_dt.append(-1)
+
+    return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes
+
+
+@numba.jit(nopython=True)
+def image_box_overlap(boxes, query_boxes, criterion=-1):
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    overlaps = np.zeros((N, K), dtype=boxes.dtype)
+    for k in range(K):
+        qbox_area = ((query_boxes[k, 2] - query_boxes[k, 0]) *
+                     (query_boxes[k, 3] - query_boxes[k, 1]))
+        for n in range(N):
+            iw = (
+                min(boxes[n, 2], query_boxes[k, 2]) -
+                max(boxes[n, 0], query_boxes[k, 0]))
+            if iw > 0:
+                ih = (
+                    min(boxes[n, 3], query_boxes[k, 3]) -
+                    max(boxes[n, 1], query_boxes[k, 1]))
+                if ih > 0:
+                    if criterion == -1:
+                        ua = ((boxes[n, 2] - boxes[n, 0]) *
+                              (boxes[n, 3] - boxes[n, 1]) + qbox_area -
+                              iw * ih)
+                    elif criterion == 0:
+                        ua = ((boxes[n, 2] - boxes[n, 0]) *
+                              (boxes[n, 3] - boxes[n, 1]))
+                    elif criterion == 1:
+                        ua = qbox_area
+                    else:
+                        ua = 1.0
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
+
+
+def bev_box_overlap(boxes, qboxes, criterion=-1):
+    from .rotate_iou import rotate_iou_gpu_eval
+    riou = rotate_iou_gpu_eval(boxes, qboxes, criterion)
+    return riou
+
+
+@numba.jit(nopython=True, parallel=True)
+def d3_box_overlap_kernel(boxes, qboxes, rinc, criterion=-1):
+    # ONLY support overlap in CAMERA, not lidar.
+    # TODO: change to use prange for parallel mode, should check the difference
+    N, K = boxes.shape[0], qboxes.shape[0]
+    for i in numba.prange(N):
+        for j in numba.prange(K):
+            if rinc[i, j] > 0:
+                # iw = (min(boxes[i, 1] + boxes[i, 4], qboxes[j, 1] +
+                #         qboxes[j, 4]) - max(boxes[i, 1], qboxes[j, 1]))
+                iw = (
+                    min(boxes[i, 1], qboxes[j, 1]) -
+                    max(boxes[i, 1] - boxes[i, 4],
+                        qboxes[j, 1] - qboxes[j, 4]))
+
+                if iw > 0:
+                    area1 = boxes[i, 3] * boxes[i, 4] * boxes[i, 5]
+                    area2 = qboxes[j, 3] * qboxes[j, 4] * qboxes[j, 5]
+                    inc = iw * rinc[i, j]
+                    if criterion == -1:
+                        ua = (area1 + area2 - inc)
+                    elif criterion == 0:
+                        ua = area1
+                    elif criterion == 1:
+                        ua = area2
+                    else:
+                        ua = inc
+                    rinc[i, j] = inc / ua
+                else:
+                    rinc[i, j] = 0.0
+
+
+def d3_box_overlap(boxes, qboxes, criterion=-1):
+    from .rotate_iou import rotate_iou_gpu_eval
+    rinc = rotate_iou_gpu_eval(boxes[:, [0, 2, 3, 5, 6]],
+                               qboxes[:, [0, 2, 3, 5, 6]], 2)
+    d3_box_overlap_kernel(boxes, qboxes, rinc, criterion)
+    return rinc
+
+
+@numba.jit(nopython=True)
+def compute_statistics_jit(overlaps,
+                           gt_datas,
+                           dt_datas,
+                           ignored_gt,
+                           ignored_det,
+                           dc_bboxes,
+                           metric,
+                           min_overlap,
+                           thresh=0,
+                           compute_fp=False,
+                           compute_aos=False):
+
+    det_size = dt_datas.shape[0]
+    gt_size = gt_datas.shape[0]
+    dt_scores = dt_datas[:, -1]
+    dt_alphas = dt_datas[:, 4]
+    gt_alphas = gt_datas[:, 4]
+    dt_bboxes = dt_datas[:, :4]
+    # gt_bboxes = gt_datas[:, :4]
+
+    assigned_detection = [False] * det_size
+    ignored_threshold = [False] * det_size
+    if compute_fp:
+        for i in range(det_size):
+            if (dt_scores[i] < thresh):
+                ignored_threshold[i] = True
+    NO_DETECTION = -10000000
+    tp, fp, fn, similarity = 0, 0, 0, 0
+    # thresholds = [0.0]
+    # delta = [0.0]
+    thresholds = np.zeros((gt_size, ))
+    thresh_idx = 0
+    delta = np.zeros((gt_size, ))
+    delta_idx = 0
+    for i in range(gt_size):
+        if ignored_gt[i] == -1:
+            continue
+        det_idx = -1
+        valid_detection = NO_DETECTION
+        max_overlap = 0
+        assigned_ignored_det = False
+
+        for j in range(det_size):
+            if (ignored_det[j] == -1):
+                continue
+            if (assigned_detection[j]):
+                continue
+            if (ignored_threshold[j]):
+                continue
+            overlap = overlaps[j, i]
+            dt_score = dt_scores[j]
+            if (not compute_fp and (overlap > min_overlap)
+                    and dt_score > valid_detection):
+                det_idx = j
+                valid_detection = dt_score
+            elif (compute_fp and (overlap > min_overlap)
+                  and (overlap > max_overlap or assigned_ignored_det)
+                  and ignored_det[j] == 0):
+                max_overlap = overlap
+                det_idx = j
+                valid_detection = 1
+                assigned_ignored_det = False
+            elif (compute_fp and (overlap > min_overlap)
+                  and (valid_detection == NO_DETECTION)
+                  and ignored_det[j] == 1):
+                det_idx = j
+                valid_detection = 1
+                assigned_ignored_det = True
+
+        if (valid_detection == NO_DETECTION) and ignored_gt[i] == 0:
+            fn += 1
+        elif ((valid_detection != NO_DETECTION)
+              and (ignored_gt[i] == 1 or ignored_det[det_idx] == 1)):
+            assigned_detection[det_idx] = True
+        elif valid_detection != NO_DETECTION:
+            tp += 1
+            # thresholds.append(dt_scores[det_idx])
+            thresholds[thresh_idx] = dt_scores[det_idx]
+            thresh_idx += 1
+            if compute_aos:
+                # delta.append(gt_alphas[i] - dt_alphas[det_idx])
+                delta[delta_idx] = gt_alphas[i] - dt_alphas[det_idx]
+                delta_idx += 1
+
+            assigned_detection[det_idx] = True
+    if compute_fp:
+        for i in range(det_size):
+            if (not (assigned_detection[i] or ignored_det[i] == -1
+                     or ignored_det[i] == 1 or ignored_threshold[i])):
+                fp += 1
+        nstuff = 0
+        if metric == 0:
+            overlaps_dt_dc = image_box_overlap(dt_bboxes, dc_bboxes, 0)
+            for i in range(dc_bboxes.shape[0]):
+                for j in range(det_size):
+                    if (assigned_detection[j]):
+                        continue
+                    if (ignored_det[j] == -1 or ignored_det[j] == 1):
+                        continue
+                    if (ignored_threshold[j]):
+                        continue
+                    if overlaps_dt_dc[j, i] > min_overlap:
+                        assigned_detection[j] = True
+                        nstuff += 1
+        fp -= nstuff
+        if compute_aos:
+            tmp = np.zeros((fp + delta_idx, ))
+            # tmp = [0] * fp
+            for i in range(delta_idx):
+                tmp[i + fp] = (1.0 + np.cos(delta[i])) / 2.0
+                # tmp.append((1.0 + np.cos(delta[i])) / 2.0)
+            # assert len(tmp) == fp + tp
+            # assert len(delta) == tp
+            if tp > 0 or fp > 0:
+                similarity = np.sum(tmp)
+            else:
+                similarity = -1
+    return tp, fp, fn, similarity, thresholds[:thresh_idx]
+
+
+def get_split_parts(num, num_part):
+    same_part = num // num_part
+    remain_num = num % num_part
+    if remain_num == 0:
+        return [same_part] * num_part
+    else:
+        return [same_part] * num_part + [remain_num]
+
+
+@numba.jit(nopython=True)
+def fused_compute_statistics(overlaps,
+                             pr,
+                             gt_nums,
+                             dt_nums,
+                             dc_nums,
+                             gt_datas,
+                             dt_datas,
+                             dontcares,
+                             ignored_gts,
+                             ignored_dets,
+                             metric,
+                             min_overlap,
+                             thresholds,
+                             compute_aos=False):
+    gt_num = 0
+    dt_num = 0
+    dc_num = 0
+    for i in range(gt_nums.shape[0]):
+        for t, thresh in enumerate(thresholds):
+            overlap = overlaps[dt_num:dt_num + dt_nums[i],
+                               gt_num:gt_num + gt_nums[i]]
+
+            gt_data = gt_datas[gt_num:gt_num + gt_nums[i]]
+            dt_data = dt_datas[dt_num:dt_num + dt_nums[i]]
+            ignored_gt = ignored_gts[gt_num:gt_num + gt_nums[i]]
+            ignored_det = ignored_dets[dt_num:dt_num + dt_nums[i]]
+            dontcare = dontcares[dc_num:dc_num + dc_nums[i]]
+            tp, fp, fn, similarity, _ = compute_statistics_jit(
+                overlap,
+                gt_data,
+                dt_data,
+                ignored_gt,
+                ignored_det,
+                dontcare,
+                metric,
+                min_overlap=min_overlap,
+                thresh=thresh,
+                compute_fp=True,
+                compute_aos=compute_aos)
+            pr[t, 0] += tp
+            pr[t, 1] += fp
+            pr[t, 2] += fn
+            if similarity != -1:
+                pr[t, 3] += similarity
+        gt_num += gt_nums[i]
+        dt_num += dt_nums[i]
+        dc_num += dc_nums[i]
+
+
+def calculate_iou_partly(gt_annos, dt_annos, metric, num_parts=50):
+    """Fast iou algorithm. this function can be used independently to do result
+    analysis. Must be used in CAMERA coordinate system.
+
+    Args:
+        gt_annos (dict): Must from get_label_annos() in kitti_common.py.
+        dt_annos (dict): Must from get_label_annos() in kitti_common.py.
+        metric (int): Eval type. 0: bbox, 1: bev, 2: 3d.
+        num_parts (int): A parameter for fast calculate algorithm.
+    """
+    assert len(gt_annos) == len(dt_annos)
+    total_dt_num = np.stack([len(a['name']) for a in dt_annos], 0)
+    total_gt_num = np.stack([len(a['name']) for a in gt_annos], 0)
+    num_examples = len(gt_annos)
+    split_parts = get_split_parts(num_examples, num_parts)
+    parted_overlaps = []
+    example_idx = 0
+
+    for num_part in split_parts:
+        gt_annos_part = gt_annos[example_idx:example_idx + num_part]
+        dt_annos_part = dt_annos[example_idx:example_idx + num_part]
+        if metric == 0:
+            gt_boxes = np.concatenate([a['bbox'] for a in gt_annos_part], 0)
+            dt_boxes = np.concatenate([a['bbox'] for a in dt_annos_part], 0)
+            overlap_part = image_box_overlap(gt_boxes, dt_boxes)
+        elif metric == 1:
+            loc = np.concatenate(
+                [a['location'][:, [0, 2]] for a in gt_annos_part], 0)
+            dims = np.concatenate(
+                [a['dimensions'][:, [0, 2]] for a in gt_annos_part], 0)
+            rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0)
+            gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1)
+            loc = np.concatenate(
+                [a['location'][:, [0, 2]] for a in dt_annos_part], 0)
+            dims = np.concatenate(
+                [a['dimensions'][:, [0, 2]] for a in dt_annos_part], 0)
+            rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0)
+            dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1)
+            overlap_part = bev_box_overlap(gt_boxes,
+                                           dt_boxes).astype(np.float64)
+        elif metric == 2:
+            loc = np.concatenate([a['location'] for a in gt_annos_part], 0)
+            dims = np.concatenate([a['dimensions'] for a in gt_annos_part], 0)
+            rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0)
+            gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1)
+            loc = np.concatenate([a['location'] for a in dt_annos_part], 0)
+            dims = np.concatenate([a['dimensions'] for a in dt_annos_part], 0)
+            rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0)
+            dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
+                                      axis=1)
+            overlap_part = d3_box_overlap(gt_boxes,
+                                          dt_boxes).astype(np.float64)
+        else:
+            raise ValueError('unknown metric')
+        parted_overlaps.append(overlap_part)
+        example_idx += num_part
+    overlaps = []
+    example_idx = 0
+    for j, num_part in enumerate(split_parts):
+        gt_annos_part = gt_annos[example_idx:example_idx + num_part]
+        dt_annos_part = dt_annos[example_idx:example_idx + num_part]
+        gt_num_idx, dt_num_idx = 0, 0
+        for i in range(num_part):
+            gt_box_num = total_gt_num[example_idx + i]
+            dt_box_num = total_dt_num[example_idx + i]
+            overlaps.append(
+                parted_overlaps[j][gt_num_idx:gt_num_idx + gt_box_num,
+                                   dt_num_idx:dt_num_idx + dt_box_num])
+            gt_num_idx += gt_box_num
+            dt_num_idx += dt_box_num
+        example_idx += num_part
+
+    return overlaps, parted_overlaps, total_gt_num, total_dt_num
+
+
+def _prepare_data(gt_annos, dt_annos, current_class, difficulty):
+    gt_datas_list = []
+    dt_datas_list = []
+    total_dc_num = []
+    ignored_gts, ignored_dets, dontcares = [], [], []
+    total_num_valid_gt = 0
+    for i in range(len(gt_annos)):
+        rets = clean_data(gt_annos[i], dt_annos[i], current_class, difficulty)
+        num_valid_gt, ignored_gt, ignored_det, dc_bboxes = rets
+        ignored_gts.append(np.array(ignored_gt, dtype=np.int64))
+        ignored_dets.append(np.array(ignored_det, dtype=np.int64))
+        if len(dc_bboxes) == 0:
+            dc_bboxes = np.zeros((0, 4)).astype(np.float64)
+        else:
+            dc_bboxes = np.stack(dc_bboxes, 0).astype(np.float64)
+        total_dc_num.append(dc_bboxes.shape[0])
+        dontcares.append(dc_bboxes)
+        total_num_valid_gt += num_valid_gt
+        gt_datas = np.concatenate(
+            [gt_annos[i]['bbox'], gt_annos[i]['alpha'][..., np.newaxis]], 1)
+        dt_datas = np.concatenate([
+            dt_annos[i]['bbox'], dt_annos[i]['alpha'][..., np.newaxis],
+            dt_annos[i]['score'][..., np.newaxis]
+        ], 1)
+        gt_datas_list.append(gt_datas)
+        dt_datas_list.append(dt_datas)
+    total_dc_num = np.stack(total_dc_num, axis=0)
+    return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares,
+            total_dc_num, total_num_valid_gt)
+
+
+def eval_class(gt_annos,
+               dt_annos,
+               current_classes,
+               difficultys,
+               metric,
+               min_overlaps,
+               compute_aos=False,
+               num_parts=200):
+    """Kitti eval. support 2d/bev/3d/aos eval. support 0.5:0.05:0.95 coco AP.
+
+    Args:
+        gt_annos (dict): Must from get_label_annos() in kitti_common.py.
+        dt_annos (dict): Must from get_label_annos() in kitti_common.py.
+        current_classes (list[int]): 0: car, 1: pedestrian, 2: cyclist.
+        difficultys (list[int]): Eval difficulty, 0: easy, 1: normal, 2: hard
+        metric (int): Eval type. 0: bbox, 1: bev, 2: 3d
+        min_overlaps (float): Min overlap. format:
+            [num_overlap, metric, class].
+        num_parts (int): A parameter for fast calculate algorithm
+
+    Returns:
+        dict[str, np.ndarray]: recall, precision and aos
+    """
+    assert len(gt_annos) == len(dt_annos)
+    num_examples = len(gt_annos)
+    if num_examples < num_parts:
+        num_parts = num_examples
+    split_parts = get_split_parts(num_examples, num_parts)
+
+    rets = calculate_iou_partly(dt_annos, gt_annos, metric, num_parts)
+    overlaps, parted_overlaps, total_dt_num, total_gt_num = rets
+    N_SAMPLE_PTS = 41
+    num_minoverlap = len(min_overlaps)
+    num_class = len(current_classes)
+    num_difficulty = len(difficultys)
+    precision = np.zeros(
+        [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+    recall = np.zeros(
+        [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+    aos = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
+    for m, current_class in enumerate(current_classes):
+        for idx_l, difficulty in enumerate(difficultys):
+            rets = _prepare_data(gt_annos, dt_annos, current_class, difficulty)
+            (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets,
+             dontcares, total_dc_num, total_num_valid_gt) = rets
+            for k, min_overlap in enumerate(min_overlaps[:, metric, m]):
+                thresholdss = []
+                for i in range(len(gt_annos)):
+                    rets = compute_statistics_jit(
+                        overlaps[i],
+                        gt_datas_list[i],
+                        dt_datas_list[i],
+                        ignored_gts[i],
+                        ignored_dets[i],
+                        dontcares[i],
+                        metric,
+                        min_overlap=min_overlap,
+                        thresh=0.0,
+                        compute_fp=False)
+                    tp, fp, fn, similarity, thresholds = rets
+                    thresholdss += thresholds.tolist()
+                thresholdss = np.array(thresholdss)
+                thresholds = get_thresholds(thresholdss, total_num_valid_gt)
+                thresholds = np.array(thresholds)
+                pr = np.zeros([len(thresholds), 4])
+                idx = 0
+                for j, num_part in enumerate(split_parts):
+                    gt_datas_part = np.concatenate(
+                        gt_datas_list[idx:idx + num_part], 0)
+                    dt_datas_part = np.concatenate(
+                        dt_datas_list[idx:idx + num_part], 0)
+                    dc_datas_part = np.concatenate(
+                        dontcares[idx:idx + num_part], 0)
+                    ignored_dets_part = np.concatenate(
+                        ignored_dets[idx:idx + num_part], 0)
+                    ignored_gts_part = np.concatenate(
+                        ignored_gts[idx:idx + num_part], 0)
+                    fused_compute_statistics(
+                        parted_overlaps[j],
+                        pr,
+                        total_gt_num[idx:idx + num_part],
+                        total_dt_num[idx:idx + num_part],
+                        total_dc_num[idx:idx + num_part],
+                        gt_datas_part,
+                        dt_datas_part,
+                        dc_datas_part,
+                        ignored_gts_part,
+                        ignored_dets_part,
+                        metric,
+                        min_overlap=min_overlap,
+                        thresholds=thresholds,
+                        compute_aos=compute_aos)
+                    idx += num_part
+                for i in range(len(thresholds)):
+                    recall[m, idx_l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2])
+                    precision[m, idx_l, k, i] = pr[i, 0] / (
+                        pr[i, 0] + pr[i, 1])
+                    if compute_aos:
+                        aos[m, idx_l, k, i] = pr[i, 3] / (pr[i, 0] + pr[i, 1])
+                for i in range(len(thresholds)):
+                    precision[m, idx_l, k, i] = np.max(
+                        precision[m, idx_l, k, i:], axis=-1)
+                    recall[m, idx_l, k, i] = np.max(
+                        recall[m, idx_l, k, i:], axis=-1)
+                    if compute_aos:
+                        aos[m, idx_l, k, i] = np.max(
+                            aos[m, idx_l, k, i:], axis=-1)
+    ret_dict = {
+        'recall': recall,
+        'precision': precision,
+        'orientation': aos,
+    }
+
+    # clean temp variables
+    del overlaps
+    del parted_overlaps
+
+    gc.collect()
+    return ret_dict
+
+
+def get_mAP(prec):
+    sums = 0
+    for i in range(0, prec.shape[-1], 4):
+        sums = sums + prec[..., i]
+    return sums / 11 * 100
+
+
+def print_str(value, *arg, sstream=None):
+    if sstream is None:
+        sstream = sysio.StringIO()
+    sstream.truncate(0)
+    sstream.seek(0)
+    print(value, *arg, file=sstream)
+    return sstream.getvalue()
+
+
+def do_eval(gt_annos,
+            dt_annos,
+            current_classes,
+            min_overlaps,
+            eval_types=['bbox', 'bev', '3d']):
+    # min_overlaps: [num_minoverlap, metric, num_class]
+    difficultys = [0, 1, 2]
+    mAP_bbox = None
+    mAP_aos = None
+    if 'bbox' in eval_types:
+        ret = eval_class(
+            gt_annos,
+            dt_annos,
+            current_classes,
+            difficultys,
+            0,
+            min_overlaps,
+            compute_aos=('aos' in eval_types))
+        # ret: [num_class, num_diff, num_minoverlap, num_sample_points]
+        mAP_bbox = get_mAP(ret['precision'])
+        if 'aos' in eval_types:
+            mAP_aos = get_mAP(ret['orientation'])
+
+    mAP_bev = None
+    if 'bev' in eval_types:
+        ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 1,
+                         min_overlaps)
+        mAP_bev = get_mAP(ret['precision'])
+
+    mAP_3d = None
+    if '3d' in eval_types:
+        ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 2,
+                         min_overlaps)
+        mAP_3d = get_mAP(ret['precision'])
+    return mAP_bbox, mAP_bev, mAP_3d, mAP_aos
+
+
+def do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges,
+                       compute_aos):
+    # overlap_ranges: [range, metric, num_class]
+    min_overlaps = np.zeros([10, *overlap_ranges.shape[1:]])
+    for i in range(overlap_ranges.shape[1]):
+        for j in range(overlap_ranges.shape[2]):
+            min_overlaps[:, i, j] = np.linspace(*overlap_ranges[:, i, j])
+    mAP_bbox, mAP_bev, mAP_3d, mAP_aos = do_eval(gt_annos, dt_annos,
+                                                 current_classes, min_overlaps,
+                                                 compute_aos)
+    # ret: [num_class, num_diff, num_minoverlap]
+    mAP_bbox = mAP_bbox.mean(-1)
+    mAP_bev = mAP_bev.mean(-1)
+    mAP_3d = mAP_3d.mean(-1)
+    if mAP_aos is not None:
+        mAP_aos = mAP_aos.mean(-1)
+    return mAP_bbox, mAP_bev, mAP_3d, mAP_aos
+
+
+def kitti_eval(gt_annos,
+               dt_annos,
+               current_classes,
+               eval_types=['bbox', 'bev', '3d']):
+    """KITTI evaluation.
+
+    Args:
+        gt_annos (list[dict]): Contain gt information of each sample.
+        dt_annos (list[dict]): Contain detected information of each sample.
+        current_classes (list[str]): Classes to evaluation.
+        eval_types (list[str], optional): Types to eval.
+            Defaults to ['bbox', 'bev', '3d'].
+
+    Returns:
+        tuple: String and dict of evaluation results.
+    """
+    assert len(eval_types) > 0, 'must contain at least one evaluation type'
+    if 'aos' in eval_types:
+        assert 'bbox' in eval_types, 'must evaluate bbox when evaluating aos'
+    overlap_0_7 = np.array([[0.7, 0.5, 0.5, 0.7,
+                             0.5], [0.7, 0.5, 0.5, 0.7, 0.5],
+                            [0.7, 0.5, 0.5, 0.7, 0.5]])
+    overlap_0_5 = np.array([[0.7, 0.5, 0.5, 0.7, 0.5],
+                            [0.5, 0.25, 0.25, 0.5, 0.25],
+                            [0.5, 0.25, 0.25, 0.5, 0.25]])
+    min_overlaps = np.stack([overlap_0_7, overlap_0_5], axis=0)  # [2, 3, 5]
+    class_to_name = {
+        0: 'Car',
+        1: 'Pedestrian',
+        2: 'Cyclist',
+        3: 'Van',
+        4: 'Person_sitting',
+    }
+    name_to_class = {v: n for n, v in class_to_name.items()}
+    if not isinstance(current_classes, (list, tuple)):
+        current_classes = [current_classes]
+    current_classes_int = []
+    for curcls in current_classes:
+        if isinstance(curcls, str):
+            current_classes_int.append(name_to_class[curcls])
+        else:
+            current_classes_int.append(curcls)
+    current_classes = current_classes_int
+    min_overlaps = min_overlaps[:, :, current_classes]
+    result = ''
+    # check whether alpha is valid
+    compute_aos = False
+    pred_alpha = False
+    valid_alpha_gt = False
+    for anno in dt_annos:
+        mask = (anno['alpha'] != -10)
+        if anno['alpha'][mask].shape[0] != 0:
+            pred_alpha = True
+            break
+    for anno in gt_annos:
+        if anno['alpha'][0] != -10:
+            valid_alpha_gt = True
+            break
+    compute_aos = (pred_alpha and valid_alpha_gt)
+    if compute_aos:
+        eval_types.append('aos')
+
+    mAPbbox, mAPbev, mAP3d, mAPaos = do_eval(gt_annos, dt_annos,
+                                             current_classes, min_overlaps,
+                                             eval_types)
+
+    ret_dict = {}
+    difficulty = ['easy', 'moderate', 'hard']
+    for j, curcls in enumerate(current_classes):
+        # mAP threshold array: [num_minoverlap, metric, class]
+        # mAP result: [num_class, num_diff, num_minoverlap]
+        curcls_name = class_to_name[curcls]
+        for i in range(min_overlaps.shape[0]):
+            # prepare results for print
+            result += ('{} AP@{:.2f}, {:.2f}, {:.2f}:\n'.format(
+                curcls_name, *min_overlaps[i, :, j]))
+            if mAPbbox is not None:
+                result += 'bbox AP:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                    *mAPbbox[j, :, i])
+            if mAPbev is not None:
+                result += 'bev  AP:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                    *mAPbev[j, :, i])
+            if mAP3d is not None:
+                result += '3d   AP:{:.4f}, {:.4f}, {:.4f}\n'.format(
+                    *mAP3d[j, :, i])
+
+            if compute_aos:
+                result += 'aos  AP:{:.2f}, {:.2f}, {:.2f}\n'.format(
+                    *mAPaos[j, :, i])
+
+            # prepare results for logger
+            for idx in range(3):
+                if i == 0:
+                    postfix = f'{difficulty[idx]}_strict'
+                else:
+                    postfix = f'{difficulty[idx]}_loose'
+                prefix = f'KITTI/{curcls_name}'
+                if mAP3d is not None:
+                    ret_dict[f'{prefix}_3D_{postfix}'] = mAP3d[j, idx, i]
+                if mAPbev is not None:
+                    ret_dict[f'{prefix}_BEV_{postfix}'] = mAPbev[j, idx, i]
+                if mAPbbox is not None:
+                    ret_dict[f'{prefix}_2D_{postfix}'] = mAPbbox[j, idx, i]
+
+    # calculate mAP over all classes if there are multiple classes
+    if len(current_classes) > 1:
+        # prepare results for print
+        result += ('\nOverall AP@{}, {}, {}:\n'.format(*difficulty))
+        if mAPbbox is not None:
+            mAPbbox = mAPbbox.mean(axis=0)
+            result += 'bbox AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbbox[:, 0])
+        if mAPbev is not None:
+            mAPbev = mAPbev.mean(axis=0)
+            result += 'bev  AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbev[:, 0])
+        if mAP3d is not None:
+            mAP3d = mAP3d.mean(axis=0)
+            result += '3d   AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP3d[:, 0])
+        if compute_aos:
+            mAPaos = mAPaos.mean(axis=0)
+            result += 'aos  AP:{:.2f}, {:.2f}, {:.2f}\n'.format(*mAPaos[:, 0])
+
+        # prepare results for logger
+        for idx in range(3):
+            postfix = f'{difficulty[idx]}'
+            if mAP3d is not None:
+                ret_dict[f'KITTI/Overall_3D_{postfix}'] = mAP3d[idx, 0]
+            if mAPbev is not None:
+                ret_dict[f'KITTI/Overall_BEV_{postfix}'] = mAPbev[idx, 0]
+            if mAPbbox is not None:
+                ret_dict[f'KITTI/Overall_2D_{postfix}'] = mAPbbox[idx, 0]
+
+    return result, ret_dict
+
+
+def kitti_eval_coco_style(gt_annos, dt_annos, current_classes):
+    """coco style evaluation of kitti.
+
+    Args:
+        gt_annos (list[dict]): Contain gt information of each sample.
+        dt_annos (list[dict]): Contain detected information of each sample.
+        current_classes (list[str]): Classes to evaluation.
+
+    Returns:
+        string: Evaluation results.
+    """
+    class_to_name = {
+        0: 'Car',
+        1: 'Pedestrian',
+        2: 'Cyclist',
+        3: 'Van',
+        4: 'Person_sitting',
+    }
+    class_to_range = {
+        0: [0.5, 0.95, 10],
+        1: [0.25, 0.7, 10],
+        2: [0.25, 0.7, 10],
+        3: [0.5, 0.95, 10],
+        4: [0.25, 0.7, 10],
+    }
+    name_to_class = {v: n for n, v in class_to_name.items()}
+    if not isinstance(current_classes, (list, tuple)):
+        current_classes = [current_classes]
+    current_classes_int = []
+    for curcls in current_classes:
+        if isinstance(curcls, str):
+            current_classes_int.append(name_to_class[curcls])
+        else:
+            current_classes_int.append(curcls)
+    current_classes = current_classes_int
+    overlap_ranges = np.zeros([3, 3, len(current_classes)])
+    for i, curcls in enumerate(current_classes):
+        overlap_ranges[:, :, i] = np.array(class_to_range[curcls])[:,
+                                                                   np.newaxis]
+    result = ''
+    # check whether alpha is valid
+    compute_aos = False
+    for anno in dt_annos:
+        if anno['alpha'].shape[0] != 0:
+            if anno['alpha'][0] != -10:
+                compute_aos = True
+            break
+    mAPbbox, mAPbev, mAP3d, mAPaos = do_coco_style_eval(
+        gt_annos, dt_annos, current_classes, overlap_ranges, compute_aos)
+    for j, curcls in enumerate(current_classes):
+        # mAP threshold array: [num_minoverlap, metric, class]
+        # mAP result: [num_class, num_diff, num_minoverlap]
+        o_range = np.array(class_to_range[curcls])[[0, 2, 1]]
+        o_range[1] = (o_range[2] - o_range[0]) / (o_range[1] - 1)
+        result += print_str((f'{class_to_name[curcls]} '
+                             'coco AP@{:.2f}:{:.2f}:{:.2f}:'.format(*o_range)))
+        result += print_str((f'bbox AP:{mAPbbox[j, 0]:.2f}, '
+                             f'{mAPbbox[j, 1]:.2f}, '
+                             f'{mAPbbox[j, 2]:.2f}'))
+        result += print_str((f'bev  AP:{mAPbev[j, 0]:.2f}, '
+                             f'{mAPbev[j, 1]:.2f}, '
+                             f'{mAPbev[j, 2]:.2f}'))
+        result += print_str((f'3d   AP:{mAP3d[j, 0]:.2f}, '
+                             f'{mAP3d[j, 1]:.2f}, '
+                             f'{mAP3d[j, 2]:.2f}'))
+        if compute_aos:
+            result += print_str((f'aos  AP:{mAPaos[j, 0]:.2f}, '
+                                 f'{mAPaos[j, 1]:.2f}, '
+                                 f'{mAPaos[j, 2]:.2f}'))
+    return result
diff --git a/mmcv/core/evaluation/kitti_utils/rotate_iou.py b/mmcv/core/evaluation/kitti_utils/rotate_iou.py
new file mode 100644
index 0000000..2f0c9c8
--- /dev/null
+++ b/mmcv/core/evaluation/kitti_utils/rotate_iou.py
@@ -0,0 +1,379 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+#####################
+# Based on https://github.com/hongzhenwang/RRPN-revise
+# Licensed under The MIT License
+# Author: yanyan, scrin@foxmail.com
+#####################
+import math
+import numba
+import numpy as np
+from numba import cuda
+
+
+@numba.jit(nopython=True)
+def div_up(m, n):
+    return m // n + (m % n > 0)
+
+
+@cuda.jit('(float32[:], float32[:], float32[:])', device=True, inline=True)
+def trangle_area(a, b, c):
+    return ((a[0] - c[0]) * (b[1] - c[1]) - (a[1] - c[1]) *
+            (b[0] - c[0])) / 2.0
+
+
+@cuda.jit('(float32[:], int32)', device=True, inline=True)
+def area(int_pts, num_of_inter):
+    area_val = 0.0
+    for i in range(num_of_inter - 2):
+        area_val += abs(
+            trangle_area(int_pts[:2], int_pts[2 * i + 2:2 * i + 4],
+                         int_pts[2 * i + 4:2 * i + 6]))
+    return area_val
+
+
+@cuda.jit('(float32[:], int32)', device=True, inline=True)
+def sort_vertex_in_convex_polygon(int_pts, num_of_inter):
+    if num_of_inter > 0:
+        center = cuda.local.array((2, ), dtype=numba.float32)
+        center[:] = 0.0
+        for i in range(num_of_inter):
+            center[0] += int_pts[2 * i]
+            center[1] += int_pts[2 * i + 1]
+        center[0] /= num_of_inter
+        center[1] /= num_of_inter
+        v = cuda.local.array((2, ), dtype=numba.float32)
+        vs = cuda.local.array((16, ), dtype=numba.float32)
+        for i in range(num_of_inter):
+            v[0] = int_pts[2 * i] - center[0]
+            v[1] = int_pts[2 * i + 1] - center[1]
+            d = math.sqrt(v[0] * v[0] + v[1] * v[1])
+            v[0] = v[0] / d
+            v[1] = v[1] / d
+            if v[1] < 0:
+                v[0] = -2 - v[0]
+            vs[i] = v[0]
+        j = 0
+        temp = 0
+        for i in range(1, num_of_inter):
+            if vs[i - 1] > vs[i]:
+                temp = vs[i]
+                tx = int_pts[2 * i]
+                ty = int_pts[2 * i + 1]
+                j = i
+                while j > 0 and vs[j - 1] > temp:
+                    vs[j] = vs[j - 1]
+                    int_pts[j * 2] = int_pts[j * 2 - 2]
+                    int_pts[j * 2 + 1] = int_pts[j * 2 - 1]
+                    j -= 1
+
+                vs[j] = temp
+                int_pts[j * 2] = tx
+                int_pts[j * 2 + 1] = ty
+
+
+@cuda.jit(
+    '(float32[:], float32[:], int32, int32, float32[:])',
+    device=True,
+    inline=True)
+def line_segment_intersection(pts1, pts2, i, j, temp_pts):
+    A = cuda.local.array((2, ), dtype=numba.float32)
+    B = cuda.local.array((2, ), dtype=numba.float32)
+    C = cuda.local.array((2, ), dtype=numba.float32)
+    D = cuda.local.array((2, ), dtype=numba.float32)
+
+    A[0] = pts1[2 * i]
+    A[1] = pts1[2 * i + 1]
+
+    B[0] = pts1[2 * ((i + 1) % 4)]
+    B[1] = pts1[2 * ((i + 1) % 4) + 1]
+
+    C[0] = pts2[2 * j]
+    C[1] = pts2[2 * j + 1]
+
+    D[0] = pts2[2 * ((j + 1) % 4)]
+    D[1] = pts2[2 * ((j + 1) % 4) + 1]
+    BA0 = B[0] - A[0]
+    BA1 = B[1] - A[1]
+    DA0 = D[0] - A[0]
+    CA0 = C[0] - A[0]
+    DA1 = D[1] - A[1]
+    CA1 = C[1] - A[1]
+    acd = DA1 * CA0 > CA1 * DA0
+    bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * (D[0] - B[0])
+    if acd != bcd:
+        abc = CA1 * BA0 > BA1 * CA0
+        abd = DA1 * BA0 > BA1 * DA0
+        if abc != abd:
+            DC0 = D[0] - C[0]
+            DC1 = D[1] - C[1]
+            ABBA = A[0] * B[1] - B[0] * A[1]
+            CDDC = C[0] * D[1] - D[0] * C[1]
+            DH = BA1 * DC0 - BA0 * DC1
+            Dx = ABBA * DC0 - BA0 * CDDC
+            Dy = ABBA * DC1 - BA1 * CDDC
+            temp_pts[0] = Dx / DH
+            temp_pts[1] = Dy / DH
+            return True
+    return False
+
+
+@cuda.jit(
+    '(float32[:], float32[:], int32, int32, float32[:])',
+    device=True,
+    inline=True)
+def line_segment_intersection_v1(pts1, pts2, i, j, temp_pts):
+    a = cuda.local.array((2, ), dtype=numba.float32)
+    b = cuda.local.array((2, ), dtype=numba.float32)
+    c = cuda.local.array((2, ), dtype=numba.float32)
+    d = cuda.local.array((2, ), dtype=numba.float32)
+
+    a[0] = pts1[2 * i]
+    a[1] = pts1[2 * i + 1]
+
+    b[0] = pts1[2 * ((i + 1) % 4)]
+    b[1] = pts1[2 * ((i + 1) % 4) + 1]
+
+    c[0] = pts2[2 * j]
+    c[1] = pts2[2 * j + 1]
+
+    d[0] = pts2[2 * ((j + 1) % 4)]
+    d[1] = pts2[2 * ((j + 1) % 4) + 1]
+
+    area_abc = trangle_area(a, b, c)
+    area_abd = trangle_area(a, b, d)
+
+    if area_abc * area_abd >= 0:
+        return False
+
+    area_cda = trangle_area(c, d, a)
+    area_cdb = area_cda + area_abc - area_abd
+
+    if area_cda * area_cdb >= 0:
+        return False
+    t = area_cda / (area_abd - area_abc)
+
+    dx = t * (b[0] - a[0])
+    dy = t * (b[1] - a[1])
+    temp_pts[0] = a[0] + dx
+    temp_pts[1] = a[1] + dy
+    return True
+
+
+@cuda.jit('(float32, float32, float32[:])', device=True, inline=True)
+def point_in_quadrilateral(pt_x, pt_y, corners):
+    ab0 = corners[2] - corners[0]
+    ab1 = corners[3] - corners[1]
+
+    ad0 = corners[6] - corners[0]
+    ad1 = corners[7] - corners[1]
+
+    ap0 = pt_x - corners[0]
+    ap1 = pt_y - corners[1]
+
+    abab = ab0 * ab0 + ab1 * ab1
+    abap = ab0 * ap0 + ab1 * ap1
+    adad = ad0 * ad0 + ad1 * ad1
+    adap = ad0 * ap0 + ad1 * ap1
+
+    return abab >= abap and abap >= 0 and adad >= adap and adap >= 0
+
+
+@cuda.jit('(float32[:], float32[:], float32[:])', device=True, inline=True)
+def quadrilateral_intersection(pts1, pts2, int_pts):
+    num_of_inter = 0
+    for i in range(4):
+        if point_in_quadrilateral(pts1[2 * i], pts1[2 * i + 1], pts2):
+            int_pts[num_of_inter * 2] = pts1[2 * i]
+            int_pts[num_of_inter * 2 + 1] = pts1[2 * i + 1]
+            num_of_inter += 1
+        if point_in_quadrilateral(pts2[2 * i], pts2[2 * i + 1], pts1):
+            int_pts[num_of_inter * 2] = pts2[2 * i]
+            int_pts[num_of_inter * 2 + 1] = pts2[2 * i + 1]
+            num_of_inter += 1
+    temp_pts = cuda.local.array((2, ), dtype=numba.float32)
+    for i in range(4):
+        for j in range(4):
+            has_pts = line_segment_intersection(pts1, pts2, i, j, temp_pts)
+            if has_pts:
+                int_pts[num_of_inter * 2] = temp_pts[0]
+                int_pts[num_of_inter * 2 + 1] = temp_pts[1]
+                num_of_inter += 1
+
+    return num_of_inter
+
+
+@cuda.jit('(float32[:], float32[:])', device=True, inline=True)
+def rbbox_to_corners(corners, rbbox):
+    # generate clockwise corners and rotate it clockwise
+    angle = rbbox[4]
+    a_cos = math.cos(angle)
+    a_sin = math.sin(angle)
+    center_x = rbbox[0]
+    center_y = rbbox[1]
+    x_d = rbbox[2]
+    y_d = rbbox[3]
+    corners_x = cuda.local.array((4, ), dtype=numba.float32)
+    corners_y = cuda.local.array((4, ), dtype=numba.float32)
+    corners_x[0] = -x_d / 2
+    corners_x[1] = -x_d / 2
+    corners_x[2] = x_d / 2
+    corners_x[3] = x_d / 2
+    corners_y[0] = -y_d / 2
+    corners_y[1] = y_d / 2
+    corners_y[2] = y_d / 2
+    corners_y[3] = -y_d / 2
+    for i in range(4):
+        corners[2 * i] = a_cos * corners_x[i] + a_sin * corners_y[i] + center_x
+        corners[2 * i +
+                1] = -a_sin * corners_x[i] + a_cos * corners_y[i] + center_y
+
+
+@cuda.jit('(float32[:], float32[:])', device=True, inline=True)
+def inter(rbbox1, rbbox2):
+    """Compute intersection of two rotated boxes.
+
+    Args:
+        rbox1 (np.ndarray, shape=[5]): Rotated 2d box.
+        rbox2 (np.ndarray, shape=[5]): Rotated 2d box.
+
+    Returns:
+        float: Intersection of two rotated boxes.
+    """
+    corners1 = cuda.local.array((8, ), dtype=numba.float32)
+    corners2 = cuda.local.array((8, ), dtype=numba.float32)
+    intersection_corners = cuda.local.array((16, ), dtype=numba.float32)
+
+    rbbox_to_corners(corners1, rbbox1)
+    rbbox_to_corners(corners2, rbbox2)
+
+    num_intersection = quadrilateral_intersection(corners1, corners2,
+                                                  intersection_corners)
+    sort_vertex_in_convex_polygon(intersection_corners, num_intersection)
+    # print(intersection_corners.reshape([-1, 2])[:num_intersection])
+
+    return area(intersection_corners, num_intersection)
+
+
+@cuda.jit('(float32[:], float32[:], int32)', device=True, inline=True)
+def devRotateIoUEval(rbox1, rbox2, criterion=-1):
+    """Compute rotated iou on device.
+
+    Args:
+        rbox1 (np.ndarray, shape=[5]): Rotated 2d box.
+        rbox2 (np.ndarray, shape=[5]): Rotated 2d box.
+        criterion (int, optional): Indicate different type of iou.
+            -1 indicate `area_inter / (area1 + area2 - area_inter)`,
+            0 indicate `area_inter / area1`,
+            1 indicate `area_inter / area2`.
+
+    Returns:
+        float: iou between two input boxes.
+    """
+    area1 = rbox1[2] * rbox1[3]
+    area2 = rbox2[2] * rbox2[3]
+    area_inter = inter(rbox1, rbox2)
+    if criterion == -1:
+        return area_inter / (area1 + area2 - area_inter)
+    elif criterion == 0:
+        return area_inter / area1
+    elif criterion == 1:
+        return area_inter / area2
+    else:
+        return area_inter
+
+
+@cuda.jit(
+    '(int64, int64, float32[:], float32[:], float32[:], int32)',
+    fastmath=False)
+def rotate_iou_kernel_eval(N,
+                           K,
+                           dev_boxes,
+                           dev_query_boxes,
+                           dev_iou,
+                           criterion=-1):
+    """Kernel of computing rotated iou.
+
+    Args:
+        N (int): The number of boxes.
+        K (int): The number of query boxes.
+        dev_boxes (np.ndarray): Boxes on device.
+        dev_query_boxes (np.ndarray): Query boxes on device.
+        dev_iou (np.ndarray): Computed iou to return.
+        criterion (int, optional): Indicate different type of iou.
+            -1 indicate `area_inter / (area1 + area2 - area_inter)`,
+            0 indicate `area_inter / area1`,
+            1 indicate `area_inter / area2`.
+    """
+    threadsPerBlock = 8 * 8
+    row_start = cuda.blockIdx.x
+    col_start = cuda.blockIdx.y
+    tx = cuda.threadIdx.x
+    row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
+    col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
+    block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
+    block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
+
+    dev_query_box_idx = threadsPerBlock * col_start + tx
+    dev_box_idx = threadsPerBlock * row_start + tx
+    if (tx < col_size):
+        block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
+        block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
+        block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
+        block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
+        block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
+    if (tx < row_size):
+        block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
+        block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
+        block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
+        block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
+        block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
+    cuda.syncthreads()
+    if tx < row_size:
+        for i in range(col_size):
+            offset = (
+                row_start * threadsPerBlock * K + col_start * threadsPerBlock +
+                tx * K + i)
+            dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5],
+                                               block_boxes[tx * 5:tx * 5 + 5],
+                                               criterion)
+
+
+def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
+    """Rotated box iou running in gpu. 500x faster than cpu version (take 5ms
+    in one example with numba.cuda code). convert from [this project](
+    https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
+
+    Args:
+        boxes (torch.Tensor): rbboxes. format: centers, dims,
+            angles(clockwise when positive) with the shape of [N, 5].
+        query_boxes (float tensor: [K, 5]): rbboxes to compute iou with boxes.
+        device_id (int, optional): Defaults to 0. Device to use.
+        criterion (int, optional): Indicate different type of iou.
+            -1 indicate `area_inter / (area1 + area2 - area_inter)`,
+            0 indicate `area_inter / area1`,
+            1 indicate `area_inter / area2`.
+
+    Returns:
+        np.ndarray: IoU results.
+    """
+    boxes = boxes.astype(np.float32)
+    query_boxes = query_boxes.astype(np.float32)
+    N = boxes.shape[0]
+    K = query_boxes.shape[0]
+    iou = np.zeros((N, K), dtype=np.float32)
+    if N == 0 or K == 0:
+        return iou
+    threadsPerBlock = 8 * 8
+    cuda.select_device(device_id)
+    blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock))
+
+    stream = cuda.stream()
+    with stream.auto_synchronize():
+        boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
+        query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
+        iou_dev = cuda.to_device(iou.reshape([-1]), stream)
+        rotate_iou_kernel_eval[blockspergrid, threadsPerBlock,
+                               stream](N, K, boxes_dev, query_boxes_dev,
+                                       iou_dev, criterion)
+        iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
+    return iou.astype(boxes.dtype)
diff --git a/mmcv/core/evaluation/lyft_eval.py b/mmcv/core/evaluation/lyft_eval.py
new file mode 100644
index 0000000..bfb95a1
--- /dev/null
+++ b/mmcv/core/evaluation/lyft_eval.py
@@ -0,0 +1,284 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from lyft_dataset_sdk.eval.detection.mAP_evaluation import (Box3D, get_ap,
+                                                            get_class_names,
+                                                            get_ious,
+                                                            group_by_key,
+                                                            wrap_in_box)
+from mmcv.utils import print_log, track_iter_progress
+from mmcv.fileio.io import dump, load
+from os import path as osp
+from terminaltables import AsciiTable
+
+
+# def load_lyft_gts(lyft, data_root, eval_split, logger=None):
+#     """Loads ground truth boxes from database.
+
+#     Args:
+#         lyft (:obj:`LyftDataset`): Lyft class in the sdk.
+#         data_root (str): Root of data for reading splits.
+#         eval_split (str): Name of the split for evaluation.
+#         logger (logging.Logger | str | None): Logger used for printing
+#         related information during evaluation. Default: None.
+
+#     Returns:
+#         list[dict]: List of annotation dictionaries.
+#     """
+#     split_scenes = mmcv.list_from_file(
+#         osp.join(data_root, f'{eval_split}.txt'))
+
+#     # Read out all sample_tokens in DB.
+#     sample_tokens_all = [s['token'] for s in lyft.sample]
+#     assert len(sample_tokens_all) > 0, 'Error: Database has no samples!'
+
+#     if eval_split == 'test':
+#         # Check that you aren't trying to cheat :)
+#         assert len(lyft.sample_annotation) > 0, \
+#             'Error: You are trying to evaluate on the test set \
+#              but you do not have the annotations!'
+
+#     sample_tokens = []
+#     for sample_token in sample_tokens_all:
+#         scene_token = lyft.get('sample', sample_token)['scene_token']
+#         scene_record = lyft.get('scene', scene_token)
+#         if scene_record['name'] in split_scenes:
+#             sample_tokens.append(sample_token)
+
+#     all_annotations = []
+
+#     print_log('Loading ground truth annotations...', logger=logger)
+#     # Load annotations and filter predictions and annotations.
+#     for sample_token in track_iter_progress(sample_tokens):
+#         sample = lyft.get('sample', sample_token)
+#         sample_annotation_tokens = sample['anns']
+#         for sample_annotation_token in sample_annotation_tokens:
+#             # Get label name in detection task and filter unused labels.
+#             sample_annotation = \
+#                 lyft.get('sample_annotation', sample_annotation_token)
+#             detection_name = sample_annotation['category_name']
+#             if detection_name is None:
+#                 continue
+#             annotation = {
+#                 'sample_token': sample_token,
+#                 'translation': sample_annotation['translation'],
+#                 'size': sample_annotation['size'],
+#                 'rotation': sample_annotation['rotation'],
+#                 'name': detection_name,
+#             }
+#             all_annotations.append(annotation)
+
+#     return all_annotations
+
+
+def load_lyft_predictions(res_path):
+    """Load Lyft predictions from json file.
+
+    Args:
+        res_path (str): Path of result json file recording detections.
+
+    Returns:
+        list[dict]: List of prediction dictionaries.
+    """
+    predictions = load(res_path)
+    predictions = predictions['results']
+    all_preds = []
+    for sample_token in predictions.keys():
+        all_preds.extend(predictions[sample_token])
+    return all_preds
+
+
+def lyft_eval(lyft, data_root, res_path, eval_set, output_dir, logger=None):
+    """Evaluation API for Lyft dataset.
+
+    Args:
+        lyft (:obj:`LyftDataset`): Lyft class in the sdk.
+        data_root (str): Root of data for reading splits.
+        res_path (str): Path of result json file recording detections.
+        eval_set (str): Name of the split for evaluation.
+        output_dir (str): Output directory for output json files.
+        logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+
+    Returns:
+        dict[str, float]: The evaluation results.
+    """
+    # evaluate by lyft metrics
+    gts = load_lyft_gts(lyft, data_root, eval_set, logger)
+    predictions = load_lyft_predictions(res_path)
+
+    class_names = get_class_names(gts)
+    print('Calculating mAP@0.5:0.95...')
+
+    iou_thresholds = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
+    metrics = {}
+    average_precisions = \
+        get_classwise_aps(gts, predictions, class_names, iou_thresholds)
+    APs_data = [['IOU', 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]]
+
+    mAPs = np.mean(average_precisions, axis=0)
+    mAPs_cate = np.mean(average_precisions, axis=1)
+    final_mAP = np.mean(mAPs)
+
+    metrics['average_precisions'] = average_precisions.tolist()
+    metrics['mAPs'] = mAPs.tolist()
+    metrics['Final mAP'] = float(final_mAP)
+    metrics['class_names'] = class_names
+    metrics['mAPs_cate'] = mAPs_cate.tolist()
+
+    APs_data = [['class', 'mAP@0.5:0.95']]
+    for i in range(len(class_names)):
+        row = [class_names[i], round(mAPs_cate[i], 3)]
+        APs_data.append(row)
+    APs_data.append(['Overall', round(final_mAP, 3)])
+    APs_table = AsciiTable(APs_data, title='mAPs@0.5:0.95')
+    APs_table.inner_footing_row_border = True
+    print_log(APs_table.table, logger=logger)
+
+    res_path = osp.join(output_dir, 'lyft_metrics.json')
+    dump(metrics, res_path)
+    return metrics
+
+
+def get_classwise_aps(gt, predictions, class_names, iou_thresholds):
+    """Returns an array with an average precision per class.
+
+    Note: Ground truth and predictions should have the following format.
+
+    .. code-block::
+
+    gt = [{
+        'sample_token': '0f0e3ce89d2324d8b45aa55a7b4f8207
+                         fbb039a550991a5149214f98cec136ac',
+        'translation': [974.2811881299899, 1714.6815014457964,
+                        -23.689857123368846],
+        'size': [1.796, 4.488, 1.664],
+        'rotation': [0.14882026466054782, 0, 0, 0.9888642620837121],
+        'name': 'car'
+    }]
+
+    predictions = [{
+        'sample_token': '0f0e3ce89d2324d8b45aa55a7b4f8207
+                         fbb039a550991a5149214f98cec136ac',
+        'translation': [971.8343488872263, 1713.6816097857359,
+                        -25.82534357061308],
+        'size': [2.519726579986132, 7.810161372666739, 3.483438286096803],
+        'rotation': [0.10913582721095375, 0.04099572636992043,
+                     0.01927712319721745, 1.029328402625659],
+        'name': 'car',
+        'score': 0.3077029437237213
+    }]
+
+    Args:
+        gt (list[dict]): list of dictionaries in the format described below.
+        predictions (list[dict]): list of dictionaries in the format
+            described below.
+        class_names (list[str]): list of the class names.
+        iou_thresholds (list[float]): IOU thresholds used to calculate
+            TP / FN
+
+    Returns:
+        np.ndarray: an array with an average precision per class.
+    """
+    assert all([0 <= iou_th <= 1 for iou_th in iou_thresholds])
+
+    gt_by_class_name = group_by_key(gt, 'name')
+    pred_by_class_name = group_by_key(predictions, 'name')
+
+    average_precisions = np.zeros((len(class_names), len(iou_thresholds)))
+
+    for class_id, class_name in enumerate(class_names):
+        if class_name in pred_by_class_name:
+            recalls, precisions, average_precision = get_single_class_aps(
+                gt_by_class_name[class_name], pred_by_class_name[class_name],
+                iou_thresholds)
+            average_precisions[class_id, :] = average_precision
+
+    return average_precisions
+
+
+def get_single_class_aps(gt, predictions, iou_thresholds):
+    """Compute recall and precision for all iou thresholds. Adapted from
+    LyftDatasetDevkit.
+
+    Args:
+        gt (list[dict]): list of dictionaries in the format described above.
+        predictions (list[dict]): list of dictionaries in the format \
+            described below.
+        iou_thresholds (list[float]): IOU thresholds used to calculate \
+            TP / FN
+
+    Returns:
+        tuple[np.ndarray]: Returns (recalls, precisions, average precisions)
+            for each class.
+    """
+    num_gts = len(gt)
+    image_gts = group_by_key(gt, 'sample_token')
+    image_gts = wrap_in_box(image_gts)
+
+    sample_gt_checked = {
+        sample_token: np.zeros((len(boxes), len(iou_thresholds)))
+        for sample_token, boxes in image_gts.items()
+    }
+
+    predictions = sorted(predictions, key=lambda x: x['score'], reverse=True)
+
+    # go down dets and mark TPs and FPs
+    num_predictions = len(predictions)
+    tps = np.zeros((num_predictions, len(iou_thresholds)))
+    fps = np.zeros((num_predictions, len(iou_thresholds)))
+
+    for prediction_index, prediction in enumerate(predictions):
+        predicted_box = Box3D(**prediction)
+
+        sample_token = prediction['sample_token']
+
+        max_overlap = -np.inf
+        jmax = -1
+
+        if sample_token in image_gts:
+            gt_boxes = image_gts[sample_token]
+            # gt_boxes per sample
+            gt_checked = sample_gt_checked[sample_token]
+            # gt flags per sample
+        else:
+            gt_boxes = []
+            gt_checked = None
+
+        if len(gt_boxes) > 0:
+            overlaps = get_ious(gt_boxes, predicted_box)
+
+            max_overlap = np.max(overlaps)
+
+            jmax = np.argmax(overlaps)
+
+        for i, iou_threshold in enumerate(iou_thresholds):
+            if max_overlap > iou_threshold:
+                if gt_checked[jmax, i] == 0:
+                    tps[prediction_index, i] = 1.0
+                    gt_checked[jmax, i] = 1
+                else:
+                    fps[prediction_index, i] = 1.0
+            else:
+                fps[prediction_index, i] = 1.0
+
+    # compute precision recall
+    fps = np.cumsum(fps, axis=0)
+    tps = np.cumsum(tps, axis=0)
+
+    recalls = tps / float(num_gts)
+    # avoid divide by zero in case the first detection
+    # matches a difficult ground truth
+    precisions = tps / np.maximum(tps + fps, np.finfo(np.float64).eps)
+
+    aps = []
+    for i in range(len(iou_thresholds)):
+        recall = recalls[:, i]
+        precision = precisions[:, i]
+        assert np.all(0 <= recall) & np.all(recall <= 1)
+        assert np.all(0 <= precision) & np.all(precision <= 1)
+        ap = get_ap(recall, precision)
+        aps.append(ap)
+
+    aps = np.array(aps)
+
+    return recalls, precisions, aps
diff --git a/mmcv/core/evaluation/mean_ap.py b/mmcv/core/evaluation/mean_ap.py
new file mode 100644
index 0000000..dca238b
--- /dev/null
+++ b/mmcv/core/evaluation/mean_ap.py
@@ -0,0 +1,467 @@
+from multiprocessing import Pool
+import numpy as np
+from mmcv.utils import print_log, is_str
+from terminaltables import AsciiTable
+
+from .bbox_overlaps import bbox_overlaps
+from .class_names import get_classes
+
+
+def average_precision(recalls, precisions, mode='area'):
+    """Calculate average precision (for single or multiple scales).
+
+    Args:
+        recalls (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        precisions (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        mode (str): 'area' or '11points', 'area' means calculating the area
+            under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1]
+
+    Returns:
+        float or ndarray: calculated average precision
+    """
+    no_scale = False
+    if recalls.ndim == 1:
+        no_scale = True
+        recalls = recalls[np.newaxis, :]
+        precisions = precisions[np.newaxis, :]
+    assert recalls.shape == precisions.shape and recalls.ndim == 2
+    num_scales = recalls.shape[0]
+    ap = np.zeros(num_scales, dtype=np.float32)
+    if mode == 'area':
+        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
+        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
+        mrec = np.hstack((zeros, recalls, ones))
+        mpre = np.hstack((zeros, precisions, zeros))
+        for i in range(mpre.shape[1] - 1, 0, -1):
+            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
+        for i in range(num_scales):
+            ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
+            ap[i] = np.sum(
+                (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
+    elif mode == '11points':
+        for i in range(num_scales):
+            for thr in np.arange(0, 1 + 1e-3, 0.1):
+                precs = precisions[i, recalls[i, :] >= thr]
+                prec = precs.max() if precs.size > 0 else 0
+                ap[i] += prec
+        ap /= 11
+    else:
+        raise ValueError(
+            'Unrecognized mode, only "area" and "11points" are supported')
+    if no_scale:
+        ap = ap[0]
+    return ap
+
+
+def tpfp_imagenet(det_bboxes,
+                  gt_bboxes,
+                  gt_bboxes_ignore=None,
+                  default_iou_thr=0.5,
+                  area_ranges=None):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Default: None
+        default_iou_thr (float): IoU threshold to be considered as matched for
+            medium and large bboxes (small ones have special rules).
+            Default: 0.5.
+        area_ranges (list[tuple] | None): Range of bbox areas to be evaluated,
+            in the format [(min1, max1), (min2, max2), ...]. Default: None.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+            each array is (num_scales, m).
+    """
+    # an indicator of ignored gts
+    gt_ignore_inds = np.concatenate(
+        (np.zeros(gt_bboxes.shape[0], dtype=np.bool),
+         np.ones(gt_bboxes_ignore.shape[0], dtype=np.bool)))
+    # stack gt_bboxes and gt_bboxes_ignore for convenience
+    gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
+
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp
+    # of a certain scale.
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0]) * (
+                det_bboxes[:, 3] - det_bboxes[:, 1])
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp
+    ious = bbox_overlaps(det_bboxes, gt_bboxes - 1)
+    gt_w = gt_bboxes[:, 2] - gt_bboxes[:, 0]
+    gt_h = gt_bboxes[:, 3] - gt_bboxes[:, 1]
+    iou_thrs = np.minimum((gt_w * gt_h) / ((gt_w + 10.0) * (gt_h + 10.0)),
+                          default_iou_thr)
+    # sort all detections by scores in descending order
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+    for k, (min_area, max_area) in enumerate(area_ranges):
+        gt_covered = np.zeros(num_gts, dtype=bool)
+        # if no area range is specified, gt_area_ignore is all False
+        if min_area is None:
+            gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+        else:
+            gt_areas = gt_w * gt_h
+            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+        for i in sort_inds:
+            max_iou = -1
+            matched_gt = -1
+            # find best overlapped available gt
+            for j in range(num_gts):
+                # different from PASCAL VOC: allow finding other gts if the
+                # best overlapped ones are already matched by other det bboxes
+                if gt_covered[j]:
+                    continue
+                elif ious[i, j] >= iou_thrs[j] and ious[i, j] > max_iou:
+                    max_iou = ious[i, j]
+                    matched_gt = j
+            # there are 4 cases for a det bbox:
+            # 1. it matches a gt, tp = 1, fp = 0
+            # 2. it matches an ignored gt, tp = 0, fp = 0
+            # 3. it matches no gt and within area range, tp = 0, fp = 1
+            # 4. it matches no gt but is beyond area range, tp = 0, fp = 0
+            if matched_gt >= 0:
+                gt_covered[matched_gt] = 1
+                if not (gt_ignore_inds[matched_gt]
+                        or gt_area_ignore[matched_gt]):
+                    tp[k, i] = 1
+            elif min_area is None:
+                fp[k, i] = 1
+            else:
+                bbox = det_bboxes[i, :4]
+                area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+                if area >= min_area and area < max_area:
+                    fp[k, i] = 1
+    return tp, fp
+
+
+def tpfp_default(det_bboxes,
+                 gt_bboxes,
+                 gt_bboxes_ignore=None,
+                 iou_thr=0.5,
+                 area_ranges=None):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Default: None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        area_ranges (list[tuple] | None): Range of bbox areas to be evaluated,
+            in the format [(min1, max1), (min2, max2), ...]. Default: None.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+            each array is (num_scales, m).
+    """
+    # an indicator of ignored gts
+    gt_ignore_inds = np.concatenate(
+        (np.zeros(gt_bboxes.shape[0], dtype=np.bool),
+         np.ones(gt_bboxes_ignore.shape[0], dtype=np.bool)))
+    # stack gt_bboxes and gt_bboxes_ignore for convenience
+    gt_bboxes = np.vstack((gt_bboxes, gt_bboxes_ignore))
+
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp of
+    # a certain scale
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0]) * (
+                det_bboxes[:, 3] - det_bboxes[:, 1])
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp
+
+    ious = bbox_overlaps(det_bboxes, gt_bboxes)
+    # for each det, the max iou with all gts
+    ious_max = ious.max(axis=1)
+    # for each det, which gt overlaps most with it
+    ious_argmax = ious.argmax(axis=1)
+    # sort all dets in descending order by scores
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+    for k, (min_area, max_area) in enumerate(area_ranges):
+        gt_covered = np.zeros(num_gts, dtype=bool)
+        # if no area range is specified, gt_area_ignore is all False
+        if min_area is None:
+            gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
+        else:
+            gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
+                gt_bboxes[:, 3] - gt_bboxes[:, 1])
+            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+        for i in sort_inds:
+            if ious_max[i] >= iou_thr:
+                matched_gt = ious_argmax[i]
+                if not (gt_ignore_inds[matched_gt]
+                        or gt_area_ignore[matched_gt]):
+                    if not gt_covered[matched_gt]:
+                        gt_covered[matched_gt] = True
+                        tp[k, i] = 1
+                    else:
+                        fp[k, i] = 1
+                # otherwise ignore this detected bbox, tp = 0, fp = 0
+            elif min_area is None:
+                fp[k, i] = 1
+            else:
+                bbox = det_bboxes[i, :4]
+                area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+                if area >= min_area and area < max_area:
+                    fp[k, i] = 1
+    return tp, fp
+
+
+def get_cls_results(det_results, annotations, class_id):
+    """Get det results and gt information of a certain class.
+
+    Args:
+        det_results (list[list]): Same as `eval_map()`.
+        annotations (list[dict]): Same as `eval_map()`.
+        class_id (int): ID of a specific class.
+
+    Returns:
+        tuple[list[np.ndarray]]: detected bboxes, gt bboxes, ignored gt bboxes
+    """
+    cls_dets = [img_res[class_id] for img_res in det_results]
+    cls_gts = []
+    cls_gts_ignore = []
+    for ann in annotations:
+        gt_inds = ann['labels'] == class_id
+        cls_gts.append(ann['bboxes'][gt_inds, :])
+
+        if ann.get('labels_ignore', None) is not None:
+            ignore_inds = ann['labels_ignore'] == class_id
+            cls_gts_ignore.append(ann['bboxes_ignore'][ignore_inds, :])
+        else:
+            cls_gts_ignore.append(np.empty((0, 4), dtype=np.float32))
+
+    return cls_dets, cls_gts, cls_gts_ignore
+
+
+def eval_map(det_results,
+             annotations,
+             scale_ranges=None,
+             iou_thr=0.5,
+             dataset=None,
+             logger=None,
+             tpfp_fn=None,
+             nproc=4):
+    """Evaluate mAP of a dataset.
+
+    Args:
+        det_results (list[list]): [[cls1_det, cls2_det, ...], ...].
+            The outer list indicates images, and the inner list indicates
+            per-class detected bboxes.
+        annotations (list[dict]): Ground truth annotations where each item of
+            the list indicates an image. Keys of annotations are:
+
+            - `bboxes`: numpy array of shape (n, 4)
+            - `labels`: numpy array of shape (n, )
+            - `bboxes_ignore` (optional): numpy array of shape (k, 4)
+            - `labels_ignore` (optional): numpy array of shape (k, )
+        scale_ranges (list[tuple] | None): Range of scales to be evaluated,
+            in the format [(min1, max1), (min2, max2), ...]. A range of
+            (32, 64) means the area range between (32**2, 64**2).
+            Default: None.
+        iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        dataset (list[str] | str | None): Dataset name or dataset classes,
+            there are minor differences in metrics for different datsets, e.g.
+            "voc07", "imagenet_det", etc. Default: None.
+        logger (logging.Logger | str | None): The way to print the mAP
+            summary. See `mmcv.utils.print_log()` for details. Default: None.
+        tpfp_fn (callable | None): The function used to determine true/
+            false positives. If None, :func:`tpfp_default` is used as default
+            unless dataset is 'det' or 'vid' (:func:`tpfp_imagenet` in this
+            case). If it is given as a function, then this function is used
+            to evaluate tp & fp. Default None.
+        nproc (int): Processes used for computing TP and FP.
+            Default: 4.
+
+    Returns:
+        tuple: (mAP, [dict, dict, ...])
+    """
+    assert len(det_results) == len(annotations)
+
+    num_imgs = len(det_results)
+    num_scales = len(scale_ranges) if scale_ranges is not None else 1
+    num_classes = len(det_results[0])  # positive class num
+    area_ranges = ([(rg[0]**2, rg[1]**2) for rg in scale_ranges]
+                   if scale_ranges is not None else None)
+
+    pool = Pool(nproc)
+    eval_results = []
+    for i in range(num_classes):
+        # get gt and det bboxes of this class
+        cls_dets, cls_gts, cls_gts_ignore = get_cls_results(
+            det_results, annotations, i)
+        # choose proper function according to datasets to compute tp and fp
+        if tpfp_fn is None:
+            if dataset in ['det', 'vid']:
+                tpfp_fn = tpfp_imagenet
+            else:
+                tpfp_fn = tpfp_default
+        if not callable(tpfp_fn):
+            raise ValueError(
+                f'tpfp_fn has to be a function or None, but got {tpfp_fn}')
+
+        # compute tp and fp for each image with multiple processes
+        tpfp = pool.starmap(
+            tpfp_fn,
+            zip(cls_dets, cls_gts, cls_gts_ignore,
+                [iou_thr for _ in range(num_imgs)],
+                [area_ranges for _ in range(num_imgs)]))
+        tp, fp = tuple(zip(*tpfp))
+        # calculate gt number of each scale
+        # ignored gts or gts beyond the specific scale are not counted
+        num_gts = np.zeros(num_scales, dtype=int)
+        for j, bbox in enumerate(cls_gts):
+            if area_ranges is None:
+                num_gts[0] += bbox.shape[0]
+            else:
+                gt_areas = (bbox[:, 2] - bbox[:, 0]) * (
+                    bbox[:, 3] - bbox[:, 1])
+                for k, (min_area, max_area) in enumerate(area_ranges):
+                    num_gts[k] += np.sum((gt_areas >= min_area)
+                                         & (gt_areas < max_area))
+        # sort all det bboxes by score, also sort tp and fp
+        cls_dets = np.vstack(cls_dets)
+        num_dets = cls_dets.shape[0]
+        sort_inds = np.argsort(-cls_dets[:, -1])
+        tp = np.hstack(tp)[:, sort_inds]
+        fp = np.hstack(fp)[:, sort_inds]
+        # calculate recall and precision with tp and fp
+        tp = np.cumsum(tp, axis=1)
+        fp = np.cumsum(fp, axis=1)
+        eps = np.finfo(np.float32).eps
+        recalls = tp / np.maximum(num_gts[:, np.newaxis], eps)
+        precisions = tp / np.maximum((tp + fp), eps)
+        # calculate AP
+        if scale_ranges is None:
+            recalls = recalls[0, :]
+            precisions = precisions[0, :]
+            num_gts = num_gts.item()
+        mode = 'area' if dataset != 'voc07' else '11points'
+        ap = average_precision(recalls, precisions, mode)
+        eval_results.append({
+            'num_gts': num_gts,
+            'num_dets': num_dets,
+            'recall': recalls,
+            'precision': precisions,
+            'ap': ap
+        })
+    pool.close()
+    if scale_ranges is not None:
+        # shape (num_classes, num_scales)
+        all_ap = np.vstack([cls_result['ap'] for cls_result in eval_results])
+        all_num_gts = np.vstack(
+            [cls_result['num_gts'] for cls_result in eval_results])
+        mean_ap = []
+        for i in range(num_scales):
+            if np.any(all_num_gts[:, i] > 0):
+                mean_ap.append(all_ap[all_num_gts[:, i] > 0, i].mean())
+            else:
+                mean_ap.append(0.0)
+    else:
+        aps = []
+        for cls_result in eval_results:
+            if cls_result['num_gts'] > 0:
+                aps.append(cls_result['ap'])
+        mean_ap = np.array(aps).mean().item() if aps else 0.0
+
+    print_map_summary(
+        mean_ap, eval_results, dataset, area_ranges, logger=logger)
+
+    return mean_ap, eval_results
+
+
+def print_map_summary(mean_ap,
+                      results,
+                      dataset=None,
+                      scale_ranges=None,
+                      logger=None):
+    """Print mAP and results of each class.
+
+    A table will be printed to show the gts/dets/recall/AP of each class and
+    the mAP.
+
+    Args:
+        mean_ap (float): Calculated from `eval_map()`.
+        results (list[dict]): Calculated from `eval_map()`.
+        dataset (list[str] | str | None): Dataset name or dataset classes.
+        scale_ranges (list[tuple] | None): Range of scales to be evaluated.
+        logger (logging.Logger | str | None): The way to print the mAP
+            summary. See `mmcv.utils.print_log()` for details. Default: None.
+    """
+
+    if logger == 'silent':
+        return
+
+    if isinstance(results[0]['ap'], np.ndarray):
+        num_scales = len(results[0]['ap'])
+    else:
+        num_scales = 1
+
+    if scale_ranges is not None:
+        assert len(scale_ranges) == num_scales
+
+    num_classes = len(results)
+
+    recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
+    aps = np.zeros((num_scales, num_classes), dtype=np.float32)
+    num_gts = np.zeros((num_scales, num_classes), dtype=int)
+    for i, cls_result in enumerate(results):
+        if cls_result['recall'].size > 0:
+            recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1]
+        aps[:, i] = cls_result['ap']
+        num_gts[:, i] = cls_result['num_gts']
+
+    if dataset is None:
+        label_names = [str(i) for i in range(num_classes)]
+    elif is_str(dataset):
+        label_names = get_classes(dataset)
+    else:
+        label_names = dataset
+
+    if not isinstance(mean_ap, list):
+        mean_ap = [mean_ap]
+
+    header = ['class', 'gts', 'dets', 'recall', 'ap']
+    for i in range(num_scales):
+        if scale_ranges is not None:
+            print_log(f'Scale range {scale_ranges[i]}', logger=logger)
+        table_data = [header]
+        for j in range(num_classes):
+            row_data = [
+                label_names[j], num_gts[i, j], results[j]['num_dets'],
+                f'{recalls[i, j]:.3f}', f'{aps[i, j]:.3f}'
+            ]
+            table_data.append(row_data)
+        table_data.append(['mAP', '', '', '', f'{mean_ap[i]:.3f}'])
+        table = AsciiTable(table_data)
+        table.inner_footing_row_border = True
+        print_log('\n' + table.table, logger=logger)
diff --git a/mmcv/core/evaluation/metric_motion.py b/mmcv/core/evaluation/metric_motion.py
new file mode 100644
index 0000000..8219438
--- /dev/null
+++ b/mmcv/core/evaluation/metric_motion.py
@@ -0,0 +1,70 @@
+# <Copyright 2019, Argo AI, LLC. Released under the MIT license.>
+
+"""This module evaluates the forecasted trajectories against the ground truth."""
+
+import math
+from typing import Dict, List, Optional
+
+import numpy as np
+import torch
+
+LOW_PROB_THRESHOLD_FOR_METRICS = 0.05
+
+
+def get_ade(forecasted_trajectory: torch.Tensor, gt_trajectory: torch.Tensor) -> float:
+    """Compute Average Displacement Error.
+    Args:
+        forecasted_trajectory: Predicted trajectory with shape [fut_ts, 2]
+        gt_trajectory: Ground truth trajectory with shape [fut_ts, 2]
+    Returns:
+        ade: Average Displacement Error
+    """
+    pred_len = forecasted_trajectory.shape[0]
+    ade = float(
+        sum(
+            torch.sqrt(
+                (forecasted_trajectory[i, 0] - gt_trajectory[i, 0]) ** 2
+                + (forecasted_trajectory[i, 1] - gt_trajectory[i, 1]) ** 2
+            )
+            for i in range(pred_len)
+        )
+        / pred_len
+    )
+    return ade
+
+def get_best_preds(
+    forecasted_trajectory: torch.Tensor,
+    gt_trajectory: torch.Tensor
+) -> float:
+    """Compute min Average Displacement Error.
+    Args:
+        forecasted_trajectory: Predicted trajectory with shape [k, fut_ts, 2]
+        gt_trajectory: Ground truth trajectory with shape [fut_ts, 2]
+        gt_fut_masks: Ground truth traj mask with shape (fut_ts)
+    Returns:
+        best_forecasted_trajectory: Predicted trajectory with shape [fut_ts, 2]
+    """
+
+    # [k, fut_ts]
+    dist = torch.linalg.norm(gt_trajectory[None] - forecasted_trajectory, dim=-1)
+    dist = dist[..., -1]
+    dist[torch.isnan(dist)] = 0
+    min_mode_idx = torch.argmin(dist, dim=-1)
+
+    return forecasted_trajectory[min_mode_idx]
+
+def get_fde(forecasted_trajectory: torch.Tensor, gt_trajectory: torch.Tensor) -> float:
+    """Compute Final Displacement Error.
+    Args:
+        forecasted_trajectory: Predicted trajectory with shape [fut_ts, 2]
+        gt_trajectory: Ground truth trajectory with shape [fut_ts, 2]
+    Returns:
+        fde: Final Displacement Error
+    """
+    fde = float(
+        torch.sqrt(
+            (forecasted_trajectory[-1, 0] - gt_trajectory[-1, 0]) ** 2
+            + (forecasted_trajectory[-1, 1] - gt_trajectory[-1, 1]) ** 2
+        )
+    )
+    return fde
diff --git a/mmcv/core/evaluation/metrics.py b/mmcv/core/evaluation/metrics.py
new file mode 100644
index 0000000..551203a
--- /dev/null
+++ b/mmcv/core/evaluation/metrics.py
@@ -0,0 +1,325 @@
+from collections import OrderedDict
+from mmcv.image import imread
+import numpy as np
+import torch
+
+
+def f_score(precision, recall, beta=1):
+    """calcuate the f-score value.
+
+    Args:
+        precision (float | torch.Tensor): The precision value.
+        recall (float | torch.Tensor): The recall value.
+        beta (int): Determines the weight of recall in the combined score.
+            Default: False.
+
+    Returns:
+        [torch.tensor]: The f-score value.
+    """
+    score = (1 + beta**2) * (precision * recall) / (
+        (beta**2 * precision) + recall)
+    return score
+
+
+def intersect_and_union(pred_label,
+                        label,
+                        num_classes,
+                        ignore_index,
+                        label_map=dict(),
+                        reduce_zero_label=False):
+    """Calculate intersection and Union.
+
+    Args:
+        pred_label (ndarray | str): Prediction segmentation map
+            or predict result filename.
+        label (ndarray | str): Ground truth segmentation map
+            or label filename.
+        num_classes (int): Number of categories.
+        ignore_index (int): Index that will be ignored in evaluation.
+        label_map (dict): Mapping old labels to new labels. The parameter will
+            work only when label is str. Default: dict().
+        reduce_zero_label (bool): Wether ignore zero label. The parameter will
+            work only when label is str. Default: False.
+
+     Returns:
+         torch.Tensor: The intersection of prediction and ground truth
+            histogram on all classes.
+         torch.Tensor: The union of prediction and ground truth histogram on
+            all classes.
+         torch.Tensor: The prediction histogram on all classes.
+         torch.Tensor: The ground truth histogram on all classes.
+    """
+
+    if isinstance(pred_label, str):
+        pred_label = torch.from_numpy(np.load(pred_label))
+    else:
+        pred_label = torch.from_numpy((pred_label))
+
+    if isinstance(label, str):
+        label = torch.from_numpy(
+            imread(label, flag='unchanged', backend='pillow'))
+    else:
+        label = torch.from_numpy(label)
+
+    if label_map is not None:
+        for old_id, new_id in label_map.items():
+            label[label == old_id] = new_id
+    if reduce_zero_label:
+        label[label == 0] = 255
+        label = label - 1
+        label[label == 254] = 255
+
+    mask = (label != ignore_index)
+    pred_label = pred_label[mask]
+    label = label[mask]
+
+    intersect = pred_label[pred_label == label]
+    area_intersect = torch.histc(
+        intersect.float(), bins=(num_classes), min=0, max=num_classes - 1)
+    area_pred_label = torch.histc(
+        pred_label.float(), bins=(num_classes), min=0, max=num_classes - 1)
+    area_label = torch.histc(
+        label.float(), bins=(num_classes), min=0, max=num_classes - 1)
+    area_union = area_pred_label + area_label - area_intersect
+    return area_intersect, area_union, area_pred_label, area_label
+
+
+def total_intersect_and_union(results,
+                              gt_seg_maps,
+                              num_classes,
+                              ignore_index,
+                              label_map=dict(),
+                              reduce_zero_label=False):
+    """Calculate Total Intersection and Union.
+
+    Args:
+        results (list[ndarray] | list[str]): List of prediction segmentation
+            maps or list of prediction result filenames.
+        gt_seg_maps (list[ndarray] | list[str]): list of ground truth
+            segmentation maps or list of label filenames.
+        num_classes (int): Number of categories.
+        ignore_index (int): Index that will be ignored in evaluation.
+        label_map (dict): Mapping old labels to new labels. Default: dict().
+        reduce_zero_label (bool): Wether ignore zero label. Default: False.
+
+     Returns:
+         ndarray: The intersection of prediction and ground truth histogram
+             on all classes.
+         ndarray: The union of prediction and ground truth histogram on all
+             classes.
+         ndarray: The prediction histogram on all classes.
+         ndarray: The ground truth histogram on all classes.
+    """
+    num_imgs = len(results)
+    assert len(gt_seg_maps) == num_imgs
+    total_area_intersect = torch.zeros((num_classes, ), dtype=torch.float64)
+    total_area_union = torch.zeros((num_classes, ), dtype=torch.float64)
+    total_area_pred_label = torch.zeros((num_classes, ), dtype=torch.float64)
+    total_area_label = torch.zeros((num_classes, ), dtype=torch.float64)
+    for i in range(num_imgs):
+        area_intersect, area_union, area_pred_label, area_label = \
+            intersect_and_union(
+                results[i], gt_seg_maps[i], num_classes, ignore_index,
+                label_map, reduce_zero_label)
+        total_area_intersect += area_intersect
+        total_area_union += area_union
+        total_area_pred_label += area_pred_label
+        total_area_label += area_label
+    return total_area_intersect, total_area_union, total_area_pred_label, \
+        total_area_label
+
+
+def mean_iou(results,
+             gt_seg_maps,
+             num_classes,
+             ignore_index,
+             nan_to_num=None,
+             label_map=dict(),
+             reduce_zero_label=False):
+    """Calculate Mean Intersection and Union (mIoU)
+
+    Args:
+        results (list[ndarray] | list[str]): List of prediction segmentation
+            maps or list of prediction result filenames.
+        gt_seg_maps (list[ndarray] | list[str]): list of ground truth
+            segmentation maps or list of label filenames.
+        num_classes (int): Number of categories.
+        ignore_index (int): Index that will be ignored in evaluation.
+        nan_to_num (int, optional): If specified, NaN values will be replaced
+            by the numbers defined by the user. Default: None.
+        label_map (dict): Mapping old labels to new labels. Default: dict().
+        reduce_zero_label (bool): Wether ignore zero label. Default: False.
+
+     Returns:
+        dict[str, float | ndarray]:
+            <aAcc> float: Overall accuracy on all images.
+            <Acc> ndarray: Per category accuracy, shape (num_classes, ).
+            <IoU> ndarray: Per category IoU, shape (num_classes, ).
+    """
+    iou_result = eval_metrics(
+        results=results,
+        gt_seg_maps=gt_seg_maps,
+        num_classes=num_classes,
+        ignore_index=ignore_index,
+        metrics=['mIoU'],
+        nan_to_num=nan_to_num,
+        label_map=label_map,
+        reduce_zero_label=reduce_zero_label)
+    return iou_result
+
+
+def mean_dice(results,
+              gt_seg_maps,
+              num_classes,
+              ignore_index,
+              nan_to_num=None,
+              label_map=dict(),
+              reduce_zero_label=False):
+    """Calculate Mean Dice (mDice)
+
+    Args:
+        results (list[ndarray] | list[str]): List of prediction segmentation
+            maps or list of prediction result filenames.
+        gt_seg_maps (list[ndarray] | list[str]): list of ground truth
+            segmentation maps or list of label filenames.
+        num_classes (int): Number of categories.
+        ignore_index (int): Index that will be ignored in evaluation.
+        nan_to_num (int, optional): If specified, NaN values will be replaced
+            by the numbers defined by the user. Default: None.
+        label_map (dict): Mapping old labels to new labels. Default: dict().
+        reduce_zero_label (bool): Wether ignore zero label. Default: False.
+
+     Returns:
+        dict[str, float | ndarray]: Default metrics.
+            <aAcc> float: Overall accuracy on all images.
+            <Acc> ndarray: Per category accuracy, shape (num_classes, ).
+            <Dice> ndarray: Per category dice, shape (num_classes, ).
+    """
+
+    dice_result = eval_metrics(
+        results=results,
+        gt_seg_maps=gt_seg_maps,
+        num_classes=num_classes,
+        ignore_index=ignore_index,
+        metrics=['mDice'],
+        nan_to_num=nan_to_num,
+        label_map=label_map,
+        reduce_zero_label=reduce_zero_label)
+    return dice_result
+
+
+def mean_fscore(results,
+                gt_seg_maps,
+                num_classes,
+                ignore_index,
+                nan_to_num=None,
+                label_map=dict(),
+                reduce_zero_label=False,
+                beta=1):
+    """Calculate Mean Intersection and Union (mIoU)
+
+    Args:
+        results (list[ndarray] | list[str]): List of prediction segmentation
+            maps or list of prediction result filenames.
+        gt_seg_maps (list[ndarray] | list[str]): list of ground truth
+            segmentation maps or list of label filenames.
+        num_classes (int): Number of categories.
+        ignore_index (int): Index that will be ignored in evaluation.
+        nan_to_num (int, optional): If specified, NaN values will be replaced
+            by the numbers defined by the user. Default: None.
+        label_map (dict): Mapping old labels to new labels. Default: dict().
+        reduce_zero_label (bool): Wether ignore zero label. Default: False.
+        beta (int): Determines the weight of recall in the combined score.
+            Default: False.
+
+
+     Returns:
+        dict[str, float | ndarray]: Default metrics.
+            <aAcc> float: Overall accuracy on all images.
+            <Fscore> ndarray: Per category recall, shape (num_classes, ).
+            <Precision> ndarray: Per category precision, shape (num_classes, ).
+            <Recall> ndarray: Per category f-score, shape (num_classes, ).
+    """
+    fscore_result = eval_metrics(
+        results=results,
+        gt_seg_maps=gt_seg_maps,
+        num_classes=num_classes,
+        ignore_index=ignore_index,
+        metrics=['mFscore'],
+        nan_to_num=nan_to_num,
+        label_map=label_map,
+        reduce_zero_label=reduce_zero_label,
+        beta=beta)
+    return fscore_result
+
+
+def eval_metrics(results,
+                 gt_seg_maps,
+                 num_classes,
+                 ignore_index,
+                 metrics=['mIoU'],
+                 nan_to_num=None,
+                 label_map=dict(),
+                 reduce_zero_label=False,
+                 beta=1):
+    """Calculate evaluation metrics
+    Args:
+        results (list[ndarray] | list[str]): List of prediction segmentation
+            maps or list of prediction result filenames.
+        gt_seg_maps (list[ndarray] | list[str]): list of ground truth
+            segmentation maps or list of label filenames.
+        num_classes (int): Number of categories.
+        ignore_index (int): Index that will be ignored in evaluation.
+        metrics (list[str] | str): Metrics to be evaluated, 'mIoU' and 'mDice'.
+        nan_to_num (int, optional): If specified, NaN values will be replaced
+            by the numbers defined by the user. Default: None.
+        label_map (dict): Mapping old labels to new labels. Default: dict().
+        reduce_zero_label (bool): Wether ignore zero label. Default: False.
+     Returns:
+        float: Overall accuracy on all images.
+        ndarray: Per category accuracy, shape (num_classes, ).
+        ndarray: Per category evaluation metrics, shape (num_classes, ).
+    """
+    if isinstance(metrics, str):
+        metrics = [metrics]
+    allowed_metrics = ['mIoU', 'mDice', 'mFscore']
+    if not set(metrics).issubset(set(allowed_metrics)):
+        raise KeyError('metrics {} is not supported'.format(metrics))
+
+    total_area_intersect, total_area_union, total_area_pred_label, \
+        total_area_label = total_intersect_and_union(
+            results, gt_seg_maps, num_classes, ignore_index, label_map,
+            reduce_zero_label)
+    all_acc = total_area_intersect.sum() / total_area_label.sum()
+    ret_metrics = OrderedDict({'aAcc': all_acc})
+    for metric in metrics:
+        if metric == 'mIoU':
+            iou = total_area_intersect / total_area_union
+            acc = total_area_intersect / total_area_label
+            ret_metrics['IoU'] = iou
+            ret_metrics['Acc'] = acc
+        elif metric == 'mDice':
+            dice = 2 * total_area_intersect / (
+                total_area_pred_label + total_area_label)
+            acc = total_area_intersect / total_area_label
+            ret_metrics['Dice'] = dice
+            ret_metrics['Acc'] = acc
+        elif metric == 'mFscore':
+            precision = total_area_intersect / total_area_pred_label
+            recall = total_area_intersect / total_area_label
+            f_value = torch.tensor(
+                [f_score(x[0], x[1], beta) for x in zip(precision, recall)])
+            ret_metrics['Fscore'] = f_value
+            ret_metrics['Precision'] = precision
+            ret_metrics['Recall'] = recall
+
+    ret_metrics = {
+        metric: value.numpy()
+        for metric, value in ret_metrics.items()
+    }
+    if nan_to_num is not None:
+        ret_metrics = OrderedDict({
+            metric: np.nan_to_num(metric_value, nan=nan_to_num)
+            for metric, metric_value in ret_metrics.items()
+        })
+    return ret_metrics
diff --git a/mmcv/core/evaluation/recall.py b/mmcv/core/evaluation/recall.py
new file mode 100644
index 0000000..23ec744
--- /dev/null
+++ b/mmcv/core/evaluation/recall.py
@@ -0,0 +1,189 @@
+from collections.abc import Sequence
+
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+from .bbox_overlaps import bbox_overlaps
+
+
+def _recalls(all_ious, proposal_nums, thrs):
+
+    img_num = all_ious.shape[0]
+    total_gt_num = sum([ious.shape[0] for ious in all_ious])
+
+    _ious = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32)
+    for k, proposal_num in enumerate(proposal_nums):
+        tmp_ious = np.zeros(0)
+        for i in range(img_num):
+            ious = all_ious[i][:, :proposal_num].copy()
+            gt_ious = np.zeros((ious.shape[0]))
+            if ious.size == 0:
+                tmp_ious = np.hstack((tmp_ious, gt_ious))
+                continue
+            for j in range(ious.shape[0]):
+                gt_max_overlaps = ious.argmax(axis=1)
+                max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps]
+                gt_idx = max_ious.argmax()
+                gt_ious[j] = max_ious[gt_idx]
+                box_idx = gt_max_overlaps[gt_idx]
+                ious[gt_idx, :] = -1
+                ious[:, box_idx] = -1
+            tmp_ious = np.hstack((tmp_ious, gt_ious))
+        _ious[k, :] = tmp_ious
+
+    _ious = np.fliplr(np.sort(_ious, axis=1))
+    recalls = np.zeros((proposal_nums.size, thrs.size))
+    for i, thr in enumerate(thrs):
+        recalls[:, i] = (_ious >= thr).sum(axis=1) / float(total_gt_num)
+
+    return recalls
+
+
+def set_recall_param(proposal_nums, iou_thrs):
+    """Check proposal_nums and iou_thrs and set correct format."""
+    if isinstance(proposal_nums, Sequence):
+        _proposal_nums = np.array(proposal_nums)
+    elif isinstance(proposal_nums, int):
+        _proposal_nums = np.array([proposal_nums])
+    else:
+        _proposal_nums = proposal_nums
+
+    if iou_thrs is None:
+        _iou_thrs = np.array([0.5])
+    elif isinstance(iou_thrs, Sequence):
+        _iou_thrs = np.array(iou_thrs)
+    elif isinstance(iou_thrs, float):
+        _iou_thrs = np.array([iou_thrs])
+    else:
+        _iou_thrs = iou_thrs
+
+    return _proposal_nums, _iou_thrs
+
+
+def eval_recalls(gts,
+                 proposals,
+                 proposal_nums=None,
+                 iou_thrs=0.5,
+                 logger=None):
+    """Calculate recalls.
+
+    Args:
+        gts (list[ndarray]): a list of arrays of shape (n, 4)
+        proposals (list[ndarray]): a list of arrays of shape (k, 4) or (k, 5)
+        proposal_nums (int | Sequence[int]): Top N proposals to be evaluated.
+        iou_thrs (float | Sequence[float]): IoU thresholds. Default: 0.5.
+        logger (logging.Logger | str | None): The way to print the recall
+            summary. See `mmcv.utils.print_log()` for details. Default: None.
+
+    Returns:
+        ndarray: recalls of different ious and proposal nums
+    """
+
+    img_num = len(gts)
+    assert img_num == len(proposals)
+
+    proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs)
+
+    all_ious = []
+    for i in range(img_num):
+        if proposals[i].ndim == 2 and proposals[i].shape[1] == 5:
+            scores = proposals[i][:, 4]
+            sort_idx = np.argsort(scores)[::-1]
+            img_proposal = proposals[i][sort_idx, :]
+        else:
+            img_proposal = proposals[i]
+        prop_num = min(img_proposal.shape[0], proposal_nums[-1])
+        if gts[i] is None or gts[i].shape[0] == 0:
+            ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32)
+        else:
+            ious = bbox_overlaps(gts[i], img_proposal[:prop_num, :4])
+        all_ious.append(ious)
+    all_ious = np.array(all_ious)
+    recalls = _recalls(all_ious, proposal_nums, iou_thrs)
+
+    print_recall_summary(recalls, proposal_nums, iou_thrs, logger=logger)
+    return recalls
+
+
+def print_recall_summary(recalls,
+                         proposal_nums,
+                         iou_thrs,
+                         row_idxs=None,
+                         col_idxs=None,
+                         logger=None):
+    """Print recalls in a table.
+
+    Args:
+        recalls (ndarray): calculated from `bbox_recalls`
+        proposal_nums (ndarray or list): top N proposals
+        iou_thrs (ndarray or list): iou thresholds
+        row_idxs (ndarray): which rows(proposal nums) to print
+        col_idxs (ndarray): which cols(iou thresholds) to print
+        logger (logging.Logger | str | None): The way to print the recall
+            summary. See `mmcv.utils.print_log()` for details. Default: None.
+    """
+    proposal_nums = np.array(proposal_nums, dtype=np.int32)
+    iou_thrs = np.array(iou_thrs)
+    if row_idxs is None:
+        row_idxs = np.arange(proposal_nums.size)
+    if col_idxs is None:
+        col_idxs = np.arange(iou_thrs.size)
+    row_header = [''] + iou_thrs[col_idxs].tolist()
+    table_data = [row_header]
+    for i, num in enumerate(proposal_nums[row_idxs]):
+        row = [f'{val:.3f}' for val in recalls[row_idxs[i], col_idxs].tolist()]
+        row.insert(0, num)
+        table_data.append(row)
+    table = AsciiTable(table_data)
+    print_log('\n' + table.table, logger=logger)
+
+
+def plot_num_recall(recalls, proposal_nums):
+    """Plot Proposal_num-Recalls curve.
+
+    Args:
+        recalls(ndarray or list): shape (k,)
+        proposal_nums(ndarray or list): same shape as `recalls`
+    """
+    if isinstance(proposal_nums, np.ndarray):
+        _proposal_nums = proposal_nums.tolist()
+    else:
+        _proposal_nums = proposal_nums
+    if isinstance(recalls, np.ndarray):
+        _recalls = recalls.tolist()
+    else:
+        _recalls = recalls
+
+    import matplotlib.pyplot as plt
+    f = plt.figure()
+    plt.plot([0] + _proposal_nums, [0] + _recalls)
+    plt.xlabel('Proposal num')
+    plt.ylabel('Recall')
+    plt.axis([0, proposal_nums.max(), 0, 1])
+    f.show()
+
+
+def plot_iou_recall(recalls, iou_thrs):
+    """Plot IoU-Recalls curve.
+
+    Args:
+        recalls(ndarray or list): shape (k,)
+        iou_thrs(ndarray or list): same shape as `recalls`
+    """
+    if isinstance(iou_thrs, np.ndarray):
+        _iou_thrs = iou_thrs.tolist()
+    else:
+        _iou_thrs = iou_thrs
+    if isinstance(recalls, np.ndarray):
+        _recalls = recalls.tolist()
+    else:
+        _recalls = recalls
+
+    import matplotlib.pyplot as plt
+    f = plt.figure()
+    plt.plot(_iou_thrs + [1.0], _recalls + [0.])
+    plt.xlabel('IoU')
+    plt.ylabel('Recall')
+    plt.axis([iou_thrs.min(), 1, 0, 1])
+    f.show()
diff --git a/mmcv/core/evaluation/seg_eval.py b/mmcv/core/evaluation/seg_eval.py
new file mode 100644
index 0000000..542fedc
--- /dev/null
+++ b/mmcv/core/evaluation/seg_eval.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+
+def fast_hist(preds, labels, num_classes):
+    """Compute the confusion matrix for every batch.
+
+    Args:
+        preds (np.ndarray):  Prediction labels of points with shape of
+        (num_points, ).
+        labels (np.ndarray): Ground truth labels of points with shape of
+        (num_points, ).
+        num_classes (int): number of classes
+
+    Returns:
+        np.ndarray: Calculated confusion matrix.
+    """
+
+    k = (labels >= 0) & (labels < num_classes)
+    bin_count = np.bincount(
+        num_classes * labels[k].astype(int) + preds[k],
+        minlength=num_classes**2)
+    return bin_count[:num_classes**2].reshape(num_classes, num_classes)
+
+
+def per_class_iou(hist):
+    """Compute the per class iou.
+
+    Args:
+        hist(np.ndarray):  Overall confusion martix
+        (num_classes, num_classes ).
+
+    Returns:
+        np.ndarray: Calculated per class iou
+    """
+
+    return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
+
+
+def get_acc(hist):
+    """Compute the overall accuracy.
+
+    Args:
+        hist(np.ndarray):  Overall confusion martix
+        (num_classes, num_classes ).
+
+    Returns:
+        float: Calculated overall acc
+    """
+
+    return np.diag(hist).sum() / hist.sum()
+
+
+def get_acc_cls(hist):
+    """Compute the class average accuracy.
+
+    Args:
+        hist(np.ndarray):  Overall confusion martix
+        (num_classes, num_classes ).
+
+    Returns:
+        float: Calculated class average acc
+    """
+
+    return np.nanmean(np.diag(hist) / hist.sum(axis=1))
+
+
+def seg_eval(gt_labels, seg_preds, label2cat, ignore_index, logger=None):
+    """Semantic Segmentation  Evaluation.
+
+    Evaluate the result of the Semantic Segmentation.
+
+    Args:
+        gt_labels (list[torch.Tensor]): Ground truth labels.
+        seg_preds  (list[torch.Tensor]): Predictions.
+        label2cat (dict): Map from label to category name.
+        ignore_index (int): Index that will be ignored in evaluation.
+        logger (logging.Logger | str | None): The way to print the mAP
+            summary. See `mmcv.utils.print_log()` for details. Default: None.
+
+    Returns:
+        dict[str, float]: Dict of results.
+    """
+    assert len(seg_preds) == len(gt_labels)
+    num_classes = len(label2cat)
+
+    hist_list = []
+    for i in range(len(gt_labels)):
+        gt_seg = gt_labels[i].clone().numpy().astype(np.int)
+        pred_seg = seg_preds[i].clone().numpy().astype(np.int)
+
+        # filter out ignored points
+        pred_seg[gt_seg == ignore_index] = -1
+        gt_seg[gt_seg == ignore_index] = -1
+
+        # calculate one instance result
+        hist_list.append(fast_hist(pred_seg, gt_seg, num_classes))
+
+    iou = per_class_iou(sum(hist_list))
+    miou = np.nanmean(iou)
+    acc = get_acc(sum(hist_list))
+    acc_cls = get_acc_cls(sum(hist_list))
+
+    header = ['classes']
+    for i in range(len(label2cat)):
+        header.append(label2cat[i])
+    header.extend(['miou', 'acc', 'acc_cls'])
+
+    ret_dict = dict()
+    table_columns = [['results']]
+    for i in range(len(label2cat)):
+        ret_dict[label2cat[i]] = float(iou[i])
+        table_columns.append([f'{iou[i]:.4f}'])
+    ret_dict['miou'] = float(miou)
+    ret_dict['acc'] = float(acc)
+    ret_dict['acc_cls'] = float(acc_cls)
+
+    table_columns.append([f'{miou:.4f}'])
+    table_columns.append([f'{acc:.4f}'])
+    table_columns.append([f'{acc_cls:.4f}'])
+
+    table_data = [header]
+    table_rows = list(zip(*table_columns))
+    table_data += table_rows
+    table = AsciiTable(table_data)
+    table.inner_footing_row_border = True
+    print_log('\n' + table.table, logger=logger)
+
+    return ret_dict
diff --git a/mmcv/core/evaluation/waymo_utils/prediction_kitti_to_waymo.py b/mmcv/core/evaluation/waymo_utils/prediction_kitti_to_waymo.py
new file mode 100644
index 0000000..014b480
--- /dev/null
+++ b/mmcv/core/evaluation/waymo_utils/prediction_kitti_to_waymo.py
@@ -0,0 +1,262 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+r"""Adapted from `Waymo to KITTI converter
+    <https://github.com/caizhongang/waymo_kitti_converter>`_.
+"""
+
+try:
+    from waymo_open_dataset import dataset_pb2 as open_dataset
+except ImportError:
+    raise ImportError(
+        'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" '
+        'to install the official devkit first.')
+
+from mmcv.utils import mkdir_or_exist, track_parallel_progress
+import numpy as np
+import tensorflow as tf
+from glob import glob
+from os.path import join
+from waymo_open_dataset import label_pb2
+from waymo_open_dataset.protos import metrics_pb2
+
+
+class KITTI2Waymo(object):
+    """KITTI predictions to Waymo converter.
+
+    This class serves as the converter to change predictions from KITTI to
+    Waymo format.
+
+    Args:
+        kitti_result_files (list[dict]): Predictions in KITTI format.
+        waymo_tfrecords_dir (str): Directory to load waymo raw data.
+        waymo_results_save_dir (str): Directory to save converted predictions
+            in waymo format (.bin files).
+        waymo_results_final_path (str): Path to save combined
+            predictions in waymo format (.bin file), like 'a/b/c.bin'.
+        prefix (str): Prefix of filename. In general, 0 for training, 1 for
+            validation and 2 for testing.
+        workers (str): Number of parallel processes.
+    """
+
+    def __init__(self,
+                 kitti_result_files,
+                 waymo_tfrecords_dir,
+                 waymo_results_save_dir,
+                 waymo_results_final_path,
+                 prefix,
+                 workers=64):
+
+        self.kitti_result_files = kitti_result_files
+        self.waymo_tfrecords_dir = waymo_tfrecords_dir
+        self.waymo_results_save_dir = waymo_results_save_dir
+        self.waymo_results_final_path = waymo_results_final_path
+        self.prefix = prefix
+        self.workers = int(workers)
+        self.name2idx = {}
+        for idx, result in enumerate(kitti_result_files):
+            if len(result['sample_idx']) > 0:
+                self.name2idx[str(result['sample_idx'][0])] = idx
+
+        # turn on eager execution for older tensorflow versions
+        if int(tf.__version__.split('.')[0]) < 2:
+            tf.enable_eager_execution()
+
+        self.k2w_cls_map = {
+            'Car': label_pb2.Label.TYPE_VEHICLE,
+            'Pedestrian': label_pb2.Label.TYPE_PEDESTRIAN,
+            'Sign': label_pb2.Label.TYPE_SIGN,
+            'Cyclist': label_pb2.Label.TYPE_CYCLIST,
+        }
+
+        self.T_ref_to_front_cam = np.array([[0.0, 0.0, 1.0, 0.0],
+                                            [-1.0, 0.0, 0.0, 0.0],
+                                            [0.0, -1.0, 0.0, 0.0],
+                                            [0.0, 0.0, 0.0, 1.0]])
+
+        self.get_file_names()
+        self.create_folder()
+
+    def get_file_names(self):
+        """Get file names of waymo raw data."""
+        self.waymo_tfrecord_pathnames = sorted(
+            glob(join(self.waymo_tfrecords_dir, '*.tfrecord')))
+        print(len(self.waymo_tfrecord_pathnames), 'tfrecords found.')
+
+    def create_folder(self):
+        """Create folder for data conversion."""
+        mkdir_or_exist(self.waymo_results_save_dir)
+
+    def parse_objects(self, kitti_result, T_k2w, context_name,
+                      frame_timestamp_micros):
+        """Parse one prediction with several instances in kitti format and
+        convert them to `Object` proto.
+
+        Args:
+            kitti_result (dict): Predictions in kitti format.
+
+                - name (np.ndarray): Class labels of predictions.
+                - dimensions (np.ndarray): Height, width, length of boxes.
+                - location (np.ndarray): Bottom center of boxes (x, y, z).
+                - rotation_y (np.ndarray): Orientation of boxes.
+                - score (np.ndarray): Scores of predictions.
+            T_k2w (np.ndarray): Transformation matrix from kitti to waymo.
+            context_name (str): Context name of the frame.
+            frame_timestamp_micros (int): Frame timestamp.
+
+        Returns:
+            :obj:`Object`: Predictions in waymo dataset Object proto.
+        """
+
+        def parse_one_object(instance_idx):
+            """Parse one instance in kitti format and convert them to `Object`
+            proto.
+
+            Args:
+                instance_idx (int): Index of the instance to be converted.
+
+            Returns:
+                :obj:`Object`: Predicted instance in waymo dataset \
+                    Object proto.
+            """
+            cls = kitti_result['name'][instance_idx]
+            length = round(kitti_result['dimensions'][instance_idx, 0], 4)
+            height = round(kitti_result['dimensions'][instance_idx, 1], 4)
+            width = round(kitti_result['dimensions'][instance_idx, 2], 4)
+            x = round(kitti_result['location'][instance_idx, 0], 4)
+            y = round(kitti_result['location'][instance_idx, 1], 4)
+            z = round(kitti_result['location'][instance_idx, 2], 4)
+            rotation_y = round(kitti_result['rotation_y'][instance_idx], 4)
+            score = round(kitti_result['score'][instance_idx], 4)
+
+            # y: downwards; move box origin from bottom center (kitti) to
+            # true center (waymo)
+            y -= height / 2
+            # frame transformation: kitti -> waymo
+            x, y, z = self.transform(T_k2w, x, y, z)
+
+            # different conventions
+            heading = -(rotation_y + np.pi / 2)
+            while heading < -np.pi:
+                heading += 2 * np.pi
+            while heading > np.pi:
+                heading -= 2 * np.pi
+
+            box = label_pb2.Label.Box()
+            box.center_x = x
+            box.center_y = y
+            box.center_z = z
+            box.length = length
+            box.width = width
+            box.height = height
+            box.heading = heading
+
+            o = metrics_pb2.Object()
+            o.object.box.CopyFrom(box)
+            o.object.type = self.k2w_cls_map[cls]
+            o.score = score
+
+            o.context_name = context_name
+            o.frame_timestamp_micros = frame_timestamp_micros
+
+            return o
+
+        objects = metrics_pb2.Objects()
+
+        for instance_idx in range(len(kitti_result['name'])):
+            o = parse_one_object(instance_idx)
+            objects.objects.append(o)
+
+        return objects
+
+    def convert_one(self, file_idx):
+        """Convert action for single file.
+
+        Args:
+            file_idx (int): Index of the file to be converted.
+        """
+        file_pathname = self.waymo_tfrecord_pathnames[file_idx]
+        file_data = tf.data.TFRecordDataset(file_pathname, compression_type='')
+
+        for frame_num, frame_data in enumerate(file_data):
+            frame = open_dataset.Frame()
+            frame.ParseFromString(bytearray(frame_data.numpy()))
+
+            filename = f'{self.prefix}{file_idx:03d}{frame_num:03d}'
+
+            for camera in frame.context.camera_calibrations:
+                # FRONT = 1, see dataset.proto for details
+                if camera.name == 1:
+                    T_front_cam_to_vehicle = np.array(
+                        camera.extrinsic.transform).reshape(4, 4)
+
+            T_k2w = T_front_cam_to_vehicle @ self.T_ref_to_front_cam
+
+            context_name = frame.context.name
+            frame_timestamp_micros = frame.timestamp_micros
+
+            if filename in self.name2idx:
+                kitti_result = \
+                    self.kitti_result_files[self.name2idx[filename]]
+                objects = self.parse_objects(kitti_result, T_k2w, context_name,
+                                             frame_timestamp_micros)
+            else:
+                print(filename, 'not found.')
+                objects = metrics_pb2.Objects()
+
+            with open(
+                    join(self.waymo_results_save_dir, f'{filename}.bin'),
+                    'wb') as f:
+                f.write(objects.SerializeToString())
+
+    def convert(self):
+        """Convert action."""
+        print('Start converting ...')
+        track_parallel_progress(self.convert_one, range(len(self)),
+                                     self.workers)
+        print('\nFinished ...')
+
+        # combine all files into one .bin
+        pathnames = sorted(glob(join(self.waymo_results_save_dir, '*.bin')))
+        combined = self.combine(pathnames)
+
+        with open(self.waymo_results_final_path, 'wb') as f:
+            f.write(combined.SerializeToString())
+
+    def __len__(self):
+        """Length of the filename list."""
+        return len(self.waymo_tfrecord_pathnames)
+
+    def transform(self, T, x, y, z):
+        """Transform the coordinates with matrix T.
+
+        Args:
+            T (np.ndarray): Transformation matrix.
+            x(float): Coordinate in x axis.
+            y(float): Coordinate in y axis.
+            z(float): Coordinate in z axis.
+
+        Returns:
+            list: Coordinates after transformation.
+        """
+        pt_bef = np.array([x, y, z, 1.0]).reshape(4, 1)
+        pt_aft = np.matmul(T, pt_bef)
+        return pt_aft[:3].flatten().tolist()
+
+    def combine(self, pathnames):
+        """Combine predictions in waymo format for each sample together.
+
+        Args:
+            pathnames (str): Paths to save predictions.
+
+        Returns:
+            :obj:`Objects`: Combined predictions in Objects proto.
+        """
+        combined = metrics_pb2.Objects()
+
+        for pathname in pathnames:
+            objects = metrics_pb2.Objects()
+            with open(pathname, 'rb') as f:
+                objects.ParseFromString(f.read())
+            for o in objects.objects:
+                combined.objects.append(o)
+
+        return combined
diff --git a/mmcv/core/mask/__init__.py b/mmcv/core/mask/__init__.py
new file mode 100644
index 0000000..02cbbc7
--- /dev/null
+++ b/mmcv/core/mask/__init__.py
@@ -0,0 +1,6 @@
+from .mask_target import mask_target
+from .utils import encode_mask_results, split_combined_polys
+
+__all__ = [
+    'split_combined_polys', 'mask_target', 'encode_mask_results'
+]
diff --git a/mmcv/core/mask/mask_target.py b/mmcv/core/mask/mask_target.py
new file mode 100644
index 0000000..e8f5461
--- /dev/null
+++ b/mmcv/core/mask/mask_target.py
@@ -0,0 +1,126 @@
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair
+
+
+def mask_target(pos_proposals_list, pos_assigned_gt_inds_list, gt_masks_list,
+                cfg):
+    """Compute mask target for positive proposals in multiple images.
+
+    Args:
+        pos_proposals_list (list[Tensor]): Positive proposals in multiple
+            images.
+        pos_assigned_gt_inds_list (list[Tensor]): Assigned GT indices for each
+            positive proposals.
+        gt_masks_list (list[:obj:`BaseInstanceMasks`]): Ground truth masks of
+            each image.
+        cfg (dict): Config dict that specifies the mask size.
+
+    Returns:
+        list[Tensor]: Mask target of each image.
+
+    Example:
+        >>> import mmcv
+        >>> import mmdet
+        >>> from mmcv.core.mask import BitmapMasks
+        >>> from mmcv.core.mask.mask_target import *
+        >>> H, W = 17, 18
+        >>> cfg = mmcv.Config({'mask_size': (13, 14)})
+        >>> rng = np.random.RandomState(0)
+        >>> # Positive proposals (tl_x, tl_y, br_x, br_y) for each image
+        >>> pos_proposals_list = [
+        >>>     torch.Tensor([
+        >>>         [ 7.2425,  5.5929, 13.9414, 14.9541],
+        >>>         [ 7.3241,  3.6170, 16.3850, 15.3102],
+        >>>     ]),
+        >>>     torch.Tensor([
+        >>>         [ 4.8448, 6.4010, 7.0314, 9.7681],
+        >>>         [ 5.9790, 2.6989, 7.4416, 4.8580],
+        >>>         [ 0.0000, 0.0000, 0.1398, 9.8232],
+        >>>     ]),
+        >>> ]
+        >>> # Corresponding class index for each proposal for each image
+        >>> pos_assigned_gt_inds_list = [
+        >>>     torch.LongTensor([7, 0]),
+        >>>     torch.LongTensor([5, 4, 1]),
+        >>> ]
+        >>> # Ground truth mask for each true object for each image
+        >>> gt_masks_list = [
+        >>>     BitmapMasks(rng.rand(8, H, W), height=H, width=W),
+        >>>     BitmapMasks(rng.rand(6, H, W), height=H, width=W),
+        >>> ]
+        >>> mask_targets = mask_target(
+        >>>     pos_proposals_list, pos_assigned_gt_inds_list,
+        >>>     gt_masks_list, cfg)
+        >>> assert mask_targets.shape == (5,) + cfg['mask_size']
+    """
+    cfg_list = [cfg for _ in range(len(pos_proposals_list))]
+    mask_targets = map(mask_target_single, pos_proposals_list,
+                       pos_assigned_gt_inds_list, gt_masks_list, cfg_list)
+    mask_targets = list(mask_targets)
+    if len(mask_targets) > 0:
+        mask_targets = torch.cat(mask_targets)
+    return mask_targets
+
+
+def mask_target_single(pos_proposals, pos_assigned_gt_inds, gt_masks, cfg):
+    """Compute mask target for each positive proposal in the image.
+
+    Args:
+        pos_proposals (Tensor): Positive proposals.
+        pos_assigned_gt_inds (Tensor): Assigned GT inds of positive proposals.
+        gt_masks (:obj:`BaseInstanceMasks`): GT masks in the format of Bitmap
+            or Polygon.
+        cfg (dict): Config dict that indicate the mask size.
+
+    Returns:
+        Tensor: Mask target of each positive proposals in the image.
+
+    Example:
+        >>> import mmcv
+        >>> import mmdet
+        >>> from mmcv.core.mask import BitmapMasks
+        >>> from mmcv.core.mask.mask_target import *  # NOQA
+        >>> H, W = 32, 32
+        >>> cfg = mmcv.Config({'mask_size': (7, 11)})
+        >>> rng = np.random.RandomState(0)
+        >>> # Masks for each ground truth box (relative to the image)
+        >>> gt_masks_data = rng.rand(3, H, W)
+        >>> gt_masks = BitmapMasks(gt_masks_data, height=H, width=W)
+        >>> # Predicted positive boxes in one image
+        >>> pos_proposals = torch.FloatTensor([
+        >>>     [ 16.2,   5.5, 19.9, 20.9],
+        >>>     [ 17.3,  13.6, 19.3, 19.3],
+        >>>     [ 14.8,  16.4, 17.0, 23.7],
+        >>>     [  0.0,   0.0, 16.0, 16.0],
+        >>>     [  4.0,   0.0, 20.0, 16.0],
+        >>> ])
+        >>> # For each predicted proposal, its assignment to a gt mask
+        >>> pos_assigned_gt_inds = torch.LongTensor([0, 1, 2, 1, 1])
+        >>> mask_targets = mask_target_single(
+        >>>     pos_proposals, pos_assigned_gt_inds, gt_masks, cfg)
+        >>> assert mask_targets.shape == (5,) + cfg['mask_size']
+    """
+    device = pos_proposals.device
+    mask_size = _pair(cfg.mask_size)
+    binarize = not cfg.get('soft_mask_target', False)
+    num_pos = pos_proposals.size(0)
+    if num_pos > 0:
+        proposals_np = pos_proposals.cpu().numpy()
+        maxh, maxw = gt_masks.height, gt_masks.width
+        proposals_np[:, [0, 2]] = np.clip(proposals_np[:, [0, 2]], 0, maxw)
+        proposals_np[:, [1, 3]] = np.clip(proposals_np[:, [1, 3]], 0, maxh)
+        pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy()
+
+        mask_targets = gt_masks.crop_and_resize(
+            proposals_np,
+            mask_size,
+            device=device,
+            inds=pos_assigned_gt_inds,
+            binarize=binarize).to_ndarray()
+
+        mask_targets = torch.from_numpy(mask_targets).float().to(device)
+    else:
+        mask_targets = pos_proposals.new_zeros((0, ) + mask_size)
+
+    return mask_targets
diff --git a/mmcv/core/mask/structures.py b/mmcv/core/mask/structures.py
new file mode 100644
index 0000000..10d9155
--- /dev/null
+++ b/mmcv/core/mask/structures.py
@@ -0,0 +1,1037 @@
+from abc import ABCMeta, abstractmethod
+
+import cv2
+import numpy as np
+import pycocotools.mask as maskUtils
+import torch
+from mmcv.ops.roi_align import roi_align
+from mmcv.image import rescale_size, imrescale, imresize, imflip, impad, imtranslate, imshear, imrotate
+
+
+class BaseInstanceMasks(metaclass=ABCMeta):
+    """Base class for instance masks."""
+
+    @abstractmethod
+    def rescale(self, scale, interpolation='nearest'):
+        """Rescale masks as large as possible while keeping the aspect ratio.
+        For details can refer to `mmcv.imrescale`.
+
+        Args:
+            scale (tuple[int]): The maximum size (h, w) of rescaled mask.
+            interpolation (str): Same as :func:`mmcv.imrescale`.
+
+        Returns:
+            BaseInstanceMasks: The rescaled masks.
+        """
+
+    @abstractmethod
+    def resize(self, out_shape, interpolation='nearest'):
+        """Resize masks to the given out_shape.
+
+        Args:
+            out_shape: Target (h, w) of resized mask.
+            interpolation (str): See :func:`mmcv.imresize`.
+
+        Returns:
+            BaseInstanceMasks: The resized masks.
+        """
+
+    @abstractmethod
+    def flip(self, flip_direction='horizontal'):
+        """Flip masks alone the given direction.
+
+        Args:
+            flip_direction (str): Either 'horizontal' or 'vertical'.
+
+        Returns:
+            BaseInstanceMasks: The flipped masks.
+        """
+
+    @abstractmethod
+    def pad(self, out_shape, pad_val):
+        """Pad masks to the given size of (h, w).
+
+        Args:
+            out_shape (tuple[int]): Target (h, w) of padded mask.
+            pad_val (int): The padded value.
+
+        Returns:
+            BaseInstanceMasks: The padded masks.
+        """
+
+    @abstractmethod
+    def crop(self, bbox):
+        """Crop each mask by the given bbox.
+
+        Args:
+            bbox (ndarray): Bbox in format [x1, y1, x2, y2], shape (4, ).
+
+        Return:
+            BaseInstanceMasks: The cropped masks.
+        """
+
+    @abstractmethod
+    def crop_and_resize(self,
+                        bboxes,
+                        out_shape,
+                        inds,
+                        device,
+                        interpolation='bilinear',
+                        binarize=True):
+        """Crop and resize masks by the given bboxes.
+
+        This function is mainly used in mask targets computation.
+        It firstly align mask to bboxes by assigned_inds, then crop mask by the
+        assigned bbox and resize to the size of (mask_h, mask_w)
+
+        Args:
+            bboxes (Tensor): Bboxes in format [x1, y1, x2, y2], shape (N, 4)
+            out_shape (tuple[int]): Target (h, w) of resized mask
+            inds (ndarray): Indexes to assign masks to each bbox,
+                shape (N,) and values should be between [0, num_masks - 1].
+            device (str): Device of bboxes
+            interpolation (str): See `mmcv.imresize`
+            binarize (bool): if True fractional values are rounded to 0 or 1
+                after the resize operation. if False and unsupported an error
+                will be raised. Defaults to True.
+
+        Return:
+            BaseInstanceMasks: the cropped and resized masks.
+        """
+
+    @abstractmethod
+    def expand(self, expanded_h, expanded_w, top, left):
+        """see :class:`Expand`."""
+
+    @property
+    @abstractmethod
+    def areas(self):
+        """ndarray: areas of each instance."""
+
+    @abstractmethod
+    def to_ndarray(self):
+        """Convert masks to the format of ndarray.
+
+        Return:
+            ndarray: Converted masks in the format of ndarray.
+        """
+
+    @abstractmethod
+    def to_tensor(self, dtype, device):
+        """Convert masks to the format of Tensor.
+
+        Args:
+            dtype (str): Dtype of converted mask.
+            device (torch.device): Device of converted masks.
+
+        Returns:
+            Tensor: Converted masks in the format of Tensor.
+        """
+
+    @abstractmethod
+    def translate(self,
+                  out_shape,
+                  offset,
+                  direction='horizontal',
+                  fill_val=0,
+                  interpolation='bilinear'):
+        """Translate the masks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            offset (int | float): The offset for translate.
+            direction (str): The translate direction, either "horizontal"
+                or "vertical".
+            fill_val (int | float): Border value. Default 0.
+            interpolation (str): Same as :func:`mmcv.imtranslate`.
+
+        Returns:
+            Translated masks.
+        """
+
+    def shear(self,
+              out_shape,
+              magnitude,
+              direction='horizontal',
+              border_value=0,
+              interpolation='bilinear'):
+        """Shear the masks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            magnitude (int | float): The magnitude used for shear.
+            direction (str): The shear direction, either "horizontal"
+                or "vertical".
+            border_value (int | tuple[int]): Value used in case of a
+                constant border. Default 0.
+            interpolation (str): Same as in :func:`mmcv.imshear`.
+
+        Returns:
+            ndarray: Sheared masks.
+        """
+
+    @abstractmethod
+    def rotate(self, out_shape, angle, center=None, scale=1.0, fill_val=0):
+        """Rotate the masks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            angle (int | float): Rotation angle in degrees. Positive values
+                mean counter-clockwise rotation.
+            center (tuple[float], optional): Center point (w, h) of the
+                rotation in source image. If not specified, the center of
+                the image will be used.
+            scale (int | float): Isotropic scale factor.
+            fill_val (int | float): Border value. Default 0 for masks.
+
+        Returns:
+            Rotated masks.
+        """
+
+
+class BitmapMasks(BaseInstanceMasks):
+    """This class represents masks in the form of bitmaps.
+
+    Args:
+        masks (ndarray): ndarray of masks in shape (N, H, W), where N is
+            the number of objects.
+        height (int): height of masks
+        width (int): width of masks
+
+    Example:
+        >>> from mmcv.core.mask.structures import *  # NOQA
+        >>> num_masks, H, W = 3, 32, 32
+        >>> rng = np.random.RandomState(0)
+        >>> masks = (rng.rand(num_masks, H, W) > 0.1).astype(np.int)
+        >>> self = BitmapMasks(masks, height=H, width=W)
+
+        >>> # demo crop_and_resize
+        >>> num_boxes = 5
+        >>> bboxes = np.array([[0, 0, 30, 10.0]] * num_boxes)
+        >>> out_shape = (14, 14)
+        >>> inds = torch.randint(0, len(self), size=(num_boxes,))
+        >>> device = 'cpu'
+        >>> interpolation = 'bilinear'
+        >>> new = self.crop_and_resize(
+        ...     bboxes, out_shape, inds, device, interpolation)
+        >>> assert len(new) == num_boxes
+        >>> assert new.height, new.width == out_shape
+    """
+
+    def __init__(self, masks, height, width):
+        self.height = height
+        self.width = width
+        if len(masks) == 0:
+            self.masks = np.empty((0, self.height, self.width), dtype=np.uint8)
+        else:
+            assert isinstance(masks, (list, np.ndarray))
+            if isinstance(masks, list):
+                assert isinstance(masks[0], np.ndarray)
+                assert masks[0].ndim == 2  # (H, W)
+            else:
+                assert masks.ndim == 3  # (N, H, W)
+
+            self.masks = np.stack(masks).reshape(-1, height, width)
+            assert self.masks.shape[1] == self.height
+            assert self.masks.shape[2] == self.width
+
+    def __getitem__(self, index):
+        """Index the BitmapMask.
+
+        Args:
+            index (int | ndarray): Indices in the format of integer or ndarray.
+
+        Returns:
+            :obj:`BitmapMasks`: Indexed bitmap masks.
+        """
+        masks = self.masks[index].reshape(-1, self.height, self.width)
+        return BitmapMasks(masks, self.height, self.width)
+
+    def __iter__(self):
+        return iter(self.masks)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += f'num_masks={len(self.masks)}, '
+        s += f'height={self.height}, '
+        s += f'width={self.width})'
+        return s
+
+    def __len__(self):
+        """Number of masks."""
+        return len(self.masks)
+
+    def rescale(self, scale, interpolation='nearest'):
+        """See :func:`BaseInstanceMasks.rescale`."""
+        if len(self.masks) == 0:
+            new_w, new_h = rescale_size((self.width, self.height), scale)
+            rescaled_masks = np.empty((0, new_h, new_w), dtype=np.uint8)
+        else:
+            rescaled_masks = np.stack([
+                imrescale(mask, scale, interpolation=interpolation)
+                for mask in self.masks
+            ])
+        height, width = rescaled_masks.shape[1:]
+        return BitmapMasks(rescaled_masks, height, width)
+
+    def resize(self, out_shape, interpolation='nearest'):
+        """See :func:`BaseInstanceMasks.resize`."""
+        if len(self.masks) == 0:
+            resized_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            resized_masks = np.stack([
+                imresize(
+                    mask, out_shape[::-1], interpolation=interpolation)
+                for mask in self.masks
+            ])
+        return BitmapMasks(resized_masks, *out_shape)
+
+    def flip(self, flip_direction='horizontal'):
+        """See :func:`BaseInstanceMasks.flip`."""
+        assert flip_direction in ('horizontal', 'vertical', 'diagonal')
+
+        if len(self.masks) == 0:
+            flipped_masks = self.masks
+        else:
+            flipped_masks = np.stack([
+                imflip(mask, direction=flip_direction)
+                for mask in self.masks
+            ])
+        return BitmapMasks(flipped_masks, self.height, self.width)
+
+    def pad(self, out_shape, pad_val=0):
+        """See :func:`BaseInstanceMasks.pad`."""
+        if len(self.masks) == 0:
+            padded_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            padded_masks = np.stack([
+                impad(mask, shape=out_shape, pad_val=pad_val)
+                for mask in self.masks
+            ])
+        return BitmapMasks(padded_masks, *out_shape)
+
+    def crop(self, bbox):
+        """See :func:`BaseInstanceMasks.crop`."""
+        assert isinstance(bbox, np.ndarray)
+        assert bbox.ndim == 1
+
+        # clip the boundary
+        bbox = bbox.copy()
+        bbox[0::2] = np.clip(bbox[0::2], 0, self.width)
+        bbox[1::2] = np.clip(bbox[1::2], 0, self.height)
+        x1, y1, x2, y2 = bbox
+        w = np.maximum(x2 - x1, 1)
+        h = np.maximum(y2 - y1, 1)
+
+        if len(self.masks) == 0:
+            cropped_masks = np.empty((0, h, w), dtype=np.uint8)
+        else:
+            cropped_masks = self.masks[:, y1:y1 + h, x1:x1 + w]
+        return BitmapMasks(cropped_masks, h, w)
+
+    def crop_and_resize(self,
+                        bboxes,
+                        out_shape,
+                        inds,
+                        device='cpu',
+                        interpolation='bilinear',
+                        binarize=True):
+        """See :func:`BaseInstanceMasks.crop_and_resize`."""
+        if len(self.masks) == 0:
+            empty_masks = np.empty((0, *out_shape), dtype=np.uint8)
+            return BitmapMasks(empty_masks, *out_shape)
+
+        # convert bboxes to tensor
+        if isinstance(bboxes, np.ndarray):
+            bboxes = torch.from_numpy(bboxes).to(device=device)
+        if isinstance(inds, np.ndarray):
+            inds = torch.from_numpy(inds).to(device=device)
+
+        num_bbox = bboxes.shape[0]
+        fake_inds = torch.arange(
+            num_bbox, device=device).to(dtype=bboxes.dtype)[:, None]
+        rois = torch.cat([fake_inds, bboxes], dim=1)  # Nx5
+        rois = rois.to(device=device)
+        if num_bbox > 0:
+            gt_masks_th = torch.from_numpy(self.masks).to(device).index_select(
+                0, inds).to(dtype=rois.dtype)
+            targets = roi_align(gt_masks_th[:, None, :, :], rois, out_shape,
+                                1.0, 0, 'avg', True).squeeze(1)
+            if binarize:
+                resized_masks = (targets >= 0.5).cpu().numpy()
+            else:
+                resized_masks = targets.cpu().numpy()
+        else:
+            resized_masks = []
+        return BitmapMasks(resized_masks, *out_shape)
+
+    def expand(self, expanded_h, expanded_w, top, left):
+        """See :func:`BaseInstanceMasks.expand`."""
+        if len(self.masks) == 0:
+            expanded_mask = np.empty((0, expanded_h, expanded_w),
+                                     dtype=np.uint8)
+        else:
+            expanded_mask = np.zeros((len(self), expanded_h, expanded_w),
+                                     dtype=np.uint8)
+            expanded_mask[:, top:top + self.height,
+                          left:left + self.width] = self.masks
+        return BitmapMasks(expanded_mask, expanded_h, expanded_w)
+
+    def translate(self,
+                  out_shape,
+                  offset,
+                  direction='horizontal',
+                  fill_val=0,
+                  interpolation='bilinear'):
+        """Translate the BitmapMasks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            offset (int | float): The offset for translate.
+            direction (str): The translate direction, either "horizontal"
+                or "vertical".
+            fill_val (int | float): Border value. Default 0 for masks.
+            interpolation (str): Same as :func:`mmcv.imtranslate`.
+
+        Returns:
+            BitmapMasks: Translated BitmapMasks.
+
+        Example:
+            >>> from mmcv.core.mask.structures import BitmapMasks
+            >>> self = BitmapMasks.random(dtype=np.uint8)
+            >>> out_shape = (32, 32)
+            >>> offset = 4
+            >>> direction = 'horizontal'
+            >>> fill_val = 0
+            >>> interpolation = 'bilinear'
+            >>> # Note, There seem to be issues when:
+            >>> # * out_shape is different than self's shape
+            >>> # * the mask dtype is not supported by cv2.AffineWarp
+            >>> new = self.translate(out_shape, offset, direction, fill_val,
+            >>>                      interpolation)
+            >>> assert len(new) == len(self)
+            >>> assert new.height, new.width == out_shape
+        """
+        if len(self.masks) == 0:
+            translated_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            translated_masks = imtranslate(
+                self.masks.transpose((1, 2, 0)),
+                offset,
+                direction,
+                border_value=fill_val,
+                interpolation=interpolation)
+            if translated_masks.ndim == 2:
+                translated_masks = translated_masks[:, :, None]
+            translated_masks = translated_masks.transpose(
+                (2, 0, 1)).astype(self.masks.dtype)
+        return BitmapMasks(translated_masks, *out_shape)
+
+    def shear(self,
+              out_shape,
+              magnitude,
+              direction='horizontal',
+              border_value=0,
+              interpolation='bilinear'):
+        """Shear the BitmapMasks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            magnitude (int | float): The magnitude used for shear.
+            direction (str): The shear direction, either "horizontal"
+                or "vertical".
+            border_value (int | tuple[int]): Value used in case of a
+                constant border.
+            interpolation (str): Same as in :func:`mmcv.imshear`.
+
+        Returns:
+            BitmapMasks: The sheared masks.
+        """
+        if len(self.masks) == 0:
+            sheared_masks = np.empty((0, *out_shape), dtype=np.uint8)
+        else:
+            sheared_masks = imshear(
+                self.masks.transpose((1, 2, 0)),
+                magnitude,
+                direction,
+                border_value=border_value,
+                interpolation=interpolation)
+            if sheared_masks.ndim == 2:
+                sheared_masks = sheared_masks[:, :, None]
+            sheared_masks = sheared_masks.transpose(
+                (2, 0, 1)).astype(self.masks.dtype)
+        return BitmapMasks(sheared_masks, *out_shape)
+
+    def rotate(self, out_shape, angle, center=None, scale=1.0, fill_val=0):
+        """Rotate the BitmapMasks.
+
+        Args:
+            out_shape (tuple[int]): Shape for output mask, format (h, w).
+            angle (int | float): Rotation angle in degrees. Positive values
+                mean counter-clockwise rotation.
+            center (tuple[float], optional): Center point (w, h) of the
+                rotation in source image. If not specified, the center of
+                the image will be used.
+            scale (int | float): Isotropic scale factor.
+            fill_val (int | float): Border value. Default 0 for masks.
+
+        Returns:
+            BitmapMasks: Rotated BitmapMasks.
+        """
+        if len(self.masks) == 0:
+            rotated_masks = np.empty((0, *out_shape), dtype=self.masks.dtype)
+        else:
+            rotated_masks = imrotate(
+                self.masks.transpose((1, 2, 0)),
+                angle,
+                center=center,
+                scale=scale,
+                border_value=fill_val)
+            if rotated_masks.ndim == 2:
+                # case when only one mask, (h, w)
+                rotated_masks = rotated_masks[:, :, None]  # (h, w, 1)
+            rotated_masks = rotated_masks.transpose(
+                (2, 0, 1)).astype(self.masks.dtype)
+        return BitmapMasks(rotated_masks, *out_shape)
+
+    @property
+    def areas(self):
+        """See :py:attr:`BaseInstanceMasks.areas`."""
+        return self.masks.sum((1, 2))
+
+    def to_ndarray(self):
+        """See :func:`BaseInstanceMasks.to_ndarray`."""
+        return self.masks
+
+    def to_tensor(self, dtype, device):
+        """See :func:`BaseInstanceMasks.to_tensor`."""
+        return torch.tensor(self.masks, dtype=dtype, device=device)
+
+    @classmethod
+    def random(cls,
+               num_masks=3,
+               height=32,
+               width=32,
+               dtype=np.uint8,
+               rng=None):
+        """Generate random bitmap masks for demo / testing purposes.
+
+        Example:
+            >>> from mmcv.core.mask.structures import BitmapMasks
+            >>> self = BitmapMasks.random()
+            >>> print('self = {}'.format(self))
+            self = BitmapMasks(num_masks=3, height=32, width=32)
+        """
+        from mmcv.utils.util_random import ensure_rng
+        rng = ensure_rng(rng)
+        masks = (rng.rand(num_masks, height, width) > 0.1).astype(dtype)
+        self = cls(masks, height=height, width=width)
+        return self
+
+
+class PolygonMasks(BaseInstanceMasks):
+    """This class represents masks in the form of polygons.
+
+    Polygons is a list of three levels. The first level of the list
+    corresponds to objects, the second level to the polys that compose the
+    object, the third level to the poly coordinates
+
+    Args:
+        masks (list[list[ndarray]]): The first level of the list
+            corresponds to objects, the second level to the polys that
+            compose the object, the third level to the poly coordinates
+        height (int): height of masks
+        width (int): width of masks
+
+    Example:
+        >>> from mmcv.core.mask.structures import *  # NOQA
+        >>> masks = [
+        >>>     [ np.array([0, 0, 10, 0, 10, 10., 0, 10, 0, 0]) ]
+        >>> ]
+        >>> height, width = 16, 16
+        >>> self = PolygonMasks(masks, height, width)
+
+        >>> # demo translate
+        >>> new = self.translate((16, 16), 4., direction='horizontal')
+        >>> assert np.all(new.masks[0][0][1::2] == masks[0][0][1::2])
+        >>> assert np.all(new.masks[0][0][0::2] == masks[0][0][0::2] + 4)
+
+        >>> # demo crop_and_resize
+        >>> num_boxes = 3
+        >>> bboxes = np.array([[0, 0, 30, 10.0]] * num_boxes)
+        >>> out_shape = (16, 16)
+        >>> inds = torch.randint(0, len(self), size=(num_boxes,))
+        >>> device = 'cpu'
+        >>> interpolation = 'bilinear'
+        >>> new = self.crop_and_resize(
+        ...     bboxes, out_shape, inds, device, interpolation)
+        >>> assert len(new) == num_boxes
+        >>> assert new.height, new.width == out_shape
+    """
+
+    def __init__(self, masks, height, width):
+        assert isinstance(masks, list)
+        if len(masks) > 0:
+            assert isinstance(masks[0], list)
+            assert isinstance(masks[0][0], np.ndarray)
+
+        self.height = height
+        self.width = width
+        self.masks = masks
+
+    def __getitem__(self, index):
+        """Index the polygon masks.
+
+        Args:
+            index (ndarray | List): The indices.
+
+        Returns:
+            :obj:`PolygonMasks`: The indexed polygon masks.
+        """
+        if isinstance(index, np.ndarray):
+            index = index.tolist()
+        if isinstance(index, list):
+            masks = [self.masks[i] for i in index]
+        else:
+            try:
+                masks = self.masks[index]
+            except Exception:
+                raise ValueError(
+                    f'Unsupported input of type {type(index)} for indexing!')
+        if len(masks) and isinstance(masks[0], np.ndarray):
+            masks = [masks]  # ensure a list of three levels
+        return PolygonMasks(masks, self.height, self.width)
+
+    def __iter__(self):
+        return iter(self.masks)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += f'num_masks={len(self.masks)}, '
+        s += f'height={self.height}, '
+        s += f'width={self.width})'
+        return s
+
+    def __len__(self):
+        """Number of masks."""
+        return len(self.masks)
+
+    def rescale(self, scale, interpolation=None):
+        """see :func:`BaseInstanceMasks.rescale`"""
+        new_w, new_h = rescale_size((self.width, self.height), scale)
+        if len(self.masks) == 0:
+            rescaled_masks = PolygonMasks([], new_h, new_w)
+        else:
+            rescaled_masks = self.resize((new_h, new_w))
+        return rescaled_masks
+
+    def resize(self, out_shape, interpolation=None):
+        """see :func:`BaseInstanceMasks.resize`"""
+        if len(self.masks) == 0:
+            resized_masks = PolygonMasks([], *out_shape)
+        else:
+            h_scale = out_shape[0] / self.height
+            w_scale = out_shape[1] / self.width
+            resized_masks = []
+            for poly_per_obj in self.masks:
+                resized_poly = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    p[0::2] *= w_scale
+                    p[1::2] *= h_scale
+                    resized_poly.append(p)
+                resized_masks.append(resized_poly)
+            resized_masks = PolygonMasks(resized_masks, *out_shape)
+        return resized_masks
+
+    def flip(self, flip_direction='horizontal'):
+        """see :func:`BaseInstanceMasks.flip`"""
+        assert flip_direction in ('horizontal', 'vertical', 'diagonal')
+        if len(self.masks) == 0:
+            flipped_masks = PolygonMasks([], self.height, self.width)
+        else:
+            flipped_masks = []
+            for poly_per_obj in self.masks:
+                flipped_poly_per_obj = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    if flip_direction == 'horizontal':
+                        p[0::2] = self.width - p[0::2]
+                    elif flip_direction == 'vertical':
+                        p[1::2] = self.height - p[1::2]
+                    else:
+                        p[0::2] = self.width - p[0::2]
+                        p[1::2] = self.height - p[1::2]
+                    flipped_poly_per_obj.append(p)
+                flipped_masks.append(flipped_poly_per_obj)
+            flipped_masks = PolygonMasks(flipped_masks, self.height,
+                                         self.width)
+        return flipped_masks
+
+    def crop(self, bbox):
+        """see :func:`BaseInstanceMasks.crop`"""
+        assert isinstance(bbox, np.ndarray)
+        assert bbox.ndim == 1
+
+        # clip the boundary
+        bbox = bbox.copy()
+        bbox[0::2] = np.clip(bbox[0::2], 0, self.width)
+        bbox[1::2] = np.clip(bbox[1::2], 0, self.height)
+        x1, y1, x2, y2 = bbox
+        w = np.maximum(x2 - x1, 1)
+        h = np.maximum(y2 - y1, 1)
+
+        if len(self.masks) == 0:
+            cropped_masks = PolygonMasks([], h, w)
+        else:
+            cropped_masks = []
+            for poly_per_obj in self.masks:
+                cropped_poly_per_obj = []
+                for p in poly_per_obj:
+                    # pycocotools will clip the boundary
+                    p = p.copy()
+                    p[0::2] -= bbox[0]
+                    p[1::2] -= bbox[1]
+                    cropped_poly_per_obj.append(p)
+                cropped_masks.append(cropped_poly_per_obj)
+            cropped_masks = PolygonMasks(cropped_masks, h, w)
+        return cropped_masks
+
+    def pad(self, out_shape, pad_val=0):
+        """padding has no effect on polygons`"""
+        return PolygonMasks(self.masks, *out_shape)
+
+    def expand(self, *args, **kwargs):
+        """TODO: Add expand for polygon"""
+        raise NotImplementedError
+
+    def crop_and_resize(self,
+                        bboxes,
+                        out_shape,
+                        inds,
+                        device='cpu',
+                        interpolation='bilinear',
+                        binarize=True):
+        """see :func:`BaseInstanceMasks.crop_and_resize`"""
+        out_h, out_w = out_shape
+        if len(self.masks) == 0:
+            return PolygonMasks([], out_h, out_w)
+
+        if not binarize:
+            raise ValueError('Polygons are always binary, '
+                             'setting binarize=False is unsupported')
+
+        resized_masks = []
+        for i in range(len(bboxes)):
+            mask = self.masks[inds[i]]
+            bbox = bboxes[i, :]
+            x1, y1, x2, y2 = bbox
+            w = np.maximum(x2 - x1, 1)
+            h = np.maximum(y2 - y1, 1)
+            h_scale = out_h / max(h, 0.1)  # avoid too large scale
+            w_scale = out_w / max(w, 0.1)
+
+            resized_mask = []
+            for p in mask:
+                p = p.copy()
+                # crop
+                # pycocotools will clip the boundary
+                p[0::2] -= bbox[0]
+                p[1::2] -= bbox[1]
+
+                # resize
+                p[0::2] *= w_scale
+                p[1::2] *= h_scale
+                resized_mask.append(p)
+            resized_masks.append(resized_mask)
+        return PolygonMasks(resized_masks, *out_shape)
+
+    def translate(self,
+                  out_shape,
+                  offset,
+                  direction='horizontal',
+                  fill_val=None,
+                  interpolation=None):
+        """Translate the PolygonMasks.
+
+        Example:
+            >>> self = PolygonMasks.random(dtype=np.int)
+            >>> out_shape = (self.height, self.width)
+            >>> new = self.translate(out_shape, 4., direction='horizontal')
+            >>> assert np.all(new.masks[0][0][1::2] == self.masks[0][0][1::2])
+            >>> assert np.all(new.masks[0][0][0::2] == self.masks[0][0][0::2] + 4)  # noqa: E501
+        """
+        assert fill_val is None or fill_val == 0, 'Here fill_val is not '\
+            f'used, and defaultly should be None or 0. got {fill_val}.'
+        if len(self.masks) == 0:
+            translated_masks = PolygonMasks([], *out_shape)
+        else:
+            translated_masks = []
+            for poly_per_obj in self.masks:
+                translated_poly_per_obj = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    if direction == 'horizontal':
+                        p[0::2] = np.clip(p[0::2] + offset, 0, out_shape[1])
+                    elif direction == 'vertical':
+                        p[1::2] = np.clip(p[1::2] + offset, 0, out_shape[0])
+                    translated_poly_per_obj.append(p)
+                translated_masks.append(translated_poly_per_obj)
+            translated_masks = PolygonMasks(translated_masks, *out_shape)
+        return translated_masks
+
+    def shear(self,
+              out_shape,
+              magnitude,
+              direction='horizontal',
+              border_value=0,
+              interpolation='bilinear'):
+        """See :func:`BaseInstanceMasks.shear`."""
+        if len(self.masks) == 0:
+            sheared_masks = PolygonMasks([], *out_shape)
+        else:
+            sheared_masks = []
+            if direction == 'horizontal':
+                shear_matrix = np.stack([[1, magnitude],
+                                         [0, 1]]).astype(np.float32)
+            elif direction == 'vertical':
+                shear_matrix = np.stack([[1, 0], [magnitude,
+                                                  1]]).astype(np.float32)
+            for poly_per_obj in self.masks:
+                sheared_poly = []
+                for p in poly_per_obj:
+                    p = np.stack([p[0::2], p[1::2]], axis=0)  # [2, n]
+                    new_coords = np.matmul(shear_matrix, p)  # [2, n]
+                    new_coords[0, :] = np.clip(new_coords[0, :], 0,
+                                               out_shape[1])
+                    new_coords[1, :] = np.clip(new_coords[1, :], 0,
+                                               out_shape[0])
+                    sheared_poly.append(
+                        new_coords.transpose((1, 0)).reshape(-1))
+                sheared_masks.append(sheared_poly)
+            sheared_masks = PolygonMasks(sheared_masks, *out_shape)
+        return sheared_masks
+
+    def rotate(self, out_shape, angle, center=None, scale=1.0, fill_val=0):
+        """See :func:`BaseInstanceMasks.rotate`."""
+        if len(self.masks) == 0:
+            rotated_masks = PolygonMasks([], *out_shape)
+        else:
+            rotated_masks = []
+            rotate_matrix = cv2.getRotationMatrix2D(center, -angle, scale)
+            for poly_per_obj in self.masks:
+                rotated_poly = []
+                for p in poly_per_obj:
+                    p = p.copy()
+                    coords = np.stack([p[0::2], p[1::2]], axis=1)  # [n, 2]
+                    # pad 1 to convert from format [x, y] to homogeneous
+                    # coordinates format [x, y, 1]
+                    coords = np.concatenate(
+                        (coords, np.ones((coords.shape[0], 1), coords.dtype)),
+                        axis=1)  # [n, 3]
+                    rotated_coords = np.matmul(
+                        rotate_matrix[None, :, :],
+                        coords[:, :, None])[..., 0]  # [n, 2, 1] -> [n, 2]
+                    rotated_coords[:, 0] = np.clip(rotated_coords[:, 0], 0,
+                                                   out_shape[1])
+                    rotated_coords[:, 1] = np.clip(rotated_coords[:, 1], 0,
+                                                   out_shape[0])
+                    rotated_poly.append(rotated_coords.reshape(-1))
+                rotated_masks.append(rotated_poly)
+            rotated_masks = PolygonMasks(rotated_masks, *out_shape)
+        return rotated_masks
+
+    def to_bitmap(self):
+        """convert polygon masks to bitmap masks."""
+        bitmap_masks = self.to_ndarray()
+        return BitmapMasks(bitmap_masks, self.height, self.width)
+
+    @property
+    def areas(self):
+        """Compute areas of masks.
+
+        This func is modified from `detectron2
+        <https://github.com/facebookresearch/detectron2/blob/ffff8acc35ea88ad1cb1806ab0f00b4c1c5dbfd9/detectron2/structures/masks.py#L387>`_.
+        The function only works with Polygons using the shoelace formula.
+
+        Return:
+            ndarray: areas of each instance
+        """  # noqa: W501
+        area = []
+        for polygons_per_obj in self.masks:
+            area_per_obj = 0
+            for p in polygons_per_obj:
+                area_per_obj += self._polygon_area(p[0::2], p[1::2])
+            area.append(area_per_obj)
+        return np.asarray(area)
+
+    def _polygon_area(self, x, y):
+        """Compute the area of a component of a polygon.
+
+        Using the shoelace formula:
+        https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
+
+        Args:
+            x (ndarray): x coordinates of the component
+            y (ndarray): y coordinates of the component
+
+        Return:
+            float: the are of the component
+        """  # noqa: 501
+        return 0.5 * np.abs(
+            np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1)))
+
+    def to_ndarray(self):
+        """Convert masks to the format of ndarray."""
+        if len(self.masks) == 0:
+            return np.empty((0, self.height, self.width), dtype=np.uint8)
+        bitmap_masks = []
+        for poly_per_obj in self.masks:
+            bitmap_masks.append(
+                polygon_to_bitmap(poly_per_obj, self.height, self.width))
+        return np.stack(bitmap_masks)
+
+    def to_tensor(self, dtype, device):
+        """See :func:`BaseInstanceMasks.to_tensor`."""
+        if len(self.masks) == 0:
+            return torch.empty((0, self.height, self.width),
+                               dtype=dtype,
+                               device=device)
+        ndarray_masks = self.to_ndarray()
+        return torch.tensor(ndarray_masks, dtype=dtype, device=device)
+
+    @classmethod
+    def random(cls,
+               num_masks=3,
+               height=32,
+               width=32,
+               n_verts=5,
+               dtype=np.float32,
+               rng=None):
+        """Generate random polygon masks for demo / testing purposes.
+
+        Adapted from [1]_
+
+        References:
+            .. [1] https://gitlab.kitware.com/computer-vision/kwimage/-/blob/928cae35ca8/kwimage/structs/polygon.py#L379  # noqa: E501
+
+        Example:
+            >>> from mmcv.core.mask.structures import PolygonMasks
+            >>> self = PolygonMasks.random()
+            >>> print('self = {}'.format(self))
+        """
+        from mmcv.utils.util_random import ensure_rng
+        rng = ensure_rng(rng)
+
+        def _gen_polygon(n, irregularity, spikeyness):
+            """Creates the polygon by sampling points on a circle around the
+            centre.  Random noise is added by varying the angular spacing
+            between sequential points, and by varying the radial distance of
+            each point from the centre.
+
+            Based on original code by Mike Ounsworth
+
+            Args:
+                n (int): number of vertices
+                irregularity (float): [0,1] indicating how much variance there
+                    is in the angular spacing of vertices. [0,1] will map to
+                    [0, 2pi/numberOfVerts]
+                spikeyness (float): [0,1] indicating how much variance there is
+                    in each vertex from the circle of radius aveRadius. [0,1]
+                    will map to [0, aveRadius]
+
+            Returns:
+                a list of vertices, in CCW order.
+            """
+            from scipy.stats import truncnorm
+            # Generate around the unit circle
+            cx, cy = (0.0, 0.0)
+            radius = 1
+
+            tau = np.pi * 2
+
+            irregularity = np.clip(irregularity, 0, 1) * 2 * np.pi / n
+            spikeyness = np.clip(spikeyness, 1e-9, 1)
+
+            # generate n angle steps
+            lower = (tau / n) - irregularity
+            upper = (tau / n) + irregularity
+            angle_steps = rng.uniform(lower, upper, n)
+
+            # normalize the steps so that point 0 and point n+1 are the same
+            k = angle_steps.sum() / (2 * np.pi)
+            angles = (angle_steps / k).cumsum() + rng.uniform(0, tau)
+
+            # Convert high and low values to be wrt the standard normal range
+            # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.truncnorm.html
+            low = 0
+            high = 2 * radius
+            mean = radius
+            std = spikeyness
+            a = (low - mean) / std
+            b = (high - mean) / std
+            tnorm = truncnorm(a=a, b=b, loc=mean, scale=std)
+
+            # now generate the points
+            radii = tnorm.rvs(n, random_state=rng)
+            x_pts = cx + radii * np.cos(angles)
+            y_pts = cy + radii * np.sin(angles)
+
+            points = np.hstack([x_pts[:, None], y_pts[:, None]])
+
+            # Scale to 0-1 space
+            points = points - points.min(axis=0)
+            points = points / points.max(axis=0)
+
+            # Randomly place within 0-1 space
+            points = points * (rng.rand() * .8 + .2)
+            min_pt = points.min(axis=0)
+            max_pt = points.max(axis=0)
+
+            high = (1 - max_pt)
+            low = (0 - min_pt)
+            offset = (rng.rand(2) * (high - low)) + low
+            points = points + offset
+            return points
+
+        def _order_vertices(verts):
+            """
+            References:
+                https://stackoverflow.com/questions/1709283/how-can-i-sort-a-coordinate-list-for-a-rectangle-counterclockwise
+            """
+            mlat = verts.T[0].sum() / len(verts)
+            mlng = verts.T[1].sum() / len(verts)
+
+            tau = np.pi * 2
+            angle = (np.arctan2(mlat - verts.T[0], verts.T[1] - mlng) +
+                     tau) % tau
+            sortx = angle.argsort()
+            verts = verts.take(sortx, axis=0)
+            return verts
+
+        # Generate a random exterior for each requested mask
+        masks = []
+        for _ in range(num_masks):
+            exterior = _order_vertices(_gen_polygon(n_verts, 0.9, 0.9))
+            exterior = (exterior * [(width, height)]).astype(dtype)
+            masks.append([exterior.ravel()])
+
+        self = cls(masks, height, width)
+        return self
+
+
+def polygon_to_bitmap(polygons, height, width):
+    """Convert masks from the form of polygons to bitmaps.
+
+    Args:
+        polygons (list[ndarray]): masks in polygon representation
+        height (int): mask height
+        width (int): mask width
+
+    Return:
+        ndarray: the converted masks in bitmap representation
+    """
+    rles = maskUtils.frPyObjects(polygons, height, width)
+    rle = maskUtils.merge(rles)
+    bitmap_mask = maskUtils.decode(rle).astype(np.bool)
+    return bitmap_mask
diff --git a/mmcv/core/mask/utils.py b/mmcv/core/mask/utils.py
new file mode 100644
index 0000000..cc671b1
--- /dev/null
+++ b/mmcv/core/mask/utils.py
@@ -0,0 +1,63 @@
+from mmcv.utils import slice_list
+import numpy as np
+import pycocotools.mask as mask_util
+
+
+def split_combined_polys(polys, poly_lens, polys_per_mask):
+    """Split the combined 1-D polys into masks.
+
+    A mask is represented as a list of polys, and a poly is represented as
+    a 1-D array. In dataset, all masks are concatenated into a single 1-D
+    tensor. Here we need to split the tensor into original representations.
+
+    Args:
+        polys (list): a list (length = image num) of 1-D tensors
+        poly_lens (list): a list (length = image num) of poly length
+        polys_per_mask (list): a list (length = image num) of poly number
+            of each mask
+
+    Returns:
+        list: a list (length = image num) of list (length = mask num) of \
+            list (length = poly num) of numpy array.
+    """
+    mask_polys_list = []
+    for img_id in range(len(polys)):
+        polys_single = polys[img_id]
+        polys_lens_single = poly_lens[img_id].tolist()
+        polys_per_mask_single = polys_per_mask[img_id].tolist()
+
+        split_polys = slice_list(polys_single, polys_lens_single)
+        mask_polys = slice_list(split_polys, polys_per_mask_single)
+        mask_polys_list.append(mask_polys)
+    return mask_polys_list
+
+
+# TODO: move this function to more proper place
+def encode_mask_results(mask_results):
+    """Encode bitmap mask to RLE code.
+
+    Args:
+        mask_results (list | tuple[list]): bitmap mask results.
+            In mask scoring rcnn, mask_results is a tuple of (segm_results,
+            segm_cls_score).
+
+    Returns:
+        list | tuple: RLE encoded mask.
+    """
+    if isinstance(mask_results, tuple):  # mask scoring
+        cls_segms, cls_mask_scores = mask_results
+    else:
+        cls_segms = mask_results
+    num_classes = len(cls_segms)
+    encoded_mask_results = [[] for _ in range(num_classes)]
+    for i in range(len(cls_segms)):
+        for cls_segm in cls_segms[i]:
+            encoded_mask_results[i].append(
+                mask_util.encode(
+                    np.array(
+                        cls_segm[:, :, np.newaxis], order='F',
+                        dtype='uint8'))[0])  # encoded with RLE
+    if isinstance(mask_results, tuple):
+        return encoded_mask_results, cls_mask_scores
+    else:
+        return encoded_mask_results
diff --git a/mmcv/core/points/__init__.py b/mmcv/core/points/__init__.py
new file mode 100644
index 0000000..73d2d83
--- /dev/null
+++ b/mmcv/core/points/__init__.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_points import BasePoints
+from .cam_points import CameraPoints
+from .depth_points import DepthPoints
+from .lidar_points import LiDARPoints
+
+__all__ = ['BasePoints', 'CameraPoints', 'DepthPoints', 'LiDARPoints']
+
+
+def get_points_type(points_type):
+    """Get the class of points according to coordinate type.
+
+    Args:
+        points_type (str): The type of points coordinate.
+            The valid value are "CAMERA", "LIDAR", or "DEPTH".
+
+    Returns:
+        class: Points type.
+    """
+    if points_type == 'CAMERA':
+        points_cls = CameraPoints
+    elif points_type == 'LIDAR':
+        points_cls = LiDARPoints
+    elif points_type == 'DEPTH':
+        points_cls = DepthPoints
+    else:
+        raise ValueError('Only "points_type" of "CAMERA", "LIDAR", or "DEPTH"'
+                         f' are supported, got {points_type}')
+
+    return points_cls
diff --git a/mmcv/core/points/base_points.py b/mmcv/core/points/base_points.py
new file mode 100644
index 0000000..31b8cec
--- /dev/null
+++ b/mmcv/core/points/base_points.py
@@ -0,0 +1,436 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import warnings
+from abc import abstractmethod
+
+
+class BasePoints(object):
+    """Base class for Points.
+
+    Args:
+        tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
+        points_dim (int): Number of the dimension of a point.
+            Each row is (x, y, z). Default to 3.
+        attribute_dims (dict): Dictionary to indicate the meaning of extra
+            dimension. Default to None.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x points_dim.
+        points_dim (int): Integer indicating the dimension of a point.
+            Each row is (x, y, z, ...).
+        attribute_dims (bool): Dictionary to indicate the meaning of extra
+            dimension. Default to None.
+        rotation_axis (int): Default rotation axis for points rotation.
+    """
+
+    def __init__(self, tensor, points_dim=3, attribute_dims=None):
+        if isinstance(tensor, torch.Tensor):
+            device = tensor.device
+        else:
+            device = torch.device('cpu')
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that
+            # does not depend on the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((0, points_dim)).to(
+                dtype=torch.float32, device=device)
+        assert tensor.dim() == 2 and tensor.size(-1) == \
+            points_dim, tensor.size()
+
+        self.tensor = tensor
+        self.points_dim = points_dim
+        self.attribute_dims = attribute_dims
+        self.rotation_axis = 0
+
+    @property
+    def coord(self):
+        """torch.Tensor: Coordinates of each point with size (N, 3)."""
+        return self.tensor[:, :3]
+
+    @coord.setter
+    def coord(self, tensor):
+        """Set the coordinates of each point."""
+        try:
+            tensor = tensor.reshape(self.shape[0], 3)
+        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
+            raise ValueError(f'got unexpected shape {tensor.shape}')
+        if not isinstance(tensor, torch.Tensor):
+            tensor = self.tensor.new_tensor(tensor)
+        self.tensor[:, :3] = tensor
+
+    @property
+    def height(self):
+        """torch.Tensor: A vector with height of each point."""
+        if self.attribute_dims is not None and \
+                'height' in self.attribute_dims.keys():
+            return self.tensor[:, self.attribute_dims['height']]
+        else:
+            return None
+
+    @height.setter
+    def height(self, tensor):
+        """Set the height of each point."""
+        try:
+            tensor = tensor.reshape(self.shape[0])
+        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
+            raise ValueError(f'got unexpected shape {tensor.shape}')
+        if not isinstance(tensor, torch.Tensor):
+            tensor = self.tensor.new_tensor(tensor)
+        if self.attribute_dims is not None and \
+                'height' in self.attribute_dims.keys():
+            self.tensor[:, self.attribute_dims['height']] = tensor
+        else:
+            # add height attribute
+            if self.attribute_dims is None:
+                self.attribute_dims = dict()
+            attr_dim = self.shape[1]
+            self.tensor = torch.cat([self.tensor, tensor.unsqueeze(1)], dim=1)
+            self.attribute_dims.update(dict(height=attr_dim))
+            self.points_dim += 1
+
+    @property
+    def color(self):
+        """torch.Tensor: A vector with color of each point."""
+        if self.attribute_dims is not None and \
+                'color' in self.attribute_dims.keys():
+            return self.tensor[:, self.attribute_dims['color']]
+        else:
+            return None
+
+    @color.setter
+    def color(self, tensor):
+        """Set the color of each point."""
+        try:
+            tensor = tensor.reshape(self.shape[0], 3)
+        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
+            raise ValueError(f'got unexpected shape {tensor.shape}')
+        if tensor.max() >= 256 or tensor.min() < 0:
+            warnings.warn('point got color value beyond [0, 255]')
+        if not isinstance(tensor, torch.Tensor):
+            tensor = self.tensor.new_tensor(tensor)
+        if self.attribute_dims is not None and \
+                'color' in self.attribute_dims.keys():
+            self.tensor[:, self.attribute_dims['color']] = tensor
+        else:
+            # add color attribute
+            if self.attribute_dims is None:
+                self.attribute_dims = dict()
+            attr_dim = self.shape[1]
+            self.tensor = torch.cat([self.tensor, tensor], dim=1)
+            self.attribute_dims.update(
+                dict(color=[attr_dim, attr_dim + 1, attr_dim + 2]))
+            self.points_dim += 3
+
+    @property
+    def shape(self):
+        """torch.Shape: Shape of points."""
+        return self.tensor.shape
+
+    def shuffle(self):
+        """Shuffle the points.
+
+        Returns:
+            torch.Tensor: The shuffled index.
+        """
+        idx = torch.randperm(self.__len__(), device=self.tensor.device)
+        self.tensor = self.tensor[idx]
+        return idx
+
+    def rotate(self, rotation, axis=None):
+        """Rotate points with the given rotation matrix or angle.
+
+        Args:
+            rotation (float, np.ndarray, torch.Tensor): Rotation matrix
+                or angle.
+            axis (int): Axis to rotate at. Defaults to None.
+        """
+        if not isinstance(rotation, torch.Tensor):
+            rotation = self.tensor.new_tensor(rotation)
+        assert rotation.shape == torch.Size([3, 3]) or \
+            rotation.numel() == 1, f'invalid rotation shape {rotation.shape}'
+
+        if axis is None:
+            axis = self.rotation_axis
+
+        if rotation.numel() == 1:
+            rot_sin = torch.sin(rotation)
+            rot_cos = torch.cos(rotation)
+            if axis == 1:
+                rot_mat_T = rotation.new_tensor([[rot_cos, 0, -rot_sin],
+                                                 [0, 1, 0],
+                                                 [rot_sin, 0, rot_cos]])
+            elif axis == 2 or axis == -1:
+                rot_mat_T = rotation.new_tensor([[rot_cos, -rot_sin, 0],
+                                                 [rot_sin, rot_cos, 0],
+                                                 [0, 0, 1]])
+            elif axis == 0:
+                rot_mat_T = rotation.new_tensor([[0, rot_cos, -rot_sin],
+                                                 [0, rot_sin, rot_cos],
+                                                 [1, 0, 0]])
+            else:
+                raise ValueError('axis should in range')
+            rot_mat_T = rot_mat_T.T
+        elif rotation.numel() == 9:
+            rot_mat_T = rotation
+        else:
+            raise NotImplementedError
+        self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T
+
+        return rot_mat_T
+
+    @abstractmethod
+    def flip(self, bev_direction='horizontal'):
+        """Flip the points in BEV along given BEV direction."""
+        pass
+
+    def translate(self, trans_vector):
+        """Translate points with the given translation vector.
+
+        Args:
+            trans_vector (np.ndarray, torch.Tensor): Translation
+                vector of size 3 or nx3.
+        """
+        if not isinstance(trans_vector, torch.Tensor):
+            trans_vector = self.tensor.new_tensor(trans_vector)
+        trans_vector = trans_vector.squeeze(0)
+        if trans_vector.dim() == 1:
+            assert trans_vector.shape[0] == 3
+        elif trans_vector.dim() == 2:
+            assert trans_vector.shape[0] == self.tensor.shape[0] and \
+                trans_vector.shape[1] == 3
+        else:
+            raise NotImplementedError(
+                f'Unsupported translation vector of shape {trans_vector.shape}'
+            )
+        self.tensor[:, :3] += trans_vector
+
+    def in_range_3d(self, point_range):
+        """Check whether the points are in the given range.
+
+        Args:
+            point_range (list | torch.Tensor): The range of point
+                (x_min, y_min, z_min, x_max, y_max, z_max)
+
+        Note:
+            In the original implementation of SECOND, checking whether
+            a box in the range checks whether the points are in a convex
+            polygon, we try to reduce the burden for simpler cases.
+
+        Returns:
+            torch.Tensor: A binary vector indicating whether each point is \
+                inside the reference range.
+        """
+        in_range_flags = ((self.tensor[:, 0] > point_range[0])
+                          & (self.tensor[:, 1] > point_range[1])
+                          & (self.tensor[:, 2] > point_range[2])
+                          & (self.tensor[:, 0] < point_range[3])
+                          & (self.tensor[:, 1] < point_range[4])
+                          & (self.tensor[:, 2] < point_range[5]))
+        return in_range_flags
+
+    @abstractmethod
+    def in_range_bev(self, point_range):
+        """Check whether the points are in the given range.
+
+        Args:
+            point_range (list | torch.Tensor): The range of point
+                in order of (x_min, y_min, x_max, y_max).
+
+        Returns:
+            torch.Tensor: Indicating whether each point is inside \
+                the reference range.
+        """
+        pass
+
+    @abstractmethod
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (:obj:`CoordMode`): The target Box mode.
+            rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+                matrix between different coordinates. Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            :obj:`BasePoints`: The converted box of the same type \
+                in the `dst` mode.
+        """
+        pass
+
+    def scale(self, scale_factor):
+        """Scale the points with horizontal and vertical scaling factors.
+
+        Args:
+            scale_factors (float): Scale factors to scale the points.
+        """
+        self.tensor[:, :3] *= scale_factor
+
+    def __getitem__(self, item):
+        """
+        Note:
+            The following usage are allowed:
+            1. `new_points = points[3]`:
+                return a `Points` that contains only one point.
+            2. `new_points = points[2:10]`:
+                return a slice of points.
+            3. `new_points = points[vector]`:
+                where vector is a torch.BoolTensor with `length = len(points)`.
+                Nonzero elements in the vector will be selected.
+            4. `new_points = points[3:11, vector]`:
+                return a slice of points and attribute dims.
+            5. `new_points = points[4:12, 2]`:
+                return a slice of points with single attribute.
+            Note that the returned Points might share storage with this Points,
+            subject to Pytorch's indexing semantics.
+
+        Returns:
+            :obj:`BasePoints`: A new object of  \
+                :class:`BasePoints` after indexing.
+        """
+        original_type = type(self)
+        if isinstance(item, int):
+            return original_type(
+                self.tensor[item].view(1, -1),
+                points_dim=self.points_dim,
+                attribute_dims=self.attribute_dims)
+        elif isinstance(item, tuple) and len(item) == 2:
+            if isinstance(item[1], slice):
+                start = 0 if item[1].start is None else item[1].start
+                stop = self.tensor.shape[1] if \
+                    item[1].stop is None else item[1].stop
+                step = 1 if item[1].step is None else item[1].step
+                item = list(item)
+                item[1] = list(range(start, stop, step))
+                item = tuple(item)
+            elif isinstance(item[1], int):
+                item = list(item)
+                item[1] = [item[1]]
+                item = tuple(item)
+            p = self.tensor[item[0], item[1]]
+
+            keep_dims = list(
+                set(item[1]).intersection(set(range(3, self.tensor.shape[1]))))
+            if self.attribute_dims is not None:
+                attribute_dims = self.attribute_dims.copy()
+                for key in self.attribute_dims.keys():
+                    cur_attribute_dims = attribute_dims[key]
+                    if isinstance(cur_attribute_dims, int):
+                        cur_attribute_dims = [cur_attribute_dims]
+                    intersect_attr = list(
+                        set(cur_attribute_dims).intersection(set(keep_dims)))
+                    if len(intersect_attr) == 1:
+                        attribute_dims[key] = intersect_attr[0]
+                    elif len(intersect_attr) > 1:
+                        attribute_dims[key] = intersect_attr
+                    else:
+                        attribute_dims.pop(key)
+            else:
+                attribute_dims = None
+        elif isinstance(item, (slice, np.ndarray, torch.Tensor)):
+            p = self.tensor[item]
+            attribute_dims = self.attribute_dims
+        else:
+            raise NotImplementedError(f'Invalid slice {item}!')
+
+        assert p.dim() == 2, \
+            f'Indexing on Points with {item} failed to return a matrix!'
+        return original_type(
+            p, points_dim=p.shape[1], attribute_dims=attribute_dims)
+
+    def __len__(self):
+        """int: Number of points in the current object."""
+        return self.tensor.shape[0]
+
+    def __repr__(self):
+        """str: Return a strings that describes the object."""
+        return self.__class__.__name__ + '(\n    ' + str(self.tensor) + ')'
+
+    @classmethod
+    def cat(cls, points_list):
+        """Concatenate a list of Points into a single Points.
+
+        Args:
+            points_list (list[:obj:`BasePoints`]): List of points.
+
+        Returns:
+            :obj:`BasePoints`: The concatenated Points.
+        """
+        assert isinstance(points_list, (list, tuple))
+        if len(points_list) == 0:
+            return cls(torch.empty(0))
+        assert all(isinstance(points, cls) for points in points_list)
+
+        # use torch.cat (v.s. layers.cat)
+        # so the returned points never share storage with input
+        cat_points = cls(
+            torch.cat([p.tensor for p in points_list], dim=0),
+            points_dim=points_list[0].tensor.shape[1],
+            attribute_dims=points_list[0].attribute_dims)
+        return cat_points
+
+    def to(self, device):
+        """Convert current points to a specific device.
+
+        Args:
+            device (str | :obj:`torch.device`): The name of the device.
+
+        Returns:
+            :obj:`BasePoints`: A new boxes object on the \
+                specific device.
+        """
+        original_type = type(self)
+        return original_type(
+            self.tensor.to(device),
+            points_dim=self.points_dim,
+            attribute_dims=self.attribute_dims)
+
+    def clone(self):
+        """Clone the Points.
+
+        Returns:
+            :obj:`BasePoints`: Box object with the same properties \
+                as self.
+        """
+        original_type = type(self)
+        return original_type(
+            self.tensor.clone(),
+            points_dim=self.points_dim,
+            attribute_dims=self.attribute_dims)
+
+    @property
+    def device(self):
+        """str: The device of the points are on."""
+        return self.tensor.device
+
+    def __iter__(self):
+        """Yield a point as a Tensor of shape (4,) at a time.
+
+        Returns:
+            torch.Tensor: A point of shape (4,).
+        """
+        yield from self.tensor
+
+    def new_point(self, data):
+        """Create a new point object with data.
+
+        The new point and its tensor has the similar properties \
+            as self and self.tensor, respectively.
+
+        Args:
+            data (torch.Tensor | numpy.array | list): Data to be copied.
+
+        Returns:
+            :obj:`BasePoints`: A new point object with ``data``, \
+                the object's other properties are similar to ``self``.
+        """
+        new_tensor = self.tensor.new_tensor(data) \
+            if not isinstance(data, torch.Tensor) else data.to(self.device)
+        original_type = type(self)
+        return original_type(
+            new_tensor,
+            points_dim=self.points_dim,
+            attribute_dims=self.attribute_dims)
diff --git a/mmcv/core/points/cam_points.py b/mmcv/core/points/cam_points.py
new file mode 100644
index 0000000..ba83cf0
--- /dev/null
+++ b/mmcv/core/points/cam_points.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_points import BasePoints
+
+
+class CameraPoints(BasePoints):
+    """Points of instances in CAM coordinates.
+
+    Args:
+        tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
+        points_dim (int): Number of the dimension of a point.
+            Each row is (x, y, z). Default to 3.
+        attribute_dims (dict): Dictionary to indicate the meaning of extra
+            dimension. Default to None.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x points_dim.
+        points_dim (int): Integer indicating the dimension of a point.
+            Each row is (x, y, z, ...).
+        attribute_dims (bool): Dictionary to indicate the meaning of extra
+            dimension. Default to None.
+        rotation_axis (int): Default rotation axis for points rotation.
+    """
+
+    def __init__(self, tensor, points_dim=3, attribute_dims=None):
+        super(CameraPoints, self).__init__(
+            tensor, points_dim=points_dim, attribute_dims=attribute_dims)
+        self.rotation_axis = 1
+
+    def flip(self, bev_direction='horizontal'):
+        """Flip the boxes in BEV along given BEV direction."""
+        if bev_direction == 'horizontal':
+            self.tensor[:, 0] = -self.tensor[:, 0]
+        elif bev_direction == 'vertical':
+            self.tensor[:, 2] = -self.tensor[:, 2]
+
+    def in_range_bev(self, point_range):
+        """Check whether the points are in the given range.
+
+        Args:
+            point_range (list | torch.Tensor): The range of point
+                in order of (x_min, y_min, x_max, y_max).
+
+        Returns:
+            torch.Tensor: Indicating whether each point is inside \
+                the reference range.
+        """
+        in_range_flags = ((self.tensor[:, 0] > point_range[0])
+                          & (self.tensor[:, 2] > point_range[1])
+                          & (self.tensor[:, 0] < point_range[2])
+                          & (self.tensor[:, 2] < point_range[3]))
+        return in_range_flags
+
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (:obj:`CoordMode`): The target Point mode.
+            rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+                matrix between different coordinates. Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            :obj:`BasePoints`: The converted point of the same type \
+                in the `dst` mode.
+        """
+        from mmcv.core.bbox import Coord3DMode
+        return Coord3DMode.convert_point(
+            point=self, src=Coord3DMode.CAM, dst=dst, rt_mat=rt_mat)
diff --git a/mmcv/core/points/depth_points.py b/mmcv/core/points/depth_points.py
new file mode 100644
index 0000000..1b12299
--- /dev/null
+++ b/mmcv/core/points/depth_points.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_points import BasePoints
+
+
+class DepthPoints(BasePoints):
+    """Points of instances in DEPTH coordinates.
+
+    Args:
+        tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
+        points_dim (int): Number of the dimension of a point.
+            Each row is (x, y, z). Default to 3.
+        attribute_dims (dict): Dictionary to indicate the meaning of extra
+            dimension. Default to None.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x points_dim.
+        points_dim (int): Integer indicating the dimension of a point.
+            Each row is (x, y, z, ...).
+        attribute_dims (bool): Dictionary to indicate the meaning of extra
+            dimension. Default to None.
+        rotation_axis (int): Default rotation axis for points rotation.
+    """
+
+    def __init__(self, tensor, points_dim=3, attribute_dims=None):
+        super(DepthPoints, self).__init__(
+            tensor, points_dim=points_dim, attribute_dims=attribute_dims)
+        self.rotation_axis = 2
+
+    def flip(self, bev_direction='horizontal'):
+        """Flip the boxes in BEV along given BEV direction."""
+        if bev_direction == 'horizontal':
+            self.tensor[:, 0] = -self.tensor[:, 0]
+        elif bev_direction == 'vertical':
+            self.tensor[:, 1] = -self.tensor[:, 1]
+
+    def in_range_bev(self, point_range):
+        """Check whether the points are in the given range.
+
+        Args:
+            point_range (list | torch.Tensor): The range of point
+                in order of (x_min, y_min, x_max, y_max).
+
+        Returns:
+            torch.Tensor: Indicating whether each point is inside \
+                the reference range.
+        """
+        in_range_flags = ((self.tensor[:, 0] > point_range[0])
+                          & (self.tensor[:, 1] > point_range[1])
+                          & (self.tensor[:, 0] < point_range[2])
+                          & (self.tensor[:, 1] < point_range[3]))
+        return in_range_flags
+
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (:obj:`CoordMode`): The target Point mode.
+            rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+                matrix between different coordinates. Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            :obj:`BasePoints`: The converted point of the same type \
+                in the `dst` mode.
+        """
+        from mmcv.core.bbox import Coord3DMode
+        return Coord3DMode.convert_point(
+            point=self, src=Coord3DMode.DEPTH, dst=dst, rt_mat=rt_mat)
diff --git a/mmcv/core/points/lidar_points.py b/mmcv/core/points/lidar_points.py
new file mode 100644
index 0000000..bbfddd9
--- /dev/null
+++ b/mmcv/core/points/lidar_points.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_points import BasePoints
+
+
+class LiDARPoints(BasePoints):
+    """Points of instances in LIDAR coordinates.
+
+    Args:
+        tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
+        points_dim (int): Number of the dimension of a point.
+            Each row is (x, y, z). Default to 3.
+        attribute_dims (dict): Dictionary to indicate the meaning of extra
+            dimension. Default to None.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x points_dim.
+        points_dim (int): Integer indicating the dimension of a point.
+            Each row is (x, y, z, ...).
+        attribute_dims (bool): Dictionary to indicate the meaning of extra
+            dimension. Default to None.
+        rotation_axis (int): Default rotation axis for points rotation.
+    """
+
+    def __init__(self, tensor, points_dim=3, attribute_dims=None):
+        super(LiDARPoints, self).__init__(
+            tensor, points_dim=points_dim, attribute_dims=attribute_dims)
+        self.rotation_axis = 2
+
+    def flip(self, bev_direction='horizontal'):
+        """Flip the boxes in BEV along given BEV direction."""
+        if bev_direction == 'horizontal':
+            self.tensor[:, 1] = -self.tensor[:, 1]
+        elif bev_direction == 'vertical':
+            self.tensor[:, 0] = -self.tensor[:, 0]
+
+    def in_range_bev(self, point_range):
+        """Check whether the points are in the given range.
+
+        Args:
+            point_range (list | torch.Tensor): The range of point
+                in order of (x_min, y_min, x_max, y_max).
+
+        Returns:
+            torch.Tensor: Indicating whether each point is inside \
+                the reference range.
+        """
+        in_range_flags = ((self.tensor[:, 0] > point_range[0])
+                          & (self.tensor[:, 1] > point_range[1])
+                          & (self.tensor[:, 0] < point_range[2])
+                          & (self.tensor[:, 1] < point_range[3]))
+        return in_range_flags
+
+    def convert_to(self, dst, rt_mat=None):
+        """Convert self to ``dst`` mode.
+
+        Args:
+            dst (:obj:`CoordMode`): The target Point mode.
+            rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+                matrix between different coordinates. Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            :obj:`BasePoints`: The converted point of the same type \
+                in the `dst` mode.
+        """
+        from mmcv.core.bbox import Coord3DMode
+        return Coord3DMode.convert_point(
+            point=self, src=Coord3DMode.LIDAR, dst=dst, rt_mat=rt_mat)
diff --git a/mmcv/core/post_processing/__init__.py b/mmcv/core/post_processing/__init__.py
new file mode 100644
index 0000000..5335741
--- /dev/null
+++ b/mmcv/core/post_processing/__init__.py
@@ -0,0 +1,9 @@
+# from .merge_augs import (merge_aug_bboxes, merge_aug_masks,
+#                          merge_aug_scores, merge_aug_bboxes_3d)
+from .box3d_nms import aligned_3d_nms, box3d_multiclass_nms, circle_nms
+
+# __all__ = [
+#     'merge_aug_bboxes',
+#     'merge_aug_scores', 'merge_aug_masks', 'box3d_multiclass_nms',
+#     'aligned_3d_nms', 'merge_aug_bboxes_3d', 'circle_nms'
+# ]
diff --git a/mmcv/core/post_processing/bbox_nms.py b/mmcv/core/post_processing/bbox_nms.py
new file mode 100644
index 0000000..1b3c77a
--- /dev/null
+++ b/mmcv/core/post_processing/bbox_nms.py
@@ -0,0 +1,170 @@
+import torch
+from mmcv.ops.nms import batched_nms
+
+from mmcv.core.bbox.iou_calculators import bbox_overlaps
+
+
+def multiclass_nms(multi_bboxes,
+                   multi_scores,
+                   score_thr,
+                   nms_cfg,
+                   max_num=-1,
+                   score_factors=None,
+                   return_inds=False):
+    """NMS for multi-class bboxes.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class), where the last column
+            contains scores of the background class, but this will be ignored.
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        nms_thr (float): NMS IoU threshold
+        max_num (int, optional): if there are more than max_num bboxes after
+            NMS, only top max_num will be kept. Default to -1.
+        score_factors (Tensor, optional): The factors multiplied to scores
+            before applying NMS. Default to None.
+        return_inds (bool, optional): Whether return the indices of kept
+            bboxes. Default to False.
+
+    Returns:
+        tuple: (dets, labels, indices (optional)), tensors of shape (k, 5),
+            (k), and (k). Dets are boxes with scores. Labels are 0-based.
+    """
+    num_classes = multi_scores.size(1) - 1
+    # exclude background category
+    if multi_bboxes.shape[1] > 4:
+        bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
+    else:
+        bboxes = multi_bboxes[:, None].expand(
+            multi_scores.size(0), num_classes, 4)
+
+    scores = multi_scores[:, :-1]
+
+    labels = torch.arange(num_classes, dtype=torch.long)
+    labels = labels.view(1, -1).expand_as(scores)
+
+    bboxes = bboxes.reshape(-1, 4)
+    scores = scores.reshape(-1)
+    labels = labels.reshape(-1)
+
+    if not torch.onnx.is_in_onnx_export():
+        # NonZero not supported  in TensorRT
+        # remove low scoring boxes
+        valid_mask = scores > score_thr
+    # multiply score_factor after threshold to preserve more bboxes, improve
+    # mAP by 1% for YOLOv3
+    if score_factors is not None:
+        # expand the shape to match original shape of score
+        score_factors = score_factors.view(-1, 1).expand(
+            multi_scores.size(0), num_classes)
+        score_factors = score_factors.reshape(-1)
+        scores = scores * score_factors
+
+    if not torch.onnx.is_in_onnx_export():
+        # NonZero not supported  in TensorRT
+        inds = valid_mask.nonzero(as_tuple=False).squeeze(1)
+        bboxes, scores, labels = bboxes[inds], scores[inds], labels[inds]
+    else:
+        # TensorRT NMS plugin has invalid output filled with -1
+        # add dummy data to make detection output correct.
+        bboxes = torch.cat([bboxes, bboxes.new_zeros(1, 4)], dim=0)
+        scores = torch.cat([scores, scores.new_zeros(1)], dim=0)
+        labels = torch.cat([labels, labels.new_zeros(1)], dim=0)
+
+    if bboxes.numel() == 0:
+        if torch.onnx.is_in_onnx_export():
+            raise RuntimeError('[ONNX Error] Can not record NMS '
+                               'as it has not been executed this time')
+        dets = torch.cat([bboxes, scores[:, None]], -1)
+        if return_inds:
+            return dets, labels, inds
+        else:
+            return dets, labels
+
+    dets, keep = batched_nms(bboxes, scores, labels, nms_cfg)
+
+    if max_num > 0:
+        dets = dets[:max_num]
+        keep = keep[:max_num]
+
+    if return_inds:
+        return dets, labels[keep], keep
+    else:
+        return dets, labels[keep]
+
+
+def fast_nms(multi_bboxes,
+             multi_scores,
+             multi_coeffs,
+             score_thr,
+             iou_thr,
+             top_k,
+             max_num=-1):
+    """Fast NMS in `YOLACT <https://arxiv.org/abs/1904.02689>`_.
+
+    Fast NMS allows already-removed detections to suppress other detections so
+    that every instance can be decided to be kept or discarded in parallel,
+    which is not possible in traditional NMS. This relaxation allows us to
+    implement Fast NMS entirely in standard GPU-accelerated matrix operations.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class+1), where the last column
+            contains scores of the background class, but this will be ignored.
+        multi_coeffs (Tensor): shape (n, #class*coeffs_dim).
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        iou_thr (float): IoU threshold to be considered as conflicted.
+        top_k (int): if there are more than top_k bboxes before NMS,
+            only top top_k will be kept.
+        max_num (int): if there are more than max_num bboxes after NMS,
+            only top max_num will be kept. If -1, keep all the bboxes.
+            Default: -1.
+
+    Returns:
+        tuple: (dets, labels, coefficients), tensors of shape (k, 5), (k, 1),
+            and (k, coeffs_dim). Dets are boxes with scores.
+            Labels are 0-based.
+    """
+
+    scores = multi_scores[:, :-1].t()  # [#class, n]
+    scores, idx = scores.sort(1, descending=True)
+
+    idx = idx[:, :top_k].contiguous()
+    scores = scores[:, :top_k]  # [#class, topk]
+    num_classes, num_dets = idx.size()
+    boxes = multi_bboxes[idx.view(-1), :].view(num_classes, num_dets, 4)
+    coeffs = multi_coeffs[idx.view(-1), :].view(num_classes, num_dets, -1)
+
+    iou = bbox_overlaps(boxes, boxes)  # [#class, topk, topk]
+    iou.triu_(diagonal=1)
+    iou_max, _ = iou.max(dim=1)
+
+    # Now just filter out the ones higher than the threshold
+    keep = iou_max <= iou_thr
+
+    # Second thresholding introduces 0.2 mAP gain at negligible time cost
+    keep *= scores > score_thr
+
+    # Assign each kept detection to its corresponding class
+    classes = torch.arange(
+        num_classes, device=boxes.device)[:, None].expand_as(keep)
+    classes = classes[keep]
+
+    boxes = boxes[keep]
+    coeffs = coeffs[keep]
+    scores = scores[keep]
+
+    # Only keep the top max_num highest scores across all classes
+    scores, idx = scores.sort(0, descending=True)
+    if max_num > 0:
+        idx = idx[:max_num]
+        scores = scores[:max_num]
+
+    classes = classes[idx]
+    boxes = boxes[idx]
+    coeffs = coeffs[idx]
+
+    cls_dets = torch.cat([boxes, scores[:, None]], dim=1)
+    return cls_dets, classes, coeffs
diff --git a/mmcv/core/post_processing/box3d_nms.py b/mmcv/core/post_processing/box3d_nms.py
new file mode 100644
index 0000000..8bede1b
--- /dev/null
+++ b/mmcv/core/post_processing/box3d_nms.py
@@ -0,0 +1,220 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numba
+import numpy as np
+import torch
+
+from mmcv.ops.iou3d_det.iou3d_utils import nms_gpu, nms_normal_gpu
+
+
+def box3d_multiclass_nms(mlvl_bboxes,
+                         mlvl_bboxes_for_nms,
+                         mlvl_scores,
+                         score_thr,
+                         max_num,
+                         cfg,
+                         mlvl_dir_scores=None,
+                         mlvl_attr_scores=None,
+                         mlvl_bboxes2d=None):
+    """Multi-class nms for 3D boxes.
+
+    Args:
+        mlvl_bboxes (torch.Tensor): Multi-level boxes with shape (N, M).
+            M is the dimensions of boxes.
+        mlvl_bboxes_for_nms (torch.Tensor): Multi-level boxes with shape
+            (N, 5) ([x1, y1, x2, y2, ry]). N is the number of boxes.
+        mlvl_scores (torch.Tensor): Multi-level boxes with shape
+            (N, C + 1). N is the number of boxes. C is the number of classes.
+        score_thr (float): Score thredhold to filter boxes with low
+            confidence.
+        max_num (int): Maximum number of boxes will be kept.
+        cfg (dict): Configuration dict of NMS.
+        mlvl_dir_scores (torch.Tensor, optional): Multi-level scores
+            of direction classifier. Defaults to None.
+        mlvl_attr_scores (torch.Tensor, optional): Multi-level scores
+            of attribute classifier. Defaults to None.
+        mlvl_bboxes2d (torch.Tensor, optional): Multi-level 2D bounding
+            boxes. Defaults to None.
+
+    Returns:
+        tuple[torch.Tensor]: Return results after nms, including 3D \
+            bounding boxes, scores, labels, direction scores, attribute \
+            scores (optional) and 2D bounding boxes (optional).
+    """
+    # do multi class nms
+    # the fg class id range: [0, num_classes-1]
+    num_classes = mlvl_scores.shape[1] - 1
+    bboxes = []
+    scores = []
+    labels = []
+    dir_scores = []
+    attr_scores = []
+    bboxes2d = []
+    for i in range(0, num_classes):
+        # get bboxes and scores of this class
+        cls_inds = mlvl_scores[:, i] > score_thr
+        if not cls_inds.any():
+            continue
+
+        _scores = mlvl_scores[cls_inds, i]
+        _bboxes_for_nms = mlvl_bboxes_for_nms[cls_inds, :]
+
+        if cfg.use_rotate_nms:
+            nms_func = nms_gpu
+        else:
+            nms_func = nms_normal_gpu
+
+        selected = nms_func(_bboxes_for_nms, _scores, cfg.nms_thr)
+        _mlvl_bboxes = mlvl_bboxes[cls_inds, :]
+        bboxes.append(_mlvl_bboxes[selected])
+        scores.append(_scores[selected])
+        cls_label = mlvl_bboxes.new_full((len(selected), ),
+                                         i,
+                                         dtype=torch.long)
+        labels.append(cls_label)
+
+        if mlvl_dir_scores is not None:
+            _mlvl_dir_scores = mlvl_dir_scores[cls_inds]
+            dir_scores.append(_mlvl_dir_scores[selected])
+        if mlvl_attr_scores is not None:
+            _mlvl_attr_scores = mlvl_attr_scores[cls_inds]
+            attr_scores.append(_mlvl_attr_scores[selected])
+        if mlvl_bboxes2d is not None:
+            _mlvl_bboxes2d = mlvl_bboxes2d[cls_inds]
+            bboxes2d.append(_mlvl_bboxes2d[selected])
+
+    if bboxes:
+        bboxes = torch.cat(bboxes, dim=0)
+        scores = torch.cat(scores, dim=0)
+        labels = torch.cat(labels, dim=0)
+        if mlvl_dir_scores is not None:
+            dir_scores = torch.cat(dir_scores, dim=0)
+        if mlvl_attr_scores is not None:
+            attr_scores = torch.cat(attr_scores, dim=0)
+        if mlvl_bboxes2d is not None:
+            bboxes2d = torch.cat(bboxes2d, dim=0)
+        if bboxes.shape[0] > max_num:
+            _, inds = scores.sort(descending=True)
+            inds = inds[:max_num]
+            bboxes = bboxes[inds, :]
+            labels = labels[inds]
+            scores = scores[inds]
+            if mlvl_dir_scores is not None:
+                dir_scores = dir_scores[inds]
+            if mlvl_attr_scores is not None:
+                attr_scores = attr_scores[inds]
+            if mlvl_bboxes2d is not None:
+                bboxes2d = bboxes2d[inds]
+    else:
+        bboxes = mlvl_scores.new_zeros((0, mlvl_bboxes.size(-1)))
+        scores = mlvl_scores.new_zeros((0, ))
+        labels = mlvl_scores.new_zeros((0, ), dtype=torch.long)
+        if mlvl_dir_scores is not None:
+            dir_scores = mlvl_scores.new_zeros((0, ))
+        if mlvl_attr_scores is not None:
+            attr_scores = mlvl_scores.new_zeros((0, ))
+        if mlvl_bboxes2d is not None:
+            bboxes2d = mlvl_scores.new_zeros((0, 4))
+
+    results = (bboxes, scores, labels)
+
+    if mlvl_dir_scores is not None:
+        results = results + (dir_scores, )
+    if mlvl_attr_scores is not None:
+        results = results + (attr_scores, )
+    if mlvl_bboxes2d is not None:
+        results = results + (bboxes2d, )
+
+    return results
+
+
+def aligned_3d_nms(boxes, scores, classes, thresh):
+    """3d nms for aligned boxes.
+
+    Args:
+        boxes (torch.Tensor): Aligned box with shape [n, 6].
+        scores (torch.Tensor): Scores of each box.
+        classes (torch.Tensor): Class of each box.
+        thresh (float): Iou threshold for nms.
+
+    Returns:
+        torch.Tensor: Indices of selected boxes.
+    """
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    z1 = boxes[:, 2]
+    x2 = boxes[:, 3]
+    y2 = boxes[:, 4]
+    z2 = boxes[:, 5]
+    area = (x2 - x1) * (y2 - y1) * (z2 - z1)
+    zero = boxes.new_zeros(1, )
+
+    score_sorted = torch.argsort(scores)
+    pick = []
+    while (score_sorted.shape[0] != 0):
+        last = score_sorted.shape[0]
+        i = score_sorted[-1]
+        pick.append(i)
+
+        xx1 = torch.max(x1[i], x1[score_sorted[:last - 1]])
+        yy1 = torch.max(y1[i], y1[score_sorted[:last - 1]])
+        zz1 = torch.max(z1[i], z1[score_sorted[:last - 1]])
+        xx2 = torch.min(x2[i], x2[score_sorted[:last - 1]])
+        yy2 = torch.min(y2[i], y2[score_sorted[:last - 1]])
+        zz2 = torch.min(z2[i], z2[score_sorted[:last - 1]])
+        classes1 = classes[i]
+        classes2 = classes[score_sorted[:last - 1]]
+        inter_l = torch.max(zero, xx2 - xx1)
+        inter_w = torch.max(zero, yy2 - yy1)
+        inter_h = torch.max(zero, zz2 - zz1)
+
+        inter = inter_l * inter_w * inter_h
+        iou = inter / (area[i] + area[score_sorted[:last - 1]] - inter)
+        iou = iou * (classes1 == classes2).float()
+        score_sorted = score_sorted[torch.nonzero(
+            iou <= thresh, as_tuple=False).flatten()]
+
+    indices = boxes.new_tensor(pick, dtype=torch.long)
+    return indices
+
+
+@numba.jit(nopython=True)
+def circle_nms(dets, thresh, post_max_size=83):
+    """Circular NMS.
+
+    An object is only counted as positive if no other center
+    with a higher confidence exists within a radius r using a
+    bird-eye view distance metric.
+
+    Args:
+        dets (torch.Tensor): Detection results with the shape of [N, 3].
+        thresh (float): Value of threshold.
+        post_max_size (int): Max number of prediction to be kept. Defaults
+            to 83
+
+    Returns:
+        torch.Tensor: Indexes of the detections to be kept.
+    """
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    scores = dets[:, 2]
+    order = scores.argsort()[::-1].astype(np.int32)  # highest->lowest
+    ndets = dets.shape[0]
+    suppressed = np.zeros((ndets), dtype=np.int32)
+    keep = []
+    for _i in range(ndets):
+        i = order[_i]  # start with highest score box
+        if suppressed[
+                i] == 1:  # if any box have enough iou with this, remove it
+            continue
+        keep.append(i)
+        for _j in range(_i + 1, ndets):
+            j = order[_j]
+            if suppressed[j] == 1:
+                continue
+            # calculate center distance between i and j box
+            dist = (x1[i] - x1[j])**2 + (y1[i] - y1[j])**2
+
+            # ovr = inter / areas[j]
+            if dist <= thresh:
+                suppressed[j] = 1
+    return keep[:post_max_size]
diff --git a/mmcv/core/post_processing/merge_augs.py b/mmcv/core/post_processing/merge_augs.py
new file mode 100644
index 0000000..e96dc3b
--- /dev/null
+++ b/mmcv/core/post_processing/merge_augs.py
@@ -0,0 +1,241 @@
+import copy
+import warnings
+
+import numpy as np
+import torch
+from mmcv import ConfigDict
+from mmcv.ops import nms
+
+from mmcv.ops.iou3d_det.iou3d_utils import nms_gpu, nms_normal_gpu
+from ..bbox.transforms import bbox_mapping_back, bbox3d2result, bbox3d_mapping_back
+from ..bbox.structures.utils import xywhr2xyxyr
+
+def merge_aug_proposals(aug_proposals, img_metas, cfg):
+    """Merge augmented proposals (multiscale, flip, etc.)
+
+    Args:
+        aug_proposals (list[Tensor]): proposals from different testing
+            schemes, shape (n, 5). Note that they are not rescaled to the
+            original image size.
+
+        img_metas (list[dict]): list of image info dict where each dict has:
+            'img_shape', 'scale_factor', 'flip', and may also contain
+            'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+            For details on the values of these keys see
+            `mmcv/datasets/pipelines/formatting.py:Collect`.
+
+        cfg (dict): rpn test config.
+
+    Returns:
+        Tensor: shape (n, 4), proposals corresponding to original image scale.
+    """
+
+    cfg = copy.deepcopy(cfg)
+
+    # deprecate arguments warning
+    if 'nms' not in cfg or 'max_num' in cfg or 'nms_thr' in cfg:
+        warnings.warn(
+            'In rpn_proposal or test_cfg, '
+            'nms_thr has been moved to a dict named nms as '
+            'iou_threshold, max_num has been renamed as max_per_img, '
+            'name of original arguments and the way to specify '
+            'iou_threshold of NMS will be deprecated.')
+    if 'nms' not in cfg:
+        cfg.nms = ConfigDict(dict(type='nms', iou_threshold=cfg.nms_thr))
+    if 'max_num' in cfg:
+        if 'max_per_img' in cfg:
+            assert cfg.max_num == cfg.max_per_img, f'You set max_num and ' \
+                f'max_per_img at the same time, but get {cfg.max_num} ' \
+                f'and {cfg.max_per_img} respectively' \
+                f'Please delete max_num which will be deprecated.'
+        else:
+            cfg.max_per_img = cfg.max_num
+    if 'nms_thr' in cfg:
+        assert cfg.nms.iou_threshold == cfg.nms_thr, f'You set ' \
+            f'iou_threshold in nms and ' \
+            f'nms_thr at the same time, but get ' \
+            f'{cfg.nms.iou_threshold} and {cfg.nms_thr}' \
+            f' respectively. Please delete the nms_thr ' \
+            f'which will be deprecated.'
+
+    recovered_proposals = []
+    for proposals, img_info in zip(aug_proposals, img_metas):
+        img_shape = img_info['img_shape']
+        scale_factor = img_info['scale_factor']
+        flip = img_info['flip']
+        flip_direction = img_info['flip_direction']
+        _proposals = proposals.clone()
+        _proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], img_shape,
+                                              scale_factor, flip,
+                                              flip_direction)
+        recovered_proposals.append(_proposals)
+    aug_proposals = torch.cat(recovered_proposals, dim=0)
+    merged_proposals, _ = nms(aug_proposals[:, :4].contiguous(),
+                              aug_proposals[:, -1].contiguous(),
+                              cfg.nms.iou_threshold)
+    scores = merged_proposals[:, 4]
+    _, order = scores.sort(0, descending=True)
+    num = min(cfg.max_per_img, merged_proposals.shape[0])
+    order = order[:num]
+    merged_proposals = merged_proposals[order, :]
+    return merged_proposals
+
+
+def merge_aug_bboxes(aug_bboxes, aug_scores, img_metas, rcnn_test_cfg):
+    """Merge augmented detection bboxes and scores.
+
+    Args:
+        aug_bboxes (list[Tensor]): shape (n, 4*#class)
+        aug_scores (list[Tensor] or None): shape (n, #class)
+        img_shapes (list[Tensor]): shape (3, ).
+        rcnn_test_cfg (dict): rcnn test config.
+
+    Returns:
+        tuple: (bboxes, scores)
+    """
+    recovered_bboxes = []
+    for bboxes, img_info in zip(aug_bboxes, img_metas):
+        img_shape = img_info[0]['img_shape']
+        scale_factor = img_info[0]['scale_factor']
+        flip = img_info[0]['flip']
+        flip_direction = img_info[0]['flip_direction']
+        bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip,
+                                   flip_direction)
+        recovered_bboxes.append(bboxes)
+    bboxes = torch.stack(recovered_bboxes).mean(dim=0)
+    if aug_scores is None:
+        return bboxes
+    else:
+        scores = torch.stack(aug_scores).mean(dim=0)
+        return bboxes, scores
+
+
+def merge_aug_scores(aug_scores):
+    """Merge augmented bbox scores."""
+    if isinstance(aug_scores[0], torch.Tensor):
+        return torch.mean(torch.stack(aug_scores), dim=0)
+    else:
+        return np.mean(aug_scores, axis=0)
+
+
+def merge_aug_masks(aug_masks, img_metas, rcnn_test_cfg, weights=None):
+    """Merge augmented mask prediction.
+
+    Args:
+        aug_masks (list[ndarray]): shape (n, #class, h, w)
+        img_shapes (list[ndarray]): shape (3, ).
+        rcnn_test_cfg (dict): rcnn test config.
+
+    Returns:
+        tuple: (bboxes, scores)
+    """
+    recovered_masks = []
+    for mask, img_info in zip(aug_masks, img_metas):
+        flip = img_info[0]['flip']
+        flip_direction = img_info[0]['flip_direction']
+        if flip:
+            if flip_direction == 'horizontal':
+                mask = mask[:, :, :, ::-1]
+            elif flip_direction == 'vertical':
+                mask = mask[:, :, ::-1, :]
+            elif flip_direction == 'diagonal':
+                mask = mask[:, :, :, ::-1]
+                mask = mask[:, :, ::-1, :]
+            else:
+                raise ValueError(
+                    f"Invalid flipping direction '{flip_direction}'")
+        recovered_masks.append(mask)
+
+    if weights is None:
+        merged_masks = np.mean(recovered_masks, axis=0)
+    else:
+        merged_masks = np.average(
+            np.array(recovered_masks), axis=0, weights=np.array(weights))
+    return merged_masks
+
+def merge_aug_bboxes_3d(aug_results, img_metas, test_cfg):
+    """Merge augmented detection 3D bboxes and scores.
+
+    Args:
+        aug_results (list[dict]): The dict of detection results.
+            The dict contains the following keys
+
+            - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+            - scores_3d (torch.Tensor): Detection scores.
+            - labels_3d (torch.Tensor): Predicted box labels.
+        img_metas (list[dict]): Meta information of each sample.
+        test_cfg (dict): Test config.
+
+    Returns:
+        dict: Bounding boxes results in cpu mode, containing merged results.
+
+            - boxes_3d (:obj:`BaseInstance3DBoxes`): Merged detection bbox.
+            - scores_3d (torch.Tensor): Merged detection scores.
+            - labels_3d (torch.Tensor): Merged predicted box labels.
+    """
+
+    assert len(aug_results) == len(img_metas), \
+        '"aug_results" should have the same length as "img_metas", got len(' \
+        f'aug_results)={len(aug_results)} and len(img_metas)={len(img_metas)}'
+
+    recovered_bboxes = []
+    recovered_scores = []
+    recovered_labels = []
+
+    for bboxes, img_info in zip(aug_results, img_metas):
+        scale_factor = img_info[0]['pcd_scale_factor']
+        pcd_horizontal_flip = img_info[0]['pcd_horizontal_flip']
+        pcd_vertical_flip = img_info[0]['pcd_vertical_flip']
+        recovered_scores.append(bboxes['scores_3d'])
+        recovered_labels.append(bboxes['labels_3d'])
+        bboxes = bbox3d_mapping_back(bboxes['boxes_3d'], scale_factor,
+                                     pcd_horizontal_flip, pcd_vertical_flip)
+        recovered_bboxes.append(bboxes)
+
+    aug_bboxes = recovered_bboxes[0].cat(recovered_bboxes)
+    aug_bboxes_for_nms = xywhr2xyxyr(aug_bboxes.bev)
+    aug_scores = torch.cat(recovered_scores, dim=0)
+    aug_labels = torch.cat(recovered_labels, dim=0)
+
+    # TODO: use a more elegent way to deal with nms
+    if test_cfg.use_rotate_nms:
+        nms_func = nms_gpu
+    else:
+        nms_func = nms_normal_gpu
+
+    merged_bboxes = []
+    merged_scores = []
+    merged_labels = []
+
+    # Apply multi-class nms when merge bboxes
+    if len(aug_labels) == 0:
+        return bbox3d2result(aug_bboxes, aug_scores, aug_labels)
+
+    for class_id in range(torch.max(aug_labels).item() + 1):
+        class_inds = (aug_labels == class_id)
+        bboxes_i = aug_bboxes[class_inds]
+        bboxes_nms_i = aug_bboxes_for_nms[class_inds, :]
+        scores_i = aug_scores[class_inds]
+        labels_i = aug_labels[class_inds]
+        if len(bboxes_nms_i) == 0:
+            continue
+        selected = nms_func(bboxes_nms_i, scores_i, test_cfg.nms_thr)
+
+        merged_bboxes.append(bboxes_i[selected, :])
+        merged_scores.append(scores_i[selected])
+        merged_labels.append(labels_i[selected])
+
+    merged_bboxes = merged_bboxes[0].cat(merged_bboxes)
+    merged_scores = torch.cat(merged_scores, dim=0)
+    merged_labels = torch.cat(merged_labels, dim=0)
+
+    _, order = merged_scores.sort(0, descending=True)
+    num = min(test_cfg.max_num, len(aug_bboxes))
+    order = order[:num]
+
+    merged_bboxes = merged_bboxes[order]
+    merged_scores = merged_scores[order]
+    merged_labels = merged_labels[order]
+
+    return bbox3d2result(merged_bboxes, merged_scores, merged_labels)
+
diff --git a/mmcv/core/utils/__init__.py b/mmcv/core/utils/__init__.py
new file mode 100644
index 0000000..b127388
--- /dev/null
+++ b/mmcv/core/utils/__init__.py
@@ -0,0 +1,9 @@
+from .dist_utils import DistOptimizerHook, allreduce_grads, reduce_mean
+from .misc import flip_tensor, mask2ndarray, multi_apply, unmap, add_prefix
+from .gaussian import draw_heatmap_gaussian, gaussian_2d, gaussian_radius
+
+__all__ = [
+    'allreduce_grads', 'DistOptimizerHook', 'reduce_mean', 'multi_apply',
+    'unmap', 'mask2ndarray', 'flip_tensor', 'add_prefix',
+    'gaussian_2d', 'gaussian_radius', 'draw_heatmap_gaussian'
+]
diff --git a/mmcv/core/utils/dist_utils.py b/mmcv/core/utils/dist_utils.py
new file mode 100644
index 0000000..5fe7775
--- /dev/null
+++ b/mmcv/core/utils/dist_utils.py
@@ -0,0 +1,69 @@
+import warnings
+from collections import OrderedDict
+
+import torch.distributed as dist
+from mmcv.runner import OptimizerHook
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+
+
+def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        dist.all_reduce(flat_tensors)
+        flat_tensors.div_(world_size)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
+
+
+def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+    """Allreduce gradients.
+
+    Args:
+        params (list[torch.Parameters]): List of parameters of a model
+        coalesce (bool, optional): Whether allreduce parameters as a whole.
+            Defaults to True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Defaults to -1.
+    """
+    grads = [
+        param.grad.data for param in params
+        if param.requires_grad and param.grad is not None
+    ]
+    world_size = dist.get_world_size()
+    if coalesce:
+        _allreduce_coalesced(grads, world_size, bucket_size_mb)
+    else:
+        for tensor in grads:
+            dist.all_reduce(tensor.div_(world_size))
+
+
+class DistOptimizerHook(OptimizerHook):
+    """Deprecated optimizer hook for distributed training."""
+
+    def __init__(self, *args, **kwargs):
+        warnings.warn('"DistOptimizerHook" is deprecated, please switch to'
+                      '"mmcv.runner.OptimizerHook".')
+        super().__init__(*args, **kwargs)
+
+
+def reduce_mean(tensor):
+    """"Obtain the mean of tensor on different GPUs."""
+    if not (dist.is_available() and dist.is_initialized()):
+        return tensor
+    tensor = tensor.clone()
+    dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM)
+    return tensor
diff --git a/mmcv/core/utils/gaussian.py b/mmcv/core/utils/gaussian.py
new file mode 100644
index 0000000..a07963e
--- /dev/null
+++ b/mmcv/core/utils/gaussian.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+
+def gaussian_2d(shape, sigma=1):
+    """Generate gaussian map.
+
+    Args:
+        shape (list[int]): Shape of the map.
+        sigma (float): Sigma to generate gaussian map.
+            Defaults to 1.
+
+    Returns:
+        np.ndarray: Generated gaussian map.
+    """
+    m, n = [(ss - 1.) / 2. for ss in shape]
+    y, x = np.ogrid[-m:m + 1, -n:n + 1]
+
+    h = np.exp(-(x * x + y * y) / (2 * sigma * sigma))
+    h[h < np.finfo(h.dtype).eps * h.max()] = 0
+    return h
+
+
+def draw_heatmap_gaussian(heatmap, center, radius, k=1):
+    """Get gaussian masked heatmap.
+
+    Args:
+        heatmap (torch.Tensor): Heatmap to be masked.
+        center (torch.Tensor): Center coord of the heatmap.
+        radius (int): Radius of gausian.
+        K (int): Multiple of masked_gaussian. Defaults to 1.
+
+    Returns:
+        torch.Tensor: Masked heatmap.
+    """
+    diameter = 2 * radius + 1
+    gaussian = gaussian_2d((diameter, diameter), sigma=diameter / 6)
+
+    x, y = int(center[0]), int(center[1])
+
+    height, width = heatmap.shape[0:2]
+
+    left, right = min(x, radius), min(width - x, radius + 1)
+    top, bottom = min(y, radius), min(height - y, radius + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = torch.from_numpy(
+        gaussian[radius - top:radius + bottom,
+                 radius - left:radius + right]).to(heatmap.device,
+                                                   torch.float32)
+    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
+        torch.max(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
+    return heatmap
+
+
+def gaussian_radius(det_size, min_overlap=0.5):
+    """Get radius of gaussian.
+
+    Args:
+        det_size (tuple[torch.Tensor]): Size of the detection result.
+        min_overlap (float): Gaussian_overlap. Defaults to 0.5.
+
+    Returns:
+        torch.Tensor: Computed radius.
+    """
+    height, width = det_size
+
+    a1 = 1
+    b1 = (height + width)
+    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
+    sq1 = torch.sqrt(b1**2 - 4 * a1 * c1)
+    r1 = (b1 + sq1) / 2
+
+    a2 = 4
+    b2 = 2 * (height + width)
+    c2 = (1 - min_overlap) * width * height
+    sq2 = torch.sqrt(b2**2 - 4 * a2 * c2)
+    r2 = (b2 + sq2) / 2
+
+    a3 = 4 * min_overlap
+    b3 = -2 * min_overlap * (height + width)
+    c3 = (min_overlap - 1) * width * height
+    sq3 = torch.sqrt(b3**2 - 4 * a3 * c3)
+    r3 = (b3 + sq3) / 2
+    return min(r1, r2, r3)
diff --git a/mmcv/core/utils/misc.py b/mmcv/core/utils/misc.py
new file mode 100644
index 0000000..52e1897
--- /dev/null
+++ b/mmcv/core/utils/misc.py
@@ -0,0 +1,102 @@
+from functools import partial
+
+import numpy as np
+import torch
+from six.moves import map, zip
+
+from ..mask.structures import BitmapMasks, PolygonMasks
+
+
+def multi_apply(func, *args, **kwargs):
+    """Apply function to a list of arguments.
+
+    Note:
+        This function applies the ``func`` to multiple inputs and
+        map the multiple outputs of the ``func`` into different
+        list. Each list contains the same type of outputs corresponding
+        to different inputs.
+
+    Args:
+        func (Function): A function that will be applied to a list of
+            arguments
+
+    Returns:
+        tuple(list): A tuple containing multiple list, each list contains \
+            a kind of returned results by the function
+    """
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
+
+
+def unmap(data, count, inds, fill=0):
+    """Unmap a subset of item (data) back to the original set of items (of size
+    count)"""
+    if data.dim() == 1:
+        ret = data.new_full((count, ), fill)
+        ret[inds.type(torch.bool)] = data
+    else:
+        new_size = (count, ) + data.size()[1:]
+        ret = data.new_full(new_size, fill)
+        ret[inds.type(torch.bool), :] = data
+    return ret
+
+
+def mask2ndarray(mask):
+    """Convert Mask to ndarray..
+
+    Args:
+        mask (:obj:`BitmapMasks` or :obj:`PolygonMasks` or
+        torch.Tensor or np.ndarray): The mask to be converted.
+
+    Returns:
+        np.ndarray: Ndarray mask of shape (n, h, w) that has been converted
+    """
+    if isinstance(mask, (BitmapMasks, PolygonMasks)):
+        mask = mask.to_ndarray()
+    elif isinstance(mask, torch.Tensor):
+        mask = mask.detach().cpu().numpy()
+    elif not isinstance(mask, np.ndarray):
+        raise TypeError(f'Unsupported {type(mask)} data type')
+    return mask
+
+
+def flip_tensor(src_tensor, flip_direction):
+    """flip tensor base on flip_direction.
+
+    Args:
+        src_tensor (Tensor): input feature map, shape (B, C, H, W).
+        flip_direction (str): The flipping direction. Options are
+          'horizontal', 'vertical', 'diagonal'.
+
+    Returns:
+        out_tensor (Tensor): Flipped tensor.
+    """
+    assert src_tensor.ndim == 4
+    valid_directions = ['horizontal', 'vertical', 'diagonal']
+    assert flip_direction in valid_directions
+    if flip_direction == 'horizontal':
+        out_tensor = torch.flip(src_tensor, [3])
+    elif flip_direction == 'vertical':
+        out_tensor = torch.flip(src_tensor, [2])
+    else:
+        out_tensor = torch.flip(src_tensor, [2, 3])
+    return out_tensor
+
+def add_prefix(inputs, prefix):
+    """Add prefix for dict.
+
+    Args:
+        inputs (dict): The input dict with str keys.
+        prefix (str): The prefix to add.
+
+    Returns:
+
+        dict: The dict with keys updated with ``prefix``.
+    """
+
+    outputs = dict()
+    for name, value in inputs.items():
+        outputs[f'{prefix}.{name}'] = value
+
+    return outputs
diff --git a/mmcv/core/visualization/__init__.py b/mmcv/core/visualization/__init__.py
new file mode 100644
index 0000000..af64628
--- /dev/null
+++ b/mmcv/core/visualization/__init__.py
@@ -0,0 +1,4 @@
+from .image import (color_val_matplotlib, imshow_det_bboxes,
+                    imshow_gt_det_bboxes, imshow, color_val)
+
+__all__ = ['imshow_det_bboxes', 'imshow_gt_det_bboxes', 'color_val_matplotlib']
diff --git a/mmcv/core/visualization/image.py b/mmcv/core/visualization/image.py
new file mode 100644
index 0000000..a001853
--- /dev/null
+++ b/mmcv/core/visualization/image.py
@@ -0,0 +1,372 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import cv2
+import pycocotools.mask as mask_util
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon
+from enum import Enum
+from mmcv.utils import concat_list, is_str
+from mmcv.image import imread, imwrite
+from mmcv.image import bgr2rgb, rgb2bgr
+from ..utils import mask2ndarray
+
+EPS = 1e-2
+
+
+class Color(Enum):
+    """An enum that defines common colors.
+
+    Contains red, green, blue, cyan, yellow, magenta, white and black.
+    """
+    red = (0, 0, 255)
+    green = (0, 255, 0)
+    blue = (255, 0, 0)
+    cyan = (255, 255, 0)
+    yellow = (0, 255, 255)
+    magenta = (255, 0, 255)
+    white = (255, 255, 255)
+    black = (0, 0, 0)
+
+
+def color_val(color):
+    """Convert various input to color tuples.
+
+    Args:
+        color (:obj:`Color`/str/tuple/int/ndarray): Color inputs
+
+    Returns:
+        tuple[int]: A tuple of 3 integers indicating BGR channels.
+    """
+    if is_str(color):
+        return Color[color].value
+    elif isinstance(color, Color):
+        return color.value
+    elif isinstance(color, tuple):
+        assert len(color) == 3
+        for channel in color:
+            assert 0 <= channel <= 255
+        return color
+    elif isinstance(color, int):
+        assert 0 <= color <= 255
+        return color, color, color
+    elif isinstance(color, np.ndarray):
+        assert color.ndim == 1 and color.size == 3
+        assert np.all((color >= 0) & (color <= 255))
+        color = color.astype(np.uint8)
+        return tuple(color)
+    else:
+        raise TypeError(f'Invalid type for color: {type(color)}')
+
+
+
+def color_val_matplotlib(color):
+    """Convert various input in BGR order to normalized RGB matplotlib color
+    tuples,
+
+    Args:
+        color (:obj:`Color`/str/tuple/int/ndarray): Color inputs
+
+    Returns:
+        tuple[float]: A tuple of 3 normalized floats indicating RGB channels.
+    """
+    color = color_val(color)
+    color = [color / 255 for color in color[::-1]]
+    return tuple(color)
+
+def imshow(img, win_name='', wait_time=0):
+    """Show an image.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+    """
+    cv2.imshow(win_name, imread(img))
+    if wait_time == 0:  # prevent from hanging if windows was closed
+        while True:
+            ret = cv2.waitKey(1)
+
+            closed = cv2.getWindowProperty(win_name, cv2.WND_PROP_VISIBLE) < 1
+            # if user closed window or if some key pressed
+            if closed or ret != -1:
+                break
+    else:
+        ret = cv2.waitKey(wait_time)
+
+
+def imshow_det_bboxes(img,
+                      bboxes,
+                      labels,
+                      segms=None,
+                      class_names=None,
+                      score_thr=0,
+                      bbox_color='green',
+                      text_color='green',
+                      mask_color=None,
+                      thickness=2,
+                      font_size=13,
+                      win_name='',
+                      show=True,
+                      wait_time=0,
+                      out_file=None):
+    """Draw bboxes and class labels (with scores) on an image.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        bboxes (ndarray): Bounding boxes (with scores), shaped (n, 4) or
+            (n, 5).
+        labels (ndarray): Labels of bboxes.
+        segms (ndarray or None): Masks, shaped (n,h,w) or None
+        class_names (list[str]): Names of each classes.
+        score_thr (float): Minimum score of bboxes to be shown.  Default: 0
+        bbox_color (str or tuple(int) or :obj:`Color`):Color of bbox lines.
+           The tuple of color should be in BGR order. Default: 'green'
+        text_color (str or tuple(int) or :obj:`Color`):Color of texts.
+           The tuple of color should be in BGR order. Default: 'green'
+        mask_color (str or tuple(int) or :obj:`Color`, optional):
+           Color of masks. The tuple of color should be in BGR order.
+           Default: None
+        thickness (int): Thickness of lines. Default: 2
+        font_size (int): Font size of texts. Default: 13
+        show (bool): Whether to show the image. Default: True
+        win_name (str): The window name. Default: ''
+        wait_time (float): Value of waitKey param. Default: 0.
+        out_file (str, optional): The filename to write the image.
+            Default: None
+
+    Returns:
+        ndarray: The image with bboxes drawn on it.
+    """
+    assert bboxes.ndim == 2, \
+        f' bboxes ndim should be 2, but its ndim is {bboxes.ndim}.'
+    assert labels.ndim == 1, \
+        f' labels ndim should be 1, but its ndim is {labels.ndim}.'
+    assert bboxes.shape[0] == labels.shape[0], \
+        'bboxes.shape[0] and labels.shape[0] should have the same length.'
+    assert bboxes.shape[1] == 4 or bboxes.shape[1] == 5, \
+        f' bboxes.shape[1] should be 4 or 5, but its {bboxes.shape[1]}.'
+    img = imread(img).astype(np.uint8)
+
+    if score_thr > 0:
+        assert bboxes.shape[1] == 5
+        scores = bboxes[:, -1]
+        inds = scores > score_thr
+        bboxes = bboxes[inds, :]
+        labels = labels[inds]
+        if segms is not None:
+            segms = segms[inds, ...]
+
+    mask_colors = []
+    if labels.shape[0] > 0:
+        if mask_color is None:
+            # random color
+            np.random.seed(42)
+            mask_colors = [
+                np.random.randint(0, 256, (1, 3), dtype=np.uint8)
+                for _ in range(max(labels) + 1)
+            ]
+        else:
+            # specify  color
+            mask_colors = [
+                np.array(color_val(mask_color)[::-1], dtype=np.uint8)
+            ] * (
+                max(labels) + 1)
+
+    bbox_color = color_val_matplotlib(bbox_color)
+    text_color = color_val_matplotlib(text_color)
+
+    img = bgr2rgb(img)
+    width, height = img.shape[1], img.shape[0]
+    img = np.ascontiguousarray(img)
+
+    fig = plt.figure(win_name, frameon=False)
+    plt.title(win_name)
+    canvas = fig.canvas
+    dpi = fig.get_dpi()
+    # add a small EPS to avoid precision lost due to matplotlib's truncation
+    # (https://github.com/matplotlib/matplotlib/issues/15363)
+    fig.set_size_inches((width + EPS) / dpi, (height + EPS) / dpi)
+
+    # remove white edges by set subplot margin
+    plt.subplots_adjust(left=0, right=1, bottom=0, top=1)
+    ax = plt.gca()
+    ax.axis('off')
+
+    polygons = []
+    color = []
+    for i, (bbox, label) in enumerate(zip(bboxes, labels)):
+        bbox_int = bbox.astype(np.int32)
+        poly = [[bbox_int[0], bbox_int[1]], [bbox_int[0], bbox_int[3]],
+                [bbox_int[2], bbox_int[3]], [bbox_int[2], bbox_int[1]]]
+        np_poly = np.array(poly).reshape((4, 2))
+        polygons.append(Polygon(np_poly))
+        color.append(bbox_color)
+        label_text = class_names[
+            label] if class_names is not None else f'class {label}'
+        if len(bbox) > 4:
+            label_text += f'|{bbox[-1]:.02f}'
+        ax.text(
+            bbox_int[0],
+            bbox_int[1],
+            f'{label_text}',
+            bbox={
+                'facecolor': 'black',
+                'alpha': 0.8,
+                'pad': 0.7,
+                'edgecolor': 'none'
+            },
+            color=text_color,
+            fontsize=font_size,
+            verticalalignment='top',
+            horizontalalignment='left')
+        if segms is not None:
+            color_mask = mask_colors[labels[i]]
+            mask = segms[i].astype(bool)
+            img[mask] = img[mask] * 0.5 + color_mask * 0.5
+
+    plt.imshow(img)
+
+    p = PatchCollection(
+        polygons, facecolor='none', edgecolors=color, linewidths=thickness)
+    ax.add_collection(p)
+
+    stream, _ = canvas.print_to_buffer()
+    buffer = np.frombuffer(stream, dtype='uint8')
+    img_rgba = buffer.reshape(height, width, 4)
+    rgb, alpha = np.split(img_rgba, [3], axis=2)
+    img = rgb.astype('uint8')
+    img = rgb2bgr(img)
+
+    if show:
+        # We do not use cv2 for display because in some cases, opencv will
+        # conflict with Qt, it will output a warning: Current thread
+        # is not the object's thread. You can refer to
+        # https://github.com/opencv/opencv-python/issues/46 for details
+        if wait_time == 0:
+            plt.show()
+        else:
+            plt.show(block=False)
+            plt.pause(wait_time)
+    if out_file is not None:
+        imwrite(img, out_file)
+
+    plt.close()
+
+    return img
+
+
+def imshow_gt_det_bboxes(img,
+                         annotation,
+                         result,
+                         class_names=None,
+                         score_thr=0,
+                         gt_bbox_color=(255, 102, 61),
+                         gt_text_color=(255, 102, 61),
+                         gt_mask_color=(255, 102, 61),
+                         det_bbox_color=(72, 101, 241),
+                         det_text_color=(72, 101, 241),
+                         det_mask_color=(72, 101, 241),
+                         thickness=2,
+                         font_size=13,
+                         win_name='',
+                         show=True,
+                         wait_time=0,
+                         out_file=None):
+    """General visualization GT and result function.
+
+    Args:
+      img (str or ndarray): The image to be displayed.)
+      annotation (dict): Ground truth annotations where contain keys of
+          'gt_bboxes' and 'gt_labels' or 'gt_masks'
+      result (tuple[list] or list): The detection result, can be either
+          (bbox, segm) or just bbox.
+      class_names (list[str]): Names of each classes.
+      score_thr (float): Minimum score of bboxes to be shown.  Default: 0
+      gt_bbox_color (str or tuple(int) or :obj:`Color`):Color of bbox lines.
+           The tuple of color should be in BGR order. Default: (255, 102, 61)
+      gt_text_color (str or tuple(int) or :obj:`Color`):Color of texts.
+           The tuple of color should be in BGR order. Default: (255, 102, 61)
+      gt_mask_color (str or tuple(int) or :obj:`Color`, optional):
+           Color of masks. The tuple of color should be in BGR order.
+           Default: (255, 102, 61)
+      det_bbox_color (str or tuple(int) or :obj:`Color`):Color of bbox lines.
+           The tuple of color should be in BGR order. Default: (72, 101, 241)
+      det_text_color (str or tuple(int) or :obj:`Color`):Color of texts.
+           The tuple of color should be in BGR order. Default: (72, 101, 241)
+      det_mask_color (str or tuple(int) or :obj:`Color`, optional):
+           Color of masks. The tuple of color should be in BGR order.
+           Default: (72, 101, 241)
+      thickness (int): Thickness of lines. Default: 2
+      font_size (int): Font size of texts. Default: 13
+      win_name (str): The window name. Default: ''
+      show (bool): Whether to show the image. Default: True
+      wait_time (float): Value of waitKey param. Default: 0.
+      out_file (str, optional): The filename to write the image.
+         Default: None
+
+    Returns:
+        ndarray: The image with bboxes or masks drawn on it.
+    """
+    assert 'gt_bboxes' in annotation
+    assert 'gt_labels' in annotation
+    assert isinstance(
+        result,
+        (tuple, list)), f'Expected tuple or list, but get {type(result)}'
+
+    gt_masks = annotation.get('gt_masks', None)
+    if gt_masks is not None:
+        gt_masks = mask2ndarray(gt_masks)
+
+    img = imread(img)
+
+    img = imshow_det_bboxes(
+        img,
+        annotation['gt_bboxes'],
+        annotation['gt_labels'],
+        gt_masks,
+        class_names=class_names,
+        bbox_color=gt_bbox_color,
+        text_color=gt_text_color,
+        mask_color=gt_mask_color,
+        thickness=thickness,
+        font_size=font_size,
+        win_name=win_name,
+        show=False)
+
+    if isinstance(result, tuple):
+        bbox_result, segm_result = result
+        if isinstance(segm_result, tuple):
+            segm_result = segm_result[0]  # ms rcnn
+    else:
+        bbox_result, segm_result = result, None
+
+    bboxes = np.vstack(bbox_result)
+    labels = [
+        np.full(bbox.shape[0], i, dtype=np.int32)
+        for i, bbox in enumerate(bbox_result)
+    ]
+    labels = np.concatenate(labels)
+
+    segms = None
+    if segm_result is not None and len(labels) > 0:  # non empty
+        segms = concat_list(segm_result)
+        segms = mask_util.decode(segms)
+        segms = segms.transpose(2, 0, 1)
+
+    img = imshow_det_bboxes(
+        img,
+        bboxes,
+        labels,
+        segms=segms,
+        class_names=class_names,
+        score_thr=score_thr,
+        bbox_color=det_bbox_color,
+        text_color=det_text_color,
+        mask_color=det_mask_color,
+        thickness=thickness,
+        font_size=font_size,
+        win_name=win_name,
+        show=show,
+        wait_time=wait_time,
+        out_file=out_file)
+    return img
diff --git a/mmcv/core/visualizer/__init__.py b/mmcv/core/visualizer/__init__.py
new file mode 100644
index 0000000..bbf1e60
--- /dev/null
+++ b/mmcv/core/visualizer/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .show_result import (show_multi_modality_result, show_result,
+                          show_seg_result)
+
+__all__ = ['show_result', 'show_seg_result', 'show_multi_modality_result']
diff --git a/mmcv/core/visualizer/image_vis.py b/mmcv/core/visualizer/image_vis.py
new file mode 100644
index 0000000..60034f1
--- /dev/null
+++ b/mmcv/core/visualizer/image_vis.py
@@ -0,0 +1,198 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import cv2
+import numpy as np
+import torch
+from matplotlib import pyplot as plt
+
+
+def project_pts_on_img(points,
+                       raw_img,
+                       lidar2img_rt,
+                       max_distance=70,
+                       thickness=-1):
+    """Project the 3D points cloud on 2D image.
+
+    Args:
+        points (numpy.array): 3D points cloud (x, y, z) to visualize.
+        raw_img (numpy.array): The numpy array of image.
+        lidar2img_rt (numpy.array, shape=[4, 4]): The projection matrix
+            according to the camera intrinsic parameters.
+        max_distance (float): the max distance of the points cloud.
+            Default: 70.
+        thickness (int, optional): The thickness of 2D points. Default: -1.
+    """
+    img = raw_img.copy()
+    num_points = points.shape[0]
+    pts_4d = np.concatenate([points[:, :3], np.ones((num_points, 1))], axis=-1)
+    pts_2d = pts_4d @ lidar2img_rt.T
+
+    # cam_points is Tensor of Nx4 whose last column is 1
+    # transform camera coordinate to image coordinate
+    pts_2d[:, 2] = np.clip(pts_2d[:, 2], a_min=1e-5, a_max=99999)
+    pts_2d[:, 0] /= pts_2d[:, 2]
+    pts_2d[:, 1] /= pts_2d[:, 2]
+
+    fov_inds = ((pts_2d[:, 0] < img.shape[1])
+                & (pts_2d[:, 0] >= 0)
+                & (pts_2d[:, 1] < img.shape[0])
+                & (pts_2d[:, 1] >= 0))
+
+    imgfov_pts_2d = pts_2d[fov_inds, :3]  # u, v, d
+
+    cmap = plt.cm.get_cmap('hsv', 256)
+    cmap = np.array([cmap(i) for i in range(256)])[:, :3] * 255
+    for i in range(imgfov_pts_2d.shape[0]):
+        depth = imgfov_pts_2d[i, 2]
+        color = cmap[np.clip(int(max_distance * 10 / depth), 0, 255), :]
+        cv2.circle(
+            img,
+            center=(int(np.round(imgfov_pts_2d[i, 0])),
+                    int(np.round(imgfov_pts_2d[i, 1]))),
+            radius=1,
+            color=tuple(color),
+            thickness=thickness,
+        )
+    cv2.imshow('project_pts_img', img.astype(np.uint8))
+    cv2.waitKey(100)
+
+
+def plot_rect3d_on_img(img,
+                       num_rects,
+                       rect_corners,
+                       color=(0, 255, 0),
+                       thickness=1):
+    """Plot the boundary lines of 3D rectangular on 2D images.
+
+    Args:
+        img (numpy.array): The numpy array of image.
+        num_rects (int): Number of 3D rectangulars.
+        rect_corners (numpy.array): Coordinates of the corners of 3D
+            rectangulars. Should be in the shape of [num_rect, 8, 2].
+        color (tuple[int]): The color to draw bboxes. Default: (0, 255, 0).
+        thickness (int, optional): The thickness of bboxes. Default: 1.
+    """
+    line_indices = ((0, 1), (0, 3), (0, 4), (1, 2), (1, 5), (3, 2), (3, 7),
+                    (4, 5), (4, 7), (2, 6), (5, 6), (6, 7))
+    for i in range(num_rects):
+        corners = rect_corners[i].astype(np.int)
+        for start, end in line_indices:
+            cv2.line(img, (corners[start, 0], corners[start, 1]),
+                     (corners[end, 0], corners[end, 1]), color, thickness,
+                     cv2.LINE_AA)
+
+    return img.astype(np.uint8)
+
+
+def draw_lidar_bbox3d_on_img(bboxes3d,
+                             raw_img,
+                             lidar2img_rt,
+                             img_metas,
+                             color=(0, 255, 0),
+                             thickness=1):
+    """Project the 3D bbox on 2D plane and draw on input image.
+
+    Args:
+        bboxes3d (:obj:`LiDARInstance3DBoxes`):
+            3d bbox in lidar coordinate system to visualize.
+        raw_img (numpy.array): The numpy array of image.
+        lidar2img_rt (numpy.array, shape=[4, 4]): The projection matrix
+            according to the camera intrinsic parameters.
+        img_metas (dict): Useless here.
+        color (tuple[int]): The color to draw bboxes. Default: (0, 255, 0).
+        thickness (int, optional): The thickness of bboxes. Default: 1.
+    """
+    img = raw_img.copy()
+    corners_3d = bboxes3d.corners
+    num_bbox = corners_3d.shape[0]
+    pts_4d = np.concatenate(
+        [corners_3d.reshape(-1, 3),
+         np.ones((num_bbox * 8, 1))], axis=-1)
+    lidar2img_rt = copy.deepcopy(lidar2img_rt).reshape(4, 4)
+    if isinstance(lidar2img_rt, torch.Tensor):
+        lidar2img_rt = lidar2img_rt.cpu().numpy()
+    pts_2d = pts_4d @ lidar2img_rt.T
+
+    pts_2d[:, 2] = np.clip(pts_2d[:, 2], a_min=1e-5, a_max=1e5)
+    pts_2d[:, 0] /= pts_2d[:, 2]
+    pts_2d[:, 1] /= pts_2d[:, 2]
+    imgfov_pts_2d = pts_2d[..., :2].reshape(num_bbox, 8, 2)
+
+    return plot_rect3d_on_img(img, num_bbox, imgfov_pts_2d, color, thickness)
+
+
+# TODO: remove third parameter in all functions here in favour of img_metas
+def draw_depth_bbox3d_on_img(bboxes3d,
+                             raw_img,
+                             calibs,
+                             img_metas,
+                             color=(0, 255, 0),
+                             thickness=1):
+    """Project the 3D bbox on 2D plane and draw on input image.
+
+    Args:
+        bboxes3d (:obj:`DepthInstance3DBoxes`, shape=[M, 7]):
+            3d bbox in depth coordinate system to visualize.
+        raw_img (numpy.array): The numpy array of image.
+        calibs (dict): Camera calibration information, Rt and K.
+        img_metas (dict): Used in coordinates transformation.
+        color (tuple[int]): The color to draw bboxes. Default: (0, 255, 0).
+        thickness (int, optional): The thickness of bboxes. Default: 1.
+    """
+    from mmcv.core.bbox import points_cam2img
+    from mmcv.models import apply_3d_transformation
+
+    img = raw_img.copy()
+    img_metas = copy.deepcopy(img_metas)
+    corners_3d = bboxes3d.corners
+    num_bbox = corners_3d.shape[0]
+    points_3d = corners_3d.reshape(-1, 3)
+
+    # first reverse the data transformations
+    xyz_depth = apply_3d_transformation(
+        points_3d, 'DEPTH', img_metas, reverse=True)
+
+    # project to 2d to get image coords (uv)
+    uv_origin = points_cam2img(xyz_depth,
+                               xyz_depth.new_tensor(img_metas['depth2img']))
+    uv_origin = (uv_origin - 1).round()
+    imgfov_pts_2d = uv_origin[..., :2].reshape(num_bbox, 8, 2).numpy()
+
+    return plot_rect3d_on_img(img, num_bbox, imgfov_pts_2d, color, thickness)
+
+
+def draw_camera_bbox3d_on_img(bboxes3d,
+                              raw_img,
+                              cam2img,
+                              img_metas,
+                              color=(0, 255, 0),
+                              thickness=1):
+    """Project the 3D bbox on 2D plane and draw on input image.
+
+    Args:
+        bboxes3d (:obj:`CameraInstance3DBoxes`, shape=[M, 7]):
+            3d bbox in camera coordinate system to visualize.
+        raw_img (numpy.array): The numpy array of image.
+        cam2img (dict): Camera intrinsic matrix,
+            denoted as `K` in depth bbox coordinate system.
+        img_metas (dict): Useless here.
+        color (tuple[int]): The color to draw bboxes. Default: (0, 255, 0).
+        thickness (int, optional): The thickness of bboxes. Default: 1.
+    """
+    from mmcv.core.bbox import points_cam2img
+
+    img = raw_img.copy()
+    cam2img = copy.deepcopy(cam2img)
+    corners_3d = bboxes3d.corners
+    num_bbox = corners_3d.shape[0]
+    points_3d = corners_3d.reshape(-1, 3)
+    if not isinstance(cam2img, torch.Tensor):
+        cam2img = torch.from_numpy(np.array(cam2img))
+    cam2img = cam2img.reshape(3, 3).float().cpu()
+
+    # project to 2d to get image coords (uv)
+    uv_origin = points_cam2img(points_3d, cam2img)
+    uv_origin = (uv_origin - 1).round()
+    imgfov_pts_2d = uv_origin[..., :2].reshape(num_bbox, 8, 2).numpy()
+
+    return plot_rect3d_on_img(img, num_bbox, imgfov_pts_2d, color, thickness)
diff --git a/mmcv/core/visualizer/open3d_vis.py b/mmcv/core/visualizer/open3d_vis.py
new file mode 100644
index 0000000..0790ee4
--- /dev/null
+++ b/mmcv/core/visualizer/open3d_vis.py
@@ -0,0 +1,443 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import numpy as np
+import torch
+
+try:
+    import open3d as o3d
+    from open3d import geometry
+except ImportError:
+    raise ImportError(
+        'Please run "pip install open3d" to install open3d first.')
+
+
+def _draw_points(points,
+                 vis,
+                 points_size=2,
+                 point_color=(0.5, 0.5, 0.5),
+                 mode='xyz'):
+    """Draw points on visualizer.
+
+    Args:
+        points (numpy.array | torch.tensor, shape=[N, 3+C]):
+            points to visualize.
+        vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer.
+        points_size (int): the size of points to show on visualizer.
+            Default: 2.
+        point_color (tuple[float]): the color of points.
+            Default: (0.5, 0.5, 0.5).
+        mode (str):  indicate type of the input points, avaliable mode
+            ['xyz', 'xyzrgb']. Default: 'xyz'.
+
+    Returns:
+        tuple: points, color of each point.
+    """
+    vis.get_render_option().point_size = points_size  # set points size
+    if isinstance(points, torch.Tensor):
+        points = points.cpu().numpy()
+
+    points = points.copy()
+    pcd = geometry.PointCloud()
+    if mode == 'xyz':
+        pcd.points = o3d.utility.Vector3dVector(points[:, :3])
+        points_colors = np.tile(np.array(point_color), (points.shape[0], 1))
+    elif mode == 'xyzrgb':
+        pcd.points = o3d.utility.Vector3dVector(points[:, :3])
+        points_colors = points[:, 3:6]
+        # normalize to [0, 1] for open3d drawing
+        if not ((points_colors >= 0.0) & (points_colors <= 1.0)).all():
+            points_colors /= 255.0
+    else:
+        raise NotImplementedError
+
+    pcd.colors = o3d.utility.Vector3dVector(points_colors)
+    vis.add_geometry(pcd)
+
+    return pcd, points_colors
+
+
+def _draw_bboxes(bbox3d,
+                 vis,
+                 points_colors,
+                 pcd=None,
+                 bbox_color=(0, 1, 0),
+                 points_in_box_color=(1, 0, 0),
+                 rot_axis=2,
+                 center_mode='lidar_bottom',
+                 mode='xyz'):
+    """Draw bbox on visualizer and change the color of points inside bbox3d.
+
+    Args:
+        bbox3d (numpy.array | torch.tensor, shape=[M, 7]):
+            3d bbox (x, y, z, dx, dy, dz, yaw) to visualize.
+        vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer.
+        points_colors (numpy.array): color of each points.
+        pcd (:obj:`open3d.geometry.PointCloud`): point cloud. Default: None.
+        bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0).
+        points_in_box_color (tuple[float]):
+            the color of points inside bbox3d. Default: (1, 0, 0).
+        rot_axis (int): rotation axis of bbox. Default: 2.
+        center_mode (bool): indicate the center of bbox is bottom center
+            or gravity center. avaliable mode
+            ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
+        mode (str):  indicate type of the input points, avaliable mode
+            ['xyz', 'xyzrgb']. Default: 'xyz'.
+    """
+    if isinstance(bbox3d, torch.Tensor):
+        bbox3d = bbox3d.cpu().numpy()
+    bbox3d = bbox3d.copy()
+
+    in_box_color = np.array(points_in_box_color)
+    for i in range(len(bbox3d)):
+        center = bbox3d[i, 0:3]
+        dim = bbox3d[i, 3:6]
+        yaw = np.zeros(3)
+        yaw[rot_axis] = -bbox3d[i, 6]
+        rot_mat = geometry.get_rotation_matrix_from_xyz(yaw)
+
+        if center_mode == 'lidar_bottom':
+            center[rot_axis] += dim[
+                rot_axis] / 2  # bottom center to gravity center
+        elif center_mode == 'camera_bottom':
+            center[rot_axis] -= dim[
+                rot_axis] / 2  # bottom center to gravity center
+        box3d = geometry.OrientedBoundingBox(center, rot_mat, dim)
+
+        line_set = geometry.LineSet.create_from_oriented_bounding_box(box3d)
+        line_set.paint_uniform_color(bbox_color)
+        # draw bboxes on visualizer
+        vis.add_geometry(line_set)
+
+        # change the color of points which are in box
+        if pcd is not None and mode == 'xyz':
+            indices = box3d.get_point_indices_within_bounding_box(pcd.points)
+            points_colors[indices] = in_box_color
+
+    # update points colors
+    if pcd is not None:
+        pcd.colors = o3d.utility.Vector3dVector(points_colors)
+        vis.update_geometry(pcd)
+
+
+def show_pts_boxes(points,
+                   bbox3d=None,
+                   show=True,
+                   save_path=None,
+                   points_size=2,
+                   point_color=(0.5, 0.5, 0.5),
+                   bbox_color=(0, 1, 0),
+                   points_in_box_color=(1, 0, 0),
+                   rot_axis=2,
+                   center_mode='lidar_bottom',
+                   mode='xyz'):
+    """Draw bbox and points on visualizer.
+
+    Args:
+        points (numpy.array | torch.tensor, shape=[N, 3+C]):
+            points to visualize.
+        bbox3d (numpy.array | torch.tensor, shape=[M, 7]):
+            3d bbox (x, y, z, dx, dy, dz, yaw) to visualize. Default: None.
+        show (bool): whether to show the visualization results. Default: True.
+        save_path (str): path to save visualized results. Default: None.
+        points_size (int): the size of points to show on visualizer.
+            Default: 2.
+        point_color (tuple[float]): the color of points.
+            Default: (0.5, 0.5, 0.5).
+        bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0).
+        points_in_box_color (tuple[float]):
+            the color of points which are in bbox3d. Default: (1, 0, 0).
+        rot_axis (int): rotation axis of bbox. Default: 2.
+        center_mode (bool): indicate the center of bbox is bottom center
+            or gravity center. avaliable mode
+            ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
+        mode (str):  indicate type of the input points, avaliable mode
+            ['xyz', 'xyzrgb']. Default: 'xyz'.
+    """
+    # TODO: support score and class info
+    assert 0 <= rot_axis <= 2
+
+    # init visualizer
+    vis = o3d.visualization.Visualizer()
+    vis.create_window()
+    mesh_frame = geometry.TriangleMesh.create_coordinate_frame(
+        size=1, origin=[0, 0, 0])  # create coordinate frame
+    vis.add_geometry(mesh_frame)
+
+    # draw points
+    pcd, points_colors = _draw_points(points, vis, points_size, point_color,
+                                      mode)
+
+    # draw boxes
+    if bbox3d is not None:
+        _draw_bboxes(bbox3d, vis, points_colors, pcd, bbox_color,
+                     points_in_box_color, rot_axis, center_mode, mode)
+
+    if show:
+        vis.run()
+
+    if save_path is not None:
+        vis.capture_screen_image(save_path)
+
+    vis.destroy_window()
+
+
+def _draw_bboxes_ind(bbox3d,
+                     vis,
+                     indices,
+                     points_colors,
+                     pcd=None,
+                     bbox_color=(0, 1, 0),
+                     points_in_box_color=(1, 0, 0),
+                     rot_axis=2,
+                     center_mode='lidar_bottom',
+                     mode='xyz'):
+    """Draw bbox on visualizer and change the color or points inside bbox3d
+    with indices.
+
+    Args:
+        bbox3d (numpy.array | torch.tensor, shape=[M, 7]):
+            3d bbox (x, y, z, dx, dy, dz, yaw) to visualize.
+        vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer.
+        indices (numpy.array | torch.tensor, shape=[N, M]):
+            indicate which bbox3d that each point lies in.
+        points_colors (numpy.array): color of each points.
+        pcd (:obj:`open3d.geometry.PointCloud`): point cloud. Default: None.
+        bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0).
+        points_in_box_color (tuple[float]):
+            the color of points which are in bbox3d. Default: (1, 0, 0).
+        rot_axis (int): rotation axis of bbox. Default: 2.
+        center_mode (bool): indicate the center of bbox is bottom center
+            or gravity center. avaliable mode
+            ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
+        mode (str):  indicate type of the input points, avaliable mode
+            ['xyz', 'xyzrgb']. Default: 'xyz'.
+    """
+    if isinstance(bbox3d, torch.Tensor):
+        bbox3d = bbox3d.cpu().numpy()
+    if isinstance(indices, torch.Tensor):
+        indices = indices.cpu().numpy()
+    bbox3d = bbox3d.copy()
+
+    in_box_color = np.array(points_in_box_color)
+    for i in range(len(bbox3d)):
+        center = bbox3d[i, 0:3]
+        dim = bbox3d[i, 3:6]
+        yaw = np.zeros(3)
+        # TODO: fix problem of current coordinate system
+        # dim[0], dim[1] = dim[1], dim[0]  # for current coordinate
+        # yaw[rot_axis] = -(bbox3d[i, 6] - 0.5 * np.pi)
+        yaw[rot_axis] = -bbox3d[i, 6]
+        rot_mat = geometry.get_rotation_matrix_from_xyz(yaw)
+        if center_mode == 'lidar_bottom':
+            center[rot_axis] += dim[
+                rot_axis] / 2  # bottom center to gravity center
+        elif center_mode == 'camera_bottom':
+            center[rot_axis] -= dim[
+                rot_axis] / 2  # bottom center to gravity center
+        box3d = geometry.OrientedBoundingBox(center, rot_mat, dim)
+
+        line_set = geometry.LineSet.create_from_oriented_bounding_box(box3d)
+        line_set.paint_uniform_color(bbox_color)
+        # draw bboxes on visualizer
+        vis.add_geometry(line_set)
+
+        # change the color of points which are in box
+        if pcd is not None and mode == 'xyz':
+            points_colors[indices[:, i].astype(np.bool)] = in_box_color
+
+    # update points colors
+    if pcd is not None:
+        pcd.colors = o3d.utility.Vector3dVector(points_colors)
+        vis.update_geometry(pcd)
+
+
+def show_pts_index_boxes(points,
+                         bbox3d=None,
+                         show=True,
+                         indices=None,
+                         save_path=None,
+                         points_size=2,
+                         point_color=(0.5, 0.5, 0.5),
+                         bbox_color=(0, 1, 0),
+                         points_in_box_color=(1, 0, 0),
+                         rot_axis=2,
+                         center_mode='lidar_bottom',
+                         mode='xyz'):
+    """Draw bbox and points on visualizer with indices that indicate which
+    bbox3d that each point lies in.
+
+    Args:
+        points (numpy.array | torch.tensor, shape=[N, 3+C]):
+            points to visualize.
+        bbox3d (numpy.array | torch.tensor, shape=[M, 7]):
+            3d bbox (x, y, z, dx, dy, dz, yaw) to visualize. Default: None.
+        show (bool): whether to show the visualization results. Default: True.
+        indices (numpy.array | torch.tensor, shape=[N, M]):
+            indicate which bbox3d that each point lies in. Default: None.
+        save_path (str): path to save visualized results. Default: None.
+        points_size (int): the size of points to show on visualizer.
+            Default: 2.
+        point_color (tuple[float]): the color of points.
+            Default: (0.5, 0.5, 0.5).
+        bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0).
+        points_in_box_color (tuple[float]):
+            the color of points which are in bbox3d. Default: (1, 0, 0).
+        rot_axis (int): rotation axis of bbox. Default: 2.
+        center_mode (bool): indicate the center of bbox is bottom center
+            or gravity center. avaliable mode
+            ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
+        mode (str):  indicate type of the input points, avaliable mode
+            ['xyz', 'xyzrgb']. Default: 'xyz'.
+    """
+    # TODO: support score and class info
+    assert 0 <= rot_axis <= 2
+
+    # init visualizer
+    vis = o3d.visualization.Visualizer()
+    vis.create_window()
+    mesh_frame = geometry.TriangleMesh.create_coordinate_frame(
+        size=1, origin=[0, 0, 0])  # create coordinate frame
+    vis.add_geometry(mesh_frame)
+
+    # draw points
+    pcd, points_colors = _draw_points(points, vis, points_size, point_color,
+                                      mode)
+
+    # draw boxes
+    if bbox3d is not None:
+        _draw_bboxes_ind(bbox3d, vis, indices, points_colors, pcd, bbox_color,
+                         points_in_box_color, rot_axis, center_mode, mode)
+
+    if show:
+        vis.run()
+
+    if save_path is not None:
+        vis.capture_screen_image(save_path)
+
+    vis.destroy_window()
+
+
+class Visualizer(object):
+    r"""Online visualizer implemented with Open3d.
+
+    Args:
+        points (numpy.array, shape=[N, 3+C]): Points to visualize. The Points
+            cloud is in mode of Coord3DMode.DEPTH (please refer to
+            core.structures.coord_3d_mode).
+        bbox3d (numpy.array, shape=[M, 7]): 3d bbox (x, y, z, dx, dy, dz, yaw)
+            to visualize. The 3d bbox is in mode of Box3DMode.DEPTH with
+            gravity_center (please refer to core.structures.box_3d_mode).
+            Default: None.
+        save_path (str): path to save visualized results. Default: None.
+        points_size (int): the size of points to show on visualizer.
+            Default: 2.
+        point_color (tuple[float]): the color of points.
+            Default: (0.5, 0.5, 0.5).
+        bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0).
+        points_in_box_color (tuple[float]):
+            the color of points which are in bbox3d. Default: (1, 0, 0).
+        rot_axis (int): rotation axis of bbox. Default: 2.
+        center_mode (bool): indicate the center of bbox is bottom center
+            or gravity center. avaliable mode
+            ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
+        mode (str):  indicate type of the input points, avaliable mode
+            ['xyz', 'xyzrgb']. Default: 'xyz'.
+    """
+
+    def __init__(self,
+                 points,
+                 bbox3d=None,
+                 save_path=None,
+                 points_size=2,
+                 point_color=(0.5, 0.5, 0.5),
+                 bbox_color=(0, 1, 0),
+                 points_in_box_color=(1, 0, 0),
+                 rot_axis=2,
+                 center_mode='lidar_bottom',
+                 mode='xyz'):
+        super(Visualizer, self).__init__()
+        assert 0 <= rot_axis <= 2
+
+        # init visualizer
+        self.o3d_visualizer = o3d.visualization.Visualizer()
+        self.o3d_visualizer.create_window()
+        mesh_frame = geometry.TriangleMesh.create_coordinate_frame(
+            size=1, origin=[0, 0, 0])  # create coordinate frame
+        self.o3d_visualizer.add_geometry(mesh_frame)
+
+        self.points_size = points_size
+        self.point_color = point_color
+        self.bbox_color = bbox_color
+        self.points_in_box_color = points_in_box_color
+        self.rot_axis = rot_axis
+        self.center_mode = center_mode
+        self.mode = mode
+        self.seg_num = 0
+
+        # draw points
+        if points is not None:
+            self.pcd, self.points_colors = _draw_points(
+                points, self.o3d_visualizer, points_size, point_color, mode)
+
+        # draw boxes
+        if bbox3d is not None:
+            _draw_bboxes(bbox3d, self.o3d_visualizer, self.points_colors,
+                         self.pcd, bbox_color, points_in_box_color, rot_axis,
+                         center_mode, mode)
+
+    def add_bboxes(self, bbox3d, bbox_color=None, points_in_box_color=None):
+        """Add bounding box to visualizer.
+
+        Args:
+            bbox3d (numpy.array, shape=[M, 7]):
+                3D bbox (x, y, z, dx, dy, dz, yaw) to be visualized.
+                The 3d bbox is in mode of Box3DMode.DEPTH with
+                gravity_center (please refer to core.structures.box_3d_mode).
+            bbox_color (tuple[float]): the color of bbox. Defaule: None.
+            points_in_box_color (tuple[float]): the color of points which
+                are in bbox3d. Defaule: None.
+        """
+        if bbox_color is None:
+            bbox_color = self.bbox_color
+        if points_in_box_color is None:
+            points_in_box_color = self.points_in_box_color
+        _draw_bboxes(bbox3d, self.o3d_visualizer, self.points_colors, self.pcd,
+                     bbox_color, points_in_box_color, self.rot_axis,
+                     self.center_mode, self.mode)
+
+    def add_seg_mask(self, seg_mask_colors):
+        """Add segmentation mask to visualizer via per-point colorization.
+
+        Args:
+            seg_mask_colors (numpy.array, shape=[N, 6]):
+                The segmentation mask whose first 3 dims are point coordinates
+                and last 3 dims are converted colors.
+        """
+        # we can't draw the colors on existing points
+        # in case gt and pred mask would overlap
+        # instead we set a large offset along x-axis for each seg mask
+        self.seg_num += 1
+        offset = (np.array(self.pcd.points).max(0) -
+                  np.array(self.pcd.points).min(0))[0] * 1.2 * self.seg_num
+        mesh_frame = geometry.TriangleMesh.create_coordinate_frame(
+            size=1, origin=[offset, 0, 0])  # create coordinate frame for seg
+        self.o3d_visualizer.add_geometry(mesh_frame)
+        seg_points = copy.deepcopy(seg_mask_colors)
+        seg_points[:, 0] += offset
+        _draw_points(
+            seg_points, self.o3d_visualizer, self.points_size, mode='xyzrgb')
+
+    def show(self, save_path=None):
+        """Visualize the points cloud.
+
+        Args:
+            save_path (str): path to save image. Default: None.
+        """
+
+        self.o3d_visualizer.run()
+
+        if save_path is not None:
+            self.o3d_visualizer.capture_screen_image(save_path)
+
+        self.o3d_visualizer.destroy_window()
+        return
diff --git a/mmcv/core/visualizer/show_result.py b/mmcv/core/visualizer/show_result.py
new file mode 100644
index 0000000..eb50be1
--- /dev/null
+++ b/mmcv/core/visualizer/show_result.py
@@ -0,0 +1,272 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import trimesh
+from os import path as osp
+
+from mmcv.utils import mkdir_or_exist
+from mmcv.image import imwrite
+
+from .image_vis import (draw_camera_bbox3d_on_img, draw_depth_bbox3d_on_img,
+                        draw_lidar_bbox3d_on_img)
+
+
+def _write_obj(points, out_filename):
+    """Write points into ``obj`` format for meshlab visualization.
+
+    Args:
+        points (np.ndarray): Points in shape (N, dim).
+        out_filename (str): Filename to be saved.
+    """
+    N = points.shape[0]
+    fout = open(out_filename, 'w')
+    for i in range(N):
+        if points.shape[1] == 6:
+            c = points[i, 3:].astype(int)
+            fout.write(
+                'v %f %f %f %d %d %d\n' %
+                (points[i, 0], points[i, 1], points[i, 2], c[0], c[1], c[2]))
+
+        else:
+            fout.write('v %f %f %f\n' %
+                       (points[i, 0], points[i, 1], points[i, 2]))
+    fout.close()
+
+
+def _write_oriented_bbox(scene_bbox, out_filename):
+    """Export oriented (around Z axis) scene bbox to meshes.
+
+    Args:
+        scene_bbox(list[ndarray] or ndarray): xyz pos of center and
+            3 lengths (dx,dy,dz) and heading angle around Z axis.
+            Y forward, X right, Z upward. heading angle of positive X is 0,
+            heading angle of positive Y is 90 degrees.
+        out_filename(str): Filename.
+    """
+
+    def heading2rotmat(heading_angle):
+        rotmat = np.zeros((3, 3))
+        rotmat[2, 2] = 1
+        cosval = np.cos(heading_angle)
+        sinval = np.sin(heading_angle)
+        rotmat[0:2, 0:2] = np.array([[cosval, -sinval], [sinval, cosval]])
+        return rotmat
+
+    def convert_oriented_box_to_trimesh_fmt(box):
+        ctr = box[:3]
+        lengths = box[3:6]
+        trns = np.eye(4)
+        trns[0:3, 3] = ctr
+        trns[3, 3] = 1.0
+        trns[0:3, 0:3] = heading2rotmat(box[6])
+        box_trimesh_fmt = trimesh.creation.box(lengths, trns)
+        return box_trimesh_fmt
+
+    if len(scene_bbox) == 0:
+        scene_bbox = np.zeros((1, 7))
+    scene = trimesh.scene.Scene()
+    for box in scene_bbox:
+        scene.add_geometry(convert_oriented_box_to_trimesh_fmt(box))
+
+    mesh_list = trimesh.util.concatenate(scene.dump())
+    # save to obj file
+    trimesh.io.export.export_mesh(mesh_list, out_filename, file_type='obj')
+
+    return
+
+
+def show_result(points,
+                gt_bboxes,
+                pred_bboxes,
+                out_dir,
+                filename,
+                show=True,
+                snapshot=False):
+    """Convert results into format that is directly readable for meshlab.
+
+    Args:
+        points (np.ndarray): Points.
+        gt_bboxes (np.ndarray): Ground truth boxes.
+        pred_bboxes (np.ndarray): Predicted boxes.
+        out_dir (str): Path of output directory
+        filename (str): Filename of the current frame.
+        show (bool): Visualize the results online. Defaults to False.
+        snapshot (bool): Whether to save the online results. Defaults to False.
+    """
+    result_path = osp.join(out_dir, filename)
+    mkdir_or_exist(result_path)
+
+    if show:
+        from .open3d_vis import Visualizer
+
+        vis = Visualizer(points)
+        if pred_bboxes is not None:
+            vis.add_bboxes(bbox3d=pred_bboxes)
+        if gt_bboxes is not None:
+            vis.add_bboxes(bbox3d=gt_bboxes, bbox_color=(0, 0, 1))
+        show_path = osp.join(result_path,
+                             f'{filename}_online.png') if snapshot else None
+        vis.show(show_path)
+
+    if points is not None:
+        _write_obj(points, osp.join(result_path, f'{filename}_points.obj'))
+
+    if gt_bboxes is not None:
+        # bottom center to gravity center
+        gt_bboxes[..., 2] += gt_bboxes[..., 5] / 2
+        # the positive direction for yaw in meshlab is clockwise
+        gt_bboxes[:, 6] *= -1
+        _write_oriented_bbox(gt_bboxes,
+                             osp.join(result_path, f'{filename}_gt.obj'))
+
+    if pred_bboxes is not None:
+        # bottom center to gravity center
+        pred_bboxes[..., 2] += pred_bboxes[..., 5] / 2
+        # the positive direction for yaw in meshlab is clockwise
+        pred_bboxes[:, 6] *= -1
+        _write_oriented_bbox(pred_bboxes,
+                             osp.join(result_path, f'{filename}_pred.obj'))
+
+
+def show_seg_result(points,
+                    gt_seg,
+                    pred_seg,
+                    out_dir,
+                    filename,
+                    palette,
+                    ignore_index=None,
+                    show=True,
+                    snapshot=False):
+    """Convert results into format that is directly readable for meshlab.
+
+    Args:
+        points (np.ndarray): Points.
+        gt_seg (np.ndarray): Ground truth segmentation mask.
+        pred_seg (np.ndarray): Predicted segmentation mask.
+        out_dir (str): Path of output directory
+        filename (str): Filename of the current frame.
+        palette (np.ndarray): Mapping between class labels and colors.
+        ignore_index (int, optional): The label index to be ignored, e.g. \
+            unannotated points. Defaults to None.
+        show (bool, optional): Visualize the results online. Defaults to False.
+        snapshot (bool, optional): Whether to save the online results. \
+            Defaults to False.
+    """
+    # we need 3D coordinates to visualize segmentation mask
+    if gt_seg is not None or pred_seg is not None:
+        assert points is not None, \
+            '3D coordinates are required for segmentation visualization'
+
+    # filter out ignored points
+    if gt_seg is not None and ignore_index is not None:
+        if points is not None:
+            points = points[gt_seg != ignore_index]
+        if pred_seg is not None:
+            pred_seg = pred_seg[gt_seg != ignore_index]
+        gt_seg = gt_seg[gt_seg != ignore_index]
+
+    if gt_seg is not None:
+        gt_seg_color = palette[gt_seg]
+        gt_seg_color = np.concatenate([points[:, :3], gt_seg_color], axis=1)
+    if pred_seg is not None:
+        pred_seg_color = palette[pred_seg]
+        pred_seg_color = np.concatenate([points[:, :3], pred_seg_color],
+                                        axis=1)
+
+    result_path = osp.join(out_dir, filename)
+    mkdir_or_exist(result_path)
+
+    # online visualization of segmentation mask
+    # we show three masks in a row, scene_points, gt_mask, pred_mask
+    if show:
+        from .open3d_vis import Visualizer
+        mode = 'xyzrgb' if points.shape[1] == 6 else 'xyz'
+        vis = Visualizer(points, mode=mode)
+        if gt_seg is not None:
+            vis.add_seg_mask(gt_seg_color)
+        if pred_seg is not None:
+            vis.add_seg_mask(pred_seg_color)
+        show_path = osp.join(result_path,
+                             f'{filename}_online.png') if snapshot else None
+        vis.show(show_path)
+
+    if points is not None:
+        _write_obj(points, osp.join(result_path, f'{filename}_points.obj'))
+
+    if gt_seg is not None:
+        _write_obj(gt_seg_color, osp.join(result_path, f'{filename}_gt.obj'))
+
+    if pred_seg is not None:
+        _write_obj(pred_seg_color, osp.join(result_path,
+                                            f'{filename}_pred.obj'))
+
+
+def show_multi_modality_result(img,
+                               gt_bboxes,
+                               pred_bboxes,
+                               proj_mat,
+                               out_dir,
+                               filename,
+                               box_mode='lidar',
+                               img_metas=None,
+                               show=True,
+                               gt_bbox_color=(61, 102, 255),
+                               pred_bbox_color=(241, 101, 72)):
+    """Convert multi-modality detection results into 2D results.
+
+    Project the predicted 3D bbox to 2D image plane and visualize them.
+
+    Args:
+        img (np.ndarray): The numpy array of image in cv2 fashion.
+        gt_bboxes (:obj:`BaseInstance3DBoxes`): Ground truth boxes.
+        pred_bboxes (:obj:`BaseInstance3DBoxes`): Predicted boxes.
+        proj_mat (numpy.array, shape=[4, 4]): The projection matrix
+            according to the camera intrinsic parameters.
+        out_dir (str): Path of output directory.
+        filename (str): Filename of the current frame.
+        box_mode (str): Coordinate system the boxes are in. Should be one of
+           'depth', 'lidar' and 'camera'. Defaults to 'lidar'.
+        img_metas (dict): Used in projecting depth bbox.
+        show (bool): Visualize the results online. Defaults to False.
+        gt_bbox_color (str or tuple(int)): Color of bbox lines.
+           The tuple of color should be in BGR order. Default: (255, 102, 61)
+        pred_bbox_color (str or tuple(int)): Color of bbox lines.
+           The tuple of color should be in BGR order. Default: (72, 101, 241)
+    """
+    if box_mode == 'depth':
+        draw_bbox = draw_depth_bbox3d_on_img
+    elif box_mode == 'lidar':
+        draw_bbox = draw_lidar_bbox3d_on_img
+    elif box_mode == 'camera':
+        draw_bbox = draw_camera_bbox3d_on_img
+    else:
+        raise NotImplementedError(f'unsupported box mode {box_mode}')
+
+    result_path = osp.join(out_dir, filename)
+    mkdir_or_exist(result_path)
+
+    if show:
+        show_img = img.copy()
+        if gt_bboxes is not None:
+            show_img = draw_bbox(
+                gt_bboxes, show_img, proj_mat, img_metas, color=gt_bbox_color)
+        if pred_bboxes is not None:
+            show_img = draw_bbox(
+                pred_bboxes,
+                show_img,
+                proj_mat,
+                img_metas,
+                color=pred_bbox_color)
+        mmcv.imshow(show_img, win_name='project_bbox3d_img', wait_time=0)
+
+    if img is not None:
+        imwrite(img, osp.join(result_path, f'{filename}_img.png'))
+
+    if gt_bboxes is not None:
+        gt_img = draw_bbox(
+            gt_bboxes, img, proj_mat, img_metas, color=gt_bbox_color)
+        imwrite(gt_img, osp.join(result_path, f'{filename}_gt.png'))
+
+    if pred_bboxes is not None:
+        pred_img = draw_bbox(
+            pred_bboxes, img, proj_mat, img_metas, color=pred_bbox_color)
+        imwrite(pred_img, osp.join(result_path, f'{filename}_pred.png'))
diff --git a/mmcv/core/voxel/__init__.py b/mmcv/core/voxel/__init__.py
new file mode 100644
index 0000000..8d69543
--- /dev/null
+++ b/mmcv/core/voxel/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import build_voxel_generator
+from .voxel_generator import VoxelGenerator
+
+__all__ = ['build_voxel_generator', 'VoxelGenerator']
diff --git a/mmcv/core/voxel/builder.py b/mmcv/core/voxel/builder.py
new file mode 100644
index 0000000..d7fe494
--- /dev/null
+++ b/mmcv/core/voxel/builder.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from . import voxel_generator
+from mmcv.utils import obj_from_dict
+
+def build_voxel_generator(cfg, **kwargs):
+    """Builder of voxel generator."""
+    if isinstance(cfg, voxel_generator.VoxelGenerator):
+        return cfg
+    elif isinstance(cfg, dict):
+        return obj_from_dict(
+            cfg, voxel_generator, default_args=kwargs)
+    else:
+        raise TypeError('Invalid type {} for building a sampler'.format(
+            type(cfg)))
diff --git a/mmcv/core/voxel/voxel_generator.py b/mmcv/core/voxel/voxel_generator.py
new file mode 100644
index 0000000..615b749
--- /dev/null
+++ b/mmcv/core/voxel/voxel_generator.py
@@ -0,0 +1,280 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numba
+import numpy as np
+
+
+class VoxelGenerator(object):
+    """Voxel generator in numpy implementation.
+
+    Args:
+        voxel_size (list[float]): Size of a single voxel
+        point_cloud_range (list[float]): Range of points
+        max_num_points (int): Maximum number of points in a single voxel
+        max_voxels (int, optional): Maximum number of voxels.
+            Defaults to 20000.
+    """
+
+    def __init__(self,
+                 voxel_size,
+                 point_cloud_range,
+                 max_num_points,
+                 max_voxels=20000):
+
+        point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
+        # [0, -40, -3, 70.4, 40, 1]
+        voxel_size = np.array(voxel_size, dtype=np.float32)
+        grid_size = (point_cloud_range[3:] -
+                     point_cloud_range[:3]) / voxel_size
+        grid_size = np.round(grid_size).astype(np.int64)
+
+        self._voxel_size = voxel_size
+        self._point_cloud_range = point_cloud_range
+        self._max_num_points = max_num_points
+        self._max_voxels = max_voxels
+        self._grid_size = grid_size
+
+    def generate(self, points):
+        """Generate voxels given points."""
+        return points_to_voxel(points, self._voxel_size,
+                               self._point_cloud_range, self._max_num_points,
+                               True, self._max_voxels)
+
+    @property
+    def voxel_size(self):
+        """list[float]: Size of a single voxel."""
+        return self._voxel_size
+
+    @property
+    def max_num_points_per_voxel(self):
+        """int: Maximum number of points per voxel."""
+        return self._max_num_points
+
+    @property
+    def point_cloud_range(self):
+        """list[float]: Range of point cloud."""
+        return self._point_cloud_range
+
+    @property
+    def grid_size(self):
+        """np.ndarray: The size of grids."""
+        return self._grid_size
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        indent = ' ' * (len(repr_str) + 1)
+        repr_str += f'(voxel_size={self._voxel_size},\n'
+        repr_str += indent + 'point_cloud_range='
+        repr_str += f'{self._point_cloud_range.tolist()},\n'
+        repr_str += indent + f'max_num_points={self._max_num_points},\n'
+        repr_str += indent + f'max_voxels={self._max_voxels},\n'
+        repr_str += indent + f'grid_size={self._grid_size.tolist()}'
+        repr_str += ')'
+        return repr_str
+
+
+def points_to_voxel(points,
+                    voxel_size,
+                    coors_range,
+                    max_points=35,
+                    reverse_index=True,
+                    max_voxels=20000):
+    """convert kitti points(N, >=3) to voxels.
+
+    Args:
+        points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and \
+            points[:, 3:] contain other information such as reflectivity.
+        voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size
+        coors_range (list[float | tuple[float] | ndarray]): Voxel range. \
+            format: xyzxyz, minmax
+        max_points (int): Indicate maximum points contained in a voxel.
+        reverse_index (bool): Whether return reversed coordinates. \
+            if points has xyz format and reverse_index is True, output \
+            coordinates will be zyx format, but points in features always \
+            xyz format.
+        max_voxels (int): Maximum number of voxels this function creates. \
+            For second, 20000 is a good choice. Points should be shuffled for \
+            randomness before this function because max_voxels drops points.
+
+    Returns:
+        tuple[np.ndarray]:
+            voxels: [M, max_points, ndim] float tensor. only contain points.
+            coordinates: [M, 3] int32 tensor.
+            num_points_per_voxel: [M] int32 tensor.
+    """
+    if not isinstance(voxel_size, np.ndarray):
+        voxel_size = np.array(voxel_size, dtype=points.dtype)
+    if not isinstance(coors_range, np.ndarray):
+        coors_range = np.array(coors_range, dtype=points.dtype)
+    voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size
+    voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist())
+    if reverse_index:
+        voxelmap_shape = voxelmap_shape[::-1]
+    # don't create large array in jit(nopython=True) code.
+    num_points_per_voxel = np.zeros(shape=(max_voxels, ), dtype=np.int32)
+    coor_to_voxelidx = -np.ones(shape=voxelmap_shape, dtype=np.int32)
+    voxels = np.zeros(
+        shape=(max_voxels, max_points, points.shape[-1]), dtype=points.dtype)
+    coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32)
+    if reverse_index:
+        voxel_num = _points_to_voxel_reverse_kernel(
+            points, voxel_size, coors_range, num_points_per_voxel,
+            coor_to_voxelidx, voxels, coors, max_points, max_voxels)
+
+    else:
+        voxel_num = _points_to_voxel_kernel(points, voxel_size, coors_range,
+                                            num_points_per_voxel,
+                                            coor_to_voxelidx, voxels, coors,
+                                            max_points, max_voxels)
+
+    coors = coors[:voxel_num]
+    voxels = voxels[:voxel_num]
+    num_points_per_voxel = num_points_per_voxel[:voxel_num]
+
+    return voxels, coors, num_points_per_voxel
+
+
+@numba.jit(nopython=True)
+def _points_to_voxel_reverse_kernel(points,
+                                    voxel_size,
+                                    coors_range,
+                                    num_points_per_voxel,
+                                    coor_to_voxelidx,
+                                    voxels,
+                                    coors,
+                                    max_points=35,
+                                    max_voxels=20000):
+    """convert kitti points(N, >=3) to voxels.
+
+    Args:
+        points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and \
+            points[:, 3:] contain other information such as reflectivity.
+        voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size \
+        coors_range (list[float | tuple[float] | ndarray]): Range of voxels. \
+            format: xyzxyz, minmax
+        num_points_per_voxel (int): Number of points per voxel.
+        coor_to_voxel_idx (np.ndarray): A voxel grid of shape (D, H, W), \
+            which has the same shape as the complete voxel map. It indicates \
+            the index of each corresponding voxel.
+        voxels (np.ndarray): Created empty voxels.
+        coors (np.ndarray): Created coordinates of each voxel.
+        max_points (int): Indicate maximum points contained in a voxel.
+        max_voxels (int): Maximum number of voxels this function create. \
+            for second, 20000 is a good choice. Points should be shuffled for \
+            randomness before this function because max_voxels drops points.
+
+    Returns:
+        tuple[np.ndarray]:
+            voxels: Shape [M, max_points, ndim], only contain points.
+            coordinates: Shape [M, 3].
+            num_points_per_voxel: Shape [M].
+    """
+    # put all computations to one loop.
+    # we shouldn't create large array in main jit code, otherwise
+    # reduce performance
+    N = points.shape[0]
+    # ndim = points.shape[1] - 1
+    ndim = 3
+    ndim_minus_1 = ndim - 1
+    grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size
+    # np.round(grid_size)
+    # grid_size = np.round(grid_size).astype(np.int64)(np.int32)
+    grid_size = np.round(grid_size, 0, grid_size).astype(np.int32)
+    coor = np.zeros(shape=(3, ), dtype=np.int32)
+    voxel_num = 0
+    failed = False
+    for i in range(N):
+        failed = False
+        for j in range(ndim):
+            c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j])
+            if c < 0 or c >= grid_size[j]:
+                failed = True
+                break
+            coor[ndim_minus_1 - j] = c
+        if failed:
+            continue
+        voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]]
+        if voxelidx == -1:
+            voxelidx = voxel_num
+            if voxel_num >= max_voxels:
+                continue
+            voxel_num += 1
+            coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx
+            coors[voxelidx] = coor
+        num = num_points_per_voxel[voxelidx]
+        if num < max_points:
+            voxels[voxelidx, num] = points[i]
+            num_points_per_voxel[voxelidx] += 1
+    return voxel_num
+
+
+@numba.jit(nopython=True)
+def _points_to_voxel_kernel(points,
+                            voxel_size,
+                            coors_range,
+                            num_points_per_voxel,
+                            coor_to_voxelidx,
+                            voxels,
+                            coors,
+                            max_points=35,
+                            max_voxels=20000):
+    """convert kitti points(N, >=3) to voxels.
+
+    Args:
+        points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and \
+            points[:, 3:] contain other information such as reflectivity.
+        voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size.
+        coors_range (list[float | tuple[float] | ndarray]): Range of voxels. \
+            format: xyzxyz, minmax
+        num_points_per_voxel (int): Number of points per voxel.
+        coor_to_voxel_idx (np.ndarray): A voxel grid of shape (D, H, W), \
+            which has the same shape as the complete voxel map. It indicates \
+            the index of each corresponding voxel.
+        voxels (np.ndarray): Created empty voxels.
+        coors (np.ndarray): Created coordinates of each voxel.
+        max_points (int): Indicate maximum points contained in a voxel.
+        max_voxels (int): Maximum number of voxels this function create. \
+            for second, 20000 is a good choice. Points should be shuffled for \
+            randomness before this function because max_voxels drops points.
+
+    Returns:
+        tuple[np.ndarray]:
+            voxels: Shape [M, max_points, ndim], only contain points.
+            coordinates: Shape [M, 3].
+            num_points_per_voxel: Shape [M].
+    """
+    N = points.shape[0]
+    # ndim = points.shape[1] - 1
+    ndim = 3
+    grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size
+    # grid_size = np.round(grid_size).astype(np.int64)(np.int32)
+    grid_size = np.round(grid_size, 0, grid_size).astype(np.int32)
+
+    # lower_bound = coors_range[:3]
+    # upper_bound = coors_range[3:]
+    coor = np.zeros(shape=(3, ), dtype=np.int32)
+    voxel_num = 0
+    failed = False
+    for i in range(N):
+        failed = False
+        for j in range(ndim):
+            c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j])
+            if c < 0 or c >= grid_size[j]:
+                failed = True
+                break
+            coor[j] = c
+        if failed:
+            continue
+        voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]]
+        if voxelidx == -1:
+            voxelidx = voxel_num
+            if voxel_num >= max_voxels:
+                continue
+            voxel_num += 1
+            coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx
+            coors[voxelidx] = coor
+        num = num_points_per_voxel[voxelidx]
+        if num < max_points:
+            voxels[voxelidx, num] = points[i]
+            num_points_per_voxel[voxelidx] += 1
+    return voxel_num
diff --git a/mmcv/datasets/B2D_dataset.py b/mmcv/datasets/B2D_dataset.py
new file mode 100644
index 0000000..530c8eb
--- /dev/null
+++ b/mmcv/datasets/B2D_dataset.py
@@ -0,0 +1,504 @@
+import copy
+import numpy as np
+from mmcv.datasets import DATASETS
+from os import path as osp
+import torch
+from pyquaternion import Quaternion
+from mmcv.utils import save_tensor
+from mmcv.parallel import DataContainer as DC
+import random
+from .custom_3d import Custom3DDataset
+from .pipelines import Compose
+from mmcv.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
+from mmcv.fileio.io import load, dump
+from mmcv.utils import track_iter_progress, mkdir_or_exist
+import tempfile
+from .nuscenes_styled_eval_utils import DetectionMetrics, EvalBoxes, DetectionBox,center_distance,accumulate,DetectionMetricDataList,calc_ap, calc_tp, quaternion_yaw
+import json
+
+@DATASETS.register_module()
+class B2D_Dataset(Custom3DDataset):
+
+
+    def __init__(self, queue_length=4, bev_size=(200, 200),overlap_test=False,with_velocity=True,sample_interval=5,name_mapping= None,eval_cfg = None ,*args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.queue_length = queue_length
+        self.overlap_test = overlap_test
+        self.with_velocity = with_velocity
+        if name_mapping is not None:
+            self.NameMapping  = name_mapping
+        else:
+            self.NameMapping = {
+                'vehicle.bh.crossbike': 'bicycle',
+                "vehicle.diamondback.century": 'bicycle',
+                "vehicle.chevrolet.impala": 'car',
+                "vehicle.dodge.charger_2020": 'car',
+                "vehicle.dodge.charger_police_2020": 'car',
+                "vehicle.lincoln.mkz_2017": 'car',
+                "vehicle.lincoln.mkz_2020": 'car',
+                "vehicle.mini.cooper_s_2021": 'car',
+                "vehicle.mercedes.coupe_2020": 'car',
+                "vehicle.ford.mustang": 'car',
+                "vehicle.nissan.patrol_2021": 'car',
+                "vehicle.audi.tt": 'car',
+                "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/FordCrown/SM_FordCrown_parked.SM_FordCrown_parked": 'car',
+                "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": "van",
+                "traffic.speed_limit.30": 'speed_limit',
+                "traffic.speed_limit.40": 'speed_limit',
+                "traffic.speed_limit.50": 'speed_limit',
+                "traffic.speed_limit.60": 'speed_limit',
+                "traffic.traffic_light": 'traffic_light',
+                "traffic.stop": 'stop',
+            }
+        if eval_cfg is not None:
+            self.eval_cfg  = eval_cfg
+        else:
+            self.eval_cfg = {
+                "dist_ths": [0.5, 1.0, 2.0, 4.0],
+                "dist_th_tp": 2.0,
+                "min_recall": 0.1,
+                "min_precision": 0.1,
+                "mean_ap_weight": 5,
+                "class_names":['car','van','bicycle'],
+                "tp_metrics":['trans_err', 'scale_err', 'orient_err', 'vel_err'],
+                "err_name_maping":{'trans_err': 'mATE','scale_err': 'mASE','orient_err': 'mAOE','vel_err': 'mAVE','attr_err': 'mAAE'}
+            }
+        self.sample_interval = sample_interval
+
+
+    def invert_pose(self, pose):
+        inv_pose = np.eye(4)
+        inv_pose[:3, :3] = np.transpose(pose[:3, :3])
+        inv_pose[:3, -1] = - inv_pose[:3, :3] @ pose[:3, -1]
+        return inv_pose
+
+    def prepare_train_data(self, index):
+        """
+        Training data preparation.
+        Args:
+            index (int): Index for accessing the target data.
+        Returns:
+            dict: Training data dict of the corresponding index.
+        """
+        queue = []
+        index_list = list(range(index-self.queue_length*self.sample_interval, index,self.sample_interval))
+        random.shuffle(index_list)
+        index_list = sorted(index_list[1:])
+        index_list.append(index)
+        for i in index_list:
+            i = max(0, i)
+            input_dict = self.get_data_info(i)
+            if input_dict is None:
+                return None
+            self.pre_pipeline(input_dict)
+            example = self.pipeline(input_dict)
+            if self.filter_empty_gt and \
+                    (example is None or ~(example['gt_labels_3d']._data != -1).any()):
+                return None
+            queue.append(example)
+        return self.union2one(queue)
+
+
+    def union2one(self, queue):
+        imgs_list = [each['img'].data for each in queue]
+        metas_map = {}
+        prev_scene_token = None
+        prev_pos = None
+        prev_angle = None
+        for i, each in enumerate(queue):
+            metas_map[i] = each['img_metas'].data
+            if metas_map[i]['folder'] != prev_scene_token:
+                metas_map[i]['prev_bev_exists'] = False
+                prev_scene_token = metas_map[i]['folder']
+                prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] = 0
+                metas_map[i]['can_bus'][-1] = 0
+            else:
+                metas_map[i]['prev_bev_exists'] = True
+                tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] -= prev_pos
+                metas_map[i]['can_bus'][-1] -= prev_angle
+                prev_pos = copy.deepcopy(tmp_pos)
+                prev_angle = copy.deepcopy(tmp_angle)
+        queue[-1]['img'] = DC(torch.stack(imgs_list), cpu_only=False, stack=True)
+        queue[-1]['img_metas'] = DC(metas_map, cpu_only=True)
+        queue = queue[-1]
+        return queue
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data \
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - sweeps (list[dict]): Infos of sweeps.
+                - timestamp (float): Sample timestamp.
+                - img_filename (str, optional): Image filename.
+                - lidar2img (list[np.ndarray], optional): Transformations \
+                    from lidar to different cameras.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+        for i in range(len(info['gt_names'])):
+            if info['gt_names'][i] in self.NameMapping.keys():
+                info['gt_names'][i] = self.NameMapping[info['gt_names'][i]]
+
+        input_dict = dict(
+            folder=info['folder'],
+            scene_token=info['folder'],
+            frame_idx=info['frame_idx'],
+            ego_yaw=np.nan_to_num(info['ego_yaw'],nan=90),
+            ego_translation=info['ego_translation'],
+            sensors=info['sensors'],
+            gt_ids=info['gt_ids'],
+            gt_boxes=info['gt_boxes'],
+            gt_names=info['gt_names'],
+            ego_vel = info['ego_vel'],
+            ego_accel = info['ego_accel'],
+            ego_rotation_rate = info['ego_rotation_rate'],
+        )
+        if self.modality['use_camera']:
+            image_paths = []
+            lidar2img_rts = []
+            lidar2cam_rts = []
+            cam_intrinsics = []
+            lidar2ego = info['sensors']['LIDAR_TOP']['lidar2ego']
+            for sensor_type, cam_info in info['sensors'].items():
+                if not 'CAM' in sensor_type:
+                    continue
+                image_paths.append(osp.join(self.data_root,cam_info['data_path']))
+                cam2ego = cam_info['cam2ego']
+                intrinsic = cam_info['intrinsic']
+                intrinsic_pad = np.eye(4)
+                intrinsic_pad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2cam = self.invert_pose(cam2ego) @ lidar2ego
+                lidar2img = intrinsic_pad @ lidar2cam
+                lidar2img_rts.append(lidar2img)
+                cam_intrinsics.append(intrinsic_pad)
+                lidar2cam_rts.append(lidar2cam)
+
+            input_dict.update(
+                dict(
+                    img_filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                    cam_intrinsic=cam_intrinsics,
+                    lidar2cam=lidar2cam_rts,
+                ))
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+        yaw = input_dict['ego_yaw']
+        rotation = list(Quaternion(axis=[0, 0, 1], radians=yaw))
+        if yaw < 0:
+            yaw += 2*np.pi
+        yaw_in_degree = yaw / np.pi * 180 
+        can_bus = np.zeros(18)
+        can_bus[:3] = input_dict['ego_translation']
+        can_bus[3:7] = rotation
+        can_bus[7:10] = input_dict['ego_vel']
+        can_bus[10:13] = input_dict['ego_accel']
+        can_bus[13:16] = input_dict['ego_rotation_rate']
+        can_bus[16] = yaw
+        can_bus[17] = yaw_in_degree
+        input_dict['can_bus'] = can_bus
+
+        return input_dict
+    
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \
+                    3D ground truth bboxes
+                - gt_labels_3d (np.ndarray): Labels of ground truths.
+                - gt_names (list[str]): Class names of ground truths.
+        """
+        info = self.data_infos[index]
+        # filter out bbox containing no points
+        mask = (info['num_points'] >= -1)
+        gt_bboxes_3d = info['gt_boxes'][mask]
+        gt_names_3d = info['gt_names'][mask]
+        gt_labels_3d = []
+
+        for cat in gt_names_3d:
+            if cat in self.CLASSES:
+                gt_labels_3d.append(self.CLASSES.index(cat))
+            else:
+                gt_labels_3d.append(-1)
+        gt_labels_3d = np.array(gt_labels_3d)
+        if not self.with_velocity:
+            gt_bboxes_3d = gt_bboxes_3d[:,0:7]
+
+        gt_bboxes_3d = LiDARInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+            gt_names=gt_names_3d)
+        return anns_results
+
+    def __getitem__(self, idx):
+        """Get item from infos according to the given index.
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        if self.test_mode:
+            return self.prepare_test_data(idx)
+        while True:
+
+            data = self.prepare_train_data(idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+        
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 result_names=['pts_bbox'],
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluation in nuScenes protocol.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            show (bool): Whether to visualize.
+                Default: False.
+            out_dir (str): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+
+        Returns:
+            dict[str, float]: Results of each evaluation metric.
+        """
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+        result_path = result_files['pts_bbox']
+        with open(result_path) as f:
+            result_data = json.load(f)
+        pred_boxes = EvalBoxes.deserialize(result_data['results'], DetectionBox)
+        meta = result_data['meta']
+
+
+        gt_boxes = self.load_gt()
+
+        metric_data_list = DetectionMetricDataList()
+        for class_name in self.eval_cfg['class_names']:
+            for dist_th in self.eval_cfg['dist_ths']:
+                md = accumulate(gt_boxes, pred_boxes, class_name, center_distance, dist_th)
+                metric_data_list.set(class_name, dist_th, md)
+                metrics = DetectionMetrics(self.eval_cfg)
+
+        for class_name in self.eval_cfg['class_names']:
+            # Compute APs.
+            for dist_th in self.eval_cfg['dist_ths']:
+                metric_data = metric_data_list[(class_name, dist_th)]
+                ap = calc_ap(metric_data, self.eval_cfg['min_recall'], self.eval_cfg['min_precision'])
+                metrics.add_label_ap(class_name, dist_th, ap)
+
+            # Compute TP metrics.
+            for metric_name in self.eval_cfg['tp_metrics']:
+                metric_data = metric_data_list[(class_name, self.eval_cfg['dist_th_tp'])]
+                tp = calc_tp(metric_data, self.eval_cfg['min_recall'], metric_name)
+                metrics.add_label_tp(class_name, metric_name, tp)
+
+        metrics_summary = metrics.serialize()
+        metrics_summary['meta'] = meta.copy()
+        print('mAP: %.4f' % (metrics_summary['mean_ap']))
+        err_name_mapping = {
+            'trans_err': 'mATE',
+            'scale_err': 'mASE',
+            'orient_err': 'mAOE',
+            'vel_err': 'mAVE',
+        }
+        for tp_name, tp_val in metrics_summary['tp_errors'].items():
+            print('%s: %.4f' % (err_name_mapping[tp_name], tp_val))
+        print('NDS: %.4f' % (metrics_summary['nd_score']))
+        #print('Eval time: %.1fs' % metrics_summary['eval_time'])
+
+        # Print per-class metrics.
+        print()
+        print('Per-class results:')
+        print('Object Class\tAP\tATE\tASE\tAOE\tAVE')
+        class_aps = metrics_summary['mean_dist_aps']
+        class_tps = metrics_summary['label_tp_errors']
+        for class_name in class_aps.keys():
+            print('%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
+                  % (class_name, class_aps[class_name],
+                     class_tps[class_name]['trans_err'],
+                     class_tps[class_name]['scale_err'],
+                     class_tps[class_name]['orient_err'],
+                     class_tps[class_name]['vel_err']))        
+
+        detail = dict()
+        metric_prefix = 'bbox_NuScenes'
+        for name in self.eval_cfg['class_names']:
+            for k, v in metrics_summary['label_aps'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics_summary['label_tp_errors'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics_summary['tp_errors'].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}'.format(metric_prefix,self.eval_cfg['err_name_maping'][k])] = val
+        detail['{}/NDS'.format(metric_prefix)] = metrics_summary['nd_score']
+        detail['{}/mAP'.format(metric_prefix)] = metrics_summary['mean_ap']
+
+
+        return detail
+    
+
+    def load_gt(self):
+        all_annotations = EvalBoxes()
+        for i in range(len(self.data_infos)):
+            sample_boxes = []
+            sample_data = self.data_infos[i]
+
+            gt_boxes = sample_data['gt_boxes']
+            
+            for j in range(gt_boxes.shape[0]):
+                class_name = self.NameMapping[sample_data['gt_names'][j]]
+                if not class_name in self.eval_cfg['class_range'].keys():
+                    continue
+                range_x, range_y = self.eval_cfg['class_range'][class_name]
+                if abs(gt_boxes[j,0]) > range_x or abs(gt_boxes[j,1]) > range_y:
+                    continue
+                sample_boxes.append(DetectionBox(
+                                                sample_token=sample_data['folder']+'_'+str(sample_data['frame_idx']),
+                                                translation=gt_boxes[j,0:3],
+                                                size=gt_boxes[j,3:6],
+                                                rotation=list(Quaternion(axis=[0, 0, 1], radians=-gt_boxes[j,6]-np.pi/2)),
+                                                velocity=gt_boxes[j,7:9],
+                                                num_pts=int(sample_data['num_points'][j]),
+                                                detection_name=self.NameMapping[sample_data['gt_names'][j]],
+                                                detection_score=-1.0,  
+                                                attribute_name=self.NameMapping[sample_data['gt_names'][j]]
+                                                ))
+            all_annotations.add_boxes(sample_data['folder']+'_'+str(sample_data['frame_idx']), sample_boxes)
+        return all_annotations
+    
+    def _format_bbox(self, results, jsonfile_prefix=None):
+        """Convert the results to the standard format.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of the output jsonfile.
+                You can specify the output directory/filename by
+                modifying the jsonfile_prefix. Default: None.
+
+        Returns:
+            str: Path of the output json file.
+        """
+
+
+        nusc_annos = {}
+        mapped_class_names = self.CLASSES
+
+        print('Start to convert detection format...')
+        for sample_id, det in enumerate(track_iter_progress(results)):
+            #pdb.set_trace()
+            annos = []
+            box3d = det['boxes_3d']
+            scores = det['scores_3d']
+            labels = det['labels_3d']
+            box_gravity_center = box3d.gravity_center
+            box_dims = box3d.dims
+            box_yaw = box3d.yaw.numpy()
+            box_yaw = -box_yaw - np.pi / 2
+            sample_token = self.data_infos[sample_id]['folder'] + '_' + str(self.data_infos[sample_id]['frame_idx'])
+
+
+
+            for i in range(len(box3d)):
+                #import pdb;pdb.set_trace()
+                quat = list(Quaternion(axis=[0, 0, 1], radians=box_yaw[i]))
+                velocity = [box3d.tensor[i, 7].item(),box3d.tensor[i, 8].item()]
+                name = mapped_class_names[labels[i]]
+                nusc_anno = dict(
+                    sample_token=sample_token,
+                    translation=box_gravity_center[i].tolist(),
+                    size=box_dims[i].tolist(),
+                    rotation=quat,
+                    velocity=velocity,
+                    detection_name=name,
+                    detection_score=scores[i].item(),
+                    attribute_name=name)
+                annos.append(nusc_anno)
+            nusc_annos[sample_token] = annos
+        nusc_submissions = {
+            'meta': self.modality,
+            'results': nusc_annos,
+        }
+
+        mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+        print('Results writes to', res_path)
+        dump(nusc_submissions, res_path)
+        return res_path  
+
+    def format_results(self, results, jsonfile_prefix=None):
+        """Format the results to json (standard format for COCO evaluation).
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: Returns (result_files, tmp_dir), where `result_files` is a \
+                dict containing the json filepaths, `tmp_dir` is the temporal \
+                directory created for saving json files when \
+                `jsonfile_prefix` is not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+        # assert len(results) == len(self), (
+        #     'The length of results is not equal to the dataset len: {} != {}'.
+        #     format(len(results), len(self)))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+
+        if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
+            result_files = self._format_bbox(results, jsonfile_prefix)
+        else:
+            # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
+            result_files = dict()
+            for name in results[0]:
+                print(f'\nFormating bboxes of {name}')
+                results_ = [out[name] for out in results]
+                tmp_file_ = osp.join(jsonfile_prefix, name)
+                result_files.update(
+                    {name: self._format_bbox(results_, tmp_file_)})
+        return result_files, tmp_dir
+
diff --git a/mmcv/datasets/B2D_e2e_dataset.py b/mmcv/datasets/B2D_e2e_dataset.py
new file mode 100644
index 0000000..9f5b4e0
--- /dev/null
+++ b/mmcv/datasets/B2D_e2e_dataset.py
@@ -0,0 +1,855 @@
+import copy
+import numpy as np
+import os
+from os import path as osp
+import torch
+import random
+import json, pickle
+import tempfile
+import cv2
+from pyquaternion import Quaternion
+from mmcv.datasets import DATASETS
+from mmcv.utils import save_tensor
+from mmcv.parallel import DataContainer as DC
+from mmcv.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
+from mmcv.fileio.io import load, dump
+from mmcv.utils import track_iter_progress, mkdir_or_exist
+from mmcv.datasets.pipelines import to_tensor
+from .custom_3d import Custom3DDataset
+from .pipelines import Compose
+from .nuscenes_styled_eval_utils import DetectionMetrics, EvalBoxes, DetectionBox,center_distance,accumulate,DetectionMetricDataList,calc_ap, calc_tp, quaternion_yaw
+from prettytable import PrettyTable
+
+
+
+@DATASETS.register_module()
+class B2D_E2E_Dataset(Custom3DDataset):
+    def __init__(self, queue_length=4, bev_size=(200, 200),overlap_test=False,with_velocity=True,sample_interval=5,name_mapping= None,eval_cfg = None, map_root =None,map_file=None,past_frames=4, future_frames=4,predict_frames=12,planning_frames=6,patch_size = [102.4, 102.4],point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] ,occ_receptive_field=3,occ_n_future=6,occ_filter_invalid_sample=False,occ_filter_by_valid_flag=False,eval_mod=None,*args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.queue_length = queue_length
+        self.bev_size = (200, 200)
+        self.overlap_test = overlap_test
+        self.with_velocity = with_velocity
+        self.NameMapping  = name_mapping
+        self.eval_cfg  = eval_cfg
+        self.sample_interval = sample_interval
+        self.past_frames = past_frames
+        self.future_frames = future_frames
+        self.predict_frames = predict_frames
+        self.planning_frames = planning_frames
+        self.map_root = map_root
+        self.map_file = map_file
+        self.point_cloud_range = np.array(point_cloud_range)
+        self.patch_size = patch_size
+        self.occ_receptive_field = occ_receptive_field  # past + current
+        self.occ_n_future = occ_n_future  # future only
+        self.occ_filter_invalid_sample = occ_filter_invalid_sample
+        self.occ_filter_by_valid_flag = occ_filter_by_valid_flag
+        self.occ_only_total_frames = 7  # NOTE: hardcode, not influenced by planning   
+        self.eval_mod = eval_mod     
+        self.map_element_class = {'Broken':0, 'Solid':1, 'SolidSolid':2,'Center':3,'TrafficLight':4,'StopSign':5}
+        with open(self.map_file,'rb') as f: 
+            self.map_infos = pickle.load(f)
+
+    def invert_pose(self, pose):
+        inv_pose = np.eye(4)
+        inv_pose[:3, :3] = np.transpose(pose[:3, :3])
+        inv_pose[:3, -1] = - inv_pose[:3, :3] @ pose[:3, -1]
+        return inv_pose
+
+    def prepare_train_data(self, index):
+        """
+        Training data preparation.
+        Args:
+            index (int): Index for accessing the target data.
+        Returns:
+            dict: Training data dict of the corresponding index.
+        """
+        queue = []
+        index_list = list(range(index-self.queue_length*self.sample_interval, index,self.sample_interval))
+        random.shuffle(index_list)
+        index_list = sorted(index_list[1:])
+        index_list.append(index)
+        for i in index_list:
+            i = max(0, i)
+            input_dict = self.get_data_info(i)
+            if input_dict is None:
+                return None
+            self.pre_pipeline(input_dict)
+            example = self.pipeline(input_dict)
+            if self.filter_empty_gt and \
+                    (example is None or ~(example['gt_labels_3d']._data != -1).any()):
+                return None
+            queue.append(example)
+        return self.union2one(queue)
+    
+    def union2one(self, queue):
+        imgs_list = [each['img'].data for each in queue]
+        gt_labels_3d_list = [each['gt_labels_3d'].data for each in queue]
+        gt_sdc_label_list = [each['gt_sdc_label'].data for each in queue]
+        gt_inds_list = [to_tensor(each['gt_inds']) for each in queue]
+        gt_bboxes_3d_list = [each['gt_bboxes_3d'].data for each in queue]
+        gt_past_traj_list = [to_tensor(each['gt_past_traj']) for each in queue]
+        gt_past_traj_mask_list = [ to_tensor(each['gt_past_traj_mask']) for each in queue]
+        gt_sdc_bbox_list = [each['gt_sdc_bbox'].data for each in queue]
+        l2g_r_mat_list = [to_tensor(each['l2g_r_mat']) for each in queue]
+        l2g_t_list = [to_tensor(each['l2g_t']) for each in queue]
+        timestamp_list = [to_tensor(each['timestamp']) for each in queue]
+        gt_fut_traj = to_tensor(queue[-1]['gt_fut_traj'])
+        gt_fut_traj_mask = to_tensor(queue[-1]['gt_fut_traj_mask'])
+        if 'gt_future_boxes' in queue[-1]:
+            gt_future_boxes_list = queue[-1]['gt_future_boxes']
+        else:
+            gt_future_boxes_list = None
+        if 'gt_future_labels' in queue[-1]:    
+            gt_future_labels_list = [to_tensor(each) for each in queue[-1]['gt_future_labels']]
+        else:
+            gt_future_labels_list = None
+
+        metas_map = {}
+        prev_scene_token = None
+        prev_pos = None
+        prev_angle = None
+        for i, each in enumerate(queue):
+            metas_map[i] = each['img_metas'].data
+            if metas_map[i]['folder'] != prev_scene_token:
+                metas_map[i]['prev_bev_exists'] = False
+                prev_scene_token = metas_map[i]['folder']
+                prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] = 0
+                metas_map[i]['can_bus'][-1] = 0
+            else:
+                metas_map[i]['prev_bev_exists'] = True
+                tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] -= prev_pos
+                metas_map[i]['can_bus'][-1] -= prev_angle
+                prev_pos = copy.deepcopy(tmp_pos)
+                prev_angle = copy.deepcopy(tmp_angle)
+        queue[-1]['img'] = DC(torch.stack(imgs_list), cpu_only=False, stack=True)
+        queue[-1]['img_metas'] = DC(metas_map, cpu_only=True)
+        queue = queue[-1]
+        queue['gt_labels_3d'] = DC(gt_labels_3d_list)
+        queue['gt_sdc_label'] = DC(gt_sdc_label_list)
+        queue['gt_inds'] = DC(gt_inds_list)
+        queue['gt_bboxes_3d'] = DC(gt_bboxes_3d_list, cpu_only=True)
+        queue['gt_sdc_bbox'] = DC(gt_sdc_bbox_list, cpu_only=True)
+        queue['l2g_r_mat'] = DC(l2g_r_mat_list)
+        queue['l2g_t'] = DC(l2g_t_list)
+        queue['timestamp'] = DC(timestamp_list)
+        queue['gt_fut_traj'] = DC(gt_fut_traj)
+        queue['gt_fut_traj_mask'] = DC(gt_fut_traj_mask)
+        queue['gt_past_traj'] = DC(gt_past_traj_list)
+        queue['gt_past_traj_mask'] = DC(gt_past_traj_mask_list)
+        if gt_future_boxes_list is not None:
+            queue['gt_future_boxes'] = DC(gt_future_boxes_list, cpu_only=True)
+        if gt_future_labels_list is not None:
+            queue['gt_future_labels'] = DC(gt_future_labels_list)
+
+        return queue
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data \
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - sweeps (list[dict]): Infos of sweeps.
+                - timestamp (float): Sample timestamp.
+                - img_filename (str, optional): Image filename.
+                - lidar2img (list[np.ndarray], optional): Transformations \
+                    from lidar to different cameras.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+
+        for i in range(len(info['gt_names'])):
+            if info['gt_names'][i] in self.NameMapping.keys():
+                info['gt_names'][i] = self.NameMapping[info['gt_names'][i]]
+
+
+        gt_masks,gt_labels,gt_bboxes = self.get_map_info(index)
+
+
+        input_dict = dict(
+            folder=info['folder'],
+            scene_token=info['folder'],
+            frame_idx=info['frame_idx'],
+            ego_yaw=np.nan_to_num(info['ego_yaw'],nan=np.pi/2),
+            ego_translation=info['ego_translation'],
+            sensors=info['sensors'],
+            world2lidar=info['sensors']['LIDAR_TOP']['world2lidar'],
+            gt_ids=info['gt_ids'],
+            gt_boxes=info['gt_boxes'],
+            gt_names=info['gt_names'],
+            ego_vel = info['ego_vel'],
+            ego_accel = info['ego_accel'],
+            ego_rotation_rate = info['ego_rotation_rate'],
+            npc2world = info['npc2world'],
+            gt_lane_labels=gt_labels,
+            gt_lane_bboxes=gt_bboxes,
+            gt_lane_masks=gt_masks,
+            timestamp=info['frame_idx']/10
+
+        )
+
+        if self.modality['use_camera']:
+            image_paths = []
+            lidar2img_rts = []
+            lidar2cam_rts = []
+            cam_intrinsics = []
+            lidar2ego = info['sensors']['LIDAR_TOP']['lidar2ego']
+            for sensor_type, cam_info in info['sensors'].items():
+                if not 'CAM' in sensor_type:
+                    continue
+                image_paths.append(osp.join(self.data_root,cam_info['data_path']))
+                # obtain lidar to image transformation matrix
+                cam2ego = cam_info['cam2ego']
+                intrinsic = cam_info['intrinsic']
+                intrinsic_pad = np.eye(4)
+                intrinsic_pad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2cam = self.invert_pose(cam2ego) @ lidar2ego
+                lidar2img = intrinsic_pad @ lidar2cam
+                lidar2img_rts.append(lidar2img)
+                cam_intrinsics.append(intrinsic_pad)
+                lidar2cam_rts.append(lidar2cam)
+            ego2world = np.eye(4)
+            ego2world[0:3,0:3] = Quaternion(axis=[0, 0, 1], radians=input_dict['ego_yaw']).rotation_matrix
+            ego2world[0:3,3] = input_dict['ego_translation']
+            lidar2global = ego2world @ lidar2ego
+            input_dict.update(
+                dict(
+                    img_filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                    cam_intrinsic=cam_intrinsics,
+                    lidar2cam=lidar2cam_rts,
+                    l2g_r_mat=lidar2global[0:3,0:3],
+                    l2g_t=lidar2global[0:3,3]
+
+                ))
+
+        annos = self.get_ann_info(index)
+        input_dict['ann_info'] = annos
+        yaw = input_dict['ego_yaw']
+        rotation = list(Quaternion(axis=[0, 0, 1], radians=yaw))
+        if yaw < 0:
+            yaw += 2*np.pi
+        yaw_in_degree = yaw / np.pi * 180 
+        
+        can_bus = np.zeros(18)
+        can_bus[:3] = input_dict['ego_translation']
+        can_bus[3:7] = rotation
+        can_bus[7:10] = input_dict['ego_vel']
+        can_bus[10:13] = input_dict['ego_accel']
+        can_bus[13:16] = input_dict['ego_rotation_rate']
+        can_bus[16] = yaw
+        can_bus[17] = yaw_in_degree
+        input_dict['can_bus'] = can_bus
+        all_frames = []
+        for adj_idx in range(index-self.occ_receptive_field+1,index+self.occ_n_future+1):
+            if adj_idx<0 or adj_idx>=len(self.data_infos):
+                all_frames.append(-1)
+            elif self.data_infos[adj_idx]['folder'] != self.data_infos[index]['folder']:
+                all_frames.append(-1)
+            else: 
+                all_frames.append(adj_idx)
+            
+        future_frames = all_frames[self.occ_receptive_field-1:]
+        input_dict['occ_has_invalid_frame'] = (-1 in all_frames[:self.occ_only_total_frames])
+        input_dict['occ_img_is_valid'] = np.array(all_frames) >= 0
+        occ_future_ann_infos = []
+        for future_frame in future_frames:
+            if future_frame >= 0:
+                occ_future_ann_infos.append(
+                    self.get_ann_boxes_only(future_frame),
+                )
+            else:
+                occ_future_ann_infos.append(None)
+        input_dict['occ_future_ann_infos'] = occ_future_ann_infos
+
+        input_dict.update(self.occ_get_transforms(future_frames))
+        sdc_planning, sdc_planning_mask = self.get_ego_future_xy(index,self.sample_interval,self.planning_frames)
+        input_dict['sdc_planning'] = sdc_planning
+        input_dict['sdc_planning_mask'] = sdc_planning_mask
+        command = info['command_near']
+        if command < 0:
+            command = 4
+        command -= 1
+        input_dict['command'] = command
+
+        return input_dict
+
+
+    def get_map_info(self, index):
+
+        gt_masks = []
+        gt_labels = []
+        gt_bboxes = []
+
+        ann_info = self.data_infos[index]
+        town_name = ann_info['town_name']
+        map_info = self.map_infos[town_name]
+        lane_points = map_info['lane_points']
+        lane_sample_points = map_info['lane_sample_points']
+        lane_types = map_info['lane_types']
+        trigger_volumes_points = map_info['trigger_volumes_points']
+        trigger_volumes_sample_points = map_info['trigger_volumes_sample_points']
+        trigger_volumes_types = map_info['trigger_volumes_types']
+        world2lidar = np.array(ann_info['sensors']['LIDAR_TOP']['world2lidar'])
+        ego_xy = np.linalg.inv(world2lidar)[0:2,3]
+
+        #1st search
+        max_distance = 100
+        chosed_idx = []
+        for idx in range(len(lane_sample_points)):
+            single_sample_points = lane_sample_points[idx]
+            distance = np.linalg.norm((single_sample_points[:,0:2]-ego_xy),axis=-1)
+            if np.min(distance) < max_distance:
+                chosed_idx.append(idx)
+
+        for idx in chosed_idx:
+            if not lane_types[idx] in self.map_element_class.keys():
+                continue
+            points = lane_points[idx]
+            points = np.concatenate([points,np.ones((points.shape[0],1))],axis=-1)
+            points_in_ego = (world2lidar @ points.T).T
+            #print(points_in_ego)
+            mask = (points_in_ego[:,0]>self.point_cloud_range[0]) & (points_in_ego[:,0]<self.point_cloud_range[3]) & (points_in_ego[:,1]>self.point_cloud_range[1]) & (points_in_ego[:,1]<self.point_cloud_range[4])
+            points_in_ego_range = points_in_ego[mask,0:2]
+            if len(points_in_ego_range) > 1:
+                gt_mask = np.zeros(self.bev_size,dtype=np.uint8)
+                normalized_points = np.zeros_like(points_in_ego_range)
+                normalized_points[:,0] = (points_in_ego_range[:,0] + self.patch_size[0]/2)*(self.bev_size[0]/self.patch_size[0])
+                normalized_points[:,1] = (points_in_ego_range[:,1] + self.patch_size[1]/2)*(self.bev_size[1]/self.patch_size[1])
+                cv2.polylines(gt_mask, [normalized_points.astype(np.int32)], False, color=1, thickness=2)
+                gt_label =  self.map_element_class[lane_types[idx]]
+                gt_masks.append(gt_mask)
+                gt_labels.append(gt_label)
+                ys, xs = np.where(gt_mask==1)
+                gt_bboxes.append([min(xs), min(ys), max(xs), max(ys)]) 
+
+        for idx in range(len(trigger_volumes_points)):
+            if not trigger_volumes_types[idx] in self.map_element_class.keys():
+                continue
+            points = trigger_volumes_points[idx]
+            points = np.concatenate([points,np.ones((points.shape[0],1))],axis=-1)
+            points_in_ego = (world2lidar @ points.T).T
+            mask = (points_in_ego[:,0]>self.point_cloud_range[0]) & (points_in_ego[:,0]<self.point_cloud_range[3]) & (points_in_ego[:,1]>self.point_cloud_range[1]) & (points_in_ego[:,1]<self.point_cloud_range[4])
+            points_in_ego_range = points_in_ego[mask,0:2]
+            if mask.all():
+                gt_mask = np.zeros(self.bev_size,dtype=np.uint8)
+                normalized_points = np.zeros_like(points_in_ego_range)
+                normalized_points[:,0] = (points_in_ego_range[:,0] + self.patch_size[0]/2)*(self.bev_size[0]/self.patch_size[0])
+                normalized_points[:,1] = (points_in_ego_range[:,1] + self.patch_size[1]/2)*(self.bev_size[1]/self.patch_size[1])
+                cv2.fillConvexPoly(gt_mask, normalized_points.astype(np.int32), color=1)
+                gt_label = self.map_element_class[trigger_volumes_types[idx]]
+                gt_masks.append(gt_mask)
+                gt_labels.append(gt_label)
+                ys, xs = np.where(gt_mask==1)
+                gt_bboxes.append([min(xs), min(ys), max(xs), max(ys)]) 
+
+        if len(gt_masks) == 0:
+            gt_masks.append(np.zeros(self.bev_size,dtype=np.uint8))
+            gt_labels.append(-1)
+            gt_bboxes.append([0,0,0,0])
+
+        gt_masks = np.stack(gt_masks)
+        gt_labels = np.array(gt_labels)
+        gt_bboxes = np.array(gt_bboxes)
+
+        return gt_masks,gt_labels,gt_bboxes
+
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \
+                    3D ground truth bboxes
+                - gt_labels_3d (np.ndarray): Labels of ground truths.
+                - gt_names (list[str]): Class names of ground truths.
+        """
+        info = self.data_infos[index]
+        # filter out bbox containing no points
+
+        for i in range(len(info['gt_names'])):
+            if info['gt_names'][i] in self.NameMapping.keys():
+                info['gt_names'][i] = self.NameMapping[info['gt_names'][i]]
+        mask = (info['num_points'] >= -1)
+        gt_bboxes_3d = info['gt_boxes'][mask]
+        gt_names_3d = info['gt_names'][mask]
+        gt_inds = info['gt_ids']
+        gt_labels_3d = []
+
+        for cat in gt_names_3d:
+            if cat in self.CLASSES:
+                gt_labels_3d.append(self.CLASSES.index(cat))
+            else:
+                gt_labels_3d.append(-1)
+        gt_labels_3d = np.array(gt_labels_3d)
+        if not self.with_velocity:
+            gt_bboxes_3d = gt_bboxes_3d[:,0:7]
+        gt_bboxes_3d = LiDARInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+        
+        ego_future_track, ego_future_mask = self.get_ego_future_xy(index,self.sample_interval,self.predict_frames)
+        past_track, past_mask = self.get_past_or_future_xy(index,self.sample_interval,self.past_frames,past_or_future='past',local_xy=True)
+        predict_track, predict_mask = self.get_past_or_future_xy(index,self.sample_interval,self.predict_frames,past_or_future='future',local_xy=False)
+        mask = (past_mask.sum((1,2))>0).astype(np.int)
+        future_track = predict_track[:,0:self.future_frames,:]*mask[:,None,None]
+        future_mask = predict_mask[:,0:self.future_frames,:]*mask[:,None,None]
+        full_past_track = np.concatenate([past_track,future_track],axis=1)
+        full_past_mask = np.concatenate([past_mask,future_mask],axis=1)
+        gt_sdc_bbox, gt_sdc_label =self.generate_sdc_info(index)
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+            gt_names=gt_names_3d,
+            gt_inds=gt_inds,
+            gt_fut_traj=predict_track,
+            gt_fut_traj_mask=predict_mask,
+            gt_past_traj=full_past_track,
+            gt_past_traj_mask=full_past_mask,
+            gt_sdc_bbox=gt_sdc_bbox,
+            gt_sdc_label=gt_sdc_label,
+            gt_sdc_fut_traj=ego_future_track[:,:,0:2],
+            gt_sdc_fut_traj_mask=ego_future_mask,
+            )
+        return anns_results
+
+    def get_ann_boxes_only(self, index):
+
+        info = self.data_infos[index]
+        for i in range(len(info['gt_names'])):
+            if info['gt_names'][i] in self.NameMapping.keys():
+                info['gt_names'][i] = self.NameMapping[info['gt_names'][i]]
+        gt_bboxes_3d = info['gt_boxes']
+        gt_names_3d = info['gt_names']
+        gt_inds = info['gt_ids']
+        gt_labels_3d = []
+        for cat in gt_names_3d:
+            if cat in self.CLASSES:
+                gt_labels_3d.append(self.CLASSES.index(cat))
+            
+            else:
+                gt_labels_3d.append(-1)
+        gt_labels_3d = np.array(gt_labels_3d)
+        if not self.with_velocity:
+            gt_bboxes_3d = gt_bboxes_3d[:,0:7]
+        gt_bboxes_3d = LiDARInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+        boxes_annos = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+            gt_inds=gt_inds,
+            )
+        return boxes_annos
+
+    def __getitem__(self, idx):
+        """Get item from infos according to the given index.
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        if self.test_mode:
+            return self.prepare_test_data(idx)
+        while True:
+
+            data = self.prepare_train_data(idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+        
+    def generate_sdc_info(self,idx):
+
+        info = self.data_infos[idx]
+        ego_size = info['ego_size']
+        ego_vel = info['ego_vel']
+        psudo_sdc_bbox = np.array([0.0, 0.0, 0.0, ego_size[0], ego_size[1], ego_size[2], -np.pi, ego_vel[1], ego_vel[0] ])
+        if not self.with_velocity:
+            psudo_sdc_bbox = psudo_sdc_bbox[0:7]
+        gt_bboxes_3d = np.array([psudo_sdc_bbox]).astype(np.float32)
+        gt_names_3d = ['car']
+        gt_labels_3d = []
+        for cat in gt_names_3d:
+            if cat in self.CLASSES:
+                gt_labels_3d.append(self.CLASSES.index(cat))
+            else:
+                gt_labels_3d.append(-1)
+        gt_labels_3d = np.array(gt_labels_3d)
+
+        # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
+        # the same as KITTI (0.5, 0.5, 0)
+        gt_bboxes_3d = LiDARInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+  
+        gt_labels_3d = DC(to_tensor(gt_labels_3d))
+        gt_bboxes_3d = DC(gt_bboxes_3d, cpu_only=True)
+
+        return gt_bboxes_3d, gt_labels_3d
+
+    def get_past_or_future_xy(self,idx,sample_rate,frames,past_or_future,local_xy=False):
+
+        assert past_or_future in ['past','future']
+        if past_or_future == 'past':
+            adj_idx_list = range(idx-sample_rate,idx-(frames+1)*sample_rate,-sample_rate)
+        else:
+            adj_idx_list = range(idx+sample_rate,idx+(frames+1)*sample_rate,sample_rate)
+
+        cur_frame = self.data_infos[idx]
+        box_ids = cur_frame['gt_ids']
+        adj_track = np.zeros((len(box_ids),frames,2))
+        adj_mask = np.zeros((len(box_ids),frames,2))
+        world2lidar_ego_cur = cur_frame['sensors']['LIDAR_TOP']['world2lidar']
+        for i in range(len(box_ids)):
+            box_id = box_ids[i]
+            cur_box2lidar = world2lidar_ego_cur @ cur_frame['npc2world'][i]
+            cur_xy = cur_box2lidar[0:2,3]      
+            for j in range(len(adj_idx_list)):
+                adj_idx = adj_idx_list[j]
+                if adj_idx <0 or adj_idx>=len(self.data_infos):
+                    break
+                adj_frame = self.data_infos[adj_idx]
+                if adj_frame['folder'] != cur_frame ['folder']:
+                    break
+                if len(np.where(adj_frame['gt_ids']==box_id)[0])==0:
+                    continue
+                assert len(np.where(adj_frame['gt_ids']==box_id)[0]) == 1 , np.where(adj_frame['gt_ids']==box_id)[0]
+                adj_idx = np.where(adj_frame['gt_ids']==box_id)[0][0]
+                adj_box2lidar = world2lidar_ego_cur @ adj_frame['npc2world'][adj_idx]
+                adj_xy = adj_box2lidar[0:2,3]    
+                if local_xy:
+                    adj_xy -= cur_xy
+                adj_track[i,j,:] = adj_xy
+                adj_mask[i,j,:] = 1
+        return adj_track, adj_mask
+
+    def get_ego_future_xy(self,idx,sample_rate,frames):
+
+        adj_idx_list = range(idx+sample_rate,idx+(frames+1)*sample_rate,sample_rate)
+        cur_frame = self.data_infos[idx]
+        adj_track = np.zeros((1,frames,3))
+        adj_mask = np.zeros((1,frames,2))
+        world2lidar_ego_cur = cur_frame['sensors']['LIDAR_TOP']['world2lidar']
+        for j in range(len(adj_idx_list)):
+            adj_idx = adj_idx_list[j]
+            if adj_idx <0 or adj_idx>=len(self.data_infos):
+                break
+            adj_frame = self.data_infos[adj_idx]
+            if adj_frame['folder'] != cur_frame ['folder']:
+                break
+            world2lidar_ego_adj = adj_frame['sensors']['LIDAR_TOP']['world2lidar']
+            adj2cur_lidar = world2lidar_ego_cur @ np.linalg.inv(world2lidar_ego_adj)
+            xy = adj2cur_lidar[0:2,3]
+            yaw = np.arctan2(adj2cur_lidar[1,0],adj2cur_lidar[0,0])
+            yaw = -yaw -np.pi
+            while yaw > np.pi:
+                yaw -= np.pi*2
+            while yaw < -np.pi:
+                yaw += np.pi*2
+            adj_track[0,j,0:2] = xy
+            adj_track[0,j,2] = yaw
+            adj_mask[0,j,:] = 1
+
+        return adj_track, adj_mask
+
+    def occ_get_transforms(self, indices, data_type=torch.float32):
+
+        l2e_r_mats = []
+        l2e_t_vecs = []
+        e2g_r_mats = []
+        e2g_t_vecs = []
+
+        for index in indices:
+            if index == -1:
+                l2e_r_mats.append(None)
+                l2e_t_vecs.append(None)
+                e2g_r_mats.append(None)
+                e2g_t_vecs.append(None)
+            else:
+                info = self.data_infos[index]
+                lidar2ego = info['sensors']['LIDAR_TOP']['lidar2ego']
+                l2e_r = lidar2ego[0:3,0:3]
+                l2e_t = lidar2ego[0:3,3]
+                ego2global = np.linalg.inv(info['world2ego'])
+                e2g_r = ego2global[0:3,0:3]
+                e2g_t = ego2global[0:3,3]
+                l2e_r_mats.append(torch.tensor(l2e_r).to(data_type))
+                l2e_t_vecs.append(torch.tensor(l2e_t).to(data_type))
+                e2g_r_mats.append(torch.tensor(e2g_r).to(data_type))
+                e2g_t_vecs.append(torch.tensor(e2g_t).to(data_type))
+        res = {
+            'occ_l2e_r_mats': l2e_r_mats,
+            'occ_l2e_t_vecs': l2e_t_vecs,
+            'occ_e2g_r_mats': e2g_r_mats,
+            'occ_e2g_t_vecs': e2g_t_vecs,
+        }
+
+        return res
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 result_names=['pts_bbox'],
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluation in nuScenes protocol.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            show (bool): Whether to visualize.
+                Default: False.
+            out_dir (str): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+
+        Returns:
+            dict[str, float]: Results of each evaluation metric.
+        """
+
+        # NOTE:Curremtly we only support evaluation on detection and planning 
+
+        result_files, tmp_dir = self.format_results(results['bbox_results'], jsonfile_prefix)    
+        result_path = result_files
+        with open(result_path) as f:
+            result_data = json.load(f)
+        pred_boxes = EvalBoxes.deserialize(result_data['results'], DetectionBox)
+        meta = result_data['meta']
+
+        gt_boxes = self.load_gt()
+
+        metric_data_list = DetectionMetricDataList()
+        for class_name in self.eval_cfg['class_names']:
+            for dist_th in self.eval_cfg['dist_ths']:
+                md = accumulate(gt_boxes, pred_boxes, class_name, center_distance, dist_th)
+                metric_data_list.set(class_name, dist_th, md)
+                metrics = DetectionMetrics(self.eval_cfg)
+
+        for class_name in self.eval_cfg['class_names']:
+            # Compute APs.
+            for dist_th in self.eval_cfg['dist_ths']:
+                metric_data = metric_data_list[(class_name, dist_th)]
+                ap = calc_ap(metric_data, self.eval_cfg['min_recall'], self.eval_cfg['min_precision'])
+                metrics.add_label_ap(class_name, dist_th, ap)
+
+            # Compute TP metrics.
+            for metric_name in self.eval_cfg['tp_metrics']:
+                metric_data = metric_data_list[(class_name, self.eval_cfg['dist_th_tp'])]
+                tp = calc_tp(metric_data, self.eval_cfg['min_recall'], metric_name)
+                metrics.add_label_tp(class_name, metric_name, tp)
+
+        metrics_summary = metrics.serialize()
+        metrics_summary['meta'] = meta.copy()
+        print('mAP: %.4f' % (metrics_summary['mean_ap']))
+        err_name_mapping = {
+            'trans_err': 'mATE',
+            'scale_err': 'mASE',
+            'orient_err': 'mAOE',
+            'vel_err': 'mAVE',
+        }
+        for tp_name, tp_val in metrics_summary['tp_errors'].items():
+            print('%s: %.4f' % (err_name_mapping[tp_name], tp_val))
+        print('NDS: %.4f' % (metrics_summary['nd_score']))
+        #print('Eval time: %.1fs' % metrics_summary['eval_time'])
+
+        # Print per-class metrics.
+        print()
+        print('Per-class results:')
+        print('Object Class\tAP\tATE\tASE\tAOE\tAVE')
+        class_aps = metrics_summary['mean_dist_aps']
+        class_tps = metrics_summary['label_tp_errors']
+        for class_name in class_aps.keys():
+            print('%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
+                  % (class_name, class_aps[class_name],
+                     class_tps[class_name]['trans_err'],
+                     class_tps[class_name]['scale_err'],
+                     class_tps[class_name]['orient_err'],
+                     class_tps[class_name]['vel_err']))        
+
+        detail = dict()
+        metric_prefix = 'bbox_NuScenes'
+        for name in self.eval_cfg['class_names']:
+            for k, v in metrics_summary['label_aps'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics_summary['label_tp_errors'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics_summary['tp_errors'].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}'.format(metric_prefix,self.eval_cfg['err_name_maping'][k])] = val
+        detail['{}/NDS'.format(metric_prefix)] = metrics_summary['nd_score']
+        detail['{}/mAP'.format(metric_prefix)] = metrics_summary['mean_ap']
+
+        if 'planning_results_computed' in results.keys():
+                planning_results_computed = results['planning_results_computed']
+                planning_tab = PrettyTable()
+                planning_tab.field_names = [
+                    "metrics", "0.5s", "1.0s", "1.5s", "2.0s", "2.5s", "3.0s"]
+                for key in planning_results_computed.keys():
+                    value = planning_results_computed[key]
+                    row_value = []
+                    row_value.append(key)
+                    for i in range(len(value)):
+                        row_value.append('%.4f' % float(value[i]))
+                    planning_tab.add_row(row_value)
+                print(planning_tab)
+
+
+        return detail
+
+    def load_gt(self):
+        all_annotations = EvalBoxes()
+        for i in range(len(self.data_infos)):
+            sample_boxes = []
+            sample_data = self.data_infos[i]
+
+            gt_boxes = sample_data['gt_boxes']
+            
+            for j in range(gt_boxes.shape[0]):
+                class_name = self.NameMapping[sample_data['gt_names'][j]]
+                if not class_name in self.eval_cfg['class_range'].keys():
+                    continue
+                range_x, range_y = self.eval_cfg['class_range'][class_name]
+                if abs(gt_boxes[j,0]) > range_x or abs(gt_boxes[j,1]) > range_y:
+                    continue
+                sample_boxes.append(DetectionBox(
+                                                sample_token=sample_data['folder']+'_'+str(sample_data['frame_idx']),
+                                                translation=gt_boxes[j,0:3],
+                                                size=gt_boxes[j,3:6],
+                                                rotation=list(Quaternion(axis=[0, 0, 1], radians=-gt_boxes[j,6]-np.pi/2)),
+                                                velocity=gt_boxes[j,7:9],
+                                                num_pts=int(sample_data['num_points'][j]),
+                                                detection_name=self.NameMapping[sample_data['gt_names'][j]],
+                                                detection_score=-1.0,  
+                                                attribute_name=self.NameMapping[sample_data['gt_names'][j]]
+                                                ))
+            all_annotations.add_boxes(sample_data['folder']+'_'+str(sample_data['frame_idx']), sample_boxes)
+        return all_annotations
+    
+    def _format_bbox(self, results, jsonfile_prefix=None):
+        """Convert the results to the standard format.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of the output jsonfile.
+                You can specify the output directory/filename by
+                modifying the jsonfile_prefix. Default: None.
+
+        Returns:
+            str: Path of the output json file.
+        """
+
+
+        nusc_annos = {}
+        mapped_class_names = self.CLASSES
+
+        print('Start to convert detection format...')
+        for sample_id, det in enumerate(track_iter_progress(results)):
+            #pdb.set_trace()
+            annos = []
+            box3d = det['boxes_3d']
+            scores = det['scores_3d']
+            labels = det['labels_3d']
+            box_gravity_center = box3d.gravity_center
+            box_dims = box3d.dims
+            box_yaw = box3d.yaw.numpy()
+            box_yaw = -box_yaw - np.pi / 2
+            sample_token = self.data_infos[sample_id]['folder'] + '_' + str(self.data_infos[sample_id]['frame_idx'])
+
+
+
+            for i in range(len(box3d)):
+                #import pdb;pdb.set_trace()
+                quat = list(Quaternion(axis=[0, 0, 1], radians=box_yaw[i]))
+                velocity = [box3d.tensor[i, 7].item(),box3d.tensor[i, 8].item()]
+                name = mapped_class_names[labels[i]]
+                nusc_anno = dict(
+                    sample_token=sample_token,
+                    translation=box_gravity_center[i].tolist(),
+                    size=box_dims[i].tolist(),
+                    rotation=quat,
+                    velocity=velocity,
+                    detection_name=name,
+                    detection_score=scores[i].item(),
+                    attribute_name=name)
+                annos.append(nusc_anno)
+            nusc_annos[sample_token] = annos
+        nusc_submissions = {
+            'meta': self.modality,
+            'results': nusc_annos,
+        }
+
+        mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+        print('Results writes to', res_path)
+        dump(nusc_submissions, res_path)
+        return res_path  
+
+    def format_results(self, results, jsonfile_prefix=None):
+        """Format the results to json (standard format for COCO evaluation).
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: Returns (result_files, tmp_dir), where `result_files` is a \
+                dict containing the json filepaths, `tmp_dir` is the temporal \
+                directory created for saving json files when \
+                `jsonfile_prefix` is not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+        # assert len(results) == len(self), (
+        #     'The length of results is not equal to the dataset len: {} != {}'.
+        #     format(len(results), len(self)))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+
+        if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
+            result_files = self._format_bbox(results, jsonfile_prefix)
+        else:
+            # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
+            result_files = dict()
+            for name in results[0]:
+                print(f'\nFormating bboxes of {name}')
+                results_ = [out[name] for out in results]
+                tmp_file_ = osp.join(jsonfile_prefix, name)
+                result_files.update(
+                    {name: self._format_bbox(results_, tmp_file_)})
+        return result_files, tmp_dir
+
diff --git a/mmcv/datasets/B2D_vad_dataset.py b/mmcv/datasets/B2D_vad_dataset.py
new file mode 100644
index 0000000..f32cf25
--- /dev/null
+++ b/mmcv/datasets/B2D_vad_dataset.py
@@ -0,0 +1,1037 @@
+import copy
+import numpy as np
+import os
+from os import path as osp
+import torch
+import random
+import json, pickle
+import tempfile
+import cv2
+import pyquaternion
+from pyquaternion import Quaternion
+import mmcv
+from mmcv.datasets import DATASETS
+from mmcv.utils import save_tensor
+from mmcv.parallel import DataContainer as DC
+from mmcv.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
+from mmcv.fileio.io import load, dump
+from mmcv.utils import track_iter_progress, mkdir_or_exist
+from mmcv.datasets.pipelines import to_tensor
+from .custom_3d import Custom3DDataset
+from .pipelines import Compose
+from mmcv.datasets.map_utils.struct import LiDARInstanceLines
+from shapely.geometry import LineString
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from .vad_custom_nuscenes_eval import NuScenesEval_custom
+from nuscenes.eval.common.utils import center_distance
+import random
+from nuscenes.utils.data_classes import Box as NuScenesBox
+from mmcv.core.bbox.structures.nuscenes_box import CustomNuscenesBox
+from shapely import affinity, ops
+from shapely.geometry import LineString, box, MultiPolygon, MultiLineString
+from nuscenes.map_expansion.map_api import NuScenesMap, NuScenesMapExplorer
+from nuscenes.eval.detection.constants import DETECTION_NAMES
+from mmcv.datasets.map_utils.mean_ap import eval_map
+from mmcv.datasets.map_utils.mean_ap import format_res_gt_by_classes
+from .nuscenes_styled_eval_utils import DetectionMetrics, EvalBoxes, DetectionBox,center_distance,accumulate,DetectionMetricDataList,calc_ap, calc_tp, quaternion_yaw
+
+@DATASETS.register_module()
+class B2D_VAD_Dataset(Custom3DDataset):
+
+
+    def __init__(self, queue_length=4, bev_size=(200, 200),overlap_test=False,with_velocity=True,sample_interval=5,name_mapping= None,eval_cfg = None, map_root =None,map_file=None,past_frames=2, future_frames=6,point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] ,polyline_points_num=20,*args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.queue_length = queue_length
+        self.bev_size = bev_size
+        self.overlap_test = overlap_test
+        self.with_velocity = with_velocity
+        self.NameMapping  = name_mapping
+        self.eval_cfg  = eval_cfg
+        self.sample_interval = sample_interval
+        self.past_frames = past_frames
+        self.future_frames = future_frames
+        self.map_root = map_root
+        self.map_file = map_file
+        self.point_cloud_range = np.array(point_cloud_range)
+        self.polyline_points_num = polyline_points_num
+        self.map_element_class = {'Broken':0, 'Solid':1, 'SolidSolid':2,'Center':3,'TrafficLight':4,'StopSign':5}
+        self.MAPCLASSES = list(self.map_element_class.keys())
+        self.NUM_MAPCLASSES = len(self.MAPCLASSES)
+        self.map_eval_use_same_gt_sample_num_flag = True
+        self.map_ann_file = 'data/infos'
+        self.eval_cfg  = eval_cfg
+        with open(self.map_file,'rb') as f: 
+            self.map_infos = pickle.load(f)
+
+    def invert_pose(self, pose):
+        inv_pose = np.eye(4)
+        inv_pose[:3, :3] = np.transpose(pose[:3, :3])
+        inv_pose[:3, -1] = - inv_pose[:3, :3] @ pose[:3, -1]
+        return inv_pose
+
+    def prepare_train_data(self, index):
+        """
+        Training data preparation.
+        Args:
+            index (int): Index for accessing the target data.
+        Returns:
+            dict: Training data dict of the corresponding index.
+        """
+        queue = []
+        index_list = list(range(index-self.queue_length*self.sample_interval, index,self.sample_interval))
+        random.shuffle(index_list)
+        index_list = sorted(index_list[1:])
+        index_list.append(index)
+        for i in index_list:
+            i = max(0, i)
+            input_dict = self.get_data_info(i)
+            if input_dict is None:
+                return None
+            self.pre_pipeline(input_dict)
+            example = self.pipeline(input_dict)
+            gt_labels,gt_bboxes = self.get_map_info(index)
+            example['map_gt_labels_3d'] = DC(gt_labels, cpu_only=False)
+            example['map_gt_bboxes_3d'] = DC(gt_bboxes, cpu_only=True)
+            
+            if self.filter_empty_gt and \
+                    (example is None or ~(example['gt_labels_3d']._data != -1).any()):
+                return None
+            queue.append(example)
+        return self.union2one(queue)
+
+
+    def union2one(self, queue):
+        imgs_list = [each['img'].data for each in queue]
+        metas_map = {}
+        prev_scene_token = None
+        prev_pos = None
+        prev_angle = None
+        for i, each in enumerate(queue):
+            metas_map[i] = each['img_metas'].data
+            if metas_map[i]['folder'] != prev_scene_token:
+                metas_map[i]['prev_bev_exists'] = False
+                prev_scene_token = metas_map[i]['folder']
+                prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] = 0
+                metas_map[i]['can_bus'][-1] = 0
+            else:
+                metas_map[i]['prev_bev_exists'] = True
+                tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] -= prev_pos
+                metas_map[i]['can_bus'][-1] -= prev_angle
+                prev_pos = copy.deepcopy(tmp_pos)
+                prev_angle = copy.deepcopy(tmp_angle)
+        queue[-1]['img'] = DC(torch.stack(imgs_list), cpu_only=False, stack=True)
+        queue[-1]['img_metas'] = DC(metas_map, cpu_only=True)
+        queue = queue[-1]
+        return queue
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data \
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - sweeps (list[dict]): Infos of sweeps.
+                - timestamp (float): Sample timestamp.
+                - img_filename (str, optional): Image filename.
+                - lidar2img (list[np.ndarray], optional): Transformations \
+                    from lidar to different cameras.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+
+        for i in range(len(info['gt_names'])):
+            if info['gt_names'][i] in self.NameMapping.keys():
+                info['gt_names'][i] = self.NameMapping[info['gt_names'][i]]
+
+        input_dict = dict(
+            folder=info['folder'],
+            scene_token=info['folder'],
+            frame_idx=info['frame_idx'],
+            ego_yaw=np.nan_to_num(info['ego_yaw'],nan=np.pi/2),
+            ego_translation=info['ego_translation'],
+            sensors=info['sensors'],
+            world2lidar=info['sensors']['LIDAR_TOP']['world2lidar'],
+            gt_ids=info['gt_ids'],
+            gt_boxes=info['gt_boxes'],
+            gt_names=info['gt_names'],
+            ego_vel = info['ego_vel'],
+            ego_accel = info['ego_accel'],
+            ego_rotation_rate = info['ego_rotation_rate'],
+            npc2world = info['npc2world'],
+            timestamp=info['frame_idx']/10
+        )
+
+        if self.modality['use_camera']:
+            image_paths = []
+            lidar2img_rts = []
+            lidar2cam_rts = []
+            cam_intrinsics = []
+            lidar2ego = info['sensors']['LIDAR_TOP']['lidar2ego']
+            lidar2global =  self.invert_pose(info['sensors']['LIDAR_TOP']['world2lidar'])
+            for sensor_type, cam_info in info['sensors'].items():
+                if not 'CAM' in sensor_type:
+                    continue
+                image_paths.append(osp.join(self.data_root,cam_info['data_path']))
+                # obtain lidar to image transformation matrix
+                cam2ego = cam_info['cam2ego']
+                intrinsic = cam_info['intrinsic']
+                intrinsic_pad = np.eye(4)
+                intrinsic_pad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2cam = self.invert_pose(cam2ego) @ lidar2ego
+                lidar2img = intrinsic_pad @ lidar2cam
+                lidar2img_rts.append(lidar2img)
+                cam_intrinsics.append(intrinsic_pad)
+                lidar2cam_rts.append(lidar2cam)
+
+            input_dict.update(
+                dict(
+                    img_filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                    cam_intrinsic=cam_intrinsics,
+                    lidar2cam=lidar2cam_rts,
+                    l2g_r_mat=lidar2global[0:3,0:3],
+                    l2g_t=lidar2global[0:3,3]
+
+                ))
+            
+        #if not self.test_mode:
+        annos = self.get_ann_info(index)
+        input_dict['ann_info'] = annos
+        yaw = input_dict['ego_yaw']
+        rotation = list(Quaternion(axis=[0, 0, 1], radians=yaw))
+        
+        if yaw < 0:
+            yaw += 2*np.pi
+        yaw_in_degree = yaw / np.pi * 180 
+        
+        can_bus = np.zeros(18)
+        can_bus[:3] = input_dict['ego_translation']
+        can_bus[3:7] = rotation
+        can_bus[7:10] = input_dict['ego_vel']
+        can_bus[10:13] = input_dict['ego_accel']
+        can_bus[13:16] = input_dict['ego_rotation_rate']
+        can_bus[16] = yaw
+        can_bus[17] = yaw_in_degree
+        input_dict['can_bus'] = can_bus
+        ego_lcf_feat = np.zeros(9)
+        ego_lcf_feat[0:2] = input_dict['ego_translation'][0:2]
+        ego_lcf_feat[2:4] = input_dict['ego_accel'][2:4]
+        ego_lcf_feat[4] = input_dict['ego_rotation_rate'][-1]
+        ego_lcf_feat[5] = info['ego_size'][1]
+        ego_lcf_feat[6] = info['ego_size'][0]
+        ego_lcf_feat[7] = np.sqrt(input_dict['ego_translation'][0]**2+input_dict['ego_translation'][1]**2)
+        ego_lcf_feat[8] = info['steer']
+        ego_his_trajs, ego_fut_trajs, ego_fut_masks, command = self.get_ego_trajs(index,self.sample_interval,self.past_frames,self.future_frames)
+        input_dict['ego_his_trajs'] = ego_his_trajs
+        input_dict['ego_fut_trajs'] = ego_fut_trajs
+        input_dict['ego_fut_masks'] = ego_fut_masks
+        input_dict['ego_fut_cmd'] = command
+        input_dict['ego_lcf_feat'] = ego_lcf_feat
+        input_dict['fut_valid_flag'] = (ego_fut_masks==1).all() 
+
+        return input_dict
+
+
+    def get_map_info(self, index):
+
+        gt_masks = []
+        gt_labels = []
+        gt_bboxes = []
+
+        ann_info = self.data_infos[index]
+        town_name = ann_info['town_name']
+        map_info = self.map_infos[town_name]
+        lane_points = map_info['lane_points']
+        lane_sample_points = map_info['lane_sample_points']
+        lane_types = map_info['lane_types']
+        trigger_volumes_points = map_info['trigger_volumes_points']
+        trigger_volumes_sample_points = map_info['trigger_volumes_sample_points']
+        trigger_volumes_types = map_info['trigger_volumes_types']
+        world2lidar = np.array(ann_info['sensors']['LIDAR_TOP']['world2lidar'])
+        ego_xy = np.linalg.inv(world2lidar)[0:2,3]
+        max_distance = 50
+        chosed_idx = []
+
+        for idx in range(len(lane_sample_points)):
+            single_sample_points = lane_sample_points[idx]
+            distance = np.linalg.norm((single_sample_points[:,0:2]-ego_xy),axis=-1)
+            if np.min(distance) < max_distance:
+                chosed_idx.append(idx)
+
+        polylines = []
+        for idx in chosed_idx:
+            if not lane_types[idx] in self.map_element_class.keys():
+                continue
+            points = lane_points[idx]
+            points = np.concatenate([points,np.ones((points.shape[0],1))],axis=-1)
+            points_in_lidar = (world2lidar @ points.T).T
+            mask = (points_in_lidar[:,0]>self.point_cloud_range[0]) & (points_in_lidar[:,0]<self.point_cloud_range[3]) & (points_in_lidar[:,1]>self.point_cloud_range[1]) & (points_in_lidar[:,1]<self.point_cloud_range[4])
+            points_in_lidar_range = points_in_lidar[mask,0:2]
+            if len(points_in_lidar_range) > 1:
+                polylines.append(LineString(points_in_lidar_range))
+                gt_label =  self.map_element_class[lane_types[idx]]
+                gt_labels.append(gt_label)
+
+        for idx in range(len(trigger_volumes_points)):
+            if not trigger_volumes_types[idx] in self.map_element_class.keys():
+                continue
+            points = trigger_volumes_points[idx]
+            points = np.concatenate([points,np.ones((points.shape[0],1))],axis=-1)
+            points_in_lidar = (world2lidar @ points.T).T
+            mask = (points_in_lidar[:,0]>self.point_cloud_range[0]) & (points_in_lidar[:,0]<self.point_cloud_range[3]) & (points_in_lidar[:,1]>self.point_cloud_range[1]) & (points_in_lidar[:,1]<self.point_cloud_range[4])
+            points_in_lidar_range = points_in_lidar[mask,0:2]
+            if mask.all():
+                polylines.append(LineString(np.concatenate((points_in_lidar_range,points_in_lidar_range[0:1]),axis=0)))
+                gt_label = self.map_element_class[trigger_volumes_types[idx]]
+                gt_labels.append(gt_label)
+
+        gt_labels = torch.tensor(gt_labels)
+        gt_bboxes = LiDARInstanceLines(polylines,fixed_num=self.polyline_points_num,patch_size=(self.point_cloud_range[4]-self.point_cloud_range[1],self.point_cloud_range[3]-self.point_cloud_range[0]))
+        return gt_labels,gt_bboxes
+
+
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \
+                    3D ground truth bboxes
+                - gt_labels_3d (np.ndarray): Labels of ground truths.
+                - gt_names (list[str]): Class names of ground truths.
+        """
+        info = self.data_infos[index]
+        # filter out bbox containing no points
+
+        for i in range(len(info['gt_names'])):
+            if info['gt_names'][i] in self.NameMapping.keys():
+                info['gt_names'][i] = self.NameMapping[info['gt_names'][i]]
+        mask = (info['num_points'] != 0)
+        gt_bboxes_3d = info['gt_boxes'][mask]
+        gt_names_3d = info['gt_names'][mask]
+        gt_inds = info['gt_ids']
+        gt_labels_3d = []
+        for cat in gt_names_3d:
+            if cat in self.CLASSES:
+                gt_labels_3d.append(self.CLASSES.index(cat))
+            else:
+                gt_labels_3d.append(-1)
+        gt_labels_3d = np.array(gt_labels_3d)
+
+        if not self.with_velocity:
+            gt_bboxes_3d = gt_bboxes_3d[:,0:7]
+
+        gt_bboxes_3d = LiDARInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+        attr_labels = self.get_box_attr_labels(index,self.sample_interval,self.future_frames)
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+            gt_names=gt_names_3d,
+            attr_labels=attr_labels[mask],
+            )
+        return anns_results
+
+    def __getitem__(self, idx):
+        """Get item from infos according to the given index.
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        if self.test_mode:
+            return self.prepare_test_data(idx)
+        while True:
+
+            data = self.prepare_train_data(idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+
+    def get_ego_trajs(self,idx,sample_rate,past_frames,future_frames):
+
+        adj_idx_list = range(idx-past_frames*sample_rate,idx+(future_frames+1)*sample_rate,sample_rate)
+        cur_frame = self.data_infos[idx]
+        full_adj_track = np.zeros((past_frames+future_frames+1,2))
+        full_adj_adj_mask = np.zeros(past_frames+future_frames+1)
+        world2lidar_lidar_cur = cur_frame['sensors']['LIDAR_TOP']['world2lidar']
+        for j in range(len(adj_idx_list)):
+            adj_idx = adj_idx_list[j]
+            if adj_idx <0 or adj_idx>=len(self.data_infos):
+                break
+            adj_frame = self.data_infos[adj_idx]
+            if adj_frame['folder'] != cur_frame ['folder']:
+                break
+            world2lidar_ego_adj = adj_frame['sensors']['LIDAR_TOP']['world2lidar']
+            adj2cur_lidar = world2lidar_lidar_cur @ np.linalg.inv(world2lidar_ego_adj)
+            xy = adj2cur_lidar[0:2,3]
+            full_adj_track[j,0:2] = xy
+            full_adj_adj_mask[j] = 1
+        offset_track = full_adj_track[1:] - full_adj_track[:-1]
+        for j in range(past_frames-1,-1,-1):
+            if full_adj_adj_mask[j] == 0:
+                offset_track[j] = offset_track[j+1]
+        for j in range(past_frames,past_frames+future_frames,1):
+
+            if full_adj_adj_mask[j+1] == 0 :
+                offset_track[j] = 0
+        command = self.command2hot(cur_frame['command_near'])
+        return offset_track[:past_frames].copy(), offset_track[past_frames:].copy(), full_adj_adj_mask[-future_frames:].copy(), command
+    
+    def command2hot(self,command,max_dim=6):
+        if command < 0:
+            command = 4
+        command -= 1
+        cmd_one_hot = np.zeros(max_dim)
+        cmd_one_hot[command] = 1
+        return cmd_one_hot
+
+    def get_box_attr_labels(self,idx,sample_rate,frames):
+
+
+        adj_idx_list = range(idx,idx+(frames+1)*sample_rate,sample_rate)
+        cur_frame = self.data_infos[idx]
+        cur_box_names = cur_frame['gt_names']
+        for i in range(len(cur_box_names)):
+            if cur_box_names[i] in self.NameMapping.keys():
+                cur_box_names[i] = self.NameMapping[cur_box_names[i]]
+        cur_boxes = cur_frame['gt_boxes'].copy()
+        box_ids = cur_frame['gt_ids']
+        future_track = np.zeros((len(box_ids),frames+1,2))
+        future_mask = np.zeros((len(box_ids),frames+1))
+        future_yaw = np.zeros((len(box_ids),frames+1))
+        gt_fut_goal = np.zeros((len(box_ids),1))
+        agent_lcf_feat = np.zeros((len(box_ids),9))
+        world2lidar_lidar_cur = cur_frame['sensors']['LIDAR_TOP']['world2lidar']
+        for i in range(len(box_ids)):
+            agent_lcf_feat[i,0:2] = cur_boxes[i,0:2]
+            agent_lcf_feat[i,2] = cur_boxes[i,6]
+            agent_lcf_feat[i,3:5] = cur_boxes[i,7:]
+            agent_lcf_feat[i,5:8] = cur_boxes[i,3:6]
+            cur_box_name = cur_box_names[i]
+            if cur_box_name in self.CLASSES:
+                agent_lcf_feat[i, 8] = self.CLASSES.index(cur_box_name)
+            else:
+                agent_lcf_feat[i, 8] = -1
+
+            box_id = box_ids[i]
+            cur_box2lidar = world2lidar_lidar_cur @ cur_frame['npc2world'][i]
+            cur_xy = cur_box2lidar[0:2,3]      
+            for j in range(len(adj_idx_list)):
+                adj_idx = adj_idx_list[j]
+                if adj_idx <0 or adj_idx>=len(self.data_infos):
+                    break
+                adj_frame = self.data_infos[adj_idx]
+                if adj_frame['folder'] != cur_frame ['folder']:
+                    break
+                if len(np.where(adj_frame['gt_ids']==box_id)[0])==0:
+                    continue
+                assert len(np.where(adj_frame['gt_ids']==box_id)[0]) == 1 , np.where(adj_frame['gt_ids']==box_id)[0]
+                adj_idx = np.where(adj_frame['gt_ids']==box_id)[0][0]
+                adj_box2lidar = world2lidar_lidar_cur @ adj_frame['npc2world'][adj_idx]
+                adj_xy = adj_box2lidar[0:2,3]    
+                future_track[i,j,:] = adj_xy
+                future_mask[i,j] = 1
+                future_yaw[i,j] = np.arctan2(adj_box2lidar[1,0],adj_box2lidar[0,0])
+
+            coord_diff = future_track[i,-1] - future_track[i,0]
+            if coord_diff.max() < 1.0: # static
+                gt_fut_goal[i] = 9
+            else:
+                box_mot_yaw = np.arctan2(coord_diff[1], coord_diff[0]) + np.pi
+                gt_fut_goal[i] = box_mot_yaw // (np.pi / 4)  # 0-8: goal direction class
+
+        future_track_offset = future_track[:,1:,:] - future_track[:,:-1,:]
+        future_mask_offset = future_mask[:,1:]
+        future_track_offset[future_mask_offset==0] = 0
+        future_yaw_offset = future_yaw[:,1:] - future_yaw[:,:-1]
+        mask1 = np.where(future_yaw_offset>np.pi)
+        mask2 = np.where(future_yaw_offset<-np.pi)
+        future_yaw_offset[mask1] -=np.pi*2 
+        future_yaw_offset[mask2] +=np.pi*2
+        attr_labels = np.concatenate([future_track_offset.reshape(-1,frames*2), future_mask_offset, gt_fut_goal, agent_lcf_feat, future_yaw_offset],axis=-1).astype(np.float32)
+        return attr_labels.copy()
+
+
+
+    def load_gt(self):
+        all_annotations = EvalBoxes()
+        for i in range(len(self.data_infos)):
+            sample_boxes = []
+            sample_data = self.data_infos[i]
+            gt_boxes = sample_data['gt_boxes']
+            for j in range(gt_boxes.shape[0]):
+                class_name = self.NameMapping[sample_data['gt_names'][j]]
+                if not class_name in self.eval_cfg['class_range'].keys():
+                    continue
+                range_x, range_y = self.eval_cfg['class_range'][class_name]
+                if abs(gt_boxes[j,0]) > range_x or abs(gt_boxes[j,1]) > range_y:
+                    continue
+                sample_boxes.append(DetectionBox(
+                                                sample_token=sample_data['folder']+'_'+str(sample_data['frame_idx']),
+                                                translation=gt_boxes[j,0:3],
+                                                size=gt_boxes[j,3:6],
+                                                rotation=list(Quaternion(axis=[0, 0, 1], radians=-gt_boxes[j,6]-np.pi/2)),
+                                                velocity=gt_boxes[j,7:9],
+                                                num_pts=int(sample_data['num_points'][j]),
+                                                detection_name=class_name,
+                                                detection_score=-1.0,  
+                                                attribute_name=class_name
+                                                ))
+            all_annotations.add_boxes(sample_data['folder']+'_'+str(sample_data['frame_idx']), sample_boxes)
+        return all_annotations
+
+
+
+    def _format_gt(self):
+        gt_annos = []
+        print('Start to convert gt map format...')
+        # assert self.map_ann_file is not None
+        if (not os.path.exists(self.map_ann_file)) :
+            dataset_length = len(self)
+            prog_bar = mmcv.ProgressBar(dataset_length)
+            mapped_class_names = self.MAPCLASSES
+            for sample_id in range(dataset_length):
+                sample_token = self.data_infos[sample_id]['folder'] +  '_' + str(self.data_infos[sample_id]['frame_idx'])
+                gt_anno = {}
+                gt_anno['sample_token'] = sample_token
+                # gt_sample_annos = []
+                gt_sample_dict = {}
+                gt_labels , gt_bboxes = self.get_map_info(sample_id)
+                gt_vecs = gt_bboxes.instance_list
+                gt_vec_list = []
+                for i, (gt_label, gt_vec) in enumerate(zip(gt_labels, gt_vecs)):
+                    name = mapped_class_names[gt_label]
+                    anno = dict(
+                        pts=np.array(list(gt_vec.coords)),
+                        pts_num=len(list(gt_vec.coords)),
+                        cls_name=name,
+                        type=gt_label,
+                    )
+                    gt_vec_list.append(anno)
+                gt_anno['vectors']=gt_vec_list
+                gt_annos.append(gt_anno)
+
+                prog_bar.update()
+            nusc_submissions = {
+                'GTs': gt_annos
+            }
+            print('\n GT anns writes to', self.map_ann_file)
+            dump(nusc_submissions, self.map_ann_file)
+        else:
+            print(f'{self.map_ann_file} exist, not update')
+
+
+    def _format_bbox(self, results, jsonfile_prefix=None, score_thresh=0.2):
+        """Convert the results to the standard format.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of the output jsonfile.
+                You can specify the output directory/filename by
+                modifying the jsonfile_prefix. Default: None.
+
+        Returns:
+            str: Path of the output json file.
+        """
+
+        nusc_annos = {}
+        det_mapped_class_names = self.CLASSES
+        # assert self.map_ann_file is not None
+        map_pred_annos = {}
+        map_mapped_class_names = self.MAPCLASSES
+        plan_annos = {}
+        print('Start to convert detection format...')
+        for sample_id, det in enumerate(track_iter_progress(results)):
+            #pdb.set_trace()
+            annos = []
+            box3d = det['boxes_3d']
+            scores = det['scores_3d']
+            labels = det['labels_3d']
+            box_gravity_center = box3d.gravity_center
+            box_dims = box3d.dims
+            box_yaw = box3d.yaw.numpy()
+            box_yaw = -box_yaw - np.pi / 2
+            sample_token = self.data_infos[sample_id]['folder'] + '_' + str(self.data_infos[sample_id]['frame_idx'])
+            for i in range(len(box3d)):
+                #import pdb;pdb.set_trace()
+                if scores[i] < score_thresh:
+                    continue
+                quat = list(Quaternion(axis=[0, 0, 1], radians=box_yaw[i]))
+                velocity = [box3d.tensor[i, 7].item(),box3d.tensor[i, 8].item()]
+                name = det_mapped_class_names[labels[i]]
+                nusc_anno = dict(
+                    sample_token=sample_token,
+                    translation=box_gravity_center[i].tolist(),
+                    size=box_dims[i].tolist(),
+                    rotation=quat,
+                    velocity=velocity,
+                    detection_name=name,
+                    detection_score=scores[i].item(),
+                    attribute_name=name)
+                annos.append(nusc_anno)
+            nusc_annos[sample_token] = annos
+            map_pred_anno = {}
+            vecs = output_to_vecs(det)
+            sample_token = self.data_infos[sample_id]['folder'] +  '_' + str(self.data_infos[sample_id]['frame_idx'])
+            map_pred_anno['sample_token'] = sample_token
+            pred_vec_list=[]
+            for i, vec in enumerate(vecs):
+                name = map_mapped_class_names[vec['label']]
+                anno = dict(
+                    # sample_token=sample_token,
+                    pts=vec['pts'],
+                    pts_num=len(vec['pts']),
+                    cls_name=name,
+                    type=vec['label'],
+                    confidence_level=vec['score'])
+                pred_vec_list.append(anno)
+                # annos.append(nusc_anno)
+            # nusc_annos[sample_token] = annos
+            map_pred_anno['vectors'] = pred_vec_list
+            map_pred_annos[sample_token] = map_pred_anno
+
+        # NOTE: Eval on map is VERY SLOW for the first time(about 3 hours) because load map ground trurh is slow. 
+        #       So we do not eval map by default.
+        # if not os.path.exists(self.map_ann_file):
+        #     self._format_gt()
+        # else:
+        #     print(f'{self.map_ann_file} exist, not update')
+        # with open(self.map_ann_file,'r') as f:
+        #     GT_anns = json.load(f)
+        # gt_annos = GT_anns['GTs']
+
+        nusc_submissions = {
+            'meta': self.modality,
+            'results': nusc_annos,
+            'map_results': map_pred_annos,
+            'plan_results': plan_annos
+            # 'GTs': gt_annos
+        }
+
+        mmcv.mkdir_or_exist(jsonfile_prefix)
+
+        res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+        print('Results writes to', res_path)
+        dump(nusc_submissions, res_path)
+        return res_path
+
+    def format_results(self, results, jsonfile_prefix=None):
+        """Format the results to json (standard format for COCO evaluation).
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: Returns (result_files, tmp_dir), where `result_files` is a \
+                dict containing the json filepaths, `tmp_dir` is the temporal \
+                directory created for saving json files when \
+                `jsonfile_prefix` is not specified.
+        """
+        if isinstance(results, dict):
+            # print(f'results must be a list, but get dict, keys={results.keys()}')
+            # assert isinstance(results, list)
+            results = results['bbox_results']
+        assert isinstance(results, list)
+        # assert len(results) == len(self), (
+        #     'The length of results is not equal to the dataset len: {} != {}'.
+        #     format(len(results), len(self)))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+
+        # currently the output prediction results could be in two formats
+        # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
+        # 2. list of dict('pts_bbox' or 'img_bbox':
+        #     dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
+        # this is a workaround to enable evaluation of both formats on nuScenes
+        # refer to https://github.com/open-mmlab/mmdetection3d/issues/449
+        if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
+            result_files = self._format_bbox(results, jsonfile_prefix)
+        else:
+            # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
+            result_files = dict()
+            for name in results[0]:
+                if name == 'metric_results':
+                    continue
+                print(f'\nFormating bboxes of {name}')
+                results_ = [out[name] for out in results]
+                tmp_file_ = osp.join(jsonfile_prefix, name)
+                result_files.update(
+                    {name: self._format_bbox(results_, tmp_file_)})
+        return result_files, tmp_dir
+
+    def _evaluate_single(self,
+                         result_path,
+                         logger=None,
+                         metric='bbox',
+                         map_metric='chamfer',
+                         result_name='pts_bbox'):
+        """Evaluation for a single model in nuScenes protocol.
+
+        Args:
+            result_path (str): Path of the result file.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            metric (str): Metric name used for evaluation. Default: 'bbox'.
+            result_name (str): Result name in the metric prefix.
+                Default: 'pts_bbox'.
+
+        Returns:
+            dict: Dictionary of evaluation details.
+        """
+        detail = dict()
+        with open(result_path,'r') as f:
+            result_data = json.load(f)
+        pred_boxes = EvalBoxes.deserialize(result_data['results'], DetectionBox)
+        meta = result_data['meta']
+
+
+
+        gt_boxes = self.load_gt()
+
+        metric_data_list = DetectionMetricDataList()
+        for class_name in self.eval_cfg['class_names']:
+            for dist_th in self.eval_cfg['dist_ths']:
+                md = accumulate(gt_boxes, pred_boxes, class_name, center_distance, dist_th)
+                metric_data_list.set(class_name, dist_th, md)
+                metrics = DetectionMetrics(self.eval_cfg)
+
+        for class_name in self.eval_cfg['class_names']:
+            # Compute APs.
+            for dist_th in self.eval_cfg['dist_ths']:
+                metric_data = metric_data_list[(class_name, dist_th)]
+                ap = calc_ap(metric_data, self.eval_cfg['min_recall'], self.eval_cfg['min_precision'])
+                metrics.add_label_ap(class_name, dist_th, ap)
+
+            # Compute TP metrics.
+            for metric_name in self.eval_cfg['tp_metrics']:
+                metric_data = metric_data_list[(class_name, self.eval_cfg['dist_th_tp'])]
+                tp = calc_tp(metric_data, self.eval_cfg['min_recall'], metric_name)
+                metrics.add_label_tp(class_name, metric_name, tp)
+
+        metrics_summary = metrics.serialize()
+        metrics_summary['meta'] = meta.copy()
+        print('mAP: %.4f' % (metrics_summary['mean_ap']))
+        err_name_mapping = {
+            'trans_err': 'mATE',
+            'scale_err': 'mASE',
+            'orient_err': 'mAOE',
+            'vel_err': 'mAVE',
+        }
+        for tp_name, tp_val in metrics_summary['tp_errors'].items():
+            print('%s: %.4f' % (err_name_mapping[tp_name], tp_val))
+        print('NDS: %.4f' % (metrics_summary['nd_score']))
+        #print('Eval time: %.1fs' % metrics_summary['eval_time'])
+
+        # Print per-class metrics.
+        print()
+        print('Per-class results:')
+        print('Object Class\tAP\tATE\tASE\tAOE\tAVE')
+        class_aps = metrics_summary['mean_dist_aps']
+        class_tps = metrics_summary['label_tp_errors']
+        for class_name in class_aps.keys():
+            print('%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
+                  % (class_name, class_aps[class_name],
+                     class_tps[class_name]['trans_err'],
+                     class_tps[class_name]['scale_err'],
+                     class_tps[class_name]['orient_err'],
+                     class_tps[class_name]['vel_err']))        
+
+        detail = dict()
+        metric_prefix = 'bbox_NuScenes'
+        for name in self.eval_cfg['class_names']:
+            for k, v in metrics_summary['label_aps'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics_summary['label_tp_errors'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics_summary['tp_errors'].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}'.format(metric_prefix,self.eval_cfg['err_name_maping'][k])] = val
+        detail['{}/NDS'.format(metric_prefix)] = metrics_summary['nd_score']
+        detail['{}/mAP'.format(metric_prefix)] = metrics_summary['mean_ap']
+
+
+        # from mmcv.datasets.map_utils.mean_ap import eval_map
+        # from mmcv.datasets.map_utils.mean_ap import format_res_gt_by_classes
+        # result_path = osp.abspath(result_path)
+        
+        # print('Formating results & gts by classes')
+        # pred_results = load(result_path)
+        # map_results = pred_results['map_results']
+        # gt_anns = load(self.map_ann_file)
+        # map_annotations = gt_anns['GTs']
+        # cls_gens, cls_gts = format_res_gt_by_classes(result_path,
+        #                                              map_results,
+        #                                              map_annotations,
+        #                                              cls_names=self.MAPCLASSES,
+        #                                              num_pred_pts_per_instance=self.polyline_points_num,
+        #                                              eval_use_same_gt_sample_num_flag=self.map_eval_use_same_gt_sample_num_flag,
+        #                                              pc_range=self.point_cloud_range)
+        # map_metrics = map_metric if isinstance(map_metric, list) else [map_metric]
+        # allowed_metrics = ['chamfer', 'iou']
+        # for metric in map_metrics:
+        #     if metric not in allowed_metrics:
+        #         raise KeyError(f'metric {metric} is not supported')
+        # for metric in map_metrics:
+        #     print('-*'*10+f'use metric:{metric}'+'-*'*10)
+        #     if metric == 'chamfer':
+        #         thresholds = [0.5,1.0,1.5]
+        #     elif metric == 'iou':
+        #         thresholds= np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        #     cls_aps = np.zeros((len(thresholds),self.NUM_MAPCLASSES))
+        #     for i, thr in enumerate(thresholds):
+        #         print('-*'*10+f'threshhold:{thr}'+'-*'*10)
+        #         mAP, cls_ap = eval_map(
+        #                         map_results,
+        #                         map_annotations,
+        #                         cls_gens,
+        #                         cls_gts,
+        #                         threshold=thr,
+        #                         cls_names=self.MAPCLASSES,
+        #                         logger=logger,
+        #                         num_pred_pts_per_instance=self.polyline_points_num,
+        #                         pc_range=self.point_cloud_range,
+        #                         metric=metric)
+        #         for j in range(self.NUM_MAPCLASSES):
+        #             cls_aps[i, j] = cls_ap[j]['ap']
+        #     for i, name in enumerate(self.MAPCLASSES):
+        #         print('{}: {}'.format(name, cls_aps.mean(0)[i]))
+        #         detail['NuscMap_{}/{}_AP'.format(metric,name)] =  cls_aps.mean(0)[i]
+        #     print('map: {}'.format(cls_aps.mean(0).mean()))
+        #     detail['NuscMap_{}/mAP'.format(metric)] = cls_aps.mean(0).mean()
+        #     for i, name in enumerate(self.MAPCLASSES):
+        #         for j, thr in enumerate(thresholds):
+        #             if metric == 'chamfer':
+        #                 detail['NuscMap_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i]
+        #             elif metric == 'iou':
+        #                 if thr == 0.5 or thr == 0.75:
+        #                     detail['NuscMap_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i]
+
+        return detail
+    
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 map_metric='chamfer',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 result_names=['pts_bbox'],
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluation in nuScenes protocol.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            show (bool): Whether to visualize.
+                Default: False.
+            out_dir (str): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+
+        Returns:
+            dict[str, float]: Results of each evaluation metric.
+        """
+        result_metric_names = ['EPA', 'ADE', 'FDE', 'MR']
+        motion_cls_names = ['car', 'pedestrian']
+        motion_metric_names = ['gt', 'cnt_ade', 'cnt_fde', 'hit',
+                               'fp', 'ADE', 'FDE', 'MR']
+        all_metric_dict = {}
+        for met in motion_metric_names:
+            for cls in motion_cls_names:
+                all_metric_dict[met+'_'+cls] = 0.0
+        result_dict = {}
+        for met in result_metric_names:
+            for cls in motion_cls_names:
+                result_dict[met+'_'+cls] = 0.0
+        
+        alpha = 0.5
+
+        for i in range(len(results)):
+            for key in all_metric_dict.keys():
+                all_metric_dict[key] += results[i]['metric_results'][key]
+        
+        for cls in motion_cls_names:
+            result_dict['EPA_'+cls] = (all_metric_dict['hit_'+cls] - \
+                 alpha * all_metric_dict['fp_'+cls]) / all_metric_dict['gt_'+cls]
+            result_dict['ADE_'+cls] = all_metric_dict['ADE_'+cls] / all_metric_dict['cnt_ade_'+cls]
+            result_dict['FDE_'+cls] = all_metric_dict['FDE_'+cls] / all_metric_dict['cnt_fde_'+cls]
+            result_dict['MR_'+cls] = all_metric_dict['MR_'+cls] / all_metric_dict['cnt_fde_'+cls]
+        
+        print('\n')
+        print('-------------- Motion Prediction --------------')
+        for k, v in result_dict.items():
+            print(f'{k}: {v}')
+
+        # NOTE: print planning metric
+        print('\n')
+        print('-------------- Planning --------------')
+        metric_dict = None
+        num_valid = 0
+        for res in results:
+            if res['metric_results']['fut_valid_flag']:
+                num_valid += 1
+            else:
+                continue
+            if metric_dict is None:
+                metric_dict = copy.deepcopy(res['metric_results'])
+            else:
+                for k in res['metric_results'].keys():
+                    metric_dict[k] += res['metric_results'][k]
+        
+        for k in metric_dict:
+            metric_dict[k] = metric_dict[k] / num_valid
+            print("{}:{}".format(k, metric_dict[k]))
+
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+
+        if isinstance(result_files, dict):
+            results_dict = dict()
+            for name in result_names:
+                print('Evaluating bboxes of {}'.format(name))
+                ret_dict = self._evaluate_single(result_files[name], metric=metric, map_metric=map_metric)
+            results_dict.update(ret_dict)
+        elif isinstance(result_files, str):
+            results_dict = self._evaluate_single(result_files, metric=metric, map_metric=map_metric)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+
+        if show:
+            self.show(results, out_dir, pipeline=pipeline)
+        return results_dict
+
+def output_to_nusc_box(detection):
+    """Convert the output to the box class in the nuScenes.
+
+    Args:
+        detection (dict): Detection results.
+
+            - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+            - scores_3d (torch.Tensor): Detection scores.
+            - labels_3d (torch.Tensor): Predicted box labels.
+
+    Returns:
+        list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
+    """
+    box3d = detection['boxes_3d']
+    scores = detection['scores_3d'].numpy()
+    labels = detection['labels_3d'].numpy()
+    trajs = detection['trajs_3d'].numpy()
+
+
+    box_gravity_center = box3d.gravity_center.numpy()
+    box_dims = box3d.dims.numpy()
+    box_yaw = box3d.yaw.numpy()
+    # TODO: check whether this is necessary
+    # with dir_offset & dir_limit in the head
+    box_yaw = -box_yaw - np.pi / 2
+
+    box_list = []
+    for i in range(len(box3d)):
+        quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+        velocity = (*box3d.tensor[i, 7:9], 0.0)
+        # velo_val = np.linalg.norm(box3d[i, 7:9])
+        # velo_ori = box3d[i, 6]
+        # velocity = (
+        # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0)
+        box = CustomNuscenesBox(
+            center=box_gravity_center[i],
+            size=box_dims[i],
+            orientation=quat,
+            fut_trajs=trajs[i],
+            label=labels[i],
+            score=scores[i],
+            velocity=velocity)
+        box_list.append(box)
+    return box_list
+
+
+def lidar_nusc_box_to_global(info,
+                             boxes,
+                             classes,
+                             eval_configs,
+                             eval_version='detection_cvpr_2019'):
+    """Convert the box from ego to global coordinate.
+
+    Args:
+        info (dict): Info for a specific sample data, including the
+            calibration information.
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        classes (list[str]): Mapped classes in the evaluation.
+        eval_configs (object): Evaluation configuration object.
+        eval_version (str): Evaluation version.
+            Default: 'detection_cvpr_2019'
+
+    Returns:
+        list: List of standard NuScenesBoxes in the global
+            coordinate.
+    """
+    box_list = []
+    for box in boxes:
+        # Move box to ego vehicle coord system
+        box.rotate(pyquaternion.Quaternion(info['lidar2ego_rotation']))
+        box.translate(np.array(info['lidar2ego_translation']))
+        # filter det in ego.
+        cls_range_x_map = eval_configs.class_range_x
+        cls_range_y_map = eval_configs.class_range_y
+        x_distance, y_distance = box.center[0], box.center[1]
+        det_range_x = cls_range_x_map[classes[box.label]]
+        det_range_y = cls_range_y_map[classes[box.label]]
+        if abs(x_distance) > det_range_x or abs(y_distance) > det_range_y:
+            continue
+        # Move box to global coord system
+        box.rotate(pyquaternion.Quaternion(info['ego2global_rotation']))
+        box.translate(np.array(info['ego2global_translation']))
+        box_list.append(box)
+    return box_list
+
+def output_to_vecs(detection):
+    box3d = detection['map_boxes_3d'].numpy()
+    scores = detection['map_scores_3d'].numpy()
+    labels = detection['map_labels_3d'].numpy()
+    pts = detection['map_pts_3d'].numpy()
+
+    vec_list = []
+    # import pdb;pdb.set_trace()
+    for i in range(box3d.shape[0]):
+        vec = dict(
+            bbox = box3d[i], # xyxy
+            label=labels[i],
+            score=scores[i],
+            pts=pts[i],
+        )
+        vec_list.append(vec)
+    return vec_list
\ No newline at end of file
diff --git a/mmcv/datasets/__init__.py b/mmcv/datasets/__init__.py
new file mode 100644
index 0000000..a0093d3
--- /dev/null
+++ b/mmcv/datasets/__init__.py
@@ -0,0 +1,15 @@
+from .builder import DATASETS, PIPELINES, build_dataloader, build_dataset
+from .custom_3d import Custom3DDataset
+from .custom import CustomDataset
+from .nuscenes_dataset import NuScenesDataset
+from .nuscenes_e2e_dataset import NuScenesE2EDataset
+from .samplers import DistributedGroupSampler, DistributedSampler, GroupSampler
+from .utils import replace_ImageToTensor
+from .custom_nuscenes_dataset_v2 import CustomNuScenesDatasetV2
+from .custom_nuscenes_dataset import CustomNuScenesDataset
+from .dd3d_nuscenes_dataset import DD3DNuscenesDataset
+from .lyft_dataset import LyftDataset
+from .B2D_dataset import B2D_Dataset
+from .B2D_e2e_dataset import B2D_E2E_Dataset
+from .nuscenes_vad_dataset import VADCustomNuScenesDataset
+from .B2D_vad_dataset import B2D_VAD_Dataset
\ No newline at end of file
diff --git a/mmcv/datasets/api_wrappers/__init__.py b/mmcv/datasets/api_wrappers/__init__.py
new file mode 100644
index 0000000..05f95c9
--- /dev/null
+++ b/mmcv/datasets/api_wrappers/__init__.py
@@ -0,0 +1,3 @@
+from .coco_api import COCO, COCOeval
+
+__all__ = ['COCO', 'COCOeval']
diff --git a/mmcv/datasets/api_wrappers/coco_api.py b/mmcv/datasets/api_wrappers/coco_api.py
new file mode 100644
index 0000000..57077f9
--- /dev/null
+++ b/mmcv/datasets/api_wrappers/coco_api.py
@@ -0,0 +1,46 @@
+# This file add snake case alias for coco api
+
+import warnings
+
+import pycocotools
+from pycocotools.coco import COCO as _COCO
+from pycocotools.cocoeval import COCOeval as _COCOeval
+
+
+class COCO(_COCO):
+    """This class is almost the same as official pycocotools package.
+
+    It implements some snake case function aliases. So that the COCO class has
+    the same interface as LVIS class.
+    """
+
+    def __init__(self, annotation_file=None):
+        if getattr(pycocotools, '__version__', '0') >= '12.0.2':
+            warnings.warn(
+                'mmpycocotools is deprecated. Please install official pycocotools by "pip install pycocotools"',  # noqa: E501
+                UserWarning)
+        super().__init__(annotation_file=annotation_file)
+        self.img_ann_map = self.imgToAnns
+        self.cat_img_map = self.catToImgs
+
+    def get_ann_ids(self, img_ids=[], cat_ids=[], area_rng=[], iscrowd=None):
+        return self.getAnnIds(img_ids, cat_ids, area_rng, iscrowd)
+
+    def get_cat_ids(self, cat_names=[], sup_names=[], cat_ids=[]):
+        return self.getCatIds(cat_names, sup_names, cat_ids)
+
+    def get_img_ids(self, img_ids=[], cat_ids=[]):
+        return self.getImgIds(img_ids, cat_ids)
+
+    def load_anns(self, ids):
+        return self.loadAnns(ids)
+
+    def load_cats(self, ids):
+        return self.loadCats(ids)
+
+    def load_imgs(self, ids):
+        return self.loadImgs(ids)
+
+
+# just for the ease of import
+COCOeval = _COCOeval
diff --git a/mmcv/datasets/builder.py b/mmcv/datasets/builder.py
new file mode 100644
index 0000000..7f527d6
--- /dev/null
+++ b/mmcv/datasets/builder.py
@@ -0,0 +1,204 @@
+import copy
+import platform
+import random
+from functools import partial
+
+import numpy as np
+from mmcv.parallel import collate
+from mmcv.utils import Registry, build_from_cfg, get_dist_info
+from torch.utils.data import DataLoader
+
+# DATASETS = Registry('dataset')
+# PIPELINES = Registry('pipeline')
+# OBJECTSAMPLERS = Registry('Object sampler')
+
+from .samplers import DistributedGroupSampler, DistributedSampler, GroupSampler
+# from .dataset_wrappers import CBGSDataset, ClassBalancedDataset, ConcatDataset, RepeatDataset
+from .samplers.sampler import build_sampler
+
+if platform.system() != 'Windows':
+    # https://github.com/pytorch/pytorch/issues/973
+    import resource
+    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+    hard_limit = rlimit[1]
+    soft_limit = min(4096, hard_limit)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
+
+DATASETS = Registry('dataset')
+PIPELINES = Registry('pipeline')
+OBJECTSAMPLERS = Registry('Object sampler')
+
+
+
+def _concat_dataset(cfg, default_args=None):
+    from .dataset_wrappers import ConcatDataset
+    ann_files = cfg['ann_file']
+    img_prefixes = cfg.get('img_prefix', None)
+    seg_prefixes = cfg.get('seg_prefix', None)
+    proposal_files = cfg.get('proposal_file', None)
+    separate_eval = cfg.get('separate_eval', True)
+
+    datasets = []
+    num_dset = len(ann_files)
+    for i in range(num_dset):
+        data_cfg = copy.deepcopy(cfg)
+        # pop 'separate_eval' since it is not a valid key for common datasets.
+        if 'separate_eval' in data_cfg:
+            data_cfg.pop('separate_eval')
+        data_cfg['ann_file'] = ann_files[i]
+        if isinstance(img_prefixes, (list, tuple)):
+            data_cfg['img_prefix'] = img_prefixes[i]
+        if isinstance(seg_prefixes, (list, tuple)):
+            data_cfg['seg_prefix'] = seg_prefixes[i]
+        if isinstance(proposal_files, (list, tuple)):
+            data_cfg['proposal_file'] = proposal_files[i]
+        datasets.append(build_dataset(data_cfg, default_args))
+
+    return ConcatDataset(datasets, separate_eval)
+
+
+
+
+def build_dataset(cfg, default_args=None):
+    from mmcv.datasets.dataset_wrappers import CBGSDataset
+    from mmcv.datasets.dataset_wrappers import (ClassBalancedDataset,
+                                                 ConcatDataset, RepeatDataset)
+    if isinstance(cfg, (list, tuple)):
+        dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
+    elif cfg['type'] == 'ConcatDataset':
+        dataset = ConcatDataset(
+            [build_dataset(c, default_args) for c in cfg['datasets']],
+            cfg.get('separate_eval', True))
+    elif cfg['type'] == 'RepeatDataset':
+        dataset = RepeatDataset(
+            build_dataset(cfg['dataset'], default_args), cfg['times'])
+    elif cfg['type'] == 'ClassBalancedDataset':
+        dataset = ClassBalancedDataset(
+            build_dataset(cfg['dataset'], default_args), cfg['oversample_thr'])
+    elif cfg['type'] == 'CBGSDataset':
+        dataset = CBGSDataset(build_dataset(cfg['dataset'], default_args))
+    elif isinstance(cfg.get('ann_file'), (list, tuple)):
+        dataset = _concat_dataset(cfg, default_args)
+    else:
+        dataset = build_from_cfg(cfg, DATASETS, default_args)
+
+    return dataset
+
+
+def build_dataloader(dataset,
+                     samples_per_gpu,
+                     workers_per_gpu,
+                     num_gpus=1,
+                     dist=True,
+                     shuffle=True,
+                     seed=None,
+                     shuffler_sampler=None,
+                     nonshuffler_sampler=None,
+                     **kwargs):
+    """Build PyTorch DataLoader.
+    In distributed training, each GPU/process has a dataloader.
+    In non-distributed training, there is only one dataloader for all GPUs.
+    Args:
+        dataset (Dataset): A PyTorch dataset.
+        samples_per_gpu (int): Number of training samples on each GPU, i.e.,
+            batch size of each GPU.
+        workers_per_gpu (int): How many subprocesses to use for data loading
+            for each GPU.
+        num_gpus (int): Number of GPUs. Only used in non-distributed training.
+        dist (bool): Distributed training/test or not. Default: True.
+        shuffle (bool): Whether to shuffle the data at every epoch.
+            Default: True.
+        kwargs: any keyword argument to be used to initialize DataLoader
+    Returns:
+        DataLoader: A PyTorch dataloader.
+    """
+    rank, world_size = get_dist_info()
+    if dist:
+        # DistributedGroupSampler will definitely shuffle the data to satisfy
+        # that images on each GPU are in the same group
+        if shuffle:
+            sampler = build_sampler(shuffler_sampler if shuffler_sampler is not None else dict(type='DistributedGroupSampler'),
+                                     dict(
+                                         dataset=dataset,
+                                         samples_per_gpu=samples_per_gpu,
+                                         num_replicas=world_size,
+                                         rank=rank,
+                                         seed=seed)
+                                     )
+
+        else:
+            sampler = build_sampler(nonshuffler_sampler if nonshuffler_sampler is not None else dict(type='DistributedSampler'),
+                                     dict(
+                                         dataset=dataset,
+                                         num_replicas=world_size,
+                                         rank=rank,
+                                         shuffle=shuffle,
+                                         seed=seed)
+                                     )
+
+        batch_size = samples_per_gpu
+        num_workers = workers_per_gpu
+    else:
+        # assert False, 'not support in bevformer'
+        print('WARNING!!!!, Only can be used for obtain inference speed!!!!')
+        sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None
+        batch_size = num_gpus * samples_per_gpu
+        num_workers = num_gpus * workers_per_gpu
+
+    init_fn = partial(
+        worker_init_fn, num_workers=num_workers, rank=rank,
+        seed=seed) if seed is not None else None
+    data_loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        num_workers=num_workers,
+        collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
+        pin_memory=False,
+        worker_init_fn=init_fn,
+        **kwargs)
+
+    return data_loader
+
+
+def worker_init_fn(worker_id, num_workers, rank, seed):
+    # The seed of each worker equals to
+    # num_worker * rank + worker_id + user_seed
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+
+
+if platform.system() != 'Windows':
+    # https://github.com/pytorch/pytorch/issues/973
+    import resource
+    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+    base_soft_limit = rlimit[0]
+    hard_limit = rlimit[1]
+    soft_limit = min(max(4096, base_soft_limit), hard_limit)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
+
+def custom_build_dataset(cfg, default_args=None):
+    from mmdet3d.datasets.dataset_wrappers import CBGSDataset
+    from mmdet.datasets.dataset_wrappers import (ClassBalancedDataset,
+                                                 ConcatDataset, RepeatDataset)
+    if isinstance(cfg, (list, tuple)):
+        dataset = ConcatDataset([custom_build_dataset(c, default_args) for c in cfg])
+    elif cfg['type'] == 'ConcatDataset':
+        dataset = ConcatDataset(
+            [custom_build_dataset(c, default_args) for c in cfg['datasets']],
+            cfg.get('separate_eval', True))
+    elif cfg['type'] == 'RepeatDataset':
+        dataset = RepeatDataset(
+            custom_build_dataset(cfg['dataset'], default_args), cfg['times'])
+    elif cfg['type'] == 'ClassBalancedDataset':
+        dataset = ClassBalancedDataset(
+            custom_build_dataset(cfg['dataset'], default_args), cfg['oversample_thr'])
+    elif cfg['type'] == 'CBGSDataset':
+        dataset = CBGSDataset(custom_build_dataset(cfg['dataset'], default_args))
+    elif isinstance(cfg.get('ann_file'), (list, tuple)):
+        dataset = _concat_dataset(cfg, default_args)
+    else:
+        dataset = build_from_cfg(cfg, DATASETS, default_args)
+
+    return dataset
\ No newline at end of file
diff --git a/mmcv/datasets/coco.py b/mmcv/datasets/coco.py
new file mode 100644
index 0000000..6d2f0c1
--- /dev/null
+++ b/mmcv/datasets/coco.py
@@ -0,0 +1,558 @@
+import itertools
+import logging
+import os.path as osp
+import tempfile
+import warnings
+from collections import OrderedDict
+
+from mmcv.fileio.io import load, dump
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+
+from mmcv.core import eval_recalls
+from .api_wrappers import COCO, COCOeval
+from .builder import DATASETS
+from .custom import CustomDataset
+
+
+@DATASETS.register_module()
+class CocoDataset(CustomDataset):
+
+    CLASSES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
+               'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+               'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
+               'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe',
+               'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
+               'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
+               'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
+               'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
+               'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot',
+               'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
+               'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
+               'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+               'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock',
+               'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush')
+
+    def load_annotations(self, ann_file):
+        """Load annotation from COCO style annotation file.
+
+        Args:
+            ann_file (str): Path of annotation file.
+
+        Returns:
+            list[dict]: Annotation info from COCO api.
+        """
+
+        self.coco = COCO(ann_file)
+        # The order of returned `cat_ids` will not
+        # change with the order of the CLASSES
+        self.cat_ids = self.coco.get_cat_ids(cat_names=self.CLASSES)
+
+        self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
+        self.img_ids = self.coco.get_img_ids()
+        data_infos = []
+        total_ann_ids = []
+        for i in self.img_ids:
+            info = self.coco.load_imgs([i])[0]
+            info['filename'] = info['file_name']
+            data_infos.append(info)
+            ann_ids = self.coco.get_ann_ids(img_ids=[i])
+            total_ann_ids.extend(ann_ids)
+        assert len(set(total_ann_ids)) == len(
+            total_ann_ids), f"Annotation ids in '{ann_file}' are not unique!"
+        return data_infos
+
+    def get_ann_info(self, idx):
+        """Get COCO annotation by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Annotation info of specified index.
+        """
+
+        img_id = self.data_infos[idx]['id']
+        ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
+        ann_info = self.coco.load_anns(ann_ids)
+        return self._parse_ann_info(self.data_infos[idx], ann_info)
+
+    def get_cat_ids(self, idx):
+        """Get COCO category ids by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            list[int]: All categories in the image of specified index.
+        """
+
+        img_id = self.data_infos[idx]['id']
+        ann_ids = self.coco.get_ann_ids(img_ids=[img_id])
+        ann_info = self.coco.load_anns(ann_ids)
+        return [ann['category_id'] for ann in ann_info]
+
+    def _filter_imgs(self, min_size=32):
+        """Filter images too small or without ground truths."""
+        valid_inds = []
+        # obtain images that contain annotation
+        ids_with_ann = set(_['image_id'] for _ in self.coco.anns.values())
+        # obtain images that contain annotations of the required categories
+        ids_in_cat = set()
+        for i, class_id in enumerate(self.cat_ids):
+            ids_in_cat |= set(self.coco.cat_img_map[class_id])
+        # merge the image id sets of the two conditions and use the merged set
+        # to filter out images if self.filter_empty_gt=True
+        ids_in_cat &= ids_with_ann
+
+        valid_img_ids = []
+        for i, img_info in enumerate(self.data_infos):
+            img_id = self.img_ids[i]
+            if self.filter_empty_gt and img_id not in ids_in_cat:
+                continue
+            if min(img_info['width'], img_info['height']) >= min_size:
+                valid_inds.append(i)
+                valid_img_ids.append(img_id)
+        self.img_ids = valid_img_ids
+        return valid_inds
+
+    def _parse_ann_info(self, img_info, ann_info):
+        """Parse bbox and mask annotation.
+
+        Args:
+            ann_info (list[dict]): Annotation info of an image.
+            with_mask (bool): Whether to parse mask annotations.
+
+        Returns:
+            dict: A dict containing the following keys: bboxes, bboxes_ignore,\
+                labels, masks, seg_map. "masks" are raw annotations and not \
+                decoded into binary masks.
+        """
+        gt_bboxes = []
+        gt_labels = []
+        gt_bboxes_ignore = []
+        gt_masks_ann = []
+        for i, ann in enumerate(ann_info):
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            if ann.get('iscrowd', False):
+                gt_bboxes_ignore.append(bbox)
+            else:
+                gt_bboxes.append(bbox)
+                gt_labels.append(self.cat2label[ann['category_id']])
+                gt_masks_ann.append(ann.get('segmentation', None))
+
+        if gt_bboxes:
+            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+            gt_labels = np.array(gt_labels, dtype=np.int64)
+        else:
+            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+            gt_labels = np.array([], dtype=np.int64)
+
+        if gt_bboxes_ignore:
+            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+        else:
+            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+        seg_map = img_info['filename'].replace('jpg', 'png')
+
+        ann = dict(
+            bboxes=gt_bboxes,
+            labels=gt_labels,
+            bboxes_ignore=gt_bboxes_ignore,
+            masks=gt_masks_ann,
+            seg_map=seg_map)
+
+        return ann
+
+    def xyxy2xywh(self, bbox):
+        """Convert ``xyxy`` style bounding boxes to ``xywh`` style for COCO
+        evaluation.
+
+        Args:
+            bbox (numpy.ndarray): The bounding boxes, shape (4, ), in
+                ``xyxy`` order.
+
+        Returns:
+            list[float]: The converted bounding boxes, in ``xywh`` order.
+        """
+
+        _bbox = bbox.tolist()
+        return [
+            _bbox[0],
+            _bbox[1],
+            _bbox[2] - _bbox[0],
+            _bbox[3] - _bbox[1],
+        ]
+
+    def _proposal2json(self, results):
+        """Convert proposal results to COCO json style."""
+        json_results = []
+        for idx in range(len(self)):
+            img_id = self.img_ids[idx]
+            bboxes = results[idx]
+            for i in range(bboxes.shape[0]):
+                data = dict()
+                data['image_id'] = img_id
+                data['bbox'] = self.xyxy2xywh(bboxes[i])
+                data['score'] = float(bboxes[i][4])
+                data['category_id'] = 1
+                json_results.append(data)
+        return json_results
+
+    def _det2json(self, results):
+        """Convert detection results to COCO json style."""
+        json_results = []
+        for idx in range(len(self)):
+            img_id = self.img_ids[idx]
+            result = results[idx]
+            for label in range(len(result)):
+                bboxes = result[label]
+                for i in range(bboxes.shape[0]):
+                    data = dict()
+                    data['image_id'] = img_id
+                    data['bbox'] = self.xyxy2xywh(bboxes[i])
+                    data['score'] = float(bboxes[i][4])
+                    data['category_id'] = self.cat_ids[label]
+                    json_results.append(data)
+        return json_results
+
+    def _segm2json(self, results):
+        """Convert instance segmentation results to COCO json style."""
+        bbox_json_results = []
+        segm_json_results = []
+        for idx in range(len(self)):
+            img_id = self.img_ids[idx]
+            det, seg = results[idx]
+            for label in range(len(det)):
+                # bbox results
+                bboxes = det[label]
+                for i in range(bboxes.shape[0]):
+                    data = dict()
+                    data['image_id'] = img_id
+                    data['bbox'] = self.xyxy2xywh(bboxes[i])
+                    data['score'] = float(bboxes[i][4])
+                    data['category_id'] = self.cat_ids[label]
+                    bbox_json_results.append(data)
+
+                # segm results
+                # some detectors use different scores for bbox and mask
+                if isinstance(seg, tuple):
+                    segms = seg[0][label]
+                    mask_score = seg[1][label]
+                else:
+                    segms = seg[label]
+                    mask_score = [bbox[4] for bbox in bboxes]
+                for i in range(bboxes.shape[0]):
+                    data = dict()
+                    data['image_id'] = img_id
+                    data['bbox'] = self.xyxy2xywh(bboxes[i])
+                    data['score'] = float(mask_score[i])
+                    data['category_id'] = self.cat_ids[label]
+                    if isinstance(segms[i]['counts'], bytes):
+                        segms[i]['counts'] = segms[i]['counts'].decode()
+                    data['segmentation'] = segms[i]
+                    segm_json_results.append(data)
+        return bbox_json_results, segm_json_results
+
+    def results2json(self, results, outfile_prefix):
+        """Dump the detection results to a COCO style json file.
+
+        There are 3 types of results: proposals, bbox predictions, mask
+        predictions, and they have different data types. This method will
+        automatically recognize the type, and dump them to json files.
+
+        Args:
+            results (list[list | tuple | ndarray]): Testing results of the
+                dataset.
+            outfile_prefix (str): The filename prefix of the json files. If the
+                prefix is "somepath/xxx", the json files will be named
+                "somepath/xxx.bbox.json", "somepath/xxx.segm.json",
+                "somepath/xxx.proposal.json".
+
+        Returns:
+            dict[str: str]: Possible keys are "bbox", "segm", "proposal", and \
+                values are corresponding filenames.
+        """
+        result_files = dict()
+        if isinstance(results[0], list):
+            json_results = self._det2json(results)
+            result_files['bbox'] = f'{outfile_prefix}.bbox.json'
+            result_files['proposal'] = f'{outfile_prefix}.bbox.json'
+            dump(json_results, result_files['bbox'])
+        elif isinstance(results[0], tuple):
+            json_results = self._segm2json(results)
+            result_files['bbox'] = f'{outfile_prefix}.bbox.json'
+            result_files['proposal'] = f'{outfile_prefix}.bbox.json'
+            result_files['segm'] = f'{outfile_prefix}.segm.json'
+            dump(json_results[0], result_files['bbox'])
+            dump(json_results[1], result_files['segm'])
+        elif isinstance(results[0], np.ndarray):
+            json_results = self._proposal2json(results)
+            result_files['proposal'] = f'{outfile_prefix}.proposal.json'
+            dump(json_results, result_files['proposal'])
+        else:
+            raise TypeError('invalid type of results')
+        return result_files
+
+    def fast_eval_recall(self, results, proposal_nums, iou_thrs, logger=None):
+        gt_bboxes = []
+        for i in range(len(self.img_ids)):
+            ann_ids = self.coco.get_ann_ids(img_ids=self.img_ids[i])
+            ann_info = self.coco.load_anns(ann_ids)
+            if len(ann_info) == 0:
+                gt_bboxes.append(np.zeros((0, 4)))
+                continue
+            bboxes = []
+            for ann in ann_info:
+                if ann.get('ignore', False) or ann['iscrowd']:
+                    continue
+                x1, y1, w, h = ann['bbox']
+                bboxes.append([x1, y1, x1 + w, y1 + h])
+            bboxes = np.array(bboxes, dtype=np.float32)
+            if bboxes.shape[0] == 0:
+                bboxes = np.zeros((0, 4))
+            gt_bboxes.append(bboxes)
+
+        recalls = eval_recalls(
+            gt_bboxes, results, proposal_nums, iou_thrs, logger=logger)
+        ar = recalls.mean(axis=1)
+        return ar
+
+    def format_results(self, results, jsonfile_prefix=None, **kwargs):
+        """Format the results to json (standard format for COCO evaluation).
+
+        Args:
+            results (list[tuple | numpy.ndarray]): Testing results of the
+                dataset.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: (result_files, tmp_dir), result_files is a dict containing \
+                the json filepaths, tmp_dir is the temporal directory created \
+                for saving json files when jsonfile_prefix is not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+        assert len(results) == len(self), (
+            'The length of results is not equal to the dataset len: {} != {}'.
+            format(len(results), len(self)))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+        result_files = self.results2json(results, jsonfile_prefix)
+        return result_files, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 classwise=False,
+                 proposal_nums=(100, 300, 1000),
+                 iou_thrs=None,
+                 metric_items=None):
+        """Evaluation in COCO protocol.
+
+        Args:
+            results (list[list | tuple]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated. Options are
+                'bbox', 'segm', 'proposal', 'proposal_fast'.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            classwise (bool): Whether to evaluating the AP for each class.
+            proposal_nums (Sequence[int]): Proposal number used for evaluating
+                recalls, such as recall@100, recall@1000.
+                Default: (100, 300, 1000).
+            iou_thrs (Sequence[float], optional): IoU threshold used for
+                evaluating recalls/mAPs. If set to a list, the average of all
+                IoUs will also be computed. If not specified, [0.50, 0.55,
+                0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95] will be used.
+                Default: None.
+            metric_items (list[str] | str, optional): Metric items that will
+                be returned. If not specified, ``['AR@100', 'AR@300',
+                'AR@1000', 'AR_s@1000', 'AR_m@1000', 'AR_l@1000' ]`` will be
+                used when ``metric=='proposal'``, ``['mAP', 'mAP_50', 'mAP_75',
+                'mAP_s', 'mAP_m', 'mAP_l']`` will be used when
+                ``metric=='bbox' or metric=='segm'``.
+
+        Returns:
+            dict[str, float]: COCO style evaluation metric.
+        """
+
+        metrics = metric if isinstance(metric, list) else [metric]
+        allowed_metrics = ['bbox', 'segm', 'proposal', 'proposal_fast']
+        for metric in metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+        if iou_thrs is None:
+            iou_thrs = np.linspace(
+                .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+        if metric_items is not None:
+            if not isinstance(metric_items, list):
+                metric_items = [metric_items]
+
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+
+        eval_results = OrderedDict()
+        cocoGt = self.coco
+        for metric in metrics:
+            msg = f'Evaluating {metric}...'
+            if logger is None:
+                msg = '\n' + msg
+            print_log(msg, logger=logger)
+
+            if metric == 'proposal_fast':
+                ar = self.fast_eval_recall(
+                    results, proposal_nums, iou_thrs, logger='silent')
+                log_msg = []
+                for i, num in enumerate(proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+                    log_msg.append(f'\nAR@{num}\t{ar[i]:.4f}')
+                log_msg = ''.join(log_msg)
+                print_log(log_msg, logger=logger)
+                continue
+
+            iou_type = 'bbox' if metric == 'proposal' else metric
+            if metric not in result_files:
+                raise KeyError(f'{metric} is not in results')
+            try:
+                predictions = load(result_files[metric])
+                if iou_type == 'segm':
+                    # Refer to https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L331  # noqa
+                    # When evaluating mask AP, if the results contain bbox,
+                    # cocoapi will use the box area instead of the mask area
+                    # for calculating the instance area. Though the overall AP
+                    # is not affected, this leads to different
+                    # small/medium/large mask AP results.
+                    for x in predictions:
+                        x.pop('bbox')
+                    warnings.simplefilter('once')
+                    warnings.warn(
+                        'The key "bbox" is deleted for more accurate mask AP '
+                        'of small/medium/large instances since v2.12.0. This '
+                        'does not change the overall mAP calculation.',
+                        UserWarning)
+                cocoDt = cocoGt.loadRes(predictions)
+            except IndexError:
+                print_log(
+                    'The testing results of the whole dataset is empty.',
+                    logger=logger,
+                    level=logging.ERROR)
+                break
+
+            cocoEval = COCOeval(cocoGt, cocoDt, iou_type)
+            cocoEval.params.catIds = self.cat_ids
+            cocoEval.params.imgIds = self.img_ids
+            cocoEval.params.maxDets = list(proposal_nums)
+            cocoEval.params.iouThrs = iou_thrs
+            # mapping of cocoEval.stats
+            coco_metric_names = {
+                'mAP': 0,
+                'mAP_50': 1,
+                'mAP_75': 2,
+                'mAP_s': 3,
+                'mAP_m': 4,
+                'mAP_l': 5,
+                'AR@100': 6,
+                'AR@300': 7,
+                'AR@1000': 8,
+                'AR_s@1000': 9,
+                'AR_m@1000': 10,
+                'AR_l@1000': 11
+            }
+            if metric_items is not None:
+                for metric_item in metric_items:
+                    if metric_item not in coco_metric_names:
+                        raise KeyError(
+                            f'metric item {metric_item} is not supported')
+
+            if metric == 'proposal':
+                cocoEval.params.useCats = 0
+                cocoEval.evaluate()
+                cocoEval.accumulate()
+                cocoEval.summarize()
+                if metric_items is None:
+                    metric_items = [
+                        'AR@100', 'AR@300', 'AR@1000', 'AR_s@1000',
+                        'AR_m@1000', 'AR_l@1000'
+                    ]
+
+                for item in metric_items:
+                    val = float(
+                        f'{cocoEval.stats[coco_metric_names[item]]:.3f}')
+                    eval_results[item] = val
+            else:
+                cocoEval.evaluate()
+                cocoEval.accumulate()
+                cocoEval.summarize()
+                if classwise:  # Compute per-category AP
+                    # Compute per-category AP
+                    # from https://github.com/facebookresearch/detectron2/
+                    precisions = cocoEval.eval['precision']
+                    # precision: (iou, recall, cls, area range, max dets)
+                    assert len(self.cat_ids) == precisions.shape[2]
+
+                    results_per_category = []
+                    for idx, catId in enumerate(self.cat_ids):
+                        # area range index 0: all area ranges
+                        # max dets index -1: typically 100 per image
+                        nm = self.coco.loadCats(catId)[0]
+                        precision = precisions[:, :, idx, 0, -1]
+                        precision = precision[precision > -1]
+                        if precision.size:
+                            ap = np.mean(precision)
+                        else:
+                            ap = float('nan')
+                        results_per_category.append(
+                            (f'{nm["name"]}', f'{float(ap):0.3f}'))
+
+                    num_columns = min(6, len(results_per_category) * 2)
+                    results_flatten = list(
+                        itertools.chain(*results_per_category))
+                    headers = ['category', 'AP'] * (num_columns // 2)
+                    results_2d = itertools.zip_longest(*[
+                        results_flatten[i::num_columns]
+                        for i in range(num_columns)
+                    ])
+                    table_data = [headers]
+                    table_data += [result for result in results_2d]
+                    table = AsciiTable(table_data)
+                    print_log('\n' + table.table, logger=logger)
+
+                if metric_items is None:
+                    metric_items = [
+                        'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
+                    ]
+
+                for metric_item in metric_items:
+                    key = f'{metric}_{metric_item}'
+                    val = float(
+                        f'{cocoEval.stats[coco_metric_names[metric_item]]:.3f}'
+                    )
+                    eval_results[key] = val
+                ap = cocoEval.stats[:6]
+                eval_results[f'{metric}_mAP_copypaste'] = (
+                    f'{ap[0]:.3f} {ap[1]:.3f} {ap[2]:.3f} {ap[3]:.3f} '
+                    f'{ap[4]:.3f} {ap[5]:.3f}')
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+        return eval_results
diff --git a/mmcv/datasets/custom.py b/mmcv/datasets/custom.py
new file mode 100644
index 0000000..4cd8a1d
--- /dev/null
+++ b/mmcv/datasets/custom.py
@@ -0,0 +1,362 @@
+import os.path as osp
+import warnings
+from collections import OrderedDict
+
+import numpy as np
+from mmcv.utils import print_log
+from mmcv.fileio.io import load
+from mmcv.fileio.parse import list_from_file
+from terminaltables import AsciiTable
+from torch.utils.data import Dataset
+
+from mmcv.core import eval_map, eval_recalls
+from .builder import DATASETS
+from .pipelines import Compose
+
+
+@DATASETS.register_module()
+class CustomDataset(Dataset):
+    """Custom dataset for detection.
+
+    The annotation format is shown as follows. The `ann` field is optional for
+    testing.
+
+    .. code-block:: none
+
+        [
+            {
+                'filename': 'a.jpg',
+                'width': 1280,
+                'height': 720,
+                'ann': {
+                    'bboxes': <np.ndarray> (n, 4) in (x1, y1, x2, y2) order.
+                    'labels': <np.ndarray> (n, ),
+                    'bboxes_ignore': <np.ndarray> (k, 4), (optional field)
+                    'labels_ignore': <np.ndarray> (k, 4) (optional field)
+                }
+            },
+            ...
+        ]
+
+    Args:
+        ann_file (str): Annotation file path.
+        pipeline (list[dict]): Processing pipeline.
+        classes (str | Sequence[str], optional): Specify classes to load.
+            If is None, ``cls.CLASSES`` will be used. Default: None.
+        data_root (str, optional): Data root for ``ann_file``,
+            ``img_prefix``, ``seg_prefix``, ``proposal_file`` if specified.
+        test_mode (bool, optional): If set True, annotation will not be loaded.
+        filter_empty_gt (bool, optional): If set true, images without bounding
+            boxes of the dataset's classes will be filtered out. This option
+            only works when `test_mode=False`, i.e., we never filter images
+            during tests.
+    """
+
+    CLASSES = None
+
+    def __init__(self,
+                 ann_file,
+                 pipeline,
+                 classes=None,
+                 data_root=None,
+                 img_prefix='',
+                 seg_prefix=None,
+                 proposal_file=None,
+                 test_mode=False,
+                 filter_empty_gt=True):
+        self.ann_file = ann_file
+        self.data_root = data_root
+        self.img_prefix = img_prefix
+        self.seg_prefix = seg_prefix
+        self.proposal_file = proposal_file
+        self.test_mode = test_mode
+        self.filter_empty_gt = filter_empty_gt
+        self.CLASSES = self.get_classes(classes)
+
+        # join paths if data_root is specified
+        if self.data_root is not None:
+            if not osp.isabs(self.ann_file):
+                self.ann_file = osp.join(self.data_root, self.ann_file)
+            if not (self.img_prefix is None or osp.isabs(self.img_prefix)):
+                self.img_prefix = osp.join(self.data_root, self.img_prefix)
+            if not (self.seg_prefix is None or osp.isabs(self.seg_prefix)):
+                self.seg_prefix = osp.join(self.data_root, self.seg_prefix)
+            if not (self.proposal_file is None
+                    or osp.isabs(self.proposal_file)):
+                self.proposal_file = osp.join(self.data_root,
+                                              self.proposal_file)
+        # load annotations (and proposals)
+        self.data_infos = self.load_annotations(self.ann_file)
+
+        if self.proposal_file is not None:
+            self.proposals = self.load_proposals(self.proposal_file)
+        else:
+            self.proposals = None
+
+        # filter images too small and containing no annotations
+        if not test_mode:
+            valid_inds = self._filter_imgs()
+            self.data_infos = [self.data_infos[i] for i in valid_inds]
+            if self.proposals is not None:
+                self.proposals = [self.proposals[i] for i in valid_inds]
+            # set group flag for the sampler
+            self._set_group_flag()
+
+        # processing pipeline
+        self.pipeline = Compose(pipeline)
+
+    def __len__(self):
+        """Total number of samples of data."""
+        return len(self.data_infos)
+
+    def load_annotations(self, ann_file):
+        """Load annotation from annotation file."""
+        return load(ann_file)
+
+    def load_proposals(self, proposal_file):
+        """Load proposal from proposal file."""
+        return load(proposal_file)
+
+    def get_ann_info(self, idx):
+        """Get annotation by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Annotation info of specified index.
+        """
+
+        return self.data_infos[idx]['ann']
+
+    def get_cat_ids(self, idx):
+        """Get category ids by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            list[int]: All categories in the image of specified index.
+        """
+
+        return self.data_infos[idx]['ann']['labels'].astype(np.int).tolist()
+
+    def pre_pipeline(self, results):
+        """Prepare results dict for pipeline."""
+        results['img_prefix'] = self.img_prefix
+        results['seg_prefix'] = self.seg_prefix
+        results['proposal_file'] = self.proposal_file
+        results['bbox_fields'] = []
+        results['mask_fields'] = []
+        results['seg_fields'] = []
+
+    def _filter_imgs(self, min_size=32):
+        """Filter images too small."""
+        if self.filter_empty_gt:
+            warnings.warn(
+                'CustomDataset does not support filtering empty gt images.')
+        valid_inds = []
+        for i, img_info in enumerate(self.data_infos):
+            if min(img_info['width'], img_info['height']) >= min_size:
+                valid_inds.append(i)
+        return valid_inds
+
+    def _set_group_flag(self):
+        """Set flag according to image aspect ratio.
+
+        Images with aspect ratio greater than 1 will be set as group 1,
+        otherwise group 0.
+        """
+        self.flag = np.zeros(len(self), dtype=np.uint8)
+        for i in range(len(self)):
+            img_info = self.data_infos[i]
+            if img_info['width'] / img_info['height'] > 1:
+                self.flag[i] = 1
+
+    def _rand_another(self, idx):
+        """Get another random index from the same group as the given index."""
+        pool = np.where(self.flag == self.flag[idx])[0]
+        return np.random.choice(pool)
+
+    def __getitem__(self, idx):
+        """Get training/test data after pipeline.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Training/test data (with annotation if `test_mode` is set \
+                True).
+        """
+
+        if self.test_mode:
+            return self.prepare_test_img(idx)
+        while True:
+            data = self.prepare_train_img(idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+
+    def prepare_train_img(self, idx):
+        """Get training data and annotations after pipeline.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Training data and annotation after pipeline with new keys \
+                introduced by pipeline.
+        """
+
+        img_info = self.data_infos[idx]
+        ann_info = self.get_ann_info(idx)
+        results = dict(img_info=img_info, ann_info=ann_info)
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        return self.pipeline(results)
+
+    def prepare_test_img(self, idx):
+        """Get testing data  after pipeline.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            dict: Testing data after pipeline with new keys introduced by \
+                pipeline.
+        """
+
+        img_info = self.data_infos[idx]
+        results = dict(img_info=img_info)
+        if self.proposals is not None:
+            results['proposals'] = self.proposals[idx]
+        self.pre_pipeline(results)
+        return self.pipeline(results)
+
+    @classmethod
+    def get_classes(cls, classes=None):
+        """Get class names of current dataset.
+
+        Args:
+            classes (Sequence[str] | str | None): If classes is None, use
+                default CLASSES defined by builtin dataset. If classes is a
+                string, take it as a file name. The file contains the name of
+                classes where each line contains one class name. If classes is
+                a tuple or list, override the CLASSES defined by the dataset.
+
+        Returns:
+            tuple[str] or list[str]: Names of categories of the dataset.
+        """
+        if classes is None:
+            return cls.CLASSES
+
+        if isinstance(classes, str):
+            # take it as a file path
+            class_names = list_from_file(classes)
+        elif isinstance(classes, (tuple, list)):
+            class_names = classes
+        else:
+            raise ValueError(f'Unsupported type {type(classes)} of classes.')
+
+        return class_names
+
+    def format_results(self, results, **kwargs):
+        """Place holder to format result to dataset specific output."""
+
+    def evaluate(self,
+                 results,
+                 metric='mAP',
+                 logger=None,
+                 proposal_nums=(100, 300, 1000),
+                 iou_thr=0.5,
+                 scale_ranges=None):
+        """Evaluate the dataset.
+
+        Args:
+            results (list): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | None | str): Logger used for printing
+                related information during evaluation. Default: None.
+            proposal_nums (Sequence[int]): Proposal number used for evaluating
+                recalls, such as recall@100, recall@1000.
+                Default: (100, 300, 1000).
+            iou_thr (float | list[float]): IoU threshold. Default: 0.5.
+            scale_ranges (list[tuple] | None): Scale ranges for evaluating mAP.
+                Default: None.
+        """
+
+        if not isinstance(metric, str):
+            assert len(metric) == 1
+            metric = metric[0]
+        allowed_metrics = ['mAP', 'recall']
+        if metric not in allowed_metrics:
+            raise KeyError(f'metric {metric} is not supported')
+        annotations = [self.get_ann_info(i) for i in range(len(self))]
+        eval_results = OrderedDict()
+        iou_thrs = [iou_thr] if isinstance(iou_thr, float) else iou_thr
+        if metric == 'mAP':
+            assert isinstance(iou_thrs, list)
+            mean_aps = []
+            for iou_thr in iou_thrs:
+                print_log(f'\n{"-" * 15}iou_thr: {iou_thr}{"-" * 15}')
+                mean_ap, _ = eval_map(
+                    results,
+                    annotations,
+                    scale_ranges=scale_ranges,
+                    iou_thr=iou_thr,
+                    dataset=self.CLASSES,
+                    logger=logger)
+                mean_aps.append(mean_ap)
+                eval_results[f'AP{int(iou_thr * 100):02d}'] = round(mean_ap, 3)
+            eval_results['mAP'] = sum(mean_aps) / len(mean_aps)
+        elif metric == 'recall':
+            gt_bboxes = [ann['bboxes'] for ann in annotations]
+            recalls = eval_recalls(
+                gt_bboxes, results, proposal_nums, iou_thr, logger=logger)
+            for i, num in enumerate(proposal_nums):
+                for j, iou in enumerate(iou_thrs):
+                    eval_results[f'recall@{num}@{iou}'] = recalls[i, j]
+            if recalls.shape[1] > 1:
+                ar = recalls.mean(axis=1)
+                for i, num in enumerate(proposal_nums):
+                    eval_results[f'AR@{num}'] = ar[i]
+        return eval_results
+
+    def __repr__(self):
+        """Print the number of instance number."""
+        dataset_type = 'Test' if self.test_mode else 'Train'
+        result = (f'\n{self.__class__.__name__} {dataset_type} dataset '
+                  f'with number of images {len(self)}, '
+                  f'and instance counts: \n')
+        if self.CLASSES is None:
+            result += 'Category names are not provided. \n'
+            return result
+        instance_count = np.zeros(len(self.CLASSES) + 1).astype(int)
+        # count the instance number in each image
+        for idx in range(len(self)):
+            label = self.get_ann_info(idx)['labels']
+            unique, counts = np.unique(label, return_counts=True)
+            if len(unique) > 0:
+                # add the occurrence number to each class
+                instance_count[unique] += counts
+            else:
+                # background is the last index
+                instance_count[-1] += 1
+        # create a table with category count
+        table_data = [['category', 'count'] * 5]
+        row_data = []
+        for cls, count in enumerate(instance_count):
+            if cls < len(self.CLASSES):
+                row_data += [f'{cls} [{self.CLASSES[cls]}]', f'{count}']
+            else:
+                # add the background number
+                row_data += ['-1 background', f'{count}']
+            if len(row_data) == 10:
+                table_data.append(row_data)
+                row_data = []
+
+        table = AsciiTable(table_data)
+        result += table.table
+        return result
diff --git a/mmcv/datasets/custom_3d.py b/mmcv/datasets/custom_3d.py
new file mode 100644
index 0000000..88c8bfb
--- /dev/null
+++ b/mmcv/datasets/custom_3d.py
@@ -0,0 +1,370 @@
+# Copyright (c) OpenMMLab. All rights reserved. 
+import numpy as np
+import tempfile
+import warnings
+from os import path as osp
+from torch.utils.data import Dataset
+
+from mmcv.datasets.builder import DATASETS
+from ..core.bbox import get_box_type
+from .pipelines import Compose
+from .utils import extract_result_dict, get_loading_pipeline
+from mmcv.fileio.io import load, dump
+from mmcv.fileio.parse import list_from_file
+
+@DATASETS.register_module()
+class Custom3DDataset(Dataset):
+    """Customized 3D dataset.
+
+    This is the base dataset of SUNRGB-D, ScanNet, nuScenes, and KITTI
+    dataset.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR'. Available options includes
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
+            Defaults to True.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 data_root,
+                 ann_file,
+                 pipeline=None,
+                 classes=None,
+                 modality=None,
+                 box_type_3d='LiDAR',
+                 filter_empty_gt=True,
+                 test_mode=False):
+        super().__init__()
+        self.data_root = data_root
+        self.ann_file = ann_file
+        self.test_mode = test_mode
+        self.modality = modality
+        self.filter_empty_gt = filter_empty_gt
+        self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
+
+        self.CLASSES = self.get_classes(classes)
+        self.cat2id = {name: i for i, name in enumerate(self.CLASSES)}
+        self.data_infos = self.load_annotations(self.ann_file)
+
+        if pipeline is not None:
+            self.pipeline = Compose(pipeline)
+
+        # set group flag for the sampler
+        if not self.test_mode:
+            self._set_group_flag()
+
+    def load_annotations(self, ann_file):
+        """Load annotations from ann_file.
+
+        Args:
+            ann_file (str): Path of the annotation file.
+
+        Returns:
+            list[dict]: List of annotations.
+        """
+        return load(ann_file)
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data \
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - file_name (str): Filename of point clouds.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+        sample_idx = info['point_cloud']['lidar_idx']
+        pts_filename = osp.join(self.data_root, info['pts_path'])
+
+        input_dict = dict(
+            pts_filename=pts_filename,
+            sample_idx=sample_idx,
+            file_name=pts_filename)
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+            if self.filter_empty_gt and ~(annos['gt_labels_3d'] != -1).any():
+                return None
+        return input_dict
+
+    def pre_pipeline(self, results):
+        """Initialization before data preparation.
+
+        Args:
+            results (dict): Dict before data preprocessing.
+
+                - img_fields (list): Image fields.
+                - bbox3d_fields (list): 3D bounding boxes fields.
+                - pts_mask_fields (list): Mask fields of points.
+                - pts_seg_fields (list): Mask fields of point segments.
+                - bbox_fields (list): Fields of bounding boxes.
+                - mask_fields (list): Fields of masks.
+                - seg_fields (list): Segment fields.
+                - box_type_3d (str): 3D box type.
+                - box_mode_3d (str): 3D box mode.
+        """
+        results['img_fields'] = []
+        results['bbox3d_fields'] = []
+        results['pts_mask_fields'] = []
+        results['pts_seg_fields'] = []
+        results['bbox_fields'] = []
+        results['mask_fields'] = []
+        results['seg_fields'] = []
+        results['box_type_3d'] = self.box_type_3d
+        results['box_mode_3d'] = self.box_mode_3d
+
+    def prepare_train_data(self, index):
+        """Training data preparation.
+
+        Args:
+            index (int): Index for accessing the target data.
+
+        Returns:
+            dict: Training data dict of the corresponding index.
+        """
+        input_dict = self.get_data_info(index)
+        if input_dict is None:
+            return None
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        if self.filter_empty_gt and \
+                (example is None or
+                    ~(example['gt_labels_3d']._data != -1).any()):
+            return None
+        return example
+
+    def prepare_test_data(self, index):
+        """Prepare data for testing.
+
+        Args:
+            index (int): Index for accessing the target data.
+
+        Returns:
+            dict: Testing data dict of the corresponding index.
+        """
+        input_dict = self.get_data_info(index)
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        return example
+
+    @classmethod
+    def get_classes(cls, classes=None):
+        """Get class names of current dataset.
+
+        Args:
+            classes (Sequence[str] | str | None): If classes is None, use
+                default CLASSES defined by builtin dataset. If classes is a
+                string, take it as a file name. The file contains the name of
+                classes where each line contains one class name. If classes is
+                a tuple or list, override the CLASSES defined by the dataset.
+
+        Return:
+            list[str]: A list of class names.
+        """
+        if classes is None:
+            return cls.CLASSES
+
+        if isinstance(classes, str):
+            # take it as a file path
+            class_names = list_from_file(classes)
+        elif isinstance(classes, (tuple, list)):
+            class_names = classes
+        else:
+            raise ValueError(f'Unsupported type {type(classes)} of classes.')
+
+        return class_names
+
+    def format_results(self,
+                       outputs,
+                       pklfile_prefix=None,
+                       submission_prefix=None):
+        """Format the results to pkl file.
+
+        Args:
+            outputs (list[dict]): Testing results of the dataset.
+            pklfile_prefix (str | None): The prefix of pkl files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: (outputs, tmp_dir), outputs is the detection results, \
+                tmp_dir is the temporal directory created for saving json \
+                files when ``jsonfile_prefix`` is not specified.
+        """
+        if pklfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            pklfile_prefix = osp.join(tmp_dir.name, 'results')
+            out = f'{pklfile_prefix}.pkl'
+        dump(outputs, out)
+        return outputs, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric=None,
+                 iou_thr=(0.25, 0.5),
+                 logger=None,
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluate.
+
+        Evaluation in indoor protocol.
+
+        Args:
+            results (list[dict]): List of results.
+            metric (str | list[str]): Metrics to be evaluated.
+            iou_thr (list[float]): AP IoU thresholds.
+            show (bool): Whether to visualize.
+                Default: False.
+            out_dir (str): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+
+        Returns:
+            dict: Evaluation results.
+        """
+        from mmcv.core.evaluation import indoor_eval
+        assert isinstance(
+            results, list), f'Expect results to be list, got {type(results)}.'
+        assert len(results) > 0, 'Expect length of results > 0.'
+        assert len(results) == len(self.data_infos)
+        assert isinstance(
+            results[0], dict
+        ), f'Expect elements in results to be dict, got {type(results[0])}.'
+        gt_annos = [info['annos'] for info in self.data_infos]
+        label2cat = {i: cat_id for i, cat_id in enumerate(self.CLASSES)}
+        ret_dict = indoor_eval(
+            gt_annos,
+            results,
+            iou_thr,
+            label2cat,
+            logger=logger,
+            box_type_3d=self.box_type_3d,
+            box_mode_3d=self.box_mode_3d)
+        if show:
+            self.show(results, out_dir, pipeline=pipeline)
+
+        return ret_dict
+
+    def _build_default_pipeline(self):
+        """Build the default pipeline for this dataset."""
+        raise NotImplementedError('_build_default_pipeline is not implemented '
+                                  f'for dataset {self.__class__.__name__}')
+
+    def _get_pipeline(self, pipeline):
+        """Get data loading pipeline in self.show/evaluate function.
+
+        Args:
+            pipeline (list[dict] | None): Input pipeline. If None is given, \
+                get from self.pipeline.
+        """
+        if pipeline is None:
+            if not hasattr(self, 'pipeline') or self.pipeline is None:
+                warnings.warn(
+                    'Use default pipeline for data loading, this may cause '
+                    'errors when data is on ceph')
+                return self._build_default_pipeline()
+            loading_pipeline = get_loading_pipeline(self.pipeline.transforms)
+            return Compose(loading_pipeline)
+        return Compose(pipeline)
+
+    def _extract_data(self, index, pipeline, key, load_annos=False):
+        """Load data using input pipeline and extract data according to key.
+
+        Args:
+            index (int): Index for accessing the target data.
+            pipeline (:obj:`Compose`): Composed data loading pipeline.
+            key (str | list[str]): One single or a list of data key.
+            load_annos (bool): Whether to load data annotations.
+                If True, need to set self.test_mode as False before loading.
+
+        Returns:
+            np.ndarray | torch.Tensor | list[np.ndarray | torch.Tensor]:
+                A single or a list of loaded data.
+        """
+        assert pipeline is not None, 'data loading pipeline is not provided'
+        # when we want to load ground-truth via pipeline (e.g. bbox, seg mask)
+        # we need to set self.test_mode as False so that we have 'annos'
+        if load_annos:
+            original_test_mode = self.test_mode
+            self.test_mode = False
+        input_dict = self.get_data_info(index)
+        self.pre_pipeline(input_dict)
+        example = pipeline(input_dict)
+
+        # extract data items according to keys
+        if isinstance(key, str):
+            data = extract_result_dict(example, key)
+        else:
+            data = [extract_result_dict(example, k) for k in key]
+        if load_annos:
+            self.test_mode = original_test_mode
+
+        return data
+
+    def __len__(self):
+        """Return the length of data infos.
+
+        Returns:
+            int: Length of data infos.
+        """
+        return len(self.data_infos)
+
+    def _rand_another(self, idx):
+        """Randomly get another item with the same flag.
+
+        Returns:
+            int: Another index of item with the same flag.
+        """
+        pool = np.where(self.flag == self.flag[idx])[0]
+        return np.random.choice(pool)
+
+    def __getitem__(self, idx):
+        """Get item from infos according to the given index.
+
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        if self.test_mode:
+            return self.prepare_test_data(idx)
+        while True:
+            data = self.prepare_train_data(idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+
+    def _set_group_flag(self):
+        """Set flag according to image aspect ratio.
+
+        Images with aspect ratio greater than 1 will be set as group 1,
+        otherwise group 0. In 3D datasets, they are all the same, thus are all
+        zeros.
+        """
+        self.flag = np.zeros(len(self), dtype=np.uint8)
diff --git a/mmcv/datasets/custom_nuscenes_dataset.py b/mmcv/datasets/custom_nuscenes_dataset.py
new file mode 100644
index 0000000..17c9e5a
--- /dev/null
+++ b/mmcv/datasets/custom_nuscenes_dataset.py
@@ -0,0 +1,246 @@
+import copy
+
+import numpy as np
+from mmcv.datasets import DATASETS
+from mmcv.datasets import NuScenesDataset
+from os import path as osp
+from mmcv.datasets import DATASETS
+import torch
+import numpy as np
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from .nuscnes_eval import NuScenesEval_custom
+from mmcv.utils import save_tensor
+from mmcv.parallel import DataContainer as DC
+import random
+from mmcv.fileio.io import load
+
+
+@DATASETS.register_module()
+class CustomNuScenesDataset(NuScenesDataset):
+    r"""NuScenes Dataset.
+
+    This datset only add camera intrinsics and extrinsics to the results.
+    """
+
+    def __init__(self, queue_length=4, bev_size=(200, 200), overlap_test=False, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.queue_length = queue_length
+        self.overlap_test = overlap_test
+        self.bev_size = bev_size
+        
+        
+    def prepare_train_data(self, index):
+        """
+        Training data preparation.
+        Args:
+            index (int): Index for accessing the target data.
+        Returns:
+            dict: Training data dict of the corresponding index.
+        """
+        queue = []
+        index_list = list(range(index-self.queue_length, index))
+        random.shuffle(index_list)
+        index_list = sorted(index_list[1:])
+        index_list.append(index)
+        for i in index_list:
+            i = max(0, i)
+            input_dict = self.get_data_info(i)
+            if input_dict is None:
+                return None
+            self.pre_pipeline(input_dict)
+            example = self.pipeline(input_dict)
+            if self.filter_empty_gt and \
+                    (example is None or ~(example['gt_labels_3d']._data != -1).any()):
+                return None
+            queue.append(example)
+        return self.union2one(queue)
+
+
+    def union2one(self, queue):
+        imgs_list = [each['img'].data for each in queue]
+        metas_map = {}
+        prev_scene_token = None
+        prev_pos = None
+        prev_angle = None
+        for i, each in enumerate(queue):
+            metas_map[i] = each['img_metas'].data
+            if metas_map[i]['scene_token'] != prev_scene_token:
+                metas_map[i]['prev_bev_exists'] = False
+                prev_scene_token = metas_map[i]['scene_token']
+                prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] = 0
+                metas_map[i]['can_bus'][-1] = 0
+            else:
+                metas_map[i]['prev_bev_exists'] = True
+                tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] -= prev_pos
+                metas_map[i]['can_bus'][-1] -= prev_angle
+                prev_pos = copy.deepcopy(tmp_pos)
+                prev_angle = copy.deepcopy(tmp_angle)
+        queue[-1]['img'] = DC(torch.stack(imgs_list), cpu_only=False, stack=True)
+        queue[-1]['img_metas'] = DC(metas_map, cpu_only=True)
+        queue = queue[-1]
+        return queue
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data \
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - sweeps (list[dict]): Infos of sweeps.
+                - timestamp (float): Sample timestamp.
+                - img_filename (str, optional): Image filename.
+                - lidar2img (list[np.ndarray], optional): Transformations \
+                    from lidar to different cameras.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+        # standard protocal modified from SECOND.Pytorch
+        input_dict = dict(
+            sample_idx=info['token'],
+            pts_filename=info['lidar_path'],
+            sweeps=info['sweeps'],
+            ego2global_translation=info['ego2global_translation'],
+            ego2global_rotation=info['ego2global_rotation'],
+            prev_idx=info['prev'],
+            next_idx=info['next'],
+            scene_token=info['scene_token'],
+            can_bus=info['can_bus'],
+            frame_idx=info['frame_idx'],
+            timestamp=info['timestamp'] / 1e6,
+        )
+#(['CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT'])
+        if self.modality['use_camera']:
+            image_paths = []
+            lidar2img_rts = []
+            lidar2cam_rts = []
+            cam_intrinsics = []
+            for cam_type, cam_info in info['cams'].items():
+                # if cam_type in ['CAM_FRONT','CAM_BACK_LEFT']:
+                #     continue
+                image_paths.append(cam_info['data_path'])
+                # obtain lidar to image transformation matrix
+                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+                lidar2cam_t = cam_info[
+                    'sensor2lidar_translation'] @ lidar2cam_r.T
+                lidar2cam_rt = np.eye(4)
+                lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                lidar2cam_rt[3, :3] = -lidar2cam_t
+                intrinsic = cam_info['cam_intrinsic']
+                viewpad = np.eye(4)
+                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+                lidar2img_rts.append(lidar2img_rt)
+
+                cam_intrinsics.append(viewpad)
+                lidar2cam_rts.append(lidar2cam_rt.T)
+
+            input_dict.update(
+                dict(
+                    img_filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                    cam_intrinsic=cam_intrinsics,
+                    lidar2cam=lidar2cam_rts,
+                ))
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+
+        rotation = Quaternion(input_dict['ego2global_rotation'])
+        translation = input_dict['ego2global_translation']
+        can_bus = input_dict['can_bus']
+        can_bus[:3] = translation
+        can_bus[3:7] = rotation
+        patch_angle = quaternion_yaw(rotation) / np.pi * 180
+        if patch_angle < 0:
+            patch_angle += 360
+        can_bus[-2] = patch_angle / 180 * np.pi
+        can_bus[-1] = patch_angle
+
+        return input_dict
+
+    def __getitem__(self, idx):
+        """Get item from infos according to the given index.
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        if self.test_mode:
+            return self.prepare_test_data(idx)
+        while True:
+
+            data = self.prepare_train_data(idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+
+    def _evaluate_single(self,
+                         result_path,
+                         logger=None,
+                         metric='bbox',
+                         result_name='pts_bbox'):
+        """Evaluation for a single model in nuScenes protocol.
+
+        Args:
+            result_path (str): Path of the result file.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            metric (str): Metric name used for evaluation. Default: 'bbox'.
+            result_name (str): Result name in the metric prefix.
+                Default: 'pts_bbox'.
+
+        Returns:
+            dict: Dictionary of evaluation details.
+        """
+        from nuscenes import NuScenes
+        self.nusc = NuScenes(version=self.version, dataroot=self.data_root,
+                             verbose=True)
+
+        import pdb
+        pdb.set_trace()
+
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+
+        eval_set_map = {
+            'v1.0-mini': 'mini_val',
+            'v1.0-trainval': 'val',
+        }
+        self.nusc_eval = NuScenesEval_custom(
+            self.nusc,
+            config=self.eval_detection_configs,
+            result_path=result_path,
+            eval_set=eval_set_map[self.version],
+            output_dir=output_dir,
+            verbose=True,
+            overlap_test=self.overlap_test,
+            data_infos=self.data_infos
+        )
+        self.nusc_eval.main(plot_examples=0, render_curves=False)
+        # record metrics
+        metrics = load(osp.join(output_dir, 'metrics_summary.json'))
+        detail = dict()
+        metric_prefix = f'{result_name}_NuScenes'
+        for name in self.CLASSES:
+            for k, v in metrics['label_aps'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['label_tp_errors'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['tp_errors'].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}'.format(metric_prefix,
+                                      self.ErrNameMapping[k])] = val
+        detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+        detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+        return detail
diff --git a/mmcv/datasets/custom_nuscenes_dataset_v2.py b/mmcv/datasets/custom_nuscenes_dataset_v2.py
new file mode 100644
index 0000000..305d6b3
--- /dev/null
+++ b/mmcv/datasets/custom_nuscenes_dataset_v2.py
@@ -0,0 +1,302 @@
+import copy
+from .nuscenes_dataset import NuScenesDataset
+from .dd3d_nuscenes_dataset import DD3DNuscenesDataset
+from os import path as osp
+from mmcv.datasets import DATASETS
+import torch
+import numpy as np
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from .nuscnes_eval import NuScenesEval_custom
+from mmcv.parallel import DataContainer as DC
+from collections import defaultdict, OrderedDict
+
+
+@DATASETS.register_module()
+class CustomNuScenesDatasetV2(NuScenesDataset):
+    def __init__(self, frames=(),mono_cfg=None, overlap_test=False,*args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.frames = frames
+        self.queue_length = len(frames)
+        self.overlap_test = overlap_test
+        self.mono_cfg = mono_cfg
+        if not self.test_mode and mono_cfg is not None:
+            self.mono_dataset = DD3DNuscenesDataset(**mono_cfg)
+
+    def prepare_test_data(self, index):
+        """Prepare data for testing.
+
+        Args:
+            index (int): Index for accessing the target data.
+
+        Returns:
+            dict: Testing data dict of the corresponding index.
+        """
+        data_queue = OrderedDict()
+        input_dict = self.get_data_info(index)
+        cur_scene_token = input_dict['scene_token']
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        data_queue[0] = example
+        
+        for frame_idx in self.frames:
+            chosen_idx = index + frame_idx
+            if frame_idx ==0 or chosen_idx <0 or chosen_idx >= len(self.data_infos):
+                continue
+            info = self.data_infos[chosen_idx]
+            input_dict = self.prepare_input_dict(info)
+            if input_dict['scene_token'] == cur_scene_token:
+                self.pre_pipeline(input_dict)
+                example = self.pipeline(input_dict)
+                data_queue[frame_idx] = example
+
+        data_queue = OrderedDict(sorted(data_queue.items()))
+        ret = defaultdict(list)
+        for i in range(len(data_queue[0]['img'])):
+            single_aug_data_queue = {}
+            for t in data_queue.keys():
+                single_example = {}
+                for key ,value in data_queue[t].items():
+                    single_example[key] = value[i]
+                single_aug_data_queue[t] = single_example
+            single_aug_data_queue = OrderedDict(sorted(single_aug_data_queue.items()))
+            single_aug_sample = self.union2one(single_aug_data_queue)
+
+            for key, value in single_aug_sample.items():
+                ret[key].append(value)
+        return ret
+
+    def prepare_train_data(self, index):
+        """
+        Training data preparation.
+        Args:
+            index (int): Index for accessing the target data.
+        Returns:
+            dict: Training data dict of the corresponding index.
+        """
+        data_queue = OrderedDict()
+        input_dict = self.get_data_info(index)
+        if input_dict is None:
+            return None 
+        cur_scene_token = input_dict['scene_token']
+        # cur_frame_idx = input_dict['frame_idx']
+        ann_info = copy.deepcopy(input_dict['ann_info'])
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        if self.filter_empty_gt and \
+                (example is None or ~(example['gt_labels_3d']._data != -1).any()):
+            return None
+        data_queue[0] = example
+        aug_param = copy.deepcopy(example['aug_param']) if 'aug_param' in example else {}
+        
+        # frame_idx_to_idx = self.scene_to_frame_idx_to_idx[cur_scene_token]
+        for frame_idx in self.frames:
+            chosen_idx = index + frame_idx
+            if frame_idx ==0 or chosen_idx <0 or chosen_idx >= len(self.data_infos):
+                continue
+            info = self.data_infos[chosen_idx]
+            input_dict = self.prepare_input_dict(info)
+            if input_dict['scene_token'] == cur_scene_token:
+                input_dict['ann_info'] = copy.deepcopy(ann_info) # only for pipeline, should never be used 
+                self.pre_pipeline(input_dict)
+                input_dict['aug_param'] = copy.deepcopy(aug_param)
+                example = self.pipeline(input_dict)
+                data_queue[frame_idx] = example
+
+        data_queue = OrderedDict(sorted(data_queue.items()))
+        return self.union2one(data_queue)
+
+    def union2one(self, queue: dict):
+        """
+        convert sample queue into one single sample.
+        """
+        imgs_list = [each['img'].data for each in queue.values()]
+        lidar2ego = np.eye(4, dtype=np.float32)
+        lidar2ego[:3, :3] = Quaternion(queue[0]['lidar2ego_rotation']).rotation_matrix
+        lidar2ego[:3, 3] = queue[0]['lidar2ego_translation']
+
+        egocurr2global = np.eye(4, dtype=np.float32)
+        egocurr2global[:3,:3] = Quaternion(queue[0]['ego2global_rotation']).rotation_matrix
+        egocurr2global[:3,3] = queue[0]['ego2global_translation']
+        metas_map = {}
+        for i, each in queue.items():
+            metas_map[i] = each['img_metas'].data
+            metas_map[i]['timestamp'] = each['timestamp']
+            if 'aug_param' in each:
+                metas_map[i]['aug_param'] = each['aug_param']
+            if i == 0:
+                metas_map[i]['lidaradj2lidarcurr'] = None
+            else:
+                egoadj2global = np.eye(4, dtype=np.float32)
+                egoadj2global[:3,:3] = Quaternion(each['ego2global_rotation']).rotation_matrix
+                egoadj2global[:3,3] = each['ego2global_translation']
+
+                lidaradj2lidarcurr = np.linalg.inv(lidar2ego) @ np.linalg.inv(egocurr2global) @ egoadj2global @ lidar2ego
+                metas_map[i]['lidaradj2lidarcurr'] = lidaradj2lidarcurr
+                for i_cam in range(len(metas_map[i]['lidar2img'])):
+                    metas_map[i]['lidar2img'][i_cam] = metas_map[i]['lidar2img'][i_cam] @ np.linalg.inv(lidaradj2lidarcurr)
+        queue[0]['img'] = DC(torch.stack(imgs_list),
+                              cpu_only=False, stack=True)
+        queue[0]['img_metas'] = DC(metas_map, cpu_only=True)
+        queue = queue[0]
+        return queue
+
+    def prepare_input_dict(self, info):
+        # standard protocal modified from SECOND.Pytorch
+        input_dict = dict(
+            sample_idx=info['token'],
+            pts_filename=info['lidar_path'],
+            sweeps=info['sweeps'],
+            ego2global_translation=info['ego2global_translation'],
+            ego2global_rotation=info['ego2global_rotation'],
+            lidar2ego_translation=info['lidar2ego_translation'],
+            lidar2ego_rotation=info['lidar2ego_rotation'],
+            prev=info['prev'],
+            next=info['next'],
+            scene_token=info['scene_token'],
+            frame_idx=info['frame_idx'],
+            timestamp=info['timestamp'] / 1e6,
+        )
+
+        if self.modality['use_camera']:
+            image_paths = []
+            lidar2img_rts = []
+            lidar2cam_rts = []
+            cam_intrinsics = []
+            for cam_type, cam_info in info['cams'].items():
+                image_paths.append(cam_info['data_path'])
+                # obtain lidar to image transformation matrix
+                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+                lidar2cam_t = cam_info[
+                    'sensor2lidar_translation'] @ lidar2cam_r.T
+                lidar2cam_rt = np.eye(4)
+                lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                lidar2cam_rt[3, :3] = -lidar2cam_t
+                intrinsic = cam_info['cam_intrinsic']
+                viewpad = np.eye(4)
+                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+                lidar2img_rts.append(lidar2img_rt)
+
+                cam_intrinsics.append(viewpad)
+                lidar2cam_rts.append(lidar2cam_rt.T)
+
+            input_dict.update(
+                dict(
+                    img_filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                    cam2img=cam_intrinsics,
+                    lidar2cam=lidar2cam_rts,
+                ))
+
+        return input_dict
+
+    def filter_crowd_annotations(self, data_dict):
+        for ann in data_dict["annotations"]:
+            if ann.get("iscrowd", 0) == 0:
+                return True
+        return False
+
+    def get_data_info(self, index):
+        info = self.data_infos[index]
+        input_dict = self.prepare_input_dict(info)
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+
+        if not self.test_mode and self.mono_cfg is not None:
+            if input_dict is None:
+                return None
+            info = self.data_infos[index]
+            img_ids = []
+            for cam_type, cam_info in info['cams'].items():
+                img_ids.append(cam_info['sample_data_token'])
+
+            mono_input_dict = []; mono_ann_index = []
+            for i, img_id in enumerate(img_ids):
+                tmp_dict = self.mono_dataset.getitem_by_datumtoken(img_id)
+                if tmp_dict is not None:
+                    if self.filter_crowd_annotations(tmp_dict):
+                        mono_input_dict.append(tmp_dict)
+                        mono_ann_index.append(i)
+
+            # filter empth annotation
+            if len(mono_ann_index) == 0:
+                return None
+
+            mono_ann_index = DC(mono_ann_index, cpu_only=True)
+            input_dict['mono_input_dict'] = mono_input_dict
+            input_dict['mono_ann_idx'] = mono_ann_index
+        return input_dict
+
+    def __getitem__(self, idx):
+        """Get item from infos according to the given index.
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        if self.test_mode:
+            return self.prepare_test_data(idx)
+        while True:
+
+            data = self.prepare_train_data(idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+
+    def _evaluate_single(self,
+                         result_path,
+                         logger=None,
+                         metric='bbox',
+                         result_name='pts_bbox'):
+        """Evaluation for a single model in nuScenes protocol.
+
+        Args:
+            result_path (str): Path of the result file.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            metric (str): Metric name used for evaluation. Default: 'bbox'.
+            result_name (str): Result name in the metric prefix.
+                Default: 'pts_bbox'.
+
+        Returns:
+            dict: Dictionary of evaluation details.
+        """
+        from nuscenes import NuScenes
+        self.nusc = NuScenes(version=self.version, dataroot=self.data_root,
+                             verbose=True)
+
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+
+        eval_set_map = {
+            'v1.0-mini': 'mini_val',
+            'v1.0-trainval': 'val',
+        }
+        self.nusc_eval = NuScenesEval_custom(
+            self.nusc,
+            config=self.eval_detection_configs,
+            result_path=result_path,
+            eval_set=eval_set_map[self.version],
+            output_dir=output_dir,
+            verbose=True,
+            overlap_test=self.overlap_test,
+            data_infos=self.data_infos
+        )
+        self.nusc_eval.main(plot_examples=0, render_curves=False)
+        # record metrics
+        metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json'))
+        detail = dict()
+        metric_prefix = f'{result_name}_NuScenes'
+        for name in self.CLASSES:
+            for k, v in metrics['label_aps'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['label_tp_errors'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['tp_errors'].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}'.format(metric_prefix,
+                                      self.ErrNameMapping[k])] = val
+        detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+        detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+        return detail
\ No newline at end of file
diff --git a/mmcv/datasets/data_utils/data_utils.py b/mmcv/datasets/data_utils/data_utils.py
new file mode 100644
index 0000000..331e02f
--- /dev/null
+++ b/mmcv/datasets/data_utils/data_utils.py
@@ -0,0 +1,174 @@
+import math
+import numpy as np
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from nuscenes.utils.data_classes import Box as NuScenesBox
+import pyquaternion
+
+
+def output_to_nusc_box(detection):
+    """Convert the output to the box class in the nuScenes.
+    Args:
+        detection (dict): Detection results.
+            - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+            - scores_3d (torch.Tensor): Detection scores.
+            - labels_3d (torch.Tensor): Predicted box labels.
+    Returns:
+        list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
+    """
+    box3d = detection['boxes_3d']
+    scores = detection['scores_3d'].numpy()
+    labels = detection['labels_3d'].numpy()
+    if 'track_ids' in detection:
+        ids = detection['track_ids'].numpy()
+    else:
+        ids = np.ones_like(labels)
+
+    box_gravity_center = box3d.gravity_center.numpy()
+    box_dims = box3d.dims.numpy()
+    box_yaw = box3d.yaw.numpy()
+    # TODO: check whether this is necessary
+    # with dir_offset & dir_limit in the head
+    box_yaw = -box_yaw - np.pi / 2
+
+    box_list = []
+    for i in range(len(box3d)):
+        quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+        velocity = (*box3d.tensor[i, 7:9], 0.0)
+        # velo_val = np.linalg.norm(box3d[i, 7:9])
+        # velo_ori = box3d[i, 6]
+        # velocity = (
+        # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0)
+        box = NuScenesBox(
+            box_gravity_center[i],
+            box_dims[i],
+            quat,
+            label=labels[i],
+            score=scores[i],
+            velocity=velocity)
+        box.token = ids[i]
+        box_list.append(box)
+    return box_list
+
+
+def output_to_nusc_box_det(detection):
+    """Convert the output to the box class in the nuScenes.
+
+    Args:
+        detection (dict): Detection results.
+
+            - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+            - scores_3d (torch.Tensor): Detection scores.
+            - labels_3d (torch.Tensor): Predicted box labels.
+
+    Returns:
+        list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
+    """
+    if 'boxes_3d_det' in detection:
+        box3d = detection['boxes_3d_det']
+        scores = detection['scores_3d_det'].numpy()
+        labels = detection['labels_3d_det'].numpy()
+    else:
+        box3d = detection['boxes_3d']
+        scores = detection['scores_3d'].numpy()
+        labels = detection['labels_3d'].numpy()
+
+    box_gravity_center = box3d.gravity_center.numpy()
+    box_dims = box3d.dims.numpy()
+    box_yaw = box3d.yaw.numpy()
+    # TODO: check whether this is necessary
+    # with dir_offset & dir_limit in the head
+    box_yaw = -box_yaw - np.pi / 2
+
+    box_list = []
+    for i in range(len(box3d)):
+        quat = Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+        velocity = (*box3d.tensor[i, 7:9], 0.0)
+        box = NuScenesBox(
+            box_gravity_center[i],
+            box_dims[i],
+            quat,
+            label=labels[i],
+            score=scores[i],
+            velocity=velocity)
+        box_list.append(box)
+    return box_list
+
+
+def lidar_nusc_box_to_global(info,
+                             boxes,
+                             classes,
+                             eval_configs,
+                             eval_version='detection_cvpr_2019'):
+    """Convert the box from ego to global coordinate.
+    Args:
+        info (dict): Info for a specific sample data, including the
+            calibration information.
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        classes (list[str]): Mapped classes in the evaluation.
+        eval_configs (object): Evaluation configuration object.
+        eval_version (str, optional): Evaluation version.
+            Default: 'detection_cvpr_2019'
+    Returns:
+        list: List of standard NuScenesBoxes in the global
+            coordinate.
+    """
+    box_list = []
+    keep_idx = []
+    for i, box in enumerate(boxes):
+        # Move box to ego vehicle coord system
+        box.rotate(Quaternion(info['lidar2ego_rotation']))
+        box.translate(np.array(info['lidar2ego_translation']))
+        # filter det in ego.
+        cls_range_map = eval_configs.class_range
+        radius = np.linalg.norm(box.center[:2], 2)
+        det_range = cls_range_map[classes[box.label]]
+        if radius > det_range:
+            continue
+        # Move box to global coord system
+        box.rotate(Quaternion(info['ego2global_rotation']))
+        box.translate(np.array(info['ego2global_translation']))
+        box_list.append(box)
+        keep_idx.append(i)
+    return box_list, keep_idx
+
+
+def obtain_map_info(nusc,
+                    nusc_maps,
+                    sample,
+                    patch_size=(102.4, 102.4),
+                    canvas_size=(256, 256),
+                    layer_names=['lane_divider', 'road_divider'],
+                    thickness=10):
+    """
+    Export 2d annotation from the info file and raw data.
+    """
+    l2e_r = sample['lidar2ego_rotation']
+    l2e_t = sample['lidar2ego_translation']
+    e2g_r = sample['ego2global_rotation']
+    e2g_t = sample['ego2global_translation']
+    l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+    e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+    scene = nusc.get('scene', sample['scene_token'])
+    log = nusc.get('log', scene['log_token'])
+    nusc_map = nusc_maps[log['location']]
+    if layer_names is None:
+        layer_names = nusc_map.non_geometric_layers
+
+    l2g_r_mat = (l2e_r_mat.T @ e2g_r_mat.T).T
+    l2g_t = l2e_t @ e2g_r_mat.T + e2g_t
+    patch_box = (l2g_t[0], l2g_t[1], patch_size[0], patch_size[1])
+    patch_angle = math.degrees(Quaternion(matrix=l2g_r_mat).yaw_pitch_roll[0])
+
+    map_mask = nusc_map.get_map_mask(
+        patch_box, patch_angle, layer_names, canvas_size=canvas_size)
+    map_mask = map_mask[-2] | map_mask[-1]
+    map_mask = map_mask[np.newaxis, :]
+    map_mask = map_mask.transpose((2, 1, 0)).squeeze(2)  # (H, W, C)
+
+    erode = nusc_map.get_map_mask(patch_box, patch_angle, [
+                                  'drivable_area'], canvas_size=canvas_size)
+    erode = erode.transpose((2, 1, 0)).squeeze(2)
+
+    map_mask = np.concatenate([erode[None], map_mask[None]], axis=0)
+    return map_mask
diff --git a/mmcv/datasets/data_utils/rasterize.py b/mmcv/datasets/data_utils/rasterize.py
new file mode 100644
index 0000000..c30a870
--- /dev/null
+++ b/mmcv/datasets/data_utils/rasterize.py
@@ -0,0 +1,160 @@
+import cv2
+import numpy as np
+from shapely import affinity
+from shapely.geometry import LineString, box
+
+
+def get_patch_coord(patch_box, patch_angle=0.0):
+    patch_x, patch_y, patch_h, patch_w = patch_box
+
+    x_min = patch_x - patch_w / 2.0
+    y_min = patch_y - patch_h / 2.0
+    x_max = patch_x + patch_w / 2.0
+    y_max = patch_y + patch_h / 2.0
+
+    patch = box(x_min, y_min, x_max, y_max)
+    patch = affinity.rotate(patch, patch_angle, origin=(
+        patch_x, patch_y), use_radians=False)
+
+    return patch
+
+
+def get_discrete_degree(vec, angle_class=36):
+    deg = np.mod(np.degrees(np.arctan2(vec[1], vec[0])), 360)
+    deg = (int(deg / (360 / angle_class) + 0.5) % angle_class) + 1
+    return deg
+
+
+def mask_for_lines(lines, mask, thickness, idx, type='index', angle_class=36):
+    coords = np.asarray(list(lines.coords), np.int32)
+    coords = coords.reshape((-1, 2))
+    if len(coords) < 2:
+        return mask, idx
+    if type == 'backward':
+        coords = np.flip(coords, 0)
+
+    if type == 'index':
+        cv2.polylines(mask, [coords], False, color=idx, thickness=thickness)
+        idx += 1
+    else:
+        for i in range(len(coords) - 1):
+            cv2.polylines(mask, [coords[i:]], False, color=get_discrete_degree(
+                coords[i + 1] - coords[i], angle_class=angle_class), thickness=thickness)
+    return mask, idx
+
+
+def line_geom_to_mask(layer_geom, confidence_levels, local_box, canvas_size, thickness, idx, type='index', angle_class=36):
+    patch_x, patch_y, patch_h, patch_w = local_box
+
+    patch = get_patch_coord(local_box)
+
+    canvas_h = canvas_size[0]
+    canvas_w = canvas_size[1]
+    scale_height = canvas_h / patch_h
+    scale_width = canvas_w / patch_w
+
+    trans_x = -patch_x + patch_w / 2.0
+    trans_y = -patch_y + patch_h / 2.0
+
+    map_mask = np.zeros(canvas_size, np.uint8)
+
+    for line in layer_geom:
+        if isinstance(line, tuple):
+            line, confidence = line
+        else:
+            confidence = None
+        new_line = line.intersection(patch)
+        if not new_line.is_empty:
+            new_line = affinity.affine_transform(
+                new_line, [1.0, 0.0, 0.0, 1.0, trans_x, trans_y])
+            new_line = affinity.scale(
+                new_line, xfact=scale_width, yfact=scale_height, origin=(0, 0))
+            confidence_levels.append(confidence)
+            if new_line.geom_type == 'MultiLineString':
+                for new_single_line in new_line:
+                    map_mask, idx = mask_for_lines(
+                        new_single_line, map_mask, thickness, idx, type, angle_class)
+            else:
+                map_mask, idx = mask_for_lines(
+                    new_line, map_mask, thickness, idx, type, angle_class)
+    return map_mask, idx
+
+
+def overlap_filter(mask, filter_mask):
+    C, _, _ = mask.shape
+    for c in range(C-1, -1, -1):
+        filter = np.repeat((filter_mask[c] != 0)[None, :], c, axis=0)
+        mask[:c][filter] = 0
+
+    return mask
+
+
+def preprocess_map(vectors, patch_size, canvas_size, num_classes, thickness, angle_class):
+    confidence_levels = [-1]
+    vector_num_list = {}
+    for i in range(num_classes):
+        vector_num_list[i] = []
+
+    for vector in vectors:
+        if vector['pts_num'] >= 2:
+            vector_num_list[vector['type']].append(
+                LineString(vector['pts'][:vector['pts_num']]))
+
+    local_box = (0.0, 0.0, patch_size[0], patch_size[1])
+
+    idx = 1
+    filter_masks = []
+    instance_masks = []
+    forward_masks = []
+    backward_masks = []
+    for i in range(num_classes):
+        map_mask, idx = line_geom_to_mask(
+            vector_num_list[i], confidence_levels, local_box, canvas_size, thickness, idx)
+        instance_masks.append(map_mask)
+        filter_mask, _ = line_geom_to_mask(
+            vector_num_list[i], confidence_levels, local_box, canvas_size, thickness + 4, 1)
+        filter_masks.append(filter_mask)
+        forward_mask, _ = line_geom_to_mask(
+            vector_num_list[i], confidence_levels, local_box, canvas_size, thickness, 1, type='forward', angle_class=angle_class)
+        forward_masks.append(forward_mask)
+        backward_mask, _ = line_geom_to_mask(
+            vector_num_list[i], confidence_levels, local_box, canvas_size, thickness, 1, type='backward', angle_class=angle_class)
+        backward_masks.append(backward_mask)
+
+    filter_masks = np.stack(filter_masks)
+    instance_masks = np.stack(instance_masks)
+    forward_masks = np.stack(forward_masks)
+    backward_masks = np.stack(backward_masks)
+
+    instance_masks = overlap_filter(instance_masks, filter_masks)
+    forward_masks = overlap_filter(
+        forward_masks, filter_masks).sum(0).astype('int32')
+    backward_masks = overlap_filter(
+        backward_masks, filter_masks).sum(0).astype('int32')
+
+    semantic_masks = instance_masks != 0
+
+    return semantic_masks, instance_masks, forward_masks, backward_masks
+
+
+def rasterize_map(vectors, patch_size, canvas_size, num_classes, thickness):
+    confidence_levels = [-1]
+    vector_num_list = {}
+    for i in range(num_classes):
+        vector_num_list[i] = []
+
+    for vector in vectors:
+        if vector['pts_num'] >= 2:
+            vector_num_list[vector['type']].append(
+                (LineString(vector['pts'][:vector['pts_num']]), vector.get('confidence_level', 1)))
+
+    local_box = (0.0, 0.0, patch_size[0], patch_size[1])
+
+    idx = 1
+    masks = []
+    for i in range(num_classes):
+        map_mask, idx = line_geom_to_mask(
+            vector_num_list[i], confidence_levels, local_box, canvas_size, thickness, idx)
+        masks.append(map_mask)
+
+    return np.stack(masks), confidence_levels
diff --git a/mmcv/datasets/data_utils/trajectory_api.py b/mmcv/datasets/data_utils/trajectory_api.py
new file mode 100644
index 0000000..83b2c3b
--- /dev/null
+++ b/mmcv/datasets/data_utils/trajectory_api.py
@@ -0,0 +1,283 @@
+import numpy as np
+from nuscenes.prediction import (PredictHelper,
+                                 convert_local_coords_to_global,
+                                 convert_global_coords_to_local)
+from mmcv.core.bbox.structures.box_3d_mode import Box3DMode
+from mmcv.core.bbox.structures.coord_3d_mode import Coord3DMode
+from mmcv.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from mmcv.parallel import DataContainer as DC
+from mmcv.datasets.pipelines import to_tensor
+
+class NuScenesTraj(object):
+    def __init__(self,
+                 nusc,
+                 predict_steps,
+                 planning_steps,
+                 past_steps,
+                 fut_steps,
+                 with_velocity,
+                 CLASSES,
+                 box_mode_3d,
+                 use_nonlinear_optimizer=False):
+        super().__init__()
+        self.nusc = nusc
+        self.prepare_sdc_vel_info()
+        self.predict_steps = predict_steps
+        self.planning_steps = planning_steps
+        self.past_steps = past_steps
+        self.fut_steps = fut_steps
+        self.with_velocity = with_velocity
+        self.CLASSES = CLASSES
+        self.box_mode_3d = box_mode_3d
+        self.predict_helper = PredictHelper(self.nusc)
+        self.use_nonlinear_optimizer = use_nonlinear_optimizer
+
+    def get_traj_label(self, sample_token, ann_tokens):
+        sd_rec = self.nusc.get('sample', sample_token)
+        fut_traj_all = []
+        fut_traj_valid_mask_all = []
+        past_traj_all = []	
+        past_traj_valid_mask_all = []
+        _, boxes, _ = self.nusc.get_sample_data(sd_rec['data']['LIDAR_TOP'], selected_anntokens=ann_tokens)
+        for i, ann_token in enumerate(ann_tokens):
+            box = boxes[i]
+            instance_token = self.nusc.get('sample_annotation', ann_token)['instance_token']
+            fut_traj_local = self.predict_helper.get_future_for_agent(instance_token, sample_token, seconds=6, in_agent_frame=True)
+            past_traj_local = self.predict_helper.get_past_for_agent(instance_token, sample_token, seconds=2, in_agent_frame=True)
+
+            fut_traj = np.zeros((self.predict_steps, 2))
+            fut_traj_valid_mask = np.zeros((self.predict_steps, 2))
+            past_traj = np.zeros((self.past_steps + self.fut_steps, 2))		
+            past_traj_valid_mask = np.zeros((self.past_steps + self.fut_steps, 2))
+            if fut_traj_local.shape[0] > 0:
+                if self.use_nonlinear_optimizer:
+                    trans = box.center
+                else:
+                    trans = np.array([0, 0, 0])
+                rot = Quaternion(matrix=box.rotation_matrix)
+                fut_traj_scence_centric = convert_local_coords_to_global(fut_traj_local, trans, rot) 
+                fut_traj[:fut_traj_scence_centric.shape[0], :] = fut_traj_scence_centric
+                fut_traj_valid_mask[:fut_traj_scence_centric.shape[0], :] = 1
+            if past_traj_local.shape[0] > 0:			
+                trans = np.array([0, 0, 0])		
+                rot = Quaternion(matrix=box.rotation_matrix)		
+                past_traj_scence_centric = convert_local_coords_to_global(past_traj_local, trans, rot) 		
+                past_traj[:past_traj_scence_centric.shape[0], :] = past_traj_scence_centric		
+                past_traj_valid_mask[:past_traj_scence_centric.shape[0], :] = 1
+
+                if fut_traj_local.shape[0] > 0:
+                    fut_steps = min(self.fut_steps, fut_traj_scence_centric.shape[0])
+                    past_traj[self.past_steps:self.past_steps+fut_steps, :] = fut_traj_scence_centric[:fut_steps]
+                    past_traj_valid_mask[self.past_steps:self.past_steps+fut_steps, :] = 1
+
+            fut_traj_all.append(fut_traj)		
+            fut_traj_valid_mask_all.append(fut_traj_valid_mask)		
+            past_traj_all.append(past_traj)		
+            past_traj_valid_mask_all.append(past_traj_valid_mask)		
+        if len(ann_tokens) > 0:		
+            fut_traj_all = np.stack(fut_traj_all, axis=0)		
+            fut_traj_valid_mask_all = np.stack(fut_traj_valid_mask_all, axis=0)		
+            past_traj_all = np.stack(past_traj_all, axis=0)		
+            past_traj_valid_mask_all = np.stack(past_traj_valid_mask_all, axis=0)		
+        else:		
+            fut_traj_all = np.zeros((0, self.predict_steps, 2))		
+            fut_traj_valid_mask_all = np.zeros((0, self.predict_steps, 2))		
+            past_traj_all = np.zeros((0, self.predict_steps, 2))		
+            past_traj_valid_mask_all = np.zeros((0, self.predict_steps, 2))		
+        return fut_traj_all, fut_traj_valid_mask_all, past_traj_all, past_traj_valid_mask_all
+
+    def get_vel_transform_mats(self, sample):
+        sd_rec = self.nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+        cs_record = self.nusc.get('calibrated_sensor',
+                             sd_rec['calibrated_sensor_token'])
+        pose_record = self.nusc.get('ego_pose', sd_rec['ego_pose_token'])
+
+        l2e_r = cs_record['rotation']
+        l2e_t = cs_record['translation']
+        e2g_r = pose_record['rotation']
+        e2g_t = pose_record['translation']
+        l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+        e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+        return l2e_r_mat, e2g_r_mat
+
+    def get_vel_and_time(self, sample):
+        lidar_token = sample['data']['LIDAR_TOP']
+        lidar_top = self.nusc.get('sample_data', lidar_token)
+        pose = self.nusc.get('ego_pose', lidar_top['ego_pose_token'])
+        xyz = pose['translation']
+        timestamp = sample['timestamp']
+        return xyz, timestamp
+        
+    def prepare_sdc_vel_info(self):
+        # generate sdc velocity info for all samples
+        # Note that these velocity values are converted from 
+        # global frame to lidar frame
+        # as aligned with bbox gts
+
+        self.sdc_vel_info = {}
+        for scene in self.nusc.scene:
+            scene_token = scene['token']
+
+            # we cannot infer vel for the last sample, therefore we skip it
+            last_sample_token = scene['last_sample_token']
+            sample_token = scene['first_sample_token']
+            sample = self.nusc.get('sample', sample_token)
+            xyz, time = self.get_vel_and_time(sample)
+            while sample['token'] != last_sample_token:
+                next_sample_token = sample['next']
+                next_sample = self.nusc.get('sample', next_sample_token)
+                next_xyz, next_time = self.get_vel_and_time(next_sample)
+                dc = np.array(next_xyz) - np.array(xyz) 
+                dt = (next_time - time) / 1e6
+                vel = dc/dt
+
+                # global frame to lidar frame
+                l2e_r_mat, e2g_r_mat = self.get_vel_transform_mats(sample)
+                vel = vel @ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(
+                    l2e_r_mat).T
+                vel = vel[:2]
+
+                self.sdc_vel_info[sample['token']] = vel
+                xyz, time = next_xyz, next_time
+                sample = next_sample
+
+            # set first sample's vel equal to second sample's
+            last_sample = self.nusc.get('sample', last_sample_token)
+            second_last_sample_token = last_sample['prev']
+            self.sdc_vel_info[last_sample_token] = self.sdc_vel_info[second_last_sample_token]                
+
+    def generate_sdc_info(self, sdc_vel, as_lidar_instance3d_box=False):
+        # sdc dim from https://forum.nuscenes.org/t/dimensions-of-the-ego-vehicle-used-to-gather-data/550
+        psudo_sdc_bbox = np.array([0.0, 0.0, 0.0, 1.73, 4.08, 1.56, -np.pi])
+        if self.with_velocity:
+            psudo_sdc_bbox = np.concatenate([psudo_sdc_bbox, sdc_vel], axis=-1)
+        gt_bboxes_3d = np.array([psudo_sdc_bbox]).astype(np.float32)
+        gt_names_3d = ['car']
+        gt_labels_3d = []
+        for cat in gt_names_3d:
+            if cat in self.CLASSES:
+                gt_labels_3d.append(self.CLASSES.index(cat))
+            else:
+                gt_labels_3d.append(-1)
+        gt_labels_3d = np.array(gt_labels_3d)
+
+        # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
+        # the same as KITTI (0.5, 0.5, 0)
+        gt_bboxes_3d = LiDARInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+        
+        if as_lidar_instance3d_box:
+            # if we do not want the batch the box in to DataContrainer
+            return gt_bboxes_3d
+
+        gt_labels_3d = DC(to_tensor(gt_labels_3d))
+        gt_bboxes_3d = DC(gt_bboxes_3d, cpu_only=True)
+
+        return gt_bboxes_3d, gt_labels_3d
+
+    def get_sdc_traj_label(self, sample_token):
+        sd_rec = self.nusc.get('sample', sample_token)
+        lidar_top_data_start = self.nusc.get('sample_data', sd_rec['data']['LIDAR_TOP'])
+        ego_pose_start = self.nusc.get('ego_pose', lidar_top_data_start['ego_pose_token'])
+
+        sdc_fut_traj = []
+        for _ in range(self.predict_steps):
+            next_annotation_token = sd_rec['next']
+            if next_annotation_token=='':
+                break
+            sd_rec = self.nusc.get('sample', next_annotation_token)
+            lidar_top_data_next = self.nusc.get('sample_data', sd_rec['data']['LIDAR_TOP'])
+            ego_pose_next = self.nusc.get('ego_pose', lidar_top_data_next['ego_pose_token'])
+            sdc_fut_traj.append(ego_pose_next['translation'][:2])  # global xy pos of sdc at future step i
+        
+        sdc_fut_traj_all = np.zeros((1, self.predict_steps, 2))
+        sdc_fut_traj_valid_mask_all = np.zeros((1, self.predict_steps, 2))
+        n_valid_timestep = len(sdc_fut_traj)
+        if n_valid_timestep>0:
+            sdc_fut_traj = np.stack(sdc_fut_traj, axis=0)  #(t,2)
+            sdc_fut_traj = convert_global_coords_to_local(
+                coordinates=sdc_fut_traj,
+                translation=ego_pose_start['translation'],
+                rotation=ego_pose_start['rotation'],
+            )
+            sdc_fut_traj_all[:,:n_valid_timestep,:] = sdc_fut_traj
+            sdc_fut_traj_valid_mask_all[:,:n_valid_timestep,:] = 1
+
+        return sdc_fut_traj_all, sdc_fut_traj_valid_mask_all
+    
+    def get_l2g_transform(self, sample):
+        sd_rec = self.nusc.get('sample_data', sample['data']['LIDAR_TOP'])
+        cs_record = self.nusc.get('calibrated_sensor',
+                             sd_rec['calibrated_sensor_token'])
+        pose_record = self.nusc.get('ego_pose', sd_rec['ego_pose_token'])
+
+        l2e_r = cs_record['rotation']
+        l2e_t = np.array(cs_record['translation'])
+        e2g_r = pose_record['rotation']
+        e2g_t = np.array(pose_record['translation'])
+        l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+        e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+        return l2e_r_mat, l2e_t, e2g_r_mat, e2g_t
+
+    def get_sdc_planning_label(self, sample_token):
+        sd_rec = self.nusc.get('sample', sample_token)
+        l2e_r_mat_init, l2e_t_init, e2g_r_mat_init, e2g_t_init = self.get_l2g_transform(sd_rec)
+        
+
+        planning = []
+        for _ in range(self.planning_steps):
+            next_annotation_token = sd_rec['next']
+            if next_annotation_token=='':
+                break
+            sd_rec = self.nusc.get('sample', next_annotation_token)
+            l2e_r_mat_curr, l2e_t_curr, e2g_r_mat_curr, e2g_t_curr = self.get_l2g_transform(sd_rec)  # (lidar to global at current frame)
+            
+            # bbox of sdc under current lidar frame
+            next_bbox3d = self.generate_sdc_info(self.sdc_vel_info[next_annotation_token], as_lidar_instance3d_box=True)
+
+            # to bbox under curr ego frame
+            next_bbox3d.rotate(l2e_r_mat_curr.T)
+            next_bbox3d.translate(l2e_t_curr)
+
+            # to bbox under world frame
+            next_bbox3d.rotate(e2g_r_mat_curr.T)
+            next_bbox3d.translate(e2g_t_curr)
+
+            # to bbox under initial ego frame, first inverse translate, then inverse rotate 
+            next_bbox3d.translate(- e2g_t_init)
+            m1 = np.linalg.inv(e2g_r_mat_init)
+            next_bbox3d.rotate(m1.T)
+
+            # to bbox under curr ego frame, first inverse translate, then inverse rotate
+            next_bbox3d.translate(- l2e_t_init)
+            m2 = np.linalg.inv(l2e_r_mat_init)
+            next_bbox3d.rotate(m2.T)
+            
+            planning.append(next_bbox3d)
+
+        planning_all = np.zeros((1, self.planning_steps, 3))
+        planning_mask_all = np.zeros((1, self.planning_steps, 2))
+        n_valid_timestep = len(planning)
+        if n_valid_timestep>0:
+            planning = [p.tensor.squeeze(0) for p in planning]
+            planning = np.stack(planning, axis=0)  # (valid_t, 9)
+            planning = planning[:, [0,1,6]]  # (x, y, yaw)
+            planning_all[:,:n_valid_timestep,:] = planning
+            planning_mask_all[:,:n_valid_timestep,:] = 1
+
+        mask = planning_mask_all[0].any(axis=1)
+        if mask.sum() == 0:
+            command = 2 #'FORWARD'
+        elif planning_all[0, mask][-1][0] >= 2:
+            command = 0 #'RIGHT' 
+        elif planning_all[0, mask][-1][0] <= -2:
+            command = 1 #'LEFT'
+        else:
+            command = 2 #'FORWARD'
+        
+        return planning_all, planning_mask_all, command
\ No newline at end of file
diff --git a/mmcv/datasets/data_utils/vector_map.py b/mmcv/datasets/data_utils/vector_map.py
new file mode 100644
index 0000000..1ea21a6
--- /dev/null
+++ b/mmcv/datasets/data_utils/vector_map.py
@@ -0,0 +1,246 @@
+import numpy as np
+from nuscenes.map_expansion.map_api import NuScenesMap, NuScenesMapExplorer
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from shapely import affinity, ops
+from shapely.geometry import LineString, box, MultiPolygon, MultiLineString
+
+CLASS2LABEL = {
+    'road_divider': 0,
+    'lane_divider': 0,
+    'ped_crossing': 1,
+    'contours': 2,
+    'others': -1
+}
+
+class VectorizedLocalMap(object):
+    def __init__(self,
+                 dataroot,
+                 patch_size,
+                 canvas_size,
+                 line_classes=['road_divider', 'lane_divider'],
+                 ped_crossing_classes=['ped_crossing'],
+                 contour_classes=['road_segment', 'lane'],
+                 sample_dist=1,
+                 num_samples=250,
+                 padding=False,
+                 normalize=False,
+                 fixed_num=-1):
+        '''
+        Args:
+            fixed_num = -1 : no fixed num
+        '''
+        super().__init__()
+        self.data_root = dataroot
+        self.MAPS = ['boston-seaport', 'singapore-hollandvillage',
+                     'singapore-onenorth', 'singapore-queenstown']
+        self.line_classes = line_classes
+        self.ped_crossing_classes = ped_crossing_classes
+        self.polygon_classes = contour_classes
+        self.nusc_maps = {}
+        self.map_explorer = {}
+        for loc in self.MAPS:
+            self.nusc_maps[loc] = NuScenesMap(dataroot=self.data_root, map_name=loc)
+            self.map_explorer[loc] = NuScenesMapExplorer(self.nusc_maps[loc])
+
+        self.patch_size = patch_size
+        self.canvas_size = canvas_size
+        self.sample_dist = sample_dist
+        self.num_samples = num_samples
+        self.padding = padding
+        self.normalize = normalize
+        self.fixed_num = fixed_num
+
+    def gen_vectorized_samples(self, location, ego2global_translation, ego2global_rotation):
+        map_pose = ego2global_translation[:2]
+        rotation = Quaternion(ego2global_rotation)
+
+        patch_box = (map_pose[0], map_pose[1], self.patch_size[0], self.patch_size[1])
+        patch_angle = quaternion_yaw(rotation) / np.pi * 180
+
+        line_geom = self.get_map_geom(patch_box, patch_angle, self.line_classes, location)
+        line_vector_dict = self.line_geoms_to_vectors(line_geom)
+
+        ped_geom = self.get_map_geom(patch_box, patch_angle, self.ped_crossing_classes, location)
+        # ped_vector_list = self.ped_geoms_to_vectors(ped_geom)
+        ped_vector_list = self.line_geoms_to_vectors(ped_geom)['ped_crossing']
+
+        polygon_geom = self.get_map_geom(patch_box, patch_angle, self.polygon_classes, location)
+        poly_bound_list = self.poly_geoms_to_vectors(polygon_geom)
+
+        vectors = []
+        for line_type, vects in line_vector_dict.items():
+            for line, length in vects:
+                vectors.append((line.astype(float), length, CLASS2LABEL.get(line_type, -1)))
+
+        for ped_line, length in ped_vector_list:
+            vectors.append((ped_line.astype(float), length, CLASS2LABEL.get('ped_crossing', -1)))
+
+        for contour, length in poly_bound_list:
+            vectors.append((contour.astype(float), length, CLASS2LABEL.get('contours', -1)))
+
+        # filter out -1
+        filtered_vectors = []
+        for pts, pts_num, type in vectors:
+            if type != -1:
+                filtered_vectors.append({
+                    'pts': pts,
+                    'pts_num': pts_num,
+                    'type': type
+                })
+
+        return filtered_vectors
+
+    def get_map_geom(self, patch_box, patch_angle, layer_names, location):
+        map_geom = []
+        for layer_name in layer_names:
+            if layer_name in self.line_classes:
+                geoms = self.map_explorer[location]._get_layer_line(patch_box, patch_angle, layer_name)
+                map_geom.append((layer_name, geoms))
+            elif layer_name in self.polygon_classes:
+                geoms = self.map_explorer[location]._get_layer_polygon(patch_box, patch_angle, layer_name)
+                map_geom.append((layer_name, geoms))
+            elif layer_name in self.ped_crossing_classes:
+                geoms = self.get_ped_crossing_line(patch_box, patch_angle, location)
+                # geoms = self.map_explorer[location]._get_layer_polygon(patch_box, patch_angle, layer_name)
+                map_geom.append((layer_name, geoms))
+        return map_geom
+
+    def _one_type_line_geom_to_vectors(self, line_geom):
+        line_vectors = []
+        for line in line_geom:
+            if not line.is_empty:
+                if line.geom_type == 'MultiLineString':
+                    for single_line in line.geoms:
+                        line_vectors.append(self.sample_pts_from_line(single_line))
+                elif line.geom_type == 'LineString':
+                    line_vectors.append(self.sample_pts_from_line(line))
+                else:
+                    raise NotImplementedError
+        return line_vectors
+
+    def poly_geoms_to_vectors(self, polygon_geom):
+        roads = polygon_geom[0][1]
+        lanes = polygon_geom[1][1]
+        union_roads = ops.unary_union(roads)
+        union_lanes = ops.unary_union(lanes)
+        union_segments = ops.unary_union([union_roads, union_lanes])
+        max_x = self.patch_size[1] / 2
+        max_y = self.patch_size[0] / 2
+        local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+        exteriors = []
+        interiors = []
+        if union_segments.geom_type != 'MultiPolygon':
+            union_segments = MultiPolygon([union_segments])
+        for poly in union_segments.geoms:
+            exteriors.append(poly.exterior)
+            for inter in poly.interiors:
+                interiors.append(inter)
+
+        results = []
+        for ext in exteriors:
+            if ext.is_ccw:
+                ext.coords = list(ext.coords)[::-1]
+            lines = ext.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        for inter in interiors:
+            if not inter.is_ccw:
+                inter.coords = list(inter.coords)[::-1]
+            lines = inter.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        return self._one_type_line_geom_to_vectors(results)
+
+    def line_geoms_to_vectors(self, line_geom):
+        line_vectors_dict = dict()
+        for line_type, a_type_of_lines in line_geom:
+            one_type_vectors = self._one_type_line_geom_to_vectors(a_type_of_lines)
+            line_vectors_dict[line_type] = one_type_vectors
+
+        return line_vectors_dict
+
+    def ped_geoms_to_vectors(self, ped_geom):
+        ped_geom = ped_geom[0][1]
+        union_ped = ops.unary_union(ped_geom)
+        if union_ped.geom_type != 'MultiPolygon':
+            union_ped = MultiPolygon([union_ped])
+
+        max_x = self.patch_size[1] / 2
+        max_y = self.patch_size[0] / 2
+        local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+        results = []
+        for ped_poly in union_ped:
+            # rect = ped_poly.minimum_rotated_rectangle
+            ext = ped_poly.exterior
+            if not ext.is_ccw:
+                ext.coords = list(ext.coords)[::-1]
+            lines = ext.intersection(local_patch)
+            results.append(lines)
+
+        return self._one_type_line_geom_to_vectors(results)
+
+    def get_ped_crossing_line(self, patch_box, patch_angle, location):
+        def add_line(poly_xy, idx, patch, patch_angle, patch_x, patch_y, line_list):
+            points = [(p0, p1) for p0, p1 in zip(poly_xy[0, idx:idx + 2], poly_xy[1, idx:idx + 2])]
+            line = LineString(points)
+            line = line.intersection(patch)
+            if not line.is_empty:
+                line = affinity.rotate(line, -patch_angle, origin=(patch_x, patch_y), use_radians=False)
+                line = affinity.affine_transform(line, [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+                line_list.append(line)
+
+        patch_x = patch_box[0]
+        patch_y = patch_box[1]
+
+        patch = NuScenesMapExplorer.get_patch_coord(patch_box, patch_angle)
+        line_list = []
+        records = getattr(self.nusc_maps[location], 'ped_crossing')
+        for record in records:
+            polygon = self.map_explorer[location].extract_polygon(record['polygon_token'])
+            poly_xy = np.array(polygon.exterior.xy)
+            dist = np.square(poly_xy[:, 1:] - poly_xy[:, :-1]).sum(0)
+            x1, x2 = np.argsort(dist)[-2:]
+
+            add_line(poly_xy, x1, patch, patch_angle, patch_x, patch_y, line_list)
+            add_line(poly_xy, x2, patch, patch_angle, patch_x, patch_y, line_list)
+
+        return line_list
+
+    def sample_pts_from_line(self, line):
+        if self.fixed_num < 0:
+            distances = np.arange(0, line.length, self.sample_dist)
+            sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+        else:
+            # fixed number of points, so distance is line.length / self.fixed_num
+            distances = np.linspace(0, line.length, self.fixed_num)
+            sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+
+        if self.normalize:
+            sampled_points = sampled_points / np.array([self.patch_size[1], self.patch_size[0]])
+
+        num_valid = len(sampled_points)
+
+        if not self.padding or self.fixed_num > 0:
+            # fixed num sample can return now!
+            return sampled_points, num_valid
+
+        # fixed distance sampling need padding!
+        num_valid = len(sampled_points)
+
+        if self.fixed_num < 0:
+            if num_valid < self.num_samples:
+                padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            else:
+                sampled_points = sampled_points[:self.num_samples, :]
+                num_valid = self.num_samples
+
+            if self.normalize:
+                sampled_points = sampled_points / np.array([self.patch_size[1], self.patch_size[0]])
+                num_valid = len(sampled_points)
+
+        return sampled_points, num_valid
diff --git a/mmcv/datasets/dataset_wrappers.py b/mmcv/datasets/dataset_wrappers.py
new file mode 100644
index 0000000..65921b7
--- /dev/null
+++ b/mmcv/datasets/dataset_wrappers.py
@@ -0,0 +1,353 @@
+import bisect
+import math
+from collections import defaultdict
+
+import numpy as np
+from mmcv.utils import print_log
+from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
+
+from .builder import DATASETS
+from .coco import CocoDataset
+
+
+@DATASETS.register_module()
+class ConcatDataset(_ConcatDataset):
+    """A wrapper of concatenated dataset.
+
+    Same as :obj:`torch.utils.data.dataset.ConcatDataset`, but
+    concat the group flag for image aspect ratio.
+
+    Args:
+        datasets (list[:obj:`Dataset`]): A list of datasets.
+        separate_eval (bool): Whether to evaluate the results
+            separately if it is used as validation dataset.
+            Defaults to True.
+    """
+
+    def __init__(self, datasets, separate_eval=True):
+        super(ConcatDataset, self).__init__(datasets)
+        self.CLASSES = datasets[0].CLASSES
+        self.separate_eval = separate_eval
+        if not separate_eval:
+            if any([isinstance(ds, CocoDataset) for ds in datasets]):
+                raise NotImplementedError(
+                    'Evaluating concatenated CocoDataset as a whole is not'
+                    ' supported! Please set "separate_eval=True"')
+            elif len(set([type(ds) for ds in datasets])) != 1:
+                raise NotImplementedError(
+                    'All the datasets should have same types')
+
+        if hasattr(datasets[0], 'flag'):
+            flags = []
+            for i in range(0, len(datasets)):
+                flags.append(datasets[i].flag)
+            self.flag = np.concatenate(flags)
+
+    def get_cat_ids(self, idx):
+        """Get category ids of concatenated dataset by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            list[int]: All categories in the image of specified index.
+        """
+
+        if idx < 0:
+            if -idx > len(self):
+                raise ValueError(
+                    'absolute value of index should not exceed dataset length')
+            idx = len(self) + idx
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        return self.datasets[dataset_idx].get_cat_ids(sample_idx)
+
+    def evaluate(self, results, logger=None, **kwargs):
+        """Evaluate the results.
+
+        Args:
+            results (list[list | tuple]): Testing results of the dataset.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+
+        Returns:
+            dict[str: float]: AP results of the total dataset or each separate
+            dataset if `self.separate_eval=True`.
+        """
+        assert len(results) == self.cumulative_sizes[-1], \
+            ('Dataset and results have different sizes: '
+             f'{self.cumulative_sizes[-1]} v.s. {len(results)}')
+
+        # Check whether all the datasets support evaluation
+        for dataset in self.datasets:
+            assert hasattr(dataset, 'evaluate'), \
+                    f'{type(dataset)} does not implement evaluate function'
+
+        if self.separate_eval:
+            dataset_idx = -1
+            total_eval_results = dict()
+            for size, dataset in zip(self.cumulative_sizes, self.datasets):
+                start_idx = 0 if dataset_idx == -1 else \
+                    self.cumulative_sizes[dataset_idx]
+                end_idx = self.cumulative_sizes[dataset_idx + 1]
+
+                results_per_dataset = results[start_idx:end_idx]
+                print_log(
+                    f'\nEvaluateing {dataset.ann_file} with '
+                    f'{len(results_per_dataset)} images now',
+                    logger=logger)
+
+                eval_results_per_dataset = dataset.evaluate(
+                    results_per_dataset, logger=logger, **kwargs)
+                dataset_idx += 1
+                for k, v in eval_results_per_dataset.items():
+                    total_eval_results.update({f'{dataset_idx}_{k}': v})
+
+            return total_eval_results
+        elif any([isinstance(ds, CocoDataset) for ds in self.datasets]):
+            raise NotImplementedError(
+                'Evaluating concatenated CocoDataset as a whole is not'
+                ' supported! Please set "separate_eval=True"')
+        elif len(set([type(ds) for ds in self.datasets])) != 1:
+            raise NotImplementedError(
+                'All the datasets should have same types')
+        else:
+            original_data_infos = self.datasets[0].data_infos
+            self.datasets[0].data_infos = sum(
+                [dataset.data_infos for dataset in self.datasets], [])
+            eval_results = self.datasets[0].evaluate(
+                results, logger=logger, **kwargs)
+            self.datasets[0].data_infos = original_data_infos
+            return eval_results
+
+
+@DATASETS.register_module()
+class RepeatDataset:
+    """A wrapper of repeated dataset.
+
+    The length of repeated dataset will be `times` larger than the original
+    dataset. This is useful when the data loading time is long but the dataset
+    is small. Using RepeatDataset can reduce the data loading time between
+    epochs.
+
+    Args:
+        dataset (:obj:`Dataset`): The dataset to be repeated.
+        times (int): Repeat times.
+    """
+
+    def __init__(self, dataset, times):
+        self.dataset = dataset
+        self.times = times
+        self.CLASSES = dataset.CLASSES
+        if hasattr(self.dataset, 'flag'):
+            self.flag = np.tile(self.dataset.flag, times)
+
+        self._ori_len = len(self.dataset)
+
+    def __getitem__(self, idx):
+        return self.dataset[idx % self._ori_len]
+
+    def get_cat_ids(self, idx):
+        """Get category ids of repeat dataset by index.
+
+        Args:
+            idx (int): Index of data.
+
+        Returns:
+            list[int]: All categories in the image of specified index.
+        """
+
+        return self.dataset.get_cat_ids(idx % self._ori_len)
+
+    def __len__(self):
+        """Length after repetition."""
+        return self.times * self._ori_len
+
+
+# Modified from https://github.com/facebookresearch/detectron2/blob/41d475b75a230221e21d9cac5d69655e3415e3a4/detectron2/data/samplers/distributed_sampler.py#L57 # noqa
+@DATASETS.register_module()
+class ClassBalancedDataset:
+    """A wrapper of repeated dataset with repeat factor.
+
+    Suitable for training on class imbalanced datasets like LVIS. Following
+    the sampling strategy in the `paper <https://arxiv.org/abs/1908.03195>`_,
+    in each epoch, an image may appear multiple times based on its
+    "repeat factor".
+    The repeat factor for an image is a function of the frequency the rarest
+    category labeled in that image. The "frequency of category c" in [0, 1]
+    is defined by the fraction of images in the training set (without repeats)
+    in which category c appears.
+    The dataset needs to instantiate :func:`self.get_cat_ids` to support
+    ClassBalancedDataset.
+
+    The repeat factor is computed as followed.
+
+    1. For each category c, compute the fraction # of images
+       that contain it: :math:`f(c)`
+    2. For each category c, compute the category-level repeat factor:
+       :math:`r(c) = max(1, sqrt(t/f(c)))`
+    3. For each image I, compute the image-level repeat factor:
+       :math:`r(I) = max_{c in I} r(c)`
+
+    Args:
+        dataset (:obj:`CustomDataset`): The dataset to be repeated.
+        oversample_thr (float): frequency threshold below which data is
+            repeated. For categories with ``f_c >= oversample_thr``, there is
+            no oversampling. For categories with ``f_c < oversample_thr``, the
+            degree of oversampling following the square-root inverse frequency
+            heuristic above.
+        filter_empty_gt (bool, optional): If set true, images without bounding
+            boxes will not be oversampled. Otherwise, they will be categorized
+            as the pure background class and involved into the oversampling.
+            Default: True.
+    """
+
+    def __init__(self, dataset, oversample_thr, filter_empty_gt=True):
+        self.dataset = dataset
+        self.oversample_thr = oversample_thr
+        self.filter_empty_gt = filter_empty_gt
+        self.CLASSES = dataset.CLASSES
+
+        repeat_factors = self._get_repeat_factors(dataset, oversample_thr)
+        repeat_indices = []
+        for dataset_idx, repeat_factor in enumerate(repeat_factors):
+            repeat_indices.extend([dataset_idx] * math.ceil(repeat_factor))
+        self.repeat_indices = repeat_indices
+
+        flags = []
+        if hasattr(self.dataset, 'flag'):
+            for flag, repeat_factor in zip(self.dataset.flag, repeat_factors):
+                flags.extend([flag] * int(math.ceil(repeat_factor)))
+            assert len(flags) == len(repeat_indices)
+        self.flag = np.asarray(flags, dtype=np.uint8)
+
+    def _get_repeat_factors(self, dataset, repeat_thr):
+        """Get repeat factor for each images in the dataset.
+
+        Args:
+            dataset (:obj:`CustomDataset`): The dataset
+            repeat_thr (float): The threshold of frequency. If an image
+                contains the categories whose frequency below the threshold,
+                it would be repeated.
+
+        Returns:
+            list[float]: The repeat factors for each images in the dataset.
+        """
+
+        # 1. For each category c, compute the fraction # of images
+        #   that contain it: f(c)
+        category_freq = defaultdict(int)
+        num_images = len(dataset)
+        for idx in range(num_images):
+            cat_ids = set(self.dataset.get_cat_ids(idx))
+            if len(cat_ids) == 0 and not self.filter_empty_gt:
+                cat_ids = set([len(self.CLASSES)])
+            for cat_id in cat_ids:
+                category_freq[cat_id] += 1
+        for k, v in category_freq.items():
+            category_freq[k] = v / num_images
+
+        # 2. For each category c, compute the category-level repeat factor:
+        #    r(c) = max(1, sqrt(t/f(c)))
+        category_repeat = {
+            cat_id: max(1.0, math.sqrt(repeat_thr / cat_freq))
+            for cat_id, cat_freq in category_freq.items()
+        }
+
+        # 3. For each image I, compute the image-level repeat factor:
+        #    r(I) = max_{c in I} r(c)
+        repeat_factors = []
+        for idx in range(num_images):
+            cat_ids = set(self.dataset.get_cat_ids(idx))
+            if len(cat_ids) == 0 and not self.filter_empty_gt:
+                cat_ids = set([len(self.CLASSES)])
+            repeat_factor = 1
+            if len(cat_ids) > 0:
+                repeat_factor = max(
+                    {category_repeat[cat_id]
+                     for cat_id in cat_ids})
+            repeat_factors.append(repeat_factor)
+
+        return repeat_factors
+
+    def __getitem__(self, idx):
+        ori_index = self.repeat_indices[idx]
+        return self.dataset[ori_index]
+
+    def __len__(self):
+        """Length after repetition."""
+        return len(self.repeat_indices)
+    
+@DATASETS.register_module()
+class CBGSDataset(object):
+    """A wrapper of class sampled dataset with ann_file path. Implementation of
+    paper `Class-balanced Grouping and Sampling for Point Cloud 3D Object
+    Detection <https://arxiv.org/abs/1908.09492.>`_.
+
+    Balance the number of scenes under different classes.
+
+    Args:
+        dataset (:obj:`CustomDataset`): The dataset to be class sampled.
+    """
+
+    def __init__(self, dataset):
+        self.dataset = dataset
+        self.CLASSES = dataset.CLASSES
+        self.cat2id = {name: i for i, name in enumerate(self.CLASSES)}
+        self.sample_indices = self._get_sample_indices()
+        # self.dataset.data_infos = self.data_infos
+        if hasattr(self.dataset, 'flag'):
+            self.flag = np.array(
+                [self.dataset.flag[ind] for ind in self.sample_indices],
+                dtype=np.uint8)
+
+    def _get_sample_indices(self):
+        """Load annotations from ann_file.
+
+        Args:
+            ann_file (str): Path of the annotation file.
+
+        Returns:
+            list[dict]: List of annotations after class sampling.
+        """
+        class_sample_idxs = {cat_id: [] for cat_id in self.cat2id.values()}
+        for idx in range(len(self.dataset)):
+            sample_cat_ids = self.dataset.get_cat_ids(idx)
+            for cat_id in sample_cat_ids:
+                class_sample_idxs[cat_id].append(idx)
+        duplicated_samples = sum(
+            [len(v) for _, v in class_sample_idxs.items()])
+        class_distribution = {
+            k: len(v) / duplicated_samples
+            for k, v in class_sample_idxs.items()
+        }
+
+        sample_indices = []
+
+        frac = 1.0 / len(self.CLASSES)
+        ratios = [frac / v for v in class_distribution.values()]
+        for cls_inds, ratio in zip(list(class_sample_idxs.values()), ratios):
+            sample_indices += np.random.choice(cls_inds,
+                                               int(len(cls_inds) *
+                                                   ratio)).tolist()
+        return sample_indices
+
+    def __getitem__(self, idx):
+        """Get item from infos according to the given index.
+
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        ori_idx = self.sample_indices[idx]
+        return self.dataset[ori_idx]
+
+    def __len__(self):
+        """Return the length of data infos.
+
+        Returns:
+            int: Length of data infos.
+        """
+        return len(self.sample_indices)
diff --git a/mmcv/datasets/dd3d_nuscenes_dataset.py b/mmcv/datasets/dd3d_nuscenes_dataset.py
new file mode 100644
index 0000000..6c77617
--- /dev/null
+++ b/mmcv/datasets/dd3d_nuscenes_dataset.py
@@ -0,0 +1,359 @@
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+#import functools
+from collections import OrderedDict
+
+import numpy as np
+import seaborn as sns
+from torch.utils.data import Dataset
+from tqdm import tqdm
+
+from mmcv.structures import BoxMode
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.nuscenes import NuScenes
+from nuscenes.utils.splits import create_splits_scenes
+
+#from tridet.data import collect_dataset_dicts
+from adzoo.bevformer.mmdet3d_plugin.dd3d.structures.boxes3d import GenericBoxes3D
+from adzoo.bevformer.mmdet3d_plugin.dd3d.structures.pose import Pose
+from adzoo.bevformer.mmdet3d_plugin.dd3d.utils.geometry import project_points3d
+from adzoo.bevformer.mmdet3d_plugin.dd3d.utils.visualization import float_to_uint8_color
+
+#  https://github.com/nutonomy/nuscenes-devkit/blob/9b209638ef3dee6d0cdc5ac700c493747f5b35fe/python-sdk/nuscenes/utils/splits.py#L189
+#     - train/val/test: The standard splits of the nuScenes dataset (700/150/150 scenes).
+#     - mini_train/mini_val: Train and val splits of the mini subset used for visualization and debugging (8/2 scenes).
+#     - train_detect/train_track: Two halves of the train split used for separating the training sets of detector and
+#         tracker if required
+DATASET_NAME_TO_VERSION = {
+    "nusc_train": "v1.0-trainval",
+    "nusc_val": "v1.0-trainval",
+    "nusc_val-subsample-8": "v1.0-trainval",
+    "nusc_trainval": "v1.0-trainval",
+    "nusc_test": "v1.0-test",
+    "nusc_mini_train": "v1.0-mini",
+    "nusc_mini_val": "v1.0-mini",
+}
+
+CAMERA_NAMES = ('CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT')
+
+ATTRIBUTE_IDS = {
+    'vehicle.moving': 0,
+    'vehicle.parked': 1,
+    'vehicle.stopped': 2,
+    'pedestrian.moving': 0,
+    'pedestrian.standing': 1,
+    'pedestrian.sitting_lying_down': 2,
+    'cycle.with_rider': 0,
+    'cycle.without_rider': 1,
+}
+
+CATEGORY_IDS = OrderedDict({
+    'barrier': 0,
+    'bicycle': 1,
+    'bus': 2,
+    'car': 3,
+    'construction_vehicle': 4,
+    'motorcycle': 5,
+    'pedestrian': 6,
+    'traffic_cone': 7,
+    'trailer': 8,
+    'truck': 9,
+})
+
+COLORS = [float_to_uint8_color(clr) for clr in sns.color_palette("bright", n_colors=10)]
+COLORMAP = OrderedDict({
+    'barrier': COLORS[8],  # yellow
+    'bicycle': COLORS[0],  # blue
+    'bus': COLORS[6],  # pink
+    'car': COLORS[2],  # green
+    'construction_vehicle': COLORS[7],  # gray
+    'motorcycle': COLORS[4],  # purple
+    'pedestrian': COLORS[1],  # orange
+    'traffic_cone': COLORS[3],  # red
+    'trailer': COLORS[9],  # skyblue
+    'truck': COLORS[5],  # brown
+})
+
+MAX_NUM_ATTRIBUTES = 3
+
+
+def _compute_iou(box1, box2):
+    """
+    Parameters
+    ----------
+    box1, box2:
+        (x1, y1, x2, y2)
+    """
+    xx1 = max(box1[0], box2[0])
+    yy1 = max(box1[1], box2[1])
+    xx2 = min(box1[2], box2[2])
+    yy2 = min(box1[3], box2[3])
+    if xx1 >= xx2 or yy1 >= yy2:
+        return 0.
+    inter = (xx2 - xx1) * (yy2 - yy1)
+    a1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    a2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    return inter / (a1 + a2 - inter)
+
+
+class DD3DNuscenesDataset(Dataset):
+    def __init__(self, name, data_root, datum_names=CAMERA_NAMES, min_num_lidar_points=3, min_box_visibility=0.2, **unused):
+        self.data_root = data_root
+        assert name in DATASET_NAME_TO_VERSION
+        version = DATASET_NAME_TO_VERSION[name]
+        self.nusc = NuScenes(version=version, dataroot=data_root, verbose=True)
+
+        self.datum_names = datum_names
+        self.min_num_lidar_points = min_num_lidar_points
+        self.min_box_visibility = min_box_visibility
+
+        self.dataset_item_info = self._build_dataset_item_info(name)
+
+        # Index instance tokens to their IDs
+        self._instance_token_to_id = self._index_instance_tokens()
+
+        # Construct the mapping from datum_token (image id) to index
+        print("Generating the mapping from image id to idx...")
+        self.datumtoken2idx = {}
+        for idx, (datum_token, _, _, _, _) in enumerate(self.dataset_item_info):
+            self.datumtoken2idx[datum_token] = idx
+        print("Done.")
+
+    def _build_dataset_item_info(self, name):
+        scenes_in_split = self._get_split_scenes(name)
+
+        dataset_items = []
+        for _, scene_token in tqdm(scenes_in_split):
+            scene = self.nusc.get('scene', scene_token)
+            sample_token = scene['first_sample_token']
+            for sample_idx in range(scene['nbr_samples']):
+                if name.endswith('subsample-8') and sample_idx % 8 > 0:
+                    # Sample-level subsampling.
+                    continue
+
+                sample = self.nusc.get('sample', sample_token)
+                for datum_name, datum_token in sample['data'].items():
+                    if datum_name not in self.datum_names:
+                        continue
+                    dataset_items.append((datum_token, sample_token, scene['name'], sample_idx, datum_name))
+                sample_token = sample['next']
+        return dataset_items
+
+    def _get_split_scenes(self, name):
+        scenes_in_splits = create_splits_scenes()
+        if name == "nusc_trainval":
+            scenes = scenes_in_splits["train"] + scenes_in_splits["val"]
+        elif name == "nusc_val-subsample-8":
+            scenes = scenes_in_splits["val"]
+        else:
+            assert name.startswith('nusc_'), f"Invalid dataset name: {name}"
+            split = name[5:]
+            assert split in scenes_in_splits, f"Invalid dataset: {split}"
+            scenes = scenes_in_splits[split]
+
+        # Mapping from scene name to token.
+        name_to_token = {scene['name']: scene['token'] for scene in self.nusc.scene}
+        return [(name, name_to_token[name]) for name in scenes]
+
+    def __len__(self):
+        return len(self.dataset_item_info)
+
+    def _build_id(self, scene_name, sample_idx, datum_name):
+        sample_id = f"{scene_name}_{sample_idx:03d}"
+        image_id = f"{sample_id}_{datum_name}"
+        return image_id, sample_id
+
+    def _index_instance_tokens(self):
+        """Index instance tokens for uniquely identifying instances across samples"""
+        instance_token_to_id = {}
+        for record in self.nusc.sample_annotation:
+            instance_token = record['instance_token']
+            if instance_token not in instance_token_to_id:
+                next_instance_id = len(instance_token_to_id)
+                instance_token_to_id[instance_token] = next_instance_id
+        return instance_token_to_id
+
+    def get_instance_annotations(self, annotation_list, K, image_shape, pose_WS):
+        annotations = []
+        for _ann in annotation_list:
+            ann = self.nusc.get('sample_annotation', _ann.token)
+            if ann['num_lidar_pts'] + ann['num_radar_pts'] < self.min_num_lidar_points:
+                continue
+            annotation = OrderedDict()
+
+            # --------
+            # Category
+            # --------
+            category = category_to_detection_name(ann['category_name'])
+            if category is None:
+                continue
+            annotation['category_id'] = CATEGORY_IDS[category]
+
+            # ------
+            # 3D box
+            # ------
+            # NOTE: ann['rotation'], ann['translation'] is in global frame.
+            pose_SO = Pose(wxyz=_ann.orientation, tvec=_ann.center)  # pose in sensor frame
+            # DEBUG:
+            # pose_WO_1 = Pose(np.array(ann['rotation']), np.array(ann['translation']))
+            # pose_WO_2 = pose_WS * pose_SO
+            # assert np.allclose(pose_WO_1.matrix, pose_WO_2.matrix)
+            bbox3d = GenericBoxes3D(_ann.orientation, _ann.center, _ann.wlh)
+            annotation['bbox3d'] = bbox3d.vectorize().tolist()[0]
+
+            # --------------------------------------
+            # 2D box -- project 8 corners of 3D bbox
+            # --------------------------------------
+            corners = project_points3d(bbox3d.corners.cpu().numpy().squeeze(0), K)
+            l, t = corners[:, 0].min(), corners[:, 1].min()
+            r, b = corners[:, 0].max(), corners[:, 1].max()
+
+            x1 = max(0, l)
+            y1 = max(0, t)
+            x2 = min(image_shape[1], r)
+            y2 = min(image_shape[0], b)
+
+            iou = _compute_iou([l, t, r, b], [x1, y1, x2, y2])
+            if iou < self.min_box_visibility:
+                continue
+
+            annotation['bbox'] = [x1, y1, x2, y2]
+            annotation['bbox_mode'] = BoxMode.XYXY_ABS
+
+            # --------
+            # Track ID
+            # --------
+            annotation['track_id'] = self._instance_token_to_id[ann['instance_token']]
+
+            # ---------
+            # Attribute
+            # ---------
+            attr_tokens = ann['attribute_tokens']
+            assert len(attr_tokens) < 2  # NOTE: Allow only single attrubute.
+            attribute_id = MAX_NUM_ATTRIBUTES  # By default, MAX_NUM_ATTRIBUTES -- this is to be ignored in loss compute.
+            if attr_tokens:
+                attribute = self.nusc.get('attribute', attr_tokens[0])['name']
+                attribute_id = ATTRIBUTE_IDS[attribute]
+            annotation['attribute_id'] = attribute_id
+
+            # -----
+            # Speed
+            # -----
+            vel_global = self.nusc.box_velocity(ann['token'])
+            speed = np.linalg.norm(vel_global)  # NOTE: This can be NaN.
+            # DEBUG:
+            # speed * Quaternion(ann['rotation']).rotation_matrix.T[0] ~= vel_global
+            annotation['speed'] = speed
+
+            annotations.append(annotation)
+
+        return annotations
+
+    def _get_ego_velocity(self, current, max_time_diff=1.5):
+        """Velocity of ego-vehicle in m/s.
+        """
+        has_prev = current['prev'] != ''
+        has_next = current['next'] != ''
+
+        # Cannot estimate velocity for a single annotation.
+        if not has_prev and not has_next:
+            return np.array([np.nan, np.nan, np.nan])
+
+        if has_prev:
+            first = self.nusc.get('sample_data', current['prev'])
+        else:
+            first = current
+
+        if has_next:
+            last = self.nusc.get('sample_data', current['next'])
+        else:
+            last = current
+
+        pos_first = self.nusc.get('ego_pose', first['ego_pose_token'])['translation']
+        pos_last = self.nusc.get('ego_pose', last['ego_pose_token'])['translation']
+        pos_diff = np.float32(pos_last) - np.float32(pos_first)
+
+        time_last = 1e-6 * last['timestamp']
+        time_first = 1e-6 * first['timestamp']
+        time_diff = time_last - time_first
+
+        if has_next and has_prev:
+            # If doing centered difference, allow for up to double the max_time_diff.
+            max_time_diff *= 2
+
+        if time_diff > max_time_diff:
+            # If time_diff is too big, don't return an estimate.
+            return np.array([np.nan, np.nan, np.nan])
+        else:
+            return pos_diff / time_diff
+
+    def __getitem__(self, idx):
+        datum_token, sample_token, scene_name, sample_idx, datum_name = self.dataset_item_info[idx]
+        datum = self.nusc.get('sample_data', datum_token)
+        assert datum['is_key_frame']
+
+        filename, _annotations, K = self.nusc.get_sample_data(datum_token)
+        image_id, sample_id = self._build_id(scene_name, sample_idx, datum_name)
+        height, width = datum['height'], datum['width']
+        d2_dict = OrderedDict(
+            file_name=filename,
+            height=height,
+            width=width,
+            image_id=image_id,
+            sample_id=sample_id,
+            sample_token=sample_token
+        )
+
+        # Intrinsics
+        d2_dict['intrinsics'] = list(K.flatten())
+
+        # Get pose of the sensor (S) from vehicle (V) frame
+        _pose_VS = self.nusc.get('calibrated_sensor', datum['calibrated_sensor_token'])
+        pose_VS = Pose(wxyz=np.float64(_pose_VS['rotation']), tvec=np.float64(_pose_VS['translation']))
+
+        # Get ego-pose of the vehicle (V) from global/world (W) frame
+        _pose_WV = self.nusc.get('ego_pose', datum['ego_pose_token'])
+        pose_WV = Pose(wxyz=np.float64(_pose_WV['rotation']), tvec=np.float64(_pose_WV['translation']))
+        pose_WS = pose_WV * pose_VS
+
+        d2_dict['pose'] = {'wxyz': list(pose_WS.quat.elements), 'tvec': list(pose_WS.tvec)}
+        d2_dict['extrinsics'] = {'wxyz': list(pose_VS.quat.elements), 'tvec': list(pose_VS.tvec)}
+
+        d2_dict['ego_speed'] = np.linalg.norm(self._get_ego_velocity(datum))
+
+        d2_dict['annotations'] = self.get_instance_annotations(_annotations, K, (height, width), pose_WS)
+
+        return d2_dict
+
+    def getitem_by_datumtoken(self, datum_token):
+        # idx = self.datumtoken2idx[datum_token]
+        # ret = self.__getitem__(idx)
+
+        datum = self.nusc.get('sample_data', datum_token)
+        sample_token = datum['sample_token']
+        filename, _annotations, K = self.nusc.get_sample_data(datum_token)
+        height, width = datum['height'], datum['width']
+        d2_dict = OrderedDict(
+            file_name=filename,
+            height=height,
+            width=width,
+            image_id=0,
+            sample_id=0,
+            sample_token=sample_token
+        )
+        # Intrinsics
+        d2_dict['intrinsics'] = list(K.flatten())
+        # Get pose of the sensor (S) from vehicle (V) frame
+        _pose_VS = self.nusc.get('calibrated_sensor', datum['calibrated_sensor_token'])
+        pose_VS = Pose(wxyz=np.float64(_pose_VS['rotation']), tvec=np.float64(_pose_VS['translation'])) 
+        # Get ego-pose of the vehicle (V) from global/world (W) frame
+        _pose_WV = self.nusc.get('ego_pose', datum['ego_pose_token'])
+        pose_WV = Pose(wxyz=np.float64(_pose_WV['rotation']), tvec=np.float64(_pose_WV['translation']))
+        pose_WS = pose_WV * pose_VS
+
+        d2_dict['pose'] = {'wxyz': list(pose_WS.quat.elements), 'tvec': list(pose_WS.tvec)}
+        d2_dict['extrinsics'] = {'wxyz': list(pose_VS.quat.elements), 'tvec': list(pose_VS.tvec)}
+
+        d2_dict['ego_speed'] = np.linalg.norm(self._get_ego_velocity(datum))
+
+        d2_dict['annotations'] = self.get_instance_annotations(_annotations, K, (height, width), pose_WS)
+        return d2_dict
\ No newline at end of file
diff --git a/mmcv/datasets/eval_utils/eval_utils.py b/mmcv/datasets/eval_utils/eval_utils.py
new file mode 100644
index 0000000..9d56923
--- /dev/null
+++ b/mmcv/datasets/eval_utils/eval_utils.py
@@ -0,0 +1,911 @@
+import json
+import torch
+import tqdm
+from typing import List, Dict, Tuple, Callable, Union
+from nuscenes import NuScenes
+from pyquaternion import Quaternion
+import numpy as np
+from .metric_utils import min_ade, min_fde, miss_rate
+
+from nuscenes.utils.splits import create_splits_scenes
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.prediction import PredictHelper, convert_local_coords_to_global
+from nuscenes.eval.common.data_classes import EvalBox, EvalBoxes
+from nuscenes.eval.detection.data_classes import DetectionBox
+from nuscenes.eval.detection.data_classes import DetectionMetricData, DetectionMetricDataList, DetectionMetrics
+from nuscenes.eval.common.utils import center_distance, scale_iou, yaw_diff, velocity_l2, attr_acc, cummean
+
+def category_to_motion_name(category_name: str):
+    """
+    Default label mapping from nuScenes to nuScenes detection classes.
+    Note that pedestrian does not include personal_mobility, stroller and wheelchair.
+    :param category_name: Generic nuScenes class.
+    :return: nuScenes detection class.
+    """
+    detection_mapping = {
+        'movable_object.barrier': 'barrier',
+        'vehicle.bicycle': 'car',
+        'vehicle.bus.bendy': 'car',
+        'vehicle.bus.rigid': 'car',
+        'vehicle.car': 'car',
+        'vehicle.construction': 'car',
+        'vehicle.motorcycle': 'car',
+        'human.pedestrian.adult': 'pedestrian',
+        'human.pedestrian.child': 'pedestrian',
+        'human.pedestrian.construction_worker': 'pedestrian',
+        'human.pedestrian.police_officer': 'pedestrian',
+        'movable_object.trafficcone': 'barrier',
+        'vehicle.trailer': 'car',
+        'vehicle.truck': 'car'
+    }
+
+    if category_name in detection_mapping:
+        return detection_mapping[category_name]
+    else:
+        return None
+
+def detection_prediction_category_to_motion_name(category_name: str):
+    """
+    Default label mapping from nuScenes to nuScenes detection classes.
+    Note that pedestrian does not include personal_mobility, stroller and wheelchair.
+    :param category_name: Generic nuScenes class.
+    :return: nuScenes detection class.
+    """
+    detection_mapping = {
+        'car': 'car',
+        'truck': 'car',
+        'construction_vehicle': 'car',
+        'bus': 'car',
+        'trailer': 'car',
+        'motorcycle': 'car',
+        'bicycle': 'car',
+        'pedestrian': 'pedestrian',
+        'traffic_cone': 'barrier',
+        'barrier': 'barrier',
+    }
+
+    if category_name in detection_mapping:
+        return detection_mapping[category_name]
+    else:
+        return None
+
+class DetectionMotionMetrics(DetectionMetrics):
+    """ Stores average precision and true positive metric results. Provides properties to summarize. """
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized dictionary. """
+
+        cfg = DetectionConfig.deserialize(content['cfg'])
+        metrics = cls(cfg=cfg)
+        metrics.add_runtime(content['eval_time'])
+
+        for detection_name, label_aps in content['label_aps'].items():
+            for dist_th, ap in label_aps.items():
+                metrics.add_label_ap(detection_name=detection_name, dist_th=float(dist_th), ap=float(ap))
+
+        for detection_name, label_tps in content['label_tp_errors'].items():
+            for metric_name, tp in label_tps.items():
+                metrics.add_label_tp(detection_name=detection_name, metric_name=metric_name, tp=float(tp))
+
+        return metrics
+
+class DetectionMotionMetricDataList(DetectionMetricDataList):
+    """ This stores a set of MetricData in a dict indexed by (name, match-distance). """
+    @classmethod
+    def deserialize(cls, content: dict):
+        mdl = cls()
+        for key, md in content.items():
+            name, distance = key.split(':')
+            mdl.set(name, float(distance), DetectionMotionMetricData.deserialize(md))
+        return mdl
+
+class DetectionMotionMetricData(DetectionMetricData):
+    """ This class holds accumulated and interpolated data required to calculate the detection metrics. """
+
+    nelem = 101
+
+    def __init__(self,
+                 recall: np.array,
+                 precision: np.array,
+                 confidence: np.array,
+                 trans_err: np.array,
+                 vel_err: np.array,
+                 scale_err: np.array,
+                 orient_err: np.array,
+                 attr_err: np.array,
+                 min_ade_err: np.array,
+                 min_fde_err: np.array,
+                 miss_rate_err: np.array):
+
+        # Assert lengths.
+        assert len(recall) == self.nelem
+        assert len(precision) == self.nelem
+        assert len(confidence) == self.nelem
+        assert len(trans_err) == self.nelem
+        assert len(vel_err) == self.nelem
+        assert len(scale_err) == self.nelem
+        assert len(orient_err) == self.nelem
+        assert len(attr_err) == self.nelem
+        assert len(min_ade_err) == self.nelem
+        assert len(min_fde_err) == self.nelem
+        assert len(miss_rate_err) == self.nelem
+
+        # Assert ordering.
+        assert all(confidence == sorted(confidence, reverse=True))  # Confidences should be descending.
+        assert all(recall == sorted(recall))  # Recalls should be ascending.
+
+        # Set attributes explicitly to help IDEs figure out what is going on.
+        self.recall = recall
+        self.precision = precision
+        self.confidence = confidence
+        self.trans_err = trans_err
+        self.vel_err = vel_err
+        self.scale_err = scale_err
+        self.orient_err = orient_err
+        self.attr_err = attr_err
+        self.min_ade_err = min_ade_err
+        self.min_fde_err = min_fde_err
+        self.miss_rate_err = miss_rate_err
+
+    def __eq__(self, other):
+        eq = True
+        for key in self.serialize().keys():
+            eq = eq and np.array_equal(getattr(self, key), getattr(other, key))
+        return eq
+
+    @property
+    def max_recall_ind(self):
+        """ Returns index of max recall achieved. """
+
+        # Last instance of confidence > 0 is index of max achieved recall.
+        non_zero = np.nonzero(self.confidence)[0]
+        if len(non_zero) == 0:  # If there are no matches, all the confidence values will be zero.
+            max_recall_ind = 0
+        else:
+            max_recall_ind = non_zero[-1]
+
+        return max_recall_ind
+
+    @property
+    def max_recall(self):
+        """ Returns max recall achieved. """
+
+        return self.recall[self.max_recall_ind]
+
+    def serialize(self):
+        """ Serialize instance into json-friendly format. """
+        return {
+            'recall': self.recall.tolist(),
+            'precision': self.precision.tolist(),
+            'confidence': self.confidence.tolist(),
+            'trans_err': self.trans_err.tolist(),
+            'vel_err': self.vel_err.tolist(),
+            'scale_err': self.scale_err.tolist(),
+            'orient_err': self.orient_err.tolist(),
+            'attr_err': self.attr_err.tolist(),
+            'min_ade_err': self.min_ade_err.tolist(),
+            'min_fde_err': self.min_fde_err.tolist(),
+            'miss_rate_err': self.miss_rate_err.tolist(),
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized content. """
+        return cls(recall=np.array(content['recall']),
+                   precision=np.array(content['precision']),
+                   confidence=np.array(content['confidence']),
+                   trans_err=np.array(content['trans_err']),
+                   vel_err=np.array(content['vel_err']),
+                   scale_err=np.array(content['scale_err']),
+                   orient_err=np.array(content['orient_err']),
+                   attr_err=np.array(content['attr_err']),
+                   min_ade_err=np.array(content['min_ade_err']),
+                   min_fde_err=np.array(content['min_fde_err']),
+                   miss_rate_err=np.array(content['miss_rate_err']))
+
+    @classmethod
+    def no_predictions(cls):
+        """ Returns a md instance corresponding to having no predictions. """
+        return cls(recall=np.linspace(0, 1, cls.nelem),
+                   precision=np.zeros(cls.nelem),
+                   confidence=np.zeros(cls.nelem),
+                   trans_err=np.ones(cls.nelem),
+                   vel_err=np.ones(cls.nelem),
+                   scale_err=np.ones(cls.nelem),
+                   orient_err=np.ones(cls.nelem),
+                   attr_err=np.ones(cls.nelem),
+                   min_ade_err=np.ones(cls.nelem),
+                   min_fde_err=np.ones(cls.nelem),
+                   miss_rate_err=np.ones(cls.nelem))
+
+    @classmethod
+    def random_md(cls):
+        """ Returns an md instance corresponding to a random results. """
+        return cls(recall=np.linspace(0, 1, cls.nelem),
+                   precision=np.random.random(cls.nelem),
+                   confidence=np.linspace(0, 1, cls.nelem)[::-1],
+                   trans_err=np.random.random(cls.nelem),
+                   vel_err=np.random.random(cls.nelem),
+                   scale_err=np.random.random(cls.nelem),
+                   orient_err=np.random.random(cls.nelem),
+                   attr_err=np.random.random(cls.nelem),
+                   min_ade_err=np.random.random(cls.nelem),
+                   min_fde_err=np.random.random(cls.nelem),
+                   miss_rate_err=np.random.random(cls.nelem))
+
+
+class DetectionMotionBox(DetectionBox):
+    def __init__(self,
+                 sample_token: str = "",
+                 translation: Tuple[float, float, float] = (0, 0, 0),
+                 size: Tuple[float, float, float] = (0, 0, 0),
+                 rotation: Tuple[float, float, float, float] = (0, 0, 0, 0),
+                 velocity: Tuple[float, float] = (0, 0),
+                 ego_translation: [float, float, float] = (0, 0, 0),  # Translation to ego vehicle in meters.
+                 num_pts: int = -1,  # Nbr. LIDAR or RADAR inside the box. Only for gt boxes.
+                 detection_name: str = 'car',  # The class name used in the detection challenge.
+                 detection_score: float = -1.0,  # GT samples do not have a score.
+                 attribute_name: str = '',
+                 traj=None,
+                 traj_scores=None):  # Box attribute. Each box can have at most 1 attribute.
+        super(DetectionBox, self).__init__(sample_token, translation, size, rotation, velocity, ego_translation, num_pts)
+        assert detection_name is not None, 'Error: detection_name cannot be empty!'
+        # assert detection_name in DETECTION_NAMES, 'Error: Unknown detection_name %s' % detection_name
+
+        # assert attribute_name in ATTRIBUTE_NAMES or attribute_name == '', \
+        #     'Error: Unknown attribute_name %s' % attribute_name
+
+        assert type(detection_score) == float, 'Error: detection_score must be a float!'
+        assert not np.any(np.isnan(detection_score)), 'Error: detection_score may not be NaN!'
+
+        # Assign.
+        self.detection_name = detection_name
+        self.attribute_name = attribute_name
+        self.detection_score = detection_score
+        self.traj = traj
+        self.traj_scores = traj_scores
+        self.traj_index = None
+
+    def __eq__(self, other):
+        return (self.sample_token == other.sample_token and
+                self.translation == other.translation and
+                self.size == other.size and
+                self.rotation == other.rotation and
+                self.velocity == other.velocity and
+                self.ego_translation == other.ego_translation and
+                self.num_pts == other.num_pts and
+                self.detection_name == other.detection_name and
+                self.detection_score == other.detection_score and
+                self.attribute_name == other.attribute_name and 
+                np.all(self.traj == other.traj) and
+                np.all(self.traj_scores == other.traj_scores))
+
+    def serialize(self) -> dict:
+        """ Serialize instance into json-friendly format. """
+        return {
+            'sample_token': self.sample_token,
+            'translation': self.translation,
+            'size': self.size,
+            'rotation': self.rotation,
+            'velocity': self.velocity,
+            'ego_translation': self.ego_translation,
+            'num_pts': self.num_pts,
+            'detection_name': self.detection_name,
+            'detection_score': self.detection_score,
+            'attribute_name': self.attribute_name,
+            'traj': self.traj,
+            'traj_scores': self.traj_scores
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized content. """
+        return cls(sample_token=content['sample_token'],
+                   translation=tuple(content['translation']),
+                   size=tuple(content['size']),
+                   rotation=tuple(content['rotation']),
+                   velocity=tuple(content['velocity']),
+                   ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+                   else tuple(content['ego_translation']),
+                   num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+                   detection_name=content['detection_name'],
+                   detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+                   attribute_name=content['attribute_name'], 
+                   traj=content['predict_traj'],
+                   traj_scores=content['predict_traj_score'])
+
+class DetectionMotionBox_modified(DetectionMotionBox):
+    def __init__(self, *args, token=None, visibility=None, index=None, **kwargs):
+        '''
+        add annotation token
+        '''
+        super().__init__(*args, **kwargs)
+        self.token = token
+        self.visibility = visibility
+        self.index = index
+
+    def serialize(self) -> dict:
+        """ Serialize instance into json-friendly format. """
+        return {
+            'token': self.token,
+            'sample_token': self.sample_token,
+            'translation': self.translation,
+            'size': self.size,
+            'rotation': self.rotation,
+            'velocity': self.velocity,
+            'ego_translation': self.ego_translation,
+            'num_pts': self.num_pts,
+            'detection_name': self.detection_name,
+            'detection_score': self.detection_score,
+            'attribute_name': self.attribute_name,
+            'visibility': self.visibility,
+            'index': self.index,
+            'traj': self.traj,
+            'traj_scores': self.traj_scores
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized content. """
+        return cls(
+            token=content['token'],
+            sample_token=content['sample_token'],
+            translation=tuple(content['translation']),
+            size=tuple(content['size']),
+            rotation=tuple(content['rotation']),
+            velocity=tuple(content['velocity']),
+            ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+            else tuple(content['ego_translation']),
+            num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+            detection_name=content['detection_name'],
+            detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+            attribute_name=content['attribute_name'],
+            visibility=content['visibility'],
+            index=content['index'],
+            traj=content['traj'],
+        )
+
+
+def load_prediction(result_path: str, max_boxes_per_sample: int, box_cls, verbose: bool = False, category_convert_type='detection_category') \
+        -> Tuple[EvalBoxes, Dict]:
+    """
+    Loads object predictions from file.
+    :param result_path: Path to the .json result file provided by the user.
+    :param max_boxes_per_sample: Maximim number of boxes allowed per sample.
+    :param box_cls: Type of box to load, e.g. DetectionBox, DetectionMotionBox or TrackingBox.
+    :param verbose: Whether to print messages to stdout.
+    :return: The deserialized results and meta data.
+    """
+
+    # Load from file and check that the format is correct.
+    with open(result_path) as f:
+        data = json.load(f)
+    assert 'results' in data, 'Error: No field `results` in result file. Please note that the result format changed.' \
+                              'See https://www.nuscenes.org/object-detection for more information.'
+
+    if category_convert_type == 'motion_category':
+        for key in data['results'].keys():
+            for i in range(len(data['results'][key])):
+                data['results'][key][i]['detection_name'] = detection_prediction_category_to_motion_name(data['results'][key][i]['detection_name']) 
+    # Deserialize results and get meta data.
+    all_results = EvalBoxes.deserialize(data['results'], box_cls)
+    meta = data['meta']
+    if verbose:
+        print("Loaded results from {}. Found detections for {} samples."
+              .format(result_path, len(all_results.sample_tokens)))
+
+    # Check that each sample has no more than x predicted boxes.
+    for sample_token in all_results.sample_tokens:
+        assert len(all_results.boxes[sample_token]) <= max_boxes_per_sample, \
+            "Error: Only <= %d boxes per sample allowed!" % max_boxes_per_sample
+
+    return all_results, meta
+
+def load_gt(nusc: NuScenes, eval_split: str, box_cls, verbose: bool = False, category_convert_type='detection_category'):
+    """
+    Loads ground truth boxes from DB.
+    :param nusc: A NuScenes instance.
+    :param eval_split: The evaluation split for which we load GT boxes.
+    :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
+    :param verbose: Whether to print messages to stdout.
+    :return: The GT boxes.
+    """
+    predict_helper = PredictHelper(nusc)
+    # Init.
+    if box_cls == DetectionMotionBox_modified:
+        attribute_map = {a['token']: a['name'] for a in nusc.attribute}
+
+    if verbose:
+        print('Loading annotations for {} split from nuScenes version: {}'.format(eval_split, nusc.version))
+    # Read out all sample_tokens in DB.
+    sample_tokens_all = [s['token'] for s in nusc.sample]
+    assert len(sample_tokens_all) > 0, "Error: Database has no samples!"
+
+    # Only keep samples from this split.
+    splits = create_splits_scenes()
+
+    # Check compatibility of split with nusc_version.
+    version = nusc.version
+    if eval_split in {'train', 'val', 'train_detect', 'train_track'}:
+        assert version.endswith('trainval'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    elif eval_split in {'mini_train', 'mini_val'}:
+        assert version.endswith('mini'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    elif eval_split == 'test':
+        assert version.endswith('test'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    else:
+        raise ValueError('Error: Requested split {} which this function cannot map to the correct NuScenes version.'
+                         .format(eval_split))
+
+    if eval_split == 'test':
+        # Check that you aren't trying to cheat :).
+        assert len(nusc.sample_annotation) > 0, \
+            'Error: You are trying to evaluate on the test set but you do not have the annotations!'
+    index_map = {}
+    for scene in nusc.scene:
+        first_sample_token = scene['first_sample_token']
+        sample = nusc.get('sample', first_sample_token)
+        index_map[first_sample_token] = 1
+        index = 2
+        while sample['next'] != '':
+            sample = nusc.get('sample', sample['next'])
+            index_map[sample['token']] = index
+            index += 1
+
+    sample_tokens = []
+    for sample_token in sample_tokens_all:
+        scene_token = nusc.get('sample', sample_token)['scene_token']
+        scene_record = nusc.get('scene', scene_token)
+        if scene_record['name'] in splits[eval_split]:
+            sample_tokens.append(sample_token)
+
+    all_annotations = EvalBoxes()
+
+    # Load annotations and filter predictions and annotations.
+    tracking_id_set = set()
+    for sample_token in tqdm.tqdm(sample_tokens, leave=verbose):
+
+        sample = nusc.get('sample', sample_token)
+        sample_annotation_tokens = sample['anns']
+
+        sample_boxes = []
+        for sample_annotation_token in sample_annotation_tokens:
+
+            sample_annotation = nusc.get('sample_annotation', sample_annotation_token)
+            if box_cls == DetectionMotionBox_modified:
+                # Get label name in detection task and filter unused labels.
+                if category_convert_type == 'detection_category':
+                    detection_name = category_to_detection_name(sample_annotation['category_name'])
+                elif category_convert_type == 'motion_category':
+                    detection_name = category_to_motion_name(sample_annotation['category_name'])
+                else:
+                    raise NotImplementedError
+                if detection_name is None:
+                    continue
+                # Get attribute_name.
+                attr_tokens = sample_annotation['attribute_tokens']
+                attr_count = len(attr_tokens)
+                if attr_count == 0:
+                    attribute_name = ''
+                elif attr_count == 1:
+                    attribute_name = attribute_map[attr_tokens[0]]
+                else:
+                    raise Exception('Error: GT annotations must not have more than one attribute!')
+                instance_token = nusc.get('sample_annotation', sample_annotation['token'])['instance_token']
+                fut_traj_local = predict_helper.get_future_for_agent(instance_token, sample_token, seconds=6, in_agent_frame=True)
+                fut_traj_scence_centric = np.zeros((0,))
+                if fut_traj_local.shape[0] > 0:
+                    _, boxes, _ = nusc.get_sample_data(sample['data']['LIDAR_TOP'], selected_anntokens=[sample_annotation['token']])
+                    box = boxes[0]
+                    trans = box.center
+                    rot = Quaternion(matrix=box.rotation_matrix)
+                    fut_traj_scence_centric = convert_local_coords_to_global(fut_traj_local, trans, rot) 
+                    
+                sample_boxes.append(
+                    box_cls(
+                        token=sample_annotation_token,
+                        sample_token=sample_token,
+                        translation=sample_annotation['translation'],
+                        size=sample_annotation['size'],
+                        rotation=sample_annotation['rotation'],
+                        velocity=nusc.box_velocity(sample_annotation['token'])[:2],
+                        num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'],
+                        detection_name=detection_name,
+                        detection_score=-1.0,  # GT samples do not have a score.
+                        attribute_name=attribute_name,
+                        visibility=sample_annotation['visibility_token'],
+                        index=index_map[sample_token],
+                        traj=fut_traj_scence_centric,
+                    )
+                )
+            elif box_cls == TrackingBox:
+                assert False
+            else:
+                raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls)
+
+        all_annotations.add_boxes(sample_token, sample_boxes)
+
+    if verbose:
+        print("Loaded ground truth annotations for {} samples.".format(len(all_annotations.sample_tokens)))
+
+    return all_annotations
+
+def prediction_metrics(gt_box_match, pred_box):
+    pred_traj = np.array(pred_box.traj)
+    gt_traj_steps = gt_box_match.traj.reshape((-1, 2))
+    valid_steps = gt_traj_steps.shape[0]
+    if valid_steps <= 0:
+        return np.array([0]), np.array([0]), 0
+    nmodes = pred_traj.shape[0]
+    pred_steps = pred_traj.shape[1]
+    valid_mask = np.zeros((pred_steps, ))
+    gt_traj = np.zeros((pred_steps, 2))
+    gt_traj[:valid_steps, :] = gt_traj_steps
+    valid_mask[: valid_steps] = 1
+    pred_traj = torch.tensor(pred_traj[None])
+    gt_traj = torch.tensor(gt_traj[None])
+    valid_mask = torch.tensor(valid_mask[None])
+    ade_err, inds = min_ade(pred_traj, gt_traj, 1 - valid_mask)
+    fde_err, inds = min_fde(pred_traj, gt_traj, 1 - valid_mask)
+    mr_err = miss_rate(pred_traj, gt_traj, 1 - valid_mask, dist_thresh=2)
+    return ade_err.numpy(), fde_err.numpy(), mr_err.numpy()
+
+
+def accumulate(gt_boxes: EvalBoxes,
+               pred_boxes: EvalBoxes,
+               class_name: str,
+               dist_fcn: Callable,
+               dist_th: float,
+               verbose: bool = False) -> DetectionMotionMetricData:
+    """
+    Average Precision over predefined different recall thresholds for a single distance threshold.
+    The recall/conf thresholds and other raw metrics will be used in secondary metrics.
+    :param gt_boxes: Maps every sample_token to a list of its sample_annotations.
+    :param pred_boxes: Maps every sample_token to a list of its sample_results.
+    :param class_name: Class to compute AP on.
+    :param dist_fcn: Distance function used to match detections and ground truths.
+    :param dist_th: Distance threshold for a match.
+    :param verbose: If true, print debug messages.
+    :return: (average_prec, metrics). The average precision value and raw data for a number of metrics.
+    """
+    # ---------------------------------------------
+    # Organize input and initialize accumulators.
+    # ---------------------------------------------
+
+    # Count the positives.
+    npos = len([1 for gt_box in gt_boxes.all if gt_box.detection_name == class_name])
+    if verbose:
+        print("Found {} GT of class {} out of {} total across {} samples.".
+              format(npos, class_name, len(gt_boxes.all), len(gt_boxes.sample_tokens)))
+
+    # For missing classes in the GT, return a data structure corresponding to no predictions.
+    if npos == 0:
+        return DetectionMotionMetricData.no_predictions(), 0, 0, 0
+
+    # Organize the predictions in a single list.
+    pred_boxes_list = [box for box in pred_boxes.all if box.detection_name == class_name]
+    pred_confs = [box.detection_score for box in pred_boxes_list]
+
+    if verbose:
+        print("Found {} PRED of class {} out of {} total across {} samples.".
+              format(len(pred_confs), class_name, len(pred_boxes.all), len(pred_boxes.sample_tokens)))
+
+    # Sort by confidence.
+    sortind = [i for (v, i) in sorted((v, i) for (i, v) in enumerate(pred_confs))][::-1]
+
+    # Do the actual matching.
+    tp = []  # Accumulator of true positives.
+    fp = []  # Accumulator of false positives.
+    conf = []  # Accumulator of confidences.
+
+    # match_data holds the extra metrics we calculate for each match.
+    match_data = {'trans_err': [],
+                  'vel_err': [],
+                  'scale_err': [],
+                  'orient_err': [],
+                  'attr_err': [],
+                  'conf': [],
+                  'min_ade_err': [],
+                  'min_fde_err': [],
+                  'miss_rate_err': []}
+
+    # ---------------------------------------------
+    # Match and accumulate match data.
+    # ---------------------------------------------
+
+    taken = set()  # Initially no gt bounding box is matched.
+    for ind in sortind:
+        pred_box = pred_boxes_list[ind]
+        min_dist = np.inf
+        match_gt_idx = None
+
+        for gt_idx, gt_box in enumerate(gt_boxes[pred_box.sample_token]):
+
+            # Find closest match among ground truth boxes
+            if gt_box.detection_name == class_name and not (pred_box.sample_token, gt_idx) in taken:
+                this_distance = dist_fcn(gt_box, pred_box)
+                if this_distance < min_dist:
+                    min_dist = this_distance
+                    match_gt_idx = gt_idx
+
+        # If the closest match is close enough according to threshold we have a match!
+        is_match = min_dist < dist_th
+
+        if is_match:
+            taken.add((pred_box.sample_token, match_gt_idx))
+
+            #  Update tp, fp and confs.
+            tp.append(1)
+            fp.append(0)
+            conf.append(pred_box.detection_score)
+
+            # Since it is a match, update match data also.
+            gt_box_match = gt_boxes[pred_box.sample_token][match_gt_idx]
+            
+            match_data['trans_err'].append(center_distance(gt_box_match, pred_box))
+            match_data['vel_err'].append(velocity_l2(gt_box_match, pred_box))
+            match_data['scale_err'].append(1 - scale_iou(gt_box_match, pred_box))
+
+            # Barrier orientation is only determined up to 180 degree. (For cones orientation is discarded later)
+            period = np.pi if class_name == 'barrier' else 2 * np.pi
+            match_data['orient_err'].append(yaw_diff(gt_box_match, pred_box, period=period))
+
+            match_data['attr_err'].append(1 - attr_acc(gt_box_match, pred_box))
+            minade, minfde, m_r = prediction_metrics(gt_box_match, pred_box)
+            
+            match_data['min_ade_err'].append(minade)
+            match_data['min_fde_err'].append(minfde)
+            match_data['miss_rate_err'].append(m_r)
+            match_data['conf'].append(pred_box.detection_score)
+
+        else:
+            # No match. Mark this as a false positive.
+            tp.append(0)
+            fp.append(1)
+            conf.append(pred_box.detection_score)
+
+    # Check if we have any matches. If not, just return a "no predictions" array.
+    if len(match_data['trans_err']) == 0:
+        return DetectionMotionMetricData.no_predictions(), 0, 0, 0
+
+    # ---------------------------------------------
+    # Calculate and interpolate precision and recall
+    # ---------------------------------------------
+
+    # Accumulate.
+    N_tp = np.sum(tp)
+    N_fp = np.sum(fp)
+    tp = np.cumsum(tp).astype(float)
+    fp = np.cumsum(fp).astype(float)
+    conf = np.array(conf)
+
+
+    # Calculate precision and recall.
+    prec = tp / (fp + tp)
+    rec = tp / float(npos)
+
+    rec_interp = np.linspace(0, 1, DetectionMotionMetricData.nelem)  # 101 steps, from 0% to 100% recall.
+    prec = np.interp(rec_interp, rec, prec, right=0)
+    conf = np.interp(rec_interp, rec, conf, right=0)
+    rec = rec_interp
+
+    # ---------------------------------------------
+    # Re-sample the match-data to match, prec, recall and conf.
+    # ---------------------------------------------
+
+    for key in match_data.keys():
+        if key == "conf":
+            continue  # Confidence is used as reference to align with fp and tp. So skip in this step.
+
+        else:
+            # For each match_data, we first calculate the accumulated mean.
+            tmp = cummean(np.array(match_data[key]))
+
+            # Then interpolate based on the confidences. (Note reversing since np.interp needs increasing arrays)
+            match_data[key] = np.interp(conf[::-1], match_data['conf'][::-1], tmp[::-1])[::-1]
+
+    # ---------------------------------------------
+    # Done. Instantiate MetricData and return
+    # ---------------------------------------------
+    return DetectionMotionMetricData(recall=rec,
+                               precision=prec,
+                               confidence=conf,
+                               trans_err=match_data['trans_err'],
+                               vel_err=match_data['vel_err'],
+                               scale_err=match_data['scale_err'],
+                               orient_err=match_data['orient_err'],
+                               attr_err=match_data['attr_err'],
+                               min_ade_err=match_data['min_ade_err'],
+                               min_fde_err=match_data['min_fde_err'],
+                               miss_rate_err=match_data['miss_rate_err']
+                               ), N_tp, N_fp, npos
+
+
+
+def accumulate_motion(gt_boxes: EvalBoxes,
+               pred_boxes: EvalBoxes,
+               class_name: str,
+               dist_fcn: Callable,
+               traj_fcn: Callable,
+               dist_th: float,
+               traj_dist_th: float,
+               verbose: bool = False,
+               final_step: float = 12) -> DetectionMotionMetricData:
+    """
+    Average Precision over predefined different recall thresholds for a single distance threshold.
+    The recall/conf thresholds and other raw metrics will be used in secondary metrics.
+    :param gt_boxes: Maps every sample_token to a list of its sample_annotations.
+    :param pred_boxes: Maps every sample_token to a list of its sample_results.
+    :param class_name: Class to compute AP on.
+    :param dist_fcn: Distance function used to match detections and ground truths.
+    :param dist_th: Distance threshold for a match.
+    :param verbose: If true, print debug messages.
+    :return: (average_prec, metrics). The average precision value and raw data for a number of metrics.
+    """
+    # ---------------------------------------------
+    # Organize input and initialize accumulators.
+    # ---------------------------------------------
+
+    # Count the positives.
+    npos = len([1 for gt_box in gt_boxes.all if gt_box.detection_name == class_name])
+    if verbose:
+        print("Found {} GT of class {} out of {} total across {} samples.".
+              format(npos, class_name, len(gt_boxes.all), len(gt_boxes.sample_tokens)))
+
+    # For missing classes in the GT, return a data structure corresponding to no predictions.
+    if npos == 0:
+        return DetectionMotionMetricData.no_predictions(), 0, 0, 0
+
+    # 
+    # Organize the predictions in a single list.
+    pred_boxes_list = []
+    pred_confs = []
+
+    pred_boxes_list = [box for box in pred_boxes.all if box.detection_name == class_name]
+    pred_confs = [box.detection_score for box in pred_boxes_list]
+    # for box in pred_boxes.all:
+    #     if box.detection_name == class_name:
+    #         box.traj_scores = np.exp(box.traj_scores)
+    #         for i in range(len(box.traj_scores)):
+    #             box.traj_index = i
+    #             pred_boxes_list.append(box)
+    # pred_confs = [box.detection_score * box.traj_scores[box.traj_index]  for box in pred_boxes_list]
+
+    if verbose:
+        print("Found {} PRED of class {} out of {} total across {} samples.".
+              format(len(pred_confs), class_name, len(pred_boxes.all), len(pred_boxes.sample_tokens)))
+
+    # Sort by confidence.
+    sortind = [i for (v, i) in sorted((v, i) for (i, v) in enumerate(pred_confs))][::-1]
+
+    # Do the actual matching.
+    tp = []  # Accumulator of true positives.
+    fp = []  # Accumulator of false positives.
+    conf = []  # Accumulator of confidences.
+
+    # match_data holds the extra metrics we calculate for each match.
+    match_data = {'trans_err': [],
+                  'vel_err': [],
+                  'scale_err': [],
+                  'orient_err': [],
+                  'attr_err': [],
+                  'conf': [],
+                  'min_ade_err': [],
+                  'min_fde_err': [],
+                  'miss_rate_err': []}
+
+    # ---------------------------------------------
+    # Match and accumulate match data.
+    # ---------------------------------------------
+
+    taken = set()  # Initially no gt bounding box is matched.
+    for ind in sortind:
+        pred_box = pred_boxes_list[ind]
+        min_dist = np.inf
+        match_gt_idx = None
+
+        for gt_idx, gt_box in enumerate(gt_boxes[pred_box.sample_token]):
+
+            # Find closest match among ground truth boxes
+            if gt_box.detection_name == class_name and not (pred_box.sample_token, gt_idx) in taken:
+                this_distance = dist_fcn(gt_box, pred_box)
+                if this_distance < min_dist:
+                    min_dist = this_distance
+                    match_gt_idx = gt_idx
+                    fde_distance = traj_fcn(gt_box, pred_box, final_step)
+        # If the closest match is close enough according to threshold we have a match!
+        is_match = min_dist < dist_th and fde_distance < traj_dist_th
+
+        if is_match:
+            taken.add((pred_box.sample_token, match_gt_idx))
+
+            #  Update tp, fp and confs.
+            tp.append(1)
+            fp.append(0)
+            conf.append(pred_box.detection_score)
+
+            # Since it is a match, update match data also.
+            gt_box_match = gt_boxes[pred_box.sample_token][match_gt_idx]
+            
+            match_data['trans_err'].append(center_distance(gt_box_match, pred_box))
+            match_data['vel_err'].append(velocity_l2(gt_box_match, pred_box))
+            match_data['scale_err'].append(1 - scale_iou(gt_box_match, pred_box))
+
+            # Barrier orientation is only determined up to 180 degree. (For cones orientation is discarded later)
+            period = np.pi if class_name == 'barrier' else 2 * np.pi
+            match_data['orient_err'].append(yaw_diff(gt_box_match, pred_box, period=period))
+
+            match_data['attr_err'].append(1 - attr_acc(gt_box_match, pred_box))
+            minade, minfde, m_r = prediction_metrics(gt_box_match, pred_box)
+            
+            match_data['min_ade_err'].append(minade)
+            match_data['min_fde_err'].append(minfde)
+            match_data['miss_rate_err'].append(m_r)
+            match_data['conf'].append(pred_box.detection_score)
+
+        else:
+            # No match. Mark this as a false positive.
+            tp.append(0)
+            fp.append(1)
+            conf.append(pred_box.detection_score)
+            # conf.append(pred_box.detection_score * pred_box.traj_scores[pred_box.traj_index])
+    # 
+    # Check if we have any matches. If not, just return a "no predictions" array.
+    if len(match_data['trans_err']) == 0:
+        return DetectionMotionMetricData.no_predictions(), 0, 0, 0
+
+    # ---------------------------------------------
+    # Calculate and interpolate precision and recall
+    # ---------------------------------------------
+
+    # Accumulate.
+    N_tp = np.sum(tp)
+    N_fp = np.sum(fp)
+    tp = np.cumsum(tp).astype(float)
+    fp = np.cumsum(fp).astype(float)
+    conf = np.array(conf)
+
+    # Calculate precision and recall.
+    prec = tp / (fp + tp)
+    rec = tp / float(npos)
+
+
+
+    rec_interp = np.linspace(0, 1, DetectionMotionMetricData.nelem)  # 101 steps, from 0% to 100% recall.
+    prec = np.interp(rec_interp, rec, prec, right=0)
+    conf = np.interp(rec_interp, rec, conf, right=0)
+    rec = rec_interp
+
+    # ---------------------------------------------
+    # Re-sample the match-data to match, prec, recall and conf.
+    # ---------------------------------------------
+
+    for key in match_data.keys():
+        if key == "conf":
+            continue  # Confidence is used as reference to align with fp and tp. So skip in this step.
+
+        else:
+            # For each match_data, we first calculate the accumulated mean.
+            tmp = cummean(np.array(match_data[key]))
+
+            # Then interpolate based on the confidences. (Note reversing since np.interp needs increasing arrays)
+            match_data[key] = np.interp(conf[::-1], match_data['conf'][::-1], tmp[::-1])[::-1]
+
+    # ---------------------------------------------
+    # Done. Instantiate MetricData and return
+    # ---------------------------------------------
+    return DetectionMotionMetricData(recall=rec,
+                               precision=prec,
+                               confidence=conf,
+                               trans_err=match_data['trans_err'],
+                               vel_err=match_data['vel_err'],
+                               scale_err=match_data['scale_err'],
+                               orient_err=match_data['orient_err'],
+                               attr_err=match_data['attr_err'],
+                               min_ade_err=match_data['min_ade_err'],
+                               min_fde_err=match_data['min_fde_err'],
+                               miss_rate_err=match_data['miss_rate_err']
+                               ), N_tp, N_fp, npos
\ No newline at end of file
diff --git a/mmcv/datasets/eval_utils/map_api.py b/mmcv/datasets/eval_utils/map_api.py
new file mode 100644
index 0000000..5f26e58
--- /dev/null
+++ b/mmcv/datasets/eval_utils/map_api.py
@@ -0,0 +1,2355 @@
+# nuScenes dev-kit.
+# Code written by Sergi Adipraja Widjaja, 2019.
+# + Map mask by Kiwoo Shin, 2019.
+# + Methods operating on NuScenesMap and NuScenes by Holger Caesar, 2019.
+
+import json
+import os
+import random
+from typing import Dict, List, Tuple, Optional, Union
+
+import cv2
+import math
+import descartes
+import matplotlib.gridspec as gridspec
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image
+from matplotlib.axes import Axes
+from matplotlib.figure import Figure
+from matplotlib.patches import Rectangle, Arrow
+from mpl_toolkits.axes_grid1.inset_locator import mark_inset
+from pyquaternion import Quaternion
+from shapely import affinity
+from shapely.geometry import Polygon, MultiPolygon, LineString, Point, box
+from tqdm import tqdm
+
+from nuscenes.map_expansion.arcline_path_utils import discretize_lane, ArcLinePath
+from nuscenes.map_expansion.bitmap import BitMap
+from nuscenes.nuscenes import NuScenes
+from nuscenes.utils.geometry_utils import view_points
+from functools import partial
+
+# Recommended style to use as the plots will show grids.
+plt.style.use('seaborn-whitegrid')
+
+# Define a map geometry type for polygons and lines.
+Geometry = Union[Polygon, LineString]
+
+locations = ['singapore-onenorth', 'singapore-hollandvillage', 'singapore-queenstown', 'boston-seaport']
+
+
+class NuScenesMap:
+    """
+    NuScenesMap database class for querying and retrieving information from the semantic maps.
+    Before using this class please use the provided tutorial `map_expansion_tutorial.ipynb`.
+
+    Below you can find the map origins (south western corner, in [lat, lon]) for each of the 4 maps in nuScenes:
+    boston-seaport: [42.336849169438615, -71.05785369873047]
+    singapore-onenorth: [1.2882100868743724, 103.78475189208984]
+    singapore-hollandvillage: [1.2993652317780957, 103.78217697143555]
+    singapore-queenstown: [1.2782562240223188, 103.76741409301758]
+
+    The dimensions of the maps are as follows ([width, height] in meters):
+    singapore-onenorth: [1585.6, 2025.0]
+    singapore-hollandvillage: [2808.3, 2922.9]
+    singapore-queenstown: [3228.6, 3687.1]
+    boston-seaport: [2979.5, 2118.1]
+    The rasterized semantic maps (e.g. singapore-onenorth.png) published with nuScenes v1.0 have a scale of 10px/m,
+    hence the above numbers are the image dimensions divided by 10.
+
+    We use the same WGS 84 Web Mercator (EPSG:3857) projection as Google Maps/Earth.
+    """
+    def __init__(self,
+                 dataroot: str = '/data/sets/nuscenes',
+                 map_name: str = 'singapore-onenorth'):
+        """
+        Loads the layers, create reverse indices and shortcuts, initializes the explorer class.
+        :param dataroot: Path to the layers in the form of a .json file.
+        :param map_name: Which map out of `singapore-onenorth`, `singepore-hollandvillage`, `singapore-queenstown`,
+        `boston-seaport` that we want to load.
+        """
+        assert map_name in locations, 'Error: Unknown map name %s!' % map_name
+
+        self.dataroot = dataroot
+        self.map_name = map_name
+
+        self.geometric_layers = ['polygon', 'line', 'node']
+
+        # These are the non-geometric layers which have polygons as the geometric descriptors.
+        self.non_geometric_polygon_layers = ['drivable_area', 'road_segment', 'road_block', 'lane', 'ped_crossing',
+                                             'walkway', 'stop_line', 'carpark_area']
+
+        # We want to be able to search for lane connectors, but not render them.
+        self.lookup_polygon_layers = self.non_geometric_polygon_layers + ['lane_connector']
+
+        # These are the non-geometric layers which have line strings as the geometric descriptors.
+        self.non_geometric_line_layers = ['road_divider', 'lane_divider', 'traffic_light']
+        self.non_geometric_layers = self.non_geometric_polygon_layers + self.non_geometric_line_layers
+        self.layer_names = self.geometric_layers + self.lookup_polygon_layers + self.non_geometric_line_layers
+
+        # Load the selected map.
+        self.json_fname = os.path.join(self.dataroot, 'maps', 'expansion', '{}.json'.format(self.map_name))
+        with open(self.json_fname, 'r') as fh:
+            self.json_obj = json.load(fh)
+
+        # Parse the map version and print an error for deprecated maps.
+        if 'version' in self.json_obj:
+            self.version = self.json_obj['version']
+        else:
+            self.version = '1.0'
+        if self.version < '1.3':
+            raise Exception('Error: You are using an outdated map version (%s)! '
+                            'Please go to https://www.nuscenes.org/download to download the latest map!')
+
+        self.canvas_edge = self.json_obj['canvas_edge']
+        self._load_layers()
+        self._make_token2ind()
+        self._make_shortcuts()
+
+        self.explorer = NuScenesMapExplorer(self)
+
+    def _load_layer(self, layer_name: str) -> List[dict]:
+        """
+        Returns a list of records corresponding to the layer name.
+        :param layer_name: Name of the layer that will be loaded.
+        :return: A list of records corresponding to a layer.
+        """
+        return self.json_obj[layer_name]
+
+    def _load_layer_dict(self, layer_name: str) -> Dict[str, Union[dict, list]]:
+        """
+        Returns a dict of records corresponding to the layer name.
+        :param layer_name: Name of the layer that will be loaded.
+        :return: A dict of records corresponding to a layer.
+        """
+        return self.json_obj[layer_name]
+
+    def _load_layers(self) -> None:
+        """ Loads each available layer. """
+
+        # Explicit assignment of layers are necessary to help the IDE determine valid class members.
+        self.polygon = self._load_layer('polygon')
+        self.line = self._load_layer('line')
+        self.node = self._load_layer('node')
+        self.drivable_area = self._load_layer('drivable_area')
+        self.road_segment = self._load_layer('road_segment')
+        self.road_block = self._load_layer('road_block')
+        self.lane = self._load_layer('lane')
+        self.ped_crossing = self._load_layer('ped_crossing')
+        self.walkway = self._load_layer('walkway')
+        self.stop_line = self._load_layer('stop_line')
+        self.carpark_area = self._load_layer('carpark_area')
+        self.road_divider = self._load_layer('road_divider')
+        self.lane_divider = self._load_layer('lane_divider')
+        self.traffic_light = self._load_layer('traffic_light')
+
+        self.arcline_path_3: Dict[str, List[dict]] = self._load_layer_dict('arcline_path_3')
+        self.connectivity: Dict[str, dict] = self._load_layer_dict('connectivity')
+        self.lane_connector = self._load_layer('lane_connector')
+
+    def _make_token2ind(self) -> None:
+        """ Store the mapping from token to layer index for each layer. """
+        self._token2ind = dict()
+        for layer_name in self.layer_names:
+            self._token2ind[layer_name] = dict()
+
+            for ind, member in enumerate(getattr(self, layer_name)):
+                self._token2ind[layer_name][member['token']] = ind
+
+    def _make_shortcuts(self) -> None:
+        """ Makes the record shortcuts. """
+
+        # Makes a shortcut between non geometric records to their nodes.
+        for layer_name in self.non_geometric_polygon_layers:
+            if layer_name == 'drivable_area':  # Drivable area has more than one geometric representation.
+                pass
+            else:
+                for record in self.__dict__[layer_name]:
+                    polygon_obj = self.get('polygon', record['polygon_token'])
+                    record['exterior_node_tokens'] = polygon_obj['exterior_node_tokens']
+                    record['holes'] = polygon_obj['holes']
+
+        for layer_name in self.non_geometric_line_layers:
+            for record in self.__dict__[layer_name]:
+                record['node_tokens'] = self.get('line', record['line_token'])['node_tokens']
+
+        # Makes a shortcut between stop lines to their cues, there's different cues for different types of stop line.
+        # Refer to `_get_stop_line_cue()` for details.
+        for record in self.stop_line:
+            cue = self._get_stop_line_cue(record)
+            record['cue'] = cue
+
+        # Makes a shortcut between lanes to their lane divider segment nodes.
+        for record in self.lane:
+            record['left_lane_divider_segment_nodes'] = [self.get('node', segment['node_token']) for segment in
+                                                         record['left_lane_divider_segments']]
+            record['right_lane_divider_segment_nodes'] = [self.get('node', segment['node_token']) for segment in
+                                                          record['right_lane_divider_segments']]
+
+    def _get_stop_line_cue(self, stop_line_record: dict) -> List[dict]:
+        """
+        Get the different cues for different types of stop lines.
+        :param stop_line_record: A single stop line record.
+        :return: The cue for that stop line.
+        """
+        if stop_line_record['stop_line_type'] in ['PED_CROSSING', 'TURN_STOP']:
+            return [self.get('ped_crossing', token) for token in stop_line_record['ped_crossing_tokens']]
+        elif stop_line_record['stop_line_type'] in ['STOP_SIGN', 'YIELD']:
+            return []
+        elif stop_line_record['stop_line_type'] == 'TRAFFIC_LIGHT':
+            return [self.get('traffic_light', token) for token in stop_line_record['traffic_light_tokens']]
+
+    def get(self, layer_name: str, token: str) -> dict:
+        """
+        Returns a record from the layer in constant runtime.
+        :param layer_name: Name of the layer that we are interested in.
+        :param token: Token of the record.
+        :return: A single layer record.
+        """
+        assert layer_name in self.layer_names, "Layer {} not found".format(layer_name)
+
+        return getattr(self, layer_name)[self.getind(layer_name, token)]
+
+    def getind(self, layer_name: str, token: str) -> int:
+        """
+        This returns the index of the record in a layer in constant runtime.
+        :param layer_name: Name of the layer we are interested in.
+        :param token: Token of the record.
+        :return: The index of the record in the layer, layer is an array.
+        """
+        return self._token2ind[layer_name][token]
+
+    def render_record(self,
+                      layer_name: str,
+                      token: str,
+                      alpha: float = 0.5,
+                      figsize: Tuple[float, float] = None,
+                      other_layers: List[str] = None,
+                      bitmap: Optional[BitMap] = None) -> Tuple[Figure, Tuple[Axes, Axes]]:
+        """
+         Render a single map record. By default will also render 3 layers which are `drivable_area`, `lane`,
+         and `walkway` unless specified by `other_layers`.
+         :param layer_name: Name of the layer that we are interested in.
+         :param token: Token of the record that you want to render.
+         :param alpha: The opacity of each layer that gets rendered.
+         :param figsize: Size of the whole figure.
+         :param other_layers: What other layers to render aside from the one specified in `layer_name`.
+         :param bitmap: Optional BitMap object to render below the other map layers.
+         :return: The matplotlib figure and axes of the rendered layers.
+         """
+        return self.explorer.render_record(layer_name, token, alpha,
+                                           figsize=figsize, other_layers=other_layers, bitmap=bitmap)
+
+    def render_layers(self,
+                      layer_names: List[str],
+                      alpha: float = 0.5,
+                      figsize: Union[None, float, Tuple[float, float]] = None,
+                      tokens: List[str] = None,
+                      bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]:
+        """
+        Render a list of layer names.
+        :param layer_names: A list of layer names.
+        :param alpha: The opacity of each layer that gets rendered.
+        :param figsize: Size of the whole figure.
+        :param tokens: Optional list of tokens to render. None means all tokens are rendered.
+        :param bitmap: Optional BitMap object to render below the other map layers.
+        :return: The matplotlib figure and axes of the rendered layers.
+        """
+        return self.explorer.render_layers(layer_names, alpha,
+                                           figsize=figsize, tokens=tokens, bitmap=bitmap)
+
+    def render_map_patch(self,
+                         box_coords: Tuple[float, float, float, float],
+                         layer_names: List[str] = None,
+                         alpha: float = 0.5,
+                         figsize: Tuple[int, int] = (15, 15),
+                         render_egoposes_range: bool = True,
+                         render_legend: bool = True,
+                         bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]:
+        """
+        Renders a rectangular patch specified by `box_coords`. By default renders all layers.
+        :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max).
+        :param layer_names: All the non geometric layers that we want to render.
+        :param alpha: The opacity of each layer.
+        :param figsize: Size of the whole figure.
+        :param render_egoposes_range: Whether to render a rectangle around all ego poses.
+        :param render_legend: Whether to render the legend of map layers.
+        :param bitmap: Optional BitMap object to render below the other map layers.
+        :return: The matplotlib figure and axes of the rendered layers.
+        """
+        return self.explorer.render_map_patch(box_coords, layer_names=layer_names, alpha=alpha, figsize=figsize,
+                                              render_egoposes_range=render_egoposes_range,
+                                              render_legend=render_legend, bitmap=bitmap)
+
+    def render_map_in_image(self,
+                            nusc: NuScenes,
+                            sample_token: str,
+                            camera_channel: str = 'CAM_FRONT',
+                            alpha: float = 0.3,
+                            patch_radius: float = 10000,
+                            min_polygon_area: float = 1000,
+                            render_behind_cam: bool = True,
+                            render_outside_im: bool = True,
+                            layer_names: List[str] = None,
+                            verbose: bool = True,
+                            out_path: str = None) -> Tuple[Figure, Axes]:
+        """
+        Render a nuScenes camera image and overlay the polygons for the specified map layers.
+        Note that the projections are not always accurate as the localization is in 2d.
+        :param nusc: The NuScenes instance to load the image from.
+        :param sample_token: The image's corresponding sample_token.
+        :param camera_channel: Camera channel name, e.g. 'CAM_FRONT'.
+        :param alpha: The transparency value of the layers to render in [0, 1].
+        :param patch_radius: The radius in meters around the ego car in which to select map records.
+        :param min_polygon_area: Minimum area a polygon needs to have to be rendered.
+        :param render_behind_cam: Whether to render polygons where any point is behind the camera.
+        :param render_outside_im: Whether to render polygons where any point is outside the image.
+        :param layer_names: The names of the layers to render, e.g. ['lane'].
+            If set to None, the recommended setting will be used.
+        :param verbose: Whether to print to stdout.
+        :param out_path: Optional path to save the rendered figure to disk.
+        """
+        return self.explorer.render_map_in_image(
+            nusc, sample_token, camera_channel=camera_channel, alpha=alpha,
+            patch_radius=patch_radius, min_polygon_area=min_polygon_area,
+            render_behind_cam=render_behind_cam, render_outside_im=render_outside_im,
+            layer_names=layer_names, verbose=verbose, out_path=out_path)
+
+    def get_map_mask_in_image(self,
+                              nusc: NuScenes,
+                              sample_token: str,
+                              camera_channel: str = 'CAM_FRONT',
+                              alpha: float = 0.3,
+                              patch_radius: float = 10000,
+                              min_polygon_area: float = 1000,
+                              render_behind_cam: bool = True,
+                              render_outside_im: bool = True,
+                              layer_names: List[str] = None,
+                              verbose: bool = False,
+                              out_path: str = None):
+        """
+        Render a nuScenes camera image and overlay the polygons for the specified map layers.
+        Note that the projections are not always accurate as the localization is in 2d.
+        :param nusc: The NuScenes instance to load the image from.
+        :param sample_token: The image's corresponding sample_token.
+        :param camera_channel: Camera channel name, e.g. 'CAM_FRONT'.
+        :param alpha: The transparency value of the layers to render in [0, 1].
+        :param patch_radius: The radius in meters around the ego car in which to select map records.
+        :param min_polygon_area: Minimum area a polygon needs to have to be rendered.
+        :param render_behind_cam: Whether to render polygons where any point is behind the camera.
+        :param render_outside_im: Whether to render polygons where any point is outside the image.
+        :param layer_names: The names of the layers to render, e.g. ['lane'].
+            If set to None, the recommended setting will be used.
+        :param verbose: Whether to print to stdout.
+        :param out_path: Optional path to save the rendered figure to disk.
+        """
+        return self.explorer.get_map_mask_in_image(
+            nusc, sample_token, camera_channel=camera_channel, alpha=alpha,
+            patch_radius=patch_radius, min_polygon_area=min_polygon_area,
+            render_behind_cam=render_behind_cam, render_outside_im=render_outside_im,
+            layer_names=layer_names, verbose=verbose, out_path=out_path)
+
+    def render_egoposes_on_fancy_map(self,
+                                     nusc: NuScenes,
+                                     scene_tokens: List = None,
+                                     verbose: bool = True,
+                                     out_path: str = None,
+                                     render_egoposes: bool = True,
+                                     render_egoposes_range: bool = True,
+                                     render_legend: bool = True,
+                                     bitmap: Optional[BitMap] = None) -> Tuple[np.ndarray, Figure, Axes]:
+        """
+        Renders each ego pose of a list of scenes on the map (around 40 poses per scene).
+        This method is heavily inspired by NuScenes.render_egoposes_on_map(), but uses the map expansion pack maps.
+        :param nusc: The NuScenes instance to load the ego poses from.
+        :param scene_tokens: Optional list of scene tokens corresponding to the current map location.
+        :param verbose: Whether to show status messages and progress bar.
+        :param out_path: Optional path to save the rendered figure to disk.
+        :param render_egoposes: Whether to render ego poses.
+        :param render_egoposes_range: Whether to render a rectangle around all ego poses.
+        :param render_legend: Whether to render the legend of map layers.
+        :param bitmap: Optional BitMap object to render below the other map layers.
+        :return: <np.float32: n, 2>. Returns a matrix with n ego poses in global map coordinates.
+        """
+        return self.explorer.render_egoposes_on_fancy_map(nusc, scene_tokens=scene_tokens,
+                                                          verbose=verbose, out_path=out_path,
+                                                          render_egoposes=render_egoposes,
+                                                          render_egoposes_range=render_egoposes_range,
+                                                          render_legend=render_legend, bitmap=bitmap)
+
+    def render_centerlines(self,
+                           resolution_meters: float = 0.5,
+                           figsize: Union[None, float, Tuple[float, float]] = None,
+                           bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]:
+        """
+        Render the centerlines of all lanes and lane connectors.
+        :param resolution_meters: How finely to discretize the lane. Smaller values ensure curved
+            lanes are properly represented.
+        :param figsize: Size of the figure.
+        :param bitmap: Optional BitMap object to render below the other map layers.
+        """
+        return self.explorer.render_centerlines(resolution_meters=resolution_meters, figsize=figsize, bitmap=bitmap)
+
+    def render_map_mask(self,
+                        patch_box: Tuple[float, float, float, float],
+                        patch_angle: float,
+                        layer_names: List[str] = None,
+                        canvas_size: Tuple[int, int] = (100, 100),
+                        figsize: Tuple[int, int] = (15, 15),
+                        n_row: int = 2) -> Tuple[Figure, List[Axes]]:
+        """
+        Render map mask of the patch specified by patch_box and patch_angle.
+        :param patch_box: Patch box defined as [x_center, y_center, height, width].
+        :param patch_angle: Patch orientation in degrees.
+        :param layer_names: A list of layer names to be returned.
+        :param canvas_size: Size of the output mask (h, w).
+        :param figsize: Size of the figure.
+        :param n_row: Number of rows with plots.
+        :return: The matplotlib figure and a list of axes of the rendered layers.
+        """
+        return self.explorer.render_map_mask(patch_box, patch_angle,
+                                             layer_names=layer_names, canvas_size=canvas_size,
+                                             figsize=figsize, n_row=n_row)
+
+    def get_map_mask(self,
+                     patch_box: Optional[Tuple[float, float, float, float]],
+                     patch_angle: float,
+                     layer_names: List[str] = None,
+                     canvas_size: Optional[Tuple[int, int]] = (100, 100)) -> np.ndarray:
+        """
+        Return list of map mask layers of the specified patch.
+        :param patch_box: Patch box defined as [x_center, y_center, height, width]. If None, this plots the entire map.
+        :param patch_angle: Patch orientation in degrees. North-facing corresponds to 0.
+        :param layer_names: A list of layer names to be extracted, or None for all non-geometric layers.
+        :param canvas_size: Size of the output mask (h, w). If None, we use the default resolution of 10px/m.
+        :return: Stacked numpy array of size [c x h x w] with c channels and the same width/height as the canvas.
+        """
+        return self.explorer.get_map_mask(patch_box, patch_angle, layer_names=layer_names, canvas_size=canvas_size)
+
+    def get_map_geom(self,
+                     patch_box: Tuple[float, float, float, float],
+                     patch_angle: float,
+                     layer_names: List[str]) -> List[Tuple[str, List[Geometry]]]:
+        """
+        Returns a list of geometries in the specified patch_box.
+        These are unscaled, but aligned with the patch angle.
+        :param patch_box: Patch box defined as [x_center, y_center, height, width].
+        :param patch_angle: Patch orientation in degrees.
+                            North-facing corresponds to 0.
+        :param layer_names: A list of layer names to be extracted, or None for all non-geometric layers.
+        :return: List of layer names and their corresponding geometries.
+        """
+        return self.explorer.get_map_geom(patch_box, patch_angle, layer_names)
+
+    def get_records_in_patch(self,
+                             box_coords: Tuple[float, float, float, float],
+                             layer_names: List[str] = None,
+                             mode: str = 'intersect') -> Dict[str, List[str]]:
+        """
+        Get all the record token that intersects or is within a particular rectangular patch.
+        :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max).
+        :param layer_names: Names of the layers that we want to retrieve in a particular patch. By default will always
+        look at the all non geometric layers.
+        :param mode: "intersect" will return all non geometric records that intersects the patch, "within" will return
+        all non geometric records that are within the patch.
+        :return: Dictionary of layer_name - tokens pairs.
+        """
+        return self.explorer.get_records_in_patch(box_coords, layer_names=layer_names, mode=mode)
+
+    def is_record_in_patch(self,
+                           layer_name: str,
+                           token: str,
+                           box_coords: Tuple[float, float, float, float],
+                           mode: str = 'intersect') -> bool:
+        """
+        Query whether a particular record is in a rectangular patch
+        :param layer_name: The layer name of the record.
+        :param token: The record token.
+        :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max).
+        :param mode: "intersect" means it will return True if the geometric object intersects the patch, "within" will
+                     return True if the geometric object is within the patch.
+        :return: Boolean value on whether a particular record intersects or within a particular patch.
+        """
+        return self.explorer.is_record_in_patch(layer_name, token, box_coords, mode=mode)
+
+    def layers_on_point(self, x: float, y: float, layer_names: List[str] = None) -> Dict[str, str]:
+        """
+        Returns all the polygonal layers that a particular point is on.
+        :param x: x coordinate of the point of interest.
+        :param y: y coordinate of the point of interest.
+        :param layer_names: The names of the layers to search for.
+        :return: All the polygonal layers that a particular point is on. {<layer name>: <list of tokens>}
+        """
+        return self.explorer.layers_on_point(x, y, layer_names=layer_names)
+
+    def record_on_point(self, x: float, y: float, layer_name: str) -> str:
+        """
+        Query what record of a layer a particular point is on.
+        :param x: x coordinate of the point of interest.
+        :param y: y coordinate of the point of interest.
+        :param layer_name: The non geometric polygonal layer name that we are interested in.
+        :return: The first token of a layer a particular point is on or '' if no layer is found.
+        """
+        return self.explorer.record_on_point(x, y, layer_name)
+
+    def extract_polygon(self, polygon_token: str) -> Polygon:
+        """
+        Construct a shapely Polygon object out of a polygon token.
+        :param polygon_token: The token of the polygon record.
+        :return: The polygon wrapped in a shapely Polygon object.
+        """
+        return self.explorer.extract_polygon(polygon_token)
+
+    def extract_line(self, line_token: str) -> LineString:
+        """
+        Construct a shapely LineString object out of a line token.
+        :param line_token: The token of the line record.
+        :return: The line wrapped in a LineString object.
+        """
+        return self.explorer.extract_line(line_token)
+
+    def get_bounds(self, layer_name: str, token: str) -> Tuple[float, float, float, float]:
+        """
+        Get the bounds of the geometric object that corresponds to a non geometric record.
+        :param layer_name: Name of the layer that we are interested in.
+        :param token: Token of the record.
+        :return: min_x, min_y, max_x, max_y of of the line representation.
+        """
+        return self.explorer.get_bounds(layer_name, token)
+
+    def get_records_in_radius(self, x: float, y: float, radius: float,
+                              layer_names: List[str], mode: str = 'intersect') -> Dict[str, List[str]]:
+        """
+        Get all the record tokens that intersect a square patch of side length 2*radius centered on (x,y).
+        :param x: X-coordinate in global frame.
+        :param y: y-coordinate in global frame.
+        :param radius: All records within radius meters of point (x, y) will be returned.
+        :param layer_names: Names of the layers that we want to retrieve. By default will always
+        look at the all non geometric layers.
+        :param mode: "intersect" will return all non geometric records that intersects the patch, "within" will return
+        all non geometric records that are within the patch.
+        :return: Dictionary of layer_name - tokens pairs.
+        """
+
+        patch = (x - radius, y - radius, x + radius, y + radius)
+        return self.explorer.get_records_in_patch(patch, layer_names, mode=mode)
+
+    def discretize_centerlines(self, resolution_meters: float) -> List[np.array]:
+        """
+        Discretize the centerlines of lanes and lane connectors.
+        :param resolution_meters: How finely to discretize the lane. Smaller values ensure curved
+            lanes are properly represented.
+        :return: A list of np.arrays with x, y and z values for each point.
+        """
+        pose_lists = []
+        for lane in self.lane + self.lane_connector:
+            my_lane = self.arcline_path_3.get(lane['token'], [])
+            discretized = np.array(discretize_lane(my_lane, resolution_meters))
+            pose_lists.append(discretized)
+
+        return pose_lists
+
+    def discretize_lanes(self, tokens: List[str],
+                         resolution_meters: float) -> Dict[str, List[Tuple[float, float, float]]]:
+        """
+        Discretizes a list of lane/lane connector tokens.
+        :param tokens: List of lane and/or lane connector record tokens. Can be retrieved with
+            get_records_in_radius or get_records_in_patch.
+        :param resolution_meters: How finely to discretize the splines.
+        :return: Mapping from lane/lane connector token to sequence of poses along the lane.
+        """
+
+        return {ID: discretize_lane(self.arcline_path_3.get(ID, []), resolution_meters) for ID in tokens}
+
+    def _get_connected_lanes(self, lane_token: str, incoming_outgoing: str) -> List[str]:
+        """
+        Helper for getting the lanes connected to a given lane
+        :param lane_token: Token for the lane.
+        :param incoming_outgoing: Whether to get incoming or outgoing lanes
+        :return: List of lane tokens this lane is connected to.
+        """
+
+        if lane_token not in self.connectivity:
+            raise ValueError(f"{lane_token} is not a valid lane.")
+
+        return self.connectivity[lane_token][incoming_outgoing]
+
+    def get_outgoing_lane_ids(self, lane_token: str) -> List[str]:
+        """
+        Get the out-going lanes.
+        :param lane_token: Token for the lane.
+        :return: List of lane tokens that start at the end of this lane.
+        """
+
+        return self._get_connected_lanes(lane_token, 'outgoing')
+
+    def get_incoming_lane_ids(self, lane_token: str) -> List[str]:
+        """
+        Get the incoming lanes.
+        :param lane_token: Token for the lane.
+        :return: List of lane tokens that end at the start of this lane.
+        """
+
+        return self._get_connected_lanes(lane_token, 'incoming')
+
+    def get_arcline_path(self, lane_token: str) -> List[ArcLinePath]:
+        """
+        Get the arcline path representation for a lane.
+        Note: This function was previously called `get_lane()`, but renamed to avoid confusion between lanes and
+              arcline paths.
+        :param lane_token: Token for the lane.
+        :return: Arc line path representation of the lane.
+        """
+
+        arcline_path = self.arcline_path_3.get(lane_token)
+        if not arcline_path:
+            raise ValueError(f'Error: Lane with token {lane_token} does not have a valid arcline path!')
+
+        return arcline_path
+
+    def get_closest_lane(self, x: float, y: float, radius: float = 5) -> str:
+        """
+        Get closest lane id within a radius of query point. The distance from a point (x, y) to a lane is
+        the minimum l2 distance from (x, y) to a point on the lane.
+        :param x: X coordinate in global coordinate frame.
+        :param y: Y Coordinate in global coordinate frame.
+        :param radius: Radius around point to consider.
+        :return: Lane id of closest lane within radius.
+        """
+
+        lanes = self.get_records_in_radius(x, y, radius, ['lane', 'lane_connector'])
+        lanes = lanes['lane'] + lanes['lane_connector']
+
+        discrete_points = self.discretize_lanes(lanes, 0.5)
+
+        current_min = np.inf
+
+        min_id = ""
+        for lane_id, points in discrete_points.items():
+
+            distance = np.linalg.norm(np.array(points)[:, :2] - [x, y], axis=1).min()
+            if distance <= current_min:
+                current_min = distance
+                min_id = lane_id
+
+        return min_id
+
+    def render_next_roads(self,
+                          x: float,
+                          y: float,
+                          alpha: float = 0.5,
+                          figsize: Union[None, float, Tuple[float, float]] = None,
+                          bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]:
+        """
+        Renders the possible next roads from a point of interest.
+        :param x: x coordinate of the point of interest.
+        :param y: y coordinate of the point of interest.
+        :param alpha: The opacity of each layer that gets rendered.
+        :param figsize: Size of the whole figure.
+        :param bitmap: Optional BitMap object to render below the other map layers.
+        """
+        return self.explorer.render_next_roads(x, y, alpha, figsize=figsize, bitmap=bitmap)
+
+    def get_next_roads(self, x: float, y: float) -> Dict[str, List[str]]:
+        """
+        Get the possible next roads from a point of interest.
+        Returns road_segment, road_block and lane.
+        :param x: x coordinate of the point of interest.
+        :param y: y coordinate of the point of interest.
+        :return: Dictionary of layer_name - tokens pairs.
+        """
+        # Filter out irrelevant layers.
+        road_layers = ['road_segment', 'road_block', 'lane']
+        layers = self.explorer.layers_on_point(x, y)
+        rel_layers = {layer: layers[layer] for layer in road_layers}
+
+        # Pick most fine-grained road layer (lane, road_block, road_segment) object that contains the point.
+        rel_layer = None
+        rel_token = None
+        for layer in road_layers[::-1]:
+            if rel_layers[layer] != '':
+                rel_layer = layer
+                rel_token = rel_layers[layer]
+                break
+        assert rel_layer is not None, 'Error: No suitable layer in the specified point location!'
+
+        # Get all records that overlap with the bounding box of the selected road.
+        box_coords = self.explorer.get_bounds(rel_layer, rel_token)
+        intersect = self.explorer.get_records_in_patch(box_coords, road_layers, mode='intersect')
+
+        # Go through all objects within the bounding box.
+        result = {layer: [] for layer in road_layers}
+        if rel_layer == 'road_segment':
+            # For road segments, we do not have a direction.
+            # Return objects that have ANY exterior points in common with the relevant layer.
+            rel_exterior_nodes = self.get(rel_layer, rel_token)['exterior_node_tokens']
+            for layer in road_layers:
+                for token in intersect[layer]:
+                    exterior_nodes = self.get(layer, token)['exterior_node_tokens']
+                    if any(n in exterior_nodes for n in rel_exterior_nodes) \
+                            and token != rel_layers[layer]:
+                        result[layer].append(token)
+        else:
+            # For lanes and road blocks, the next road is indicated by the edge line.
+            # Return objects where ALL edge line nodes are included in the exterior nodes.
+            to_edge_line = self.get(rel_layer, rel_token)['to_edge_line_token']
+            to_edge_nodes = self.get('line', to_edge_line)['node_tokens']
+            for layer in road_layers:
+                for token in intersect[layer]:
+                    exterior_nodes = self.get(layer, token)['exterior_node_tokens']
+                    if all(n in exterior_nodes for n in to_edge_nodes) \
+                            and token != rel_layers[layer]:
+                        result[layer].append(token)
+        return result
+
+
+class NuScenesMapExplorer:
+    """ Helper class to explore the nuScenes map data. """
+    def __init__(self,
+                 map_api: NuScenesMap,
+                 representative_layers: Tuple[str] = ('drivable_area', 'lane', 'walkway'),
+                 color_map: dict = None):
+        """
+        :param map_api: NuScenesMap database class.
+        :param representative_layers: These are the layers that we feel are representative of the whole mapping data.
+        :param color_map: Color map.
+        """
+        # Mutable default argument.
+        if color_map is None:
+            color_map = dict(drivable_area='#a6cee3',
+                             road_segment='#1f78b4',
+                             road_block='#b2df8a',
+                             lane='#33a02c',
+                             ped_crossing='#fb9a99',
+                             walkway='#e31a1c',
+                             stop_line='#fdbf6f',
+                             carpark_area='#ff7f00',
+                             road_divider='#cab2d6',
+                             lane_divider='#6a3d9a',
+                             traffic_light='#7e772e')
+
+        self.map_api = map_api
+        self.representative_layers = representative_layers
+        self.color_map = color_map
+
+        self.canvas_max_x = self.map_api.canvas_edge[0]
+        self.canvas_min_x = 0
+        self.canvas_max_y = self.map_api.canvas_edge[1]
+        self.canvas_min_y = 0
+        self.canvas_aspect_ratio = (self.canvas_max_x - self.canvas_min_x) / (self.canvas_max_y - self.canvas_min_y)
+
+    def render_centerlines(self,
+                           resolution_meters: float,
+                           figsize: Union[None, float, Tuple[float, float]] = None,
+                           bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]:
+        """
+        Render the centerlines of all lanes and lane connectors.
+        :param resolution_meters: How finely to discretize the lane. Smaller values ensure curved
+            lanes are properly represented.
+        :param figsize: Size of the figure.
+        :param bitmap: Optional BitMap object to render below the other map layers.
+        """
+        # Discretize all lanes and lane connectors.
+        pose_lists = self.map_api.discretize_centerlines(resolution_meters)
+
+        # Render connectivity lines.
+        fig = plt.figure(figsize=self._get_figsize(figsize))
+        ax = fig.add_axes([0, 0, 1, 1 / self.canvas_aspect_ratio])
+
+        if bitmap is not None:
+            bitmap.render(self.map_api.canvas_edge, ax)
+
+        for pose_list in pose_lists:
+            if len(pose_list) > 0:
+                plt.plot(pose_list[:, 0], pose_list[:, 1])
+
+        return fig, ax
+
+    def render_map_mask(self,
+                        patch_box: Tuple[float, float, float, float],
+                        patch_angle: float,
+                        layer_names: List[str],
+                        canvas_size: Tuple[int, int],
+                        figsize: Tuple[int, int],
+                        n_row: int = 2) -> Tuple[Figure, List[Axes]]:
+        """
+        Render map mask of the patch specified by patch_box and patch_angle.
+        :param patch_box: Patch box defined as [x_center, y_center, height, width].
+        :param patch_angle: Patch orientation in degrees.
+        :param layer_names: A list of layer names to be extracted.
+        :param canvas_size: Size of the output mask (h, w).
+        :param figsize: Size of the figure.
+        :param n_row: Number of rows with plots.
+        :return: The matplotlib figure and a list of axes of the rendered layers.
+        """
+        if layer_names is None:
+            layer_names = self.map_api.non_geometric_layers
+
+        map_mask = self.get_map_mask(patch_box, patch_angle, layer_names, canvas_size)
+
+        # If no canvas_size is specified, retrieve the default from the output of get_map_mask.
+        if canvas_size is None:
+            canvas_size = map_mask.shape[1:]
+
+        fig = plt.figure(figsize=figsize)
+        ax = fig.add_axes([0, 0, 1, 1])
+        ax.set_xlim(0, canvas_size[1])
+        ax.set_ylim(0, canvas_size[0])
+
+        n_col = len(map_mask) // n_row
+        gs = gridspec.GridSpec(n_row, n_col)
+        gs.update(wspace=0.025, hspace=0.05)
+        for i in range(len(map_mask)):
+            r = i // n_col
+            c = i - r * n_col
+            subax = plt.subplot(gs[r, c])
+            subax.imshow(map_mask[i], origin='lower')
+            subax.text(canvas_size[0] * 0.5, canvas_size[1] * 1.1, layer_names[i])
+            subax.grid(False)
+
+        return fig, fig.axes
+
+    def get_map_geom(self,
+                     patch_box: Tuple[float, float, float, float],
+                     patch_angle: float,
+                     layer_names: List[str]) -> List[Tuple[str, List[Geometry]]]:
+        """
+        Returns a list of geometries in the specified patch_box.
+        These are unscaled, but aligned with the patch angle.
+        :param patch_box: Patch box defined as [x_center, y_center, height, width].
+        :param patch_angle: Patch orientation in degrees.
+                            North-facing corresponds to 0.
+        :param layer_names: A list of layer names to be extracted, or None for all non-geometric layers.
+        :return: List of layer names and their corresponding geometries.
+        """
+        # If None, return all geometric layers.
+        if layer_names is None:
+            layer_names = self.map_api.non_geometric_layers
+
+        # Get each layer name and geometry and store them in a list.
+        map_geom = []
+        for layer_name in layer_names:
+            layer_geom = self._get_layer_geom(patch_box, patch_angle, layer_name)
+            if layer_geom is None:
+                continue
+            map_geom.append((layer_name, layer_geom))
+
+        return map_geom
+
+    def map_geom_to_mask(self,
+                         map_geom: List[Tuple[str, List[Geometry]]],
+                         local_box: Tuple[float, float, float, float],
+                         canvas_size: Tuple[int, int]) -> np.ndarray:
+        """
+        Return list of map mask layers of the specified patch.
+        :param map_geom: List of layer names and their corresponding geometries.
+        :param local_box: The local patch box defined as (x_center, y_center, height, width), where typically
+            x_center = y_center = 0.
+        :param canvas_size: Size of the output mask (h, w).
+        :return: Stacked numpy array of size [c x h x w] with c channels and the same height/width as the canvas.
+        """
+        # Get each layer mask and stack them into a numpy tensor.
+        map_mask = []
+        for layer_name, layer_geom in map_geom:
+            layer_mask = self._layer_geom_to_mask(layer_name, layer_geom, local_box, canvas_size)
+            if layer_mask is not None:
+                map_mask.append(layer_mask)
+
+        return np.array(map_mask)
+
+    def get_map_mask(self,
+                     patch_box: Optional[Tuple[float, float, float, float]],
+                     patch_angle: float,
+                     layer_names: List[str] = None,
+                     canvas_size: Tuple[int, int] = (100, 100)) -> np.ndarray:
+        """
+        Return list of map mask layers of the specified patch.
+        :param patch_box: Patch box defined as [x_center, y_center, height, width]. If None, this plots the entire map.
+        :param patch_angle: Patch orientation in degrees. North-facing corresponds to 0.
+        :param layer_names: A list of layer names to be extracted, or None for all non-geometric layers.
+        :param canvas_size: Size of the output mask (h, w). If None, we use the default resolution of 10px/m.
+        :return: Stacked numpy array of size [c x h x w] with c channels and the same width/height as the canvas.
+        """
+        # For some combination of parameters, we need to know the size of the current map.
+        if self.map_api.map_name == 'singapore-onenorth':
+            map_dims = [1585.6, 2025.0]
+        elif self.map_api.map_name == 'singapore-hollandvillage':
+            map_dims = [2808.3, 2922.9]
+        elif self.map_api.map_name == 'singapore-queenstown':
+            map_dims = [3228.6, 3687.1]
+        elif self.map_api.map_name == 'boston-seaport':
+            map_dims = [2979.5, 2118.1]
+        else:
+            raise Exception('Error: Invalid map!')
+
+        # If None, return the entire map.
+        if patch_box is None:
+            patch_box = [map_dims[0] / 2, map_dims[1] / 2, map_dims[1], map_dims[0]]
+
+        # If None, return all geometric layers.
+        if layer_names is None:
+            layer_names = self.map_api.non_geometric_layers
+
+        # If None, return the specified patch in the original scale of 10px/m.
+        if canvas_size is None:
+            map_scale = 10
+            canvas_size = np.array((patch_box[2], patch_box[3])) * map_scale
+            canvas_size = tuple(np.round(canvas_size).astype(np.int32))
+
+        # Get geometry of each layer.
+        map_geom = self.get_map_geom(patch_box, patch_angle, layer_names)
+
+        # Convert geometry of each layer into mask and stack them into a numpy tensor.
+        # Convert the patch box from global coordinates to local coordinates by setting the center to (0, 0).
+        local_box = (0.0, 0.0, patch_box[2], patch_box[3])
+        map_mask = self.map_geom_to_mask(map_geom, local_box, canvas_size)
+        assert np.all(map_mask.shape[1:] == canvas_size)
+
+        return map_mask
+
+    def render_record(self,
+                      layer_name: str,
+                      token: str,
+                      alpha: float = 0.5,
+                      figsize: Union[None, float, Tuple[float, float]] = None,
+                      other_layers: List[str] = None,
+                      bitmap: Optional[BitMap] = None) -> Tuple[Figure, Tuple[Axes, Axes]]:
+        """
+        Render a single map record.
+        By default will also render 3 layers which are `drivable_area`, `lane`, and `walkway` unless specified by
+        `other_layers`.
+        :param layer_name: Name of the layer that we are interested in.
+        :param token: Token of the record that you want to render.
+        :param alpha: The opacity of each layer that gets rendered.
+        :param figsize: Size of the whole figure.
+        :param other_layers: What other layers to render aside from the one specified in `layer_name`.
+        :param bitmap: Optional BitMap object to render below the other map layers.
+        :return: The matplotlib figure and axes of the rendered layers.
+        """
+        if other_layers is None:
+            other_layers = list(self.representative_layers)
+
+        for other_layer in other_layers:
+            if other_layer not in self.map_api.non_geometric_layers:
+                raise ValueError("{} is not a non geometric layer".format(layer_name))
+
+        x1, y1, x2, y2 = self.map_api.get_bounds(layer_name, token)
+
+        local_width = x2 - x1
+        local_height = y2 - y1
+        assert local_height > 0, 'Error: Map has 0 height!'
+        local_aspect_ratio = local_width / local_height
+
+        # We obtained the values 0.65 and 0.66 by trials.
+        fig = plt.figure(figsize=self._get_figsize(figsize))
+        global_ax = fig.add_axes([0, 0, 0.65, 0.65 / self.canvas_aspect_ratio])
+        local_ax = fig.add_axes([0.66, 0.66 / self.canvas_aspect_ratio, 0.34, 0.34 / local_aspect_ratio])
+
+        # To make sure the sequence of the layer overlays is always consistent after typesetting set().
+        random.seed('nutonomy')
+
+        if bitmap is not None:
+            bitmap.render(self.map_api.canvas_edge, global_ax)
+            bitmap.render(self.map_api.canvas_edge, local_ax)
+
+        layer_names = other_layers + [layer_name]
+        layer_names = list(set(layer_names))
+
+        for layer in layer_names:
+            self._render_layer(global_ax, layer, alpha)
+
+        for layer in layer_names:
+            self._render_layer(local_ax, layer, alpha)
+
+        if layer_name == 'drivable_area':
+            # Bad output aesthetically if we add spacing between the objects and the axes for drivable area.
+            local_ax_xlim = (x1, x2)
+            local_ax_ylim = (y1, y2)
+        else:
+            # Add some spacing between the object and the axes.
+            local_ax_xlim = (x1 - local_width / 3, x2 + local_width / 3)
+            local_ax_ylim = (y1 - local_height / 3, y2 + local_height / 3)
+
+            # Draws the rectangular patch on the local_ax.
+            local_ax.add_patch(Rectangle((x1, y1), local_width, local_height, linestyle='-.', color='red', fill=False,
+                                         lw=2))
+
+        local_ax.set_xlim(*local_ax_xlim)
+        local_ax.set_ylim(*local_ax_ylim)
+        local_ax.set_title('Local View')
+
+        global_ax.set_xlim(self.canvas_min_x, self.canvas_max_x)
+        global_ax.set_ylim(self.canvas_min_y, self.canvas_max_y)
+        global_ax.set_title('Global View')
+        global_ax.legend()
+
+        # Adds the zoomed in effect to the plot.
+        mark_inset(global_ax, local_ax, loc1=2, loc2=4)
+
+        return fig, (global_ax, local_ax)
+
+    def render_layers(self,
+                      layer_names: List[str],
+                      alpha: float,
+                      figsize: Union[None, float, Tuple[float, float]],
+                      tokens: List[str] = None,
+                      bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]:
+        """
+        Render a list of layers.
+        :param layer_names: A list of layer names.
+        :param alpha: The opacity of each layer.
+        :param figsize: Size of the whole figure.
+        :param tokens: Optional list of tokens to render. None means all tokens are rendered.
+        :param bitmap: Optional BitMap object to render below the other map layers.
+        :return: The matplotlib figure and axes of the rendered layers.
+        """
+        fig = plt.figure(figsize=self._get_figsize(figsize))
+        ax = fig.add_axes([0, 0, 1, 1 / self.canvas_aspect_ratio])
+
+        ax.set_xlim(self.canvas_min_x, self.canvas_max_x)
+        ax.set_ylim(self.canvas_min_y, self.canvas_max_y)
+
+        if bitmap is not None:
+            bitmap.render(self.map_api.canvas_edge, ax)
+
+        layer_names = list(set(layer_names))
+        for layer_name in layer_names:
+            self._render_layer(ax, layer_name, alpha, tokens)
+
+        ax.legend()
+
+        return fig, ax
+
+    def render_map_patch(self,
+                         box_coords: Tuple[float, float, float, float],
+                         layer_names: List[str] = None,
+                         alpha: float = 0.5,
+                         figsize: Tuple[float, float] = (15, 15),
+                         render_egoposes_range: bool = True,
+                         render_legend: bool = True,
+                         bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]:
+        """
+        Renders a rectangular patch specified by `box_coords`. By default renders all layers.
+        :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max).
+        :param layer_names: All the non geometric layers that we want to render.
+        :param alpha: The opacity of each layer.
+        :param figsize: Size of the whole figure.
+        :param render_egoposes_range: Whether to render a rectangle around all ego poses.
+        :param render_legend: Whether to render the legend of map layers.
+        :param bitmap: Optional BitMap object to render below the other map layers.
+        :return: The matplotlib figure and axes of the rendered layers.
+        """
+        x_min, y_min, x_max, y_max = box_coords
+
+        if layer_names is None:
+            layer_names = self.map_api.non_geometric_layers
+
+        fig = plt.figure(figsize=figsize)
+
+        local_width = x_max - x_min
+        local_height = y_max - y_min
+        assert local_height > 0, 'Error: Map patch has 0 height!'
+        local_aspect_ratio = local_width / local_height
+
+        ax = fig.add_axes([0, 0, 1, 1 / local_aspect_ratio])
+
+        if bitmap is not None:
+            bitmap.render(self.map_api.canvas_edge, ax)
+
+        for layer_name in layer_names:
+            self._render_layer(ax, layer_name, alpha)
+
+        x_margin = np.minimum(local_width / 4, 50)
+        y_margin = np.minimum(local_height / 4, 10)
+        ax.set_xlim(x_min - x_margin, x_max + x_margin)
+        ax.set_ylim(y_min - y_margin, y_max + y_margin)
+
+        if render_egoposes_range:
+            ax.add_patch(Rectangle((x_min, y_min), local_width, local_height, fill=False, linestyle='-.', color='red',
+                                   lw=2))
+            ax.text(x_min + local_width / 100, y_min + local_height / 2, "%g m" % local_height,
+                    fontsize=14, weight='bold')
+            ax.text(x_min + local_width / 2, y_min + local_height / 100, "%g m" % local_width,
+                    fontsize=14, weight='bold')
+
+        if render_legend:
+            ax.legend(frameon=True, loc='upper right')
+
+        return fig, ax
+
+    def render_map_in_image(self,
+                            nusc: NuScenes,
+                            sample_token: str,
+                            camera_channel: str = 'CAM_FRONT',
+                            alpha: float = 0.3,
+                            patch_radius: float = 10000,
+                            min_polygon_area: float = 1000,
+                            render_behind_cam: bool = True,
+                            render_outside_im: bool = True,
+                            layer_names: List[str] = None,
+                            verbose: bool = True,
+                            out_path: str = None) -> Tuple[Figure, Axes]:
+        """
+        Render a nuScenes camera image and overlay the polygons for the specified map layers.
+        Note that the projections are not always accurate as the localization is in 2d.
+        :param nusc: The NuScenes instance to load the image from.
+        :param sample_token: The image's corresponding sample_token.
+        :param camera_channel: Camera channel name, e.g. 'CAM_FRONT'.
+        :param alpha: The transparency value of the layers to render in [0, 1].
+        :param patch_radius: The radius in meters around the ego car in which to select map records.
+        :param min_polygon_area: Minimum area a polygon needs to have to be rendered.
+        :param render_behind_cam: Whether to render polygons where any point is behind the camera.
+        :param render_outside_im: Whether to render polygons where any point is outside the image.
+        :param layer_names: The names of the layers to render, e.g. ['lane'].
+            If set to None, the recommended setting will be used.
+        :param verbose: Whether to print to stdout.
+        :param out_path: Optional path to save the rendered figure to disk.
+        """
+        near_plane = 1e-8
+
+        if verbose:
+            print('Warning: Note that the projections are not always accurate as the localization is in 2d.')
+
+        # Default layers.
+        if layer_names is None:
+            layer_names = ['road_segment', 'lane', 'ped_crossing', 'walkway', 'stop_line', 'carpark_area']
+
+        # Check layers whether we can render them.
+        for layer_name in layer_names:
+            assert layer_name in self.map_api.non_geometric_polygon_layers, \
+                'Error: Can only render non-geometry polygons: %s' % layer_names
+
+        # Check that NuScenesMap was loaded for the correct location.
+        sample_record = nusc.get('sample', sample_token)
+        scene_record = nusc.get('scene', sample_record['scene_token'])
+        log_record = nusc.get('log', scene_record['log_token'])
+        log_location = log_record['location']
+        assert self.map_api.map_name == log_location, \
+            'Error: NuScenesMap loaded for location %s, should be %s!' % (self.map_api.map_name, log_location)
+
+        # Grab the front camera image and intrinsics.
+        cam_token = sample_record['data'][camera_channel]
+        cam_record = nusc.get('sample_data', cam_token)
+        cam_path = nusc.get_sample_data_path(cam_token)
+        im = Image.open(cam_path)
+        im_size = im.size
+        cs_record = nusc.get('calibrated_sensor', cam_record['calibrated_sensor_token'])
+        cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+
+        # Retrieve the current map.
+        poserecord = nusc.get('ego_pose', cam_record['ego_pose_token'])
+        ego_pose = poserecord['translation']
+        box_coords = (
+            ego_pose[0] - patch_radius,
+            ego_pose[1] - patch_radius,
+            ego_pose[0] + patch_radius,
+            ego_pose[1] + patch_radius,
+        )
+        records_in_patch = self.get_records_in_patch(box_coords, layer_names, 'intersect')
+
+        # Init axes.
+        fig = plt.figure(figsize=(9, 16))
+        ax = fig.add_axes([0, 0, 1, 1])
+        ax.set_xlim(0, im_size[0])
+        ax.set_ylim(0, im_size[1])
+        ax.imshow(im)
+
+        # Retrieve and render each record.
+        for layer_name in layer_names:
+            for token in records_in_patch[layer_name]:
+                record = self.map_api.get(layer_name, token)
+                if layer_name == 'drivable_area':
+                    polygon_tokens = record['polygon_tokens']
+                else:
+                    polygon_tokens = [record['polygon_token']]
+
+                for polygon_token in polygon_tokens:
+                    polygon = self.map_api.extract_polygon(polygon_token)
+
+                    # Convert polygon nodes to pointcloud with 0 height.
+                    points = np.array(polygon.exterior.xy)
+                    points = np.vstack((points, np.zeros((1, points.shape[1]))))
+
+                    # Transform into the ego vehicle frame for the timestamp of the image.
+                    points = points - np.array(poserecord['translation']).reshape((-1, 1))
+                    points = np.dot(Quaternion(poserecord['rotation']).rotation_matrix.T, points)
+
+                    # Transform into the camera.
+                    points = points - np.array(cs_record['translation']).reshape((-1, 1))
+                    points = np.dot(Quaternion(cs_record['rotation']).rotation_matrix.T, points)
+
+                    # Remove points that are partially behind the camera.
+                    depths = points[2, :]
+                    behind = depths < near_plane
+                    if np.all(behind):
+                        continue
+
+                    if render_behind_cam:
+                        # Perform clipping on polygons that are partially behind the camera.
+                        points = NuScenesMapExplorer._clip_points_behind_camera(points, near_plane)
+                    elif np.any(behind):
+                        # Otherwise ignore any polygon that is partially behind the camera.
+                        continue
+
+                    # Ignore polygons with less than 3 points after clipping.
+                    if len(points) == 0 or points.shape[1] < 3:
+                        continue
+
+                    # Take the actual picture (matrix multiplication with camera-matrix + renormalization).
+                    points = view_points(points, cam_intrinsic, normalize=True)
+
+                    # Skip polygons where all points are outside the image.
+                    # Leave a margin of 1 pixel for aesthetic reasons.
+                    inside = np.ones(points.shape[1], dtype=bool)
+                    inside = np.logical_and(inside, points[0, :] > 1)
+                    inside = np.logical_and(inside, points[0, :] < im.size[0] - 1)
+                    inside = np.logical_and(inside, points[1, :] > 1)
+                    inside = np.logical_and(inside, points[1, :] < im.size[1] - 1)
+                    if render_outside_im:
+                        if np.all(np.logical_not(inside)):
+                            continue
+                    else:
+                        if np.any(np.logical_not(inside)):
+                            continue
+
+                    points = points[:2, :]
+                    points = [(p0, p1) for (p0, p1) in zip(points[0], points[1])]
+                    polygon_proj = Polygon(points)
+
+                    # Filter small polygons
+                    if polygon_proj.area < min_polygon_area:
+                        continue
+
+                    label = layer_name
+                    ax.add_patch(descartes.PolygonPatch(polygon_proj, fc=self.color_map[layer_name], alpha=alpha,
+                                                        label=label))
+
+        # Display the image.
+        plt.axis('off')
+        ax.invert_yaxis()
+
+        if out_path is not None:
+            plt.tight_layout()
+            plt.savefig(out_path, bbox_inches='tight', pad_inches=0)
+
+        return fig, ax
+
+    @staticmethod
+    def points_transform(points, poserecord, cs_record, cam_intrinsic, im_size, near_plane=1e-8,
+                         render_behind_cam=True, render_outside_im=True):
+        points = np.vstack((points, np.zeros((1, points.shape[1]))))
+
+        # Transform into the ego vehicle frame for the timestamp of the image.
+        points = points - np.array(poserecord['translation']).reshape((-1, 1))
+        points = np.dot(Quaternion(poserecord['rotation']).rotation_matrix.T, points)
+
+        # Transform into the camera.
+        points = points - np.array(cs_record['translation']).reshape((-1, 1))
+        points = np.dot(Quaternion(cs_record['rotation']).rotation_matrix.T, points)
+
+        # Remove points that are partially behind the camera.
+        depths = points[2, :]
+        behind = depths < near_plane
+        if np.all(behind):
+            return None
+
+        if render_behind_cam:
+            # Perform clipping on polygons that are partially behind the camera.
+            points = NuScenesMapExplorer._clip_points_behind_camera(points, near_plane)
+
+        elif np.any(behind):
+            # Otherwise ignore any polygon that is partially behind the camera.
+            return None
+
+        # Take the actual picture (matrix multiplication with camera-matrix + renormalization).
+        points = view_points(points, cam_intrinsic, normalize=True)
+
+        # Skip polygons where all points are outside the image.
+        # Leave a margin of 1 pixel for aesthetic reasons.
+        inside = np.ones(points.shape[1], dtype=bool)
+        inside = np.logical_and(inside, points[0, :] > 1)
+        inside = np.logical_and(inside, points[0, :] < im_size[0] - 1)
+        inside = np.logical_and(inside, points[1, :] > 1)
+        inside = np.logical_and(inside, points[1, :] < im_size[1] - 1)
+
+        if render_outside_im:
+            if np.all(np.logical_not(inside)):
+                return None
+        else:
+            if np.any(np.logical_not(inside)):
+                return None
+
+        # points = points[:, inside]
+
+        # Ignore polygons with less than 3 points after clipping.
+        if len(points) == 0 or points.shape[1] < 3:
+            return None
+
+        points = points[:2, :]
+        points = [(p0, p1) for (p0, p1) in zip(points[0], points[1])]
+        return points
+
+    def get_map_mask_in_image(self,
+                              nusc: NuScenes,
+                              sample_token: str,
+                              camera_channel: str = 'CAM_FRONT',
+                              alpha: float = 0.3,
+                              patch_radius: float = 10000,
+                              min_polygon_area: float = 1000,
+                              render_behind_cam: bool = True,
+                              render_outside_im: bool = True,
+                              layer_names: List[str] = None,
+                              verbose: bool = False,
+                              out_path: str = None) -> np.ndarray:
+        """
+        Render a nuScenes camera image and overlay the polygons for the specified map layers.
+        Note that the projections are not always accurate as the localization is in 2d.
+        :param nusc: The NuScenes instance to load the image from.
+        :param sample_token: The image's corresponding sample_token.
+        :param camera_channel: Camera channel name, e.g. 'CAM_FRONT'.
+        :param alpha: The transparency value of the layers to render in [0, 1].
+        :param patch_radius: The radius in meters around the ego car in which to select map records.
+        :param min_polygon_area: Minimum area a polygon needs to have to be rendered.
+        :param render_behind_cam: Whether to render polygons where any point is behind the camera.
+        :param render_outside_im: Whether to render polygons where any point is outside the image.
+        :param layer_names: The names of the layers to render, e.g. ['lane'].
+            If set to None, the recommended setting will be used.
+        :param verbose: Whether to print to stdout.
+        :param out_path: Optional path to save the rendered figure to disk.
+        """
+        near_plane = 1e-8
+        if verbose:
+            print('Warning: Note that the projections are not always accurate as the localization is in 2d.')
+
+        # Default layers.
+        if layer_names is None:
+            layer_names = ['road_segment', 'lane', 'ped_crossing', 'walkway', 'stop_line', 'carpark_area']
+
+        # # Check layers whether we can render them.
+        # for layer_name in layer_names:
+        #     assert layer_name in self.map_api.non_geometric_polygon_layers, \
+        #         'Error: Can only render non-geometry polygons: %s' % layer_names
+
+        # Check that NuScenesMap was loaded for the correct location.
+        sample_record = nusc.get('sample', sample_token)
+        scene_record = nusc.get('scene', sample_record['scene_token'])
+        log_record = nusc.get('log', scene_record['log_token'])
+        log_location = log_record['location']
+        assert self.map_api.map_name == log_location, \
+            'Error: NuScenesMap loaded for location %s, should be %s!' % (self.map_api.map_name, log_location)
+
+        # Grab the front camera image and intrinsics.
+        cam_token = sample_record['data'][camera_channel]
+        cam_record = nusc.get('sample_data', cam_token)
+        cam_path = nusc.get_sample_data_path(cam_token)
+        im = Image.open(cam_path)
+        im_size = im.size
+        cs_record = nusc.get('calibrated_sensor', cam_record['calibrated_sensor_token'])
+        cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+
+        # Retrieve the current map.
+        poserecord = nusc.get('ego_pose', cam_record['ego_pose_token'])
+        ego_pose = poserecord['translation']
+        box_coords = (
+            ego_pose[0] - patch_radius,
+            ego_pose[1] - patch_radius,
+            ego_pose[0] + patch_radius,
+            ego_pose[1] + patch_radius,
+        )
+        records_in_patch = self.get_records_in_patch(box_coords, layer_names, 'intersect')
+
+        if out_path is not None:
+            # Init axes.
+            fig = plt.figure(figsize=(9, 16))
+            ax = fig.add_axes([0, 0, 1, 1])
+            ax.set_xlim(0, im_size[0])
+            ax.set_ylim(0, im_size[1])
+            ax.imshow(im)
+
+        points_transform = partial(self.points_transform, poserecord=poserecord, cs_record=cs_record,
+                                   cam_intrinsic=cam_intrinsic, near_plane=near_plane, im_size=im_size,
+                                   render_behind_cam=render_behind_cam, render_outside_im=render_outside_im)
+
+        # Retrieve and render each record.
+        map_geom = []
+        for layer_name in layer_names:
+            if layer_name in self.map_api.non_geometric_line_layers:
+                line_list = []
+                for token in records_in_patch[layer_name]:
+                    record = self.map_api.get(layer_name, token)
+                    line = self.map_api.extract_line(record['line_token'])
+                    if line.is_empty:  # Skip lines without nodes.
+                        continue
+                    points = np.array(line.xy)
+                    points = points_transform(points)
+                    if points is None:
+                        continue
+                    line = LineString(points)
+                    line_list.append(line)
+                    # For visualize
+                    if out_path is not None:
+                        polygon = Polygon(points)
+                        ax.add_patch(descartes.PolygonPatch(polygon, fc=self.color_map[layer_name],
+                                                            alpha=alpha, label=layer_name))
+                map_geom.append((layer_name, line_list))
+            elif layer_name == 'drivable_area':
+                polygon_list = []
+                for token in records_in_patch[layer_name]:
+                    record = self.map_api.get(layer_name, token)
+                    polygons = [self.map_api.extract_polygon(polygon_token) for polygon_token in
+                                record['polygon_tokens']]
+                    for polygon in polygons:
+                        ex_points = np.array(polygon.exterior.xy)
+                        ex_points = points_transform(ex_points)
+                        if ex_points is None:
+                            continue
+                        interiors = []
+                        for interior in polygon.interiors:
+                            in_points = np.array(interior.xy)
+                            in_points = points_transform(in_points)
+                            if in_points is None:
+                                continue
+                            interiors.append(in_points)
+                        polygon = Polygon(ex_points, interiors)
+                        polygon = polygon.buffer(0.01)
+                        if polygon.geom_type == 'Polygon':
+                            polygon = MultiPolygon([polygon])
+                        # Filter small polygons
+                        if polygon.area < min_polygon_area:
+                            continue
+                        polygon_list.append(polygon)
+                        # For visualize
+                        if out_path is not None:
+                            ax.add_patch(descartes.PolygonPatch(polygon, fc=self.color_map[layer_name],
+                                                                alpha=alpha, label=layer_name))
+                map_geom.append((layer_name, polygon_list))
+            else:
+                polygon_list = []
+                for token in records_in_patch[layer_name]:
+                    record = self.map_api.get(layer_name, token)
+                    polygon = self.map_api.extract_polygon(record['polygon_token'])
+                    if polygon.is_valid:
+                        if not polygon.is_empty:
+                            ex_points = np.array(polygon.exterior.xy)
+                            ex_points = points_transform(ex_points)
+                            if ex_points is None:
+                                continue
+                            interiors = []
+                            for interior in polygon.interiors:
+                                in_points = np.array(interior.xy)
+                                in_points = points_transform(in_points)
+                                if in_points is None:
+                                    continue
+                                interiors.append(in_points)
+                            polygon = Polygon(ex_points, interiors)
+                            polygon = polygon.buffer(0.01)
+                            if polygon.geom_type == 'Polygon':
+                                polygon = MultiPolygon([polygon])
+                            # Filter small polygons
+                            if polygon.area < min_polygon_area:
+                                continue
+                            polygon_list.append(polygon)
+                            # For visualize
+                            if out_path is not None:
+                                ax.add_patch(descartes.PolygonPatch(polygon, fc=self.color_map[layer_name],
+                                                                    alpha=alpha, label=layer_name))
+                map_geom.append((layer_name, polygon_list))
+
+        # For visualize
+        if out_path is not None:
+            # Display the image.
+            plt.axis('off')
+            ax.invert_yaxis()
+            plt.tight_layout()
+            plt.savefig(out_path, bbox_inches='tight', pad_inches=0)
+            plt.close()
+
+        # Convert geometry of each layer into mask and stack them into a numpy tensor.
+        # Convert the patch box from global coordinates to local coordinates by setting the center to (0, 0).
+        local_box = (im_size[0] // 2, im_size[1] // 2, im_size[1], im_size[0])
+        canvas_size = (im_size[1], im_size[0])
+        img_mask = self.map_geom_to_mask(map_geom, local_box, canvas_size)
+        assert np.all(img_mask.shape[1:] == canvas_size)
+        return img_mask
+
+    def render_egoposes_on_fancy_map(self,
+                                     nusc: NuScenes,
+                                     scene_tokens: List = None,
+                                     verbose: bool = True,
+                                     out_path: str = None,
+                                     render_egoposes: bool = True,
+                                     render_egoposes_range: bool = True,
+                                     render_legend: bool = True,
+                                     bitmap: Optional[BitMap] = None) -> Tuple[np.ndarray, Figure, Axes]:
+        """
+        Renders each ego pose of a list of scenes on the map (around 40 poses per scene).
+        This method is heavily inspired by NuScenes.render_egoposes_on_map(), but uses the map expansion pack maps.
+        Note that the maps are constantly evolving, whereas we only released a single snapshot of the data.
+        Therefore for some scenes there is a bad fit between ego poses and maps.
+        :param nusc: The NuScenes instance to load the ego poses from.
+        :param scene_tokens: Optional list of scene tokens corresponding to the current map location.
+        :param verbose: Whether to show status messages and progress bar.
+        :param out_path: Optional path to save the rendered figure to disk.
+        :param render_egoposes: Whether to render ego poses.
+        :param render_egoposes_range: Whether to render a rectangle around all ego poses.
+        :param render_legend: Whether to render the legend of map layers.
+        :param bitmap: Optional BitMap object to render below the other map layers.
+        :return: <np.float32: n, 2>. Returns a matrix with n ego poses in global map coordinates.
+        """
+        # Settings
+        patch_margin = 2
+        min_diff_patch = 30
+
+        # Ids of scenes with a bad match between localization and map.
+        scene_blacklist = [499, 515, 517]
+
+        # Get logs by location.
+        log_location = self.map_api.map_name
+        log_tokens = [log['token'] for log in nusc.log if log['location'] == log_location]
+        assert len(log_tokens) > 0, 'Error: This split has 0 scenes for location %s!' % log_location
+
+        # Filter scenes.
+        scene_tokens_location = [e['token'] for e in nusc.scene if e['log_token'] in log_tokens]
+        if scene_tokens is not None:
+            scene_tokens_location = [t for t in scene_tokens_location if t in scene_tokens]
+        assert len(scene_tokens_location) > 0, 'Error: Found 0 valid scenes for location %s!' % log_location
+
+        map_poses = []
+        if verbose:
+            print('Adding ego poses to map...')
+        for scene_token in tqdm(scene_tokens_location, disable=not verbose):
+            # Check that the scene is from the correct location.
+            scene_record = nusc.get('scene', scene_token)
+            scene_name = scene_record['name']
+            scene_id = int(scene_name.replace('scene-', ''))
+            log_record = nusc.get('log', scene_record['log_token'])
+            assert log_record['location'] == log_location, \
+                'Error: The provided scene_tokens do not correspond to the provided map location!'
+
+            # Print a warning if the localization is known to be bad.
+            if verbose and scene_id in scene_blacklist:
+                print('Warning: %s is known to have a bad fit between ego pose and map.' % scene_name)
+
+            # For each sample in the scene, store the ego pose.
+            sample_tokens = nusc.field2token('sample', 'scene_token', scene_token)
+            for sample_token in sample_tokens:
+                sample_record = nusc.get('sample', sample_token)
+
+                # Poses are associated with the sample_data. Here we use the lidar sample_data.
+                sample_data_record = nusc.get('sample_data', sample_record['data']['LIDAR_TOP'])
+                pose_record = nusc.get('ego_pose', sample_data_record['ego_pose_token'])
+
+                # Calculate the pose on the map and append.
+                map_poses.append(pose_record['translation'])
+
+        # Check that ego poses aren't empty.
+        assert len(map_poses) > 0, 'Error: Found 0 ego poses. Please check the inputs.'
+
+        # Compute number of close ego poses.
+        if verbose:
+            print('Creating plot...')
+        map_poses = np.vstack(map_poses)[:, :2]
+
+        # Render the map patch with the current ego poses.
+        min_patch = np.floor(map_poses.min(axis=0) - patch_margin)
+        max_patch = np.ceil(map_poses.max(axis=0) + patch_margin)
+        diff_patch = max_patch - min_patch
+        if any(diff_patch < min_diff_patch):
+            center_patch = (min_patch + max_patch) / 2
+            diff_patch = np.maximum(diff_patch, min_diff_patch)
+            min_patch = center_patch - diff_patch / 2
+            max_patch = center_patch + diff_patch / 2
+        my_patch = (min_patch[0], min_patch[1], max_patch[0], max_patch[1])
+        fig, ax = self.render_map_patch(my_patch, self.map_api.non_geometric_layers, figsize=(10, 10),
+                                        render_egoposes_range=render_egoposes_range,
+                                        render_legend=render_legend, bitmap=bitmap)
+
+        # Plot in the same axis as the map.
+        # Make sure these are plotted "on top".
+        if render_egoposes:
+            ax.scatter(map_poses[:, 0], map_poses[:, 1], s=20, c='k', alpha=1.0, zorder=2)
+        plt.axis('off')
+
+        if out_path is not None:
+            plt.savefig(out_path, bbox_inches='tight', pad_inches=0)
+
+        return map_poses, fig, ax
+
+    def render_next_roads(self,
+                          x: float,
+                          y: float,
+                          alpha: float = 0.5,
+                          figsize: Union[None, float, Tuple[float, float]] = None,
+                          bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]:
+        """
+        Renders the possible next roads from a point of interest.
+        :param x: x coordinate of the point of interest.
+        :param y: y coordinate of the point of interest.
+        :param alpha: The opacity of each layer that gets rendered.
+        :param figsize: Size of the whole figure.
+        :param bitmap: Optional BitMap object to render below the other map layers.
+        """
+        # Get next roads.
+        next_roads = self.map_api.get_next_roads(x, y)
+        layer_names = []
+        tokens = []
+        for layer_name, layer_tokens in next_roads.items():
+            if len(layer_tokens) > 0:
+                layer_names.append(layer_name)
+                tokens.extend(layer_tokens)
+
+        # Render them.
+        fig, ax = self.render_layers(layer_names, alpha, figsize, tokens=tokens, bitmap=bitmap)
+
+        # Render current location with an x.
+        ax.plot(x, y, 'x', markersize=12, color='red')
+
+        return fig, ax
+
+    @staticmethod
+    def _clip_points_behind_camera(points, near_plane: float):
+        """
+        Perform clipping on polygons that are partially behind the camera.
+        This method is necessary as the projection does not work for points behind the camera.
+        Hence we compute the line between the point and the camera and follow that line until we hit the near plane of
+        the camera. Then we use that point.
+        :param points: <np.float32: 3, n> Matrix of points, where each point (x, y, z) is along each column.
+        :param near_plane: If we set the near_plane distance of the camera to 0 then some points will project to
+            infinity. Therefore we need to clip these points at the near plane.
+        :return: The clipped version of the polygon. This may have fewer points than the original polygon if some lines
+            were entirely behind the polygon.
+        """
+        points_clipped = []
+        # Loop through each line on the polygon.
+        # For each line where exactly 1 endpoints is behind the camera, move the point along the line until
+        # it hits the near plane of the camera (clipping).
+        assert points.shape[0] == 3
+        point_count = points.shape[1]
+        for line_1 in range(point_count):
+            line_2 = (line_1 + 1) % point_count
+            point_1 = points[:, line_1]
+            point_2 = points[:, line_2]
+            z_1 = point_1[2]
+            z_2 = point_2[2]
+
+            if z_1 >= near_plane and z_2 >= near_plane:
+                # Both points are in front.
+                # Add both points unless the first is already added.
+                if len(points_clipped) == 0 or all(points_clipped[-1] != point_1):
+                    points_clipped.append(point_1)
+                points_clipped.append(point_2)
+            elif z_1 < near_plane and z_2 < near_plane:
+                # Both points are in behind.
+                # Don't add anything.
+                continue
+            else:
+                # One point is in front, one behind.
+                # By convention pointA is behind the camera and pointB in front.
+                if z_1 <= z_2:
+                    point_a = points[:, line_1]
+                    point_b = points[:, line_2]
+                else:
+                    point_a = points[:, line_2]
+                    point_b = points[:, line_1]
+                z_a = point_a[2]
+                z_b = point_b[2]
+
+                # Clip line along near plane.
+                pointdiff = point_b - point_a
+                alpha = (near_plane - z_b) / (z_a - z_b)
+                clipped = point_a + (1 - alpha) * pointdiff
+                assert np.abs(clipped[2] - near_plane) < 1e-6
+
+                # Add the first point (if valid and not duplicate), the clipped point and the second point (if valid).
+                if z_1 >= near_plane and (len(points_clipped) == 0 or all(points_clipped[-1] != point_1)):
+                    points_clipped.append(point_1)
+                points_clipped.append(clipped)
+                if z_2 >= near_plane:
+                    points_clipped.append(point_2)
+
+        points_clipped = np.array(points_clipped).transpose()
+        return points_clipped
+
+    def get_records_in_patch(self,
+                             box_coords: Tuple[float, float, float, float],
+                             layer_names: List[str] = None,
+                             mode: str = 'intersect') -> Dict[str, List[str]]:
+        """
+        Get all the record token that intersects or within a particular rectangular patch.
+        :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max).
+        :param layer_names: Names of the layers that we want to retrieve in a particular patch.
+            By default will always look for all non geometric layers.
+        :param mode: "intersect" will return all non geometric records that intersects the patch,
+            "within" will return all non geometric records that are within the patch.
+        :return: Dictionary of layer_name - tokens pairs.
+        """
+        if mode not in ['intersect', 'within']:
+            raise ValueError("Mode {} is not valid, choice=('intersect', 'within')".format(mode))
+
+        if layer_names is None:
+            layer_names = self.map_api.non_geometric_layers
+
+        records_in_patch = dict()
+        for layer_name in layer_names:
+            layer_records = []
+            for record in getattr(self.map_api, layer_name):
+                token = record['token']
+                if self.is_record_in_patch(layer_name, token, box_coords, mode):
+                    layer_records.append(token)
+
+            records_in_patch.update({layer_name: layer_records})
+
+        return records_in_patch
+
+    def is_record_in_patch(self,
+                           layer_name: str,
+                           token: str,
+                           box_coords: Tuple[float, float, float, float],
+                           mode: str = 'intersect') -> bool:
+        """
+        Query whether a particular record is in a rectangular patch.
+        :param layer_name: The layer name of the record.
+        :param token: The record token.
+        :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max).
+        :param mode: "intersect" means it will return True if the geometric object intersects the patch and False
+        otherwise, "within" will return True if the geometric object is within the patch and False otherwise.
+        :return: Boolean value on whether a particular record intersects or is within a particular patch.
+        """
+        if mode not in ['intersect', 'within']:
+            raise ValueError("Mode {} is not valid, choice=('intersect', 'within')".format(mode))
+
+        if layer_name in self.map_api.lookup_polygon_layers:
+            return self._is_polygon_record_in_patch(token, layer_name, box_coords, mode)
+        elif layer_name in self.map_api.non_geometric_line_layers:
+            return self._is_line_record_in_patch(token, layer_name, box_coords,  mode)
+        else:
+            raise ValueError("{} is not a valid layer".format(layer_name))
+
+    def layers_on_point(self, x: float, y: float, layer_names: List[str] = None) -> Dict[str, str]:
+        """
+        Returns all the polygonal layers that a particular point is on.
+        :param x: x coordinate of the point of interest.
+        :param y: y coordinate of the point of interest.
+        :param layer_names: The names of the layers to search for.
+        :return: All the polygonal layers that a particular point is on.
+        """
+        # Default option.
+        if layer_names is None:
+            layer_names = self.map_api.non_geometric_polygon_layers
+
+        layers_on_point = dict()
+        for layer_name in layer_names:
+            layers_on_point.update({layer_name: self.record_on_point(x, y, layer_name)})
+
+        return layers_on_point
+
+    def record_on_point(self, x: float, y: float, layer_name: str) -> str:
+        """
+        Query what record of a layer a particular point is on.
+        :param x: x coordinate of the point of interest.
+        :param y: y coordinate of the point of interest.
+        :param layer_name: The non geometric polygonal layer name that we are interested in.
+        :return: The first token of a layer a particular point is on or '' if no layer is found.
+        """
+        if layer_name not in self.map_api.non_geometric_polygon_layers:
+            raise ValueError("{} is not a polygon layer".format(layer_name))
+
+        point = Point(x, y)
+        records = getattr(self.map_api, layer_name)
+
+        if layer_name == 'drivable_area':
+            for record in records:
+                polygons = [self.map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']]
+                for polygon in polygons:
+                    if point.within(polygon):
+                        return record['token']
+                    else:
+                        pass
+        else:
+            for record in records:
+                polygon = self.map_api.extract_polygon(record['polygon_token'])
+                if point.within(polygon):
+                    return record['token']
+                else:
+                    pass
+
+        # If nothing is found, return an empty string.
+        return ''
+
+    def extract_polygon(self, polygon_token: str) -> Polygon:
+        """
+        Construct a shapely Polygon object out of a polygon token.
+        :param polygon_token: The token of the polygon record.
+        :return: The polygon wrapped in a shapely Polygon object.
+        """
+        polygon_record = self.map_api.get('polygon', polygon_token)
+
+        exterior_coords = [(self.map_api.get('node', token)['x'], self.map_api.get('node', token)['y'])
+                           for token in polygon_record['exterior_node_tokens']]
+
+        interiors = []
+        for hole in polygon_record['holes']:
+            interior_coords = [(self.map_api.get('node', token)['x'], self.map_api.get('node', token)['y'])
+                               for token in hole['node_tokens']]
+            if len(interior_coords) > 0:  # Add only non-empty holes.
+                interiors.append(interior_coords)
+
+        return Polygon(exterior_coords, interiors)
+
+    def extract_line(self, line_token: str) -> LineString:
+        """
+        Construct a shapely LineString object out of a line token.
+        :param line_token: The token of the line record.
+        :return: The line wrapped in a LineString object.
+        """
+        line_record = self.map_api.get('line', line_token)
+        line_nodes = [(self.map_api.get('node', token)['x'], self.map_api.get('node', token)['y'])
+                      for token in line_record['node_tokens']]
+
+        return LineString(line_nodes)
+
+    def get_bounds(self, layer_name: str, token: str) -> Tuple[float, float, float, float]:
+        """
+        Get the bounds of the geometric object that corresponds to a non geometric record.
+        :param layer_name: Name of the layer that we are interested in.
+        :param token: Token of the record.
+        :return: min_x, min_y, max_x, max_y of the line representation.
+        """
+        if layer_name in self.map_api.non_geometric_polygon_layers:
+            return self._get_polygon_bounds(layer_name, token)
+        elif layer_name in self.map_api.non_geometric_line_layers:
+            return self._get_line_bounds(layer_name, token)
+        else:
+            raise ValueError("{} is not a valid layer".format(layer_name))
+
+    def _get_polygon_bounds(self, layer_name: str, token: str) -> Tuple[float, float, float, float]:
+        """
+        Get the extremities of the polygon object that corresponds to a non geometric record.
+        :param layer_name: Name of the layer that we are interested in.
+        :param token: Token of the record.
+        :return: min_x, min_y, max_x, max_y of of the polygon or polygons (for drivable_area) representation.
+        """
+        if layer_name not in self.map_api.non_geometric_polygon_layers:
+            raise ValueError("{} is not a record with polygon representation".format(token))
+
+        record = self.map_api.get(layer_name, token)
+
+        if layer_name == 'drivable_area':
+            polygons = [self.map_api.get('polygon', polygon_token) for polygon_token in record['polygon_tokens']]
+            exterior_node_coords = []
+
+            for polygon in polygons:
+                nodes = [self.map_api.get('node', node_token) for node_token in polygon['exterior_node_tokens']]
+                node_coords = [(node['x'], node['y']) for node in nodes]
+                exterior_node_coords.extend(node_coords)
+
+            exterior_node_coords = np.array(exterior_node_coords)
+        else:
+            exterior_nodes = [self.map_api.get('node', token) for token in record['exterior_node_tokens']]
+            exterior_node_coords = np.array([(node['x'], node['y']) for node in exterior_nodes])
+
+        xs = exterior_node_coords[:, 0]
+        ys = exterior_node_coords[:, 1]
+
+        x2 = xs.max()
+        x1 = xs.min()
+        y2 = ys.max()
+        y1 = ys.min()
+
+        return x1, y1, x2, y2
+
+    def _get_line_bounds(self, layer_name: str, token: str) -> Tuple[float, float, float, float]:
+        """
+        Get the bounds of the line object that corresponds to a non geometric record.
+        :param layer_name: Name of the layer that we are interested in.
+        :param token: Token of the record.
+        :return: min_x, min_y, max_x, max_y of of the line representation.
+        """
+        if layer_name not in self.map_api.non_geometric_line_layers:
+            raise ValueError("{} is not a record with line representation".format(token))
+
+        record = self.map_api.get(layer_name, token)
+        nodes = [self.map_api.get('node', node_token) for node_token in record['node_tokens']]
+        node_coords = [(node['x'], node['y']) for node in nodes]
+        node_coords = np.array(node_coords)
+
+        xs = node_coords[:, 0]
+        ys = node_coords[:, 1]
+
+        x2 = xs.max()
+        x1 = xs.min()
+        y2 = ys.max()
+        y1 = ys.min()
+
+        return x1, y1, x2, y2
+
+    def _is_polygon_record_in_patch(self,
+                                    token: str,
+                                    layer_name: str,
+                                    box_coords: Tuple[float, float, float, float],
+                                    mode: str = 'intersect') -> bool:
+        """
+        Query whether a particular polygon record is in a rectangular patch.
+        :param layer_name: The layer name of the record.
+        :param token: The record token.
+        :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max).
+        :param mode: "intersect" means it will return True if the geometric object intersects the patch and False
+        otherwise, "within" will return True if the geometric object is within the patch and False otherwise.
+        :return: Boolean value on whether a particular polygon record intersects or is within a particular patch.
+        """
+        if layer_name not in self.map_api.lookup_polygon_layers:
+            raise ValueError('{} is not a polygonal layer'.format(layer_name))
+
+        x_min, y_min, x_max, y_max = box_coords
+        record = self.map_api.get(layer_name, token)
+        rectangular_patch = box(x_min, y_min, x_max, y_max)
+
+        if layer_name == 'drivable_area':
+            polygons = [self.map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']]
+            geom = MultiPolygon(polygons)
+        else:
+            geom = self.map_api.extract_polygon(record['polygon_token'])
+
+        if mode == 'intersect':
+            return geom.intersects(rectangular_patch)
+        elif mode == 'within':
+            return geom.within(rectangular_patch)
+
+    def _is_line_record_in_patch(self,
+                                 token: str,
+                                 layer_name: str,
+                                 box_coords: Tuple[float, float, float, float],
+                                 mode: str = 'intersect') -> bool:
+        """
+        Query whether a particular line record is in a rectangular patch.
+        :param layer_name: The layer name of the record.
+        :param token: The record token.
+        :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max).
+        :param mode: "intersect" means it will return True if the geometric object intersects the patch and False
+        otherwise, "within" will return True if the geometric object is within the patch and False otherwise.
+        :return: Boolean value on whether a particular line  record intersects or is within a particular patch.
+        """
+        if layer_name not in self.map_api.non_geometric_line_layers:
+            raise ValueError("{} is not a line layer".format(layer_name))
+
+        # Retrieve nodes of this line.
+        record = self.map_api.get(layer_name, token)
+        node_recs = [self.map_api.get('node', node_token) for node_token in record['node_tokens']]
+        node_coords = [[node['x'], node['y']] for node in node_recs]
+        node_coords = np.array(node_coords)
+
+        # A few lines in Queenstown have zero nodes. In this case we return False.
+        if len(node_coords) == 0:
+            return False
+
+        # Check that nodes fall inside the path.
+        x_min, y_min, x_max, y_max = box_coords
+        cond_x = np.logical_and(node_coords[:, 0] < x_max, node_coords[:, 0] > x_min)
+        cond_y = np.logical_and(node_coords[:, 1] < y_max, node_coords[:, 1] > y_min)
+        cond = np.logical_and(cond_x, cond_y)
+        if mode == 'intersect':
+            return np.any(cond)
+        elif mode == 'within':
+            return np.all(cond)
+
+    def _render_layer(self, ax: Axes, layer_name: str, alpha: float, tokens: List[str] = None) -> None:
+        """
+        Wrapper method that renders individual layers on an axis.
+        :param ax: The matplotlib axes where the layer will get rendered.
+        :param layer_name: Name of the layer that we are interested in.
+        :param alpha: The opacity of the layer to be rendered.
+        :param tokens: Optional list of tokens to render. None means all tokens are rendered.
+        """
+        if layer_name in self.map_api.non_geometric_polygon_layers:
+            self._render_polygon_layer(ax, layer_name, alpha, tokens)
+        elif layer_name in self.map_api.non_geometric_line_layers:
+            self._render_line_layer(ax, layer_name, alpha, tokens)
+        else:
+            raise ValueError("{} is not a valid layer".format(layer_name))
+
+    def _render_polygon_layer(self, ax: Axes, layer_name: str, alpha: float, tokens: List[str] = None) -> None:
+        """
+        Renders an individual non-geometric polygon layer on an axis.
+        :param ax: The matplotlib axes where the layer will get rendered.
+        :param layer_name: Name of the layer that we are interested in.
+        :param alpha: The opacity of the layer to be rendered.
+        :param tokens: Optional list of tokens to render. None means all tokens are rendered.
+        """
+        if layer_name not in self.map_api.non_geometric_polygon_layers:
+            raise ValueError('{} is not a polygonal layer'.format(layer_name))
+
+        first_time = True
+        records = getattr(self.map_api, layer_name)
+        if tokens is not None:
+            records = [r for r in records if r['token'] in tokens]
+        if layer_name == 'drivable_area':
+            for record in records:
+                polygons = [self.map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']]
+
+                for polygon in polygons:
+                    if first_time:
+                        label = layer_name
+                        first_time = False
+                    else:
+                        label = None
+                    ax.add_patch(descartes.PolygonPatch(polygon, fc=self.color_map[layer_name], alpha=alpha,
+                                                        label=label))
+        else:
+            for record in records:
+                polygon = self.map_api.extract_polygon(record['polygon_token'])
+
+                if first_time:
+                    label = layer_name
+                    first_time = False
+                else:
+                    label = None
+
+                ax.add_patch(descartes.PolygonPatch(polygon, fc=self.color_map[layer_name], alpha=alpha,
+                                                    label=label))
+
+    def _render_line_layer(self, ax: Axes, layer_name: str, alpha: float, tokens: List[str] = None) -> None:
+        """
+        Renders an individual non-geometric line layer on an axis.
+        :param ax: The matplotlib axes where the layer will get rendered.
+        :param layer_name: Name of the layer that we are interested in.
+        :param alpha: The opacity of the layer to be rendered.
+        :param tokens: Optional list of tokens to render. None means all tokens are rendered.
+        """
+        if layer_name not in self.map_api.non_geometric_line_layers:
+            raise ValueError("{} is not a line layer".format(layer_name))
+
+        first_time = True
+        records = getattr(self.map_api, layer_name)
+        if tokens is not None:
+            records = [r for r in records if r['token'] in tokens]
+        for record in records:
+            if first_time:
+                label = layer_name
+                first_time = False
+            else:
+                label = None
+            line = self.map_api.extract_line(record['line_token'])
+            if line.is_empty:  # Skip lines without nodes
+                continue
+            xs, ys = line.xy
+
+            if layer_name == 'traffic_light':
+                # Draws an arrow with the physical traffic light as the starting point, pointing to the direction on
+                # where the traffic light points.
+                ax.add_patch(Arrow(xs[0], ys[0], xs[1]-xs[0], ys[1]-ys[0], color=self.color_map[layer_name],
+                                   label=label))
+            else:
+                ax.plot(xs, ys, color=self.color_map[layer_name], alpha=alpha, label=label)
+
+    def _get_layer_geom(self,
+                        patch_box: Tuple[float, float, float, float],
+                        patch_angle: float,
+                        layer_name: str) -> List[Geometry]:
+        """
+        Wrapper method that gets the geometries for each layer.
+        :param patch_box: Patch box defined as [x_center, y_center, height, width].
+        :param patch_angle: Patch orientation in degrees.
+        :param layer_name: Name of map layer to be converted to binary map mask patch.
+        :return: List of geometries for the given layer.
+        """
+        if layer_name in self.map_api.non_geometric_polygon_layers:
+            return self._get_layer_polygon(patch_box, patch_angle, layer_name)
+        elif layer_name in self.map_api.non_geometric_line_layers:
+            return self._get_layer_line(patch_box, patch_angle, layer_name)
+        else:
+            raise ValueError("{} is not a valid layer".format(layer_name))
+
+    def _layer_geom_to_mask(self,
+                            layer_name: str,
+                            layer_geom: List[Geometry],
+                            local_box: Tuple[float, float, float, float],
+                            canvas_size: Tuple[int, int]) -> np.ndarray:
+        """
+        Wrapper method that gets the mask for each layer's geometries.
+        :param layer_name: The name of the layer for which we get the masks.
+        :param layer_geom: List of the geometries of the layer specified in layer_name.
+        :param local_box: The local patch box defined as (x_center, y_center, height, width), where typically
+            x_center = y_center = 0.
+        :param canvas_size: Size of the output mask (h, w).
+        """
+        if layer_name in self.map_api.non_geometric_polygon_layers:
+            return self._polygon_geom_to_mask(layer_geom, local_box, layer_name, canvas_size)
+        elif layer_name in self.map_api.non_geometric_line_layers:
+            return self._line_geom_to_mask(layer_geom, local_box, layer_name, canvas_size)
+        else:
+            raise ValueError("{} is not a valid layer".format(layer_name))
+
+    @staticmethod
+    def mask_for_polygons(polygons: MultiPolygon, mask: np.ndarray) -> np.ndarray:
+        """
+        Convert a polygon or multipolygon list to an image mask ndarray.
+        :param polygons: List of Shapely polygons to be converted to numpy array.
+        :param mask: Canvas where mask will be generated.
+        :return: Numpy ndarray polygon mask.
+        """
+        if not polygons:
+            return mask
+
+        def int_coords(x):
+            # function to round and convert to int
+            return np.array(x).round().astype(np.int32)
+        exteriors = [int_coords(poly.exterior.coords) for poly in polygons]
+        interiors = [int_coords(pi.coords) for poly in polygons for pi in poly.interiors]
+        cv2.fillPoly(mask, exteriors, 1)
+        cv2.fillPoly(mask, interiors, 0)
+        return mask
+
+    @staticmethod
+    def mask_for_lines(lines: LineString, mask: np.ndarray) -> np.ndarray:
+        """
+        Convert a Shapely LineString back to an image mask ndarray.
+        :param lines: List of shapely LineStrings to be converted to a numpy array.
+        :param mask: Canvas where mask will be generated.
+        :return: Numpy ndarray line mask.
+        """
+        if lines.geom_type == 'MultiLineString':
+            for line in lines:
+                coords = np.asarray(list(line.coords), np.int32)
+                coords = coords.reshape((-1, 2))
+                cv2.polylines(mask, [coords], False, 1, 2)
+        else:
+            coords = np.asarray(list(lines.coords), np.int32)
+            coords = coords.reshape((-1, 2))
+            cv2.polylines(mask, [coords], False, 1, 2)
+
+        return mask
+
+    def _polygon_geom_to_mask(self,
+                              layer_geom: List[Polygon],
+                              local_box: Tuple[float, float, float, float],
+                              layer_name: str,
+                              canvas_size: Tuple[int, int]) -> np.ndarray:
+        """
+        Convert polygon inside patch to binary mask and return the map patch.
+        :param layer_geom: list of polygons for each map layer
+        :param local_box: The local patch box defined as (x_center, y_center, height, width), where typically
+            x_center = y_center = 0.
+        :param layer_name: name of map layer to be converted to binary map mask patch.
+        :param canvas_size: Size of the output mask (h, w).
+        :return: Binary map mask patch with the size canvas_size.
+        """
+        if layer_name not in self.map_api.non_geometric_polygon_layers:
+            raise ValueError('{} is not a polygonal layer'.format(layer_name))
+
+        patch_x, patch_y, patch_h, patch_w = local_box
+
+        patch = self.get_patch_coord(local_box)
+
+        canvas_h = canvas_size[0]
+        canvas_w = canvas_size[1]
+
+        scale_height = canvas_h / patch_h
+        scale_width = canvas_w / patch_w
+
+        trans_x = -patch_x + patch_w / 2.0
+        trans_y = -patch_y + patch_h / 2.0
+
+        map_mask = np.zeros(canvas_size, np.uint8)
+
+        for polygon in layer_geom:
+            new_polygon = polygon.intersection(patch)
+            if not new_polygon.is_empty:
+                new_polygon = affinity.affine_transform(new_polygon,
+                                                        [1.0, 0.0, 0.0, 1.0, trans_x, trans_y])
+                new_polygon = affinity.scale(new_polygon, xfact=scale_width, yfact=scale_height, origin=(0, 0))
+
+                if new_polygon.geom_type == 'Polygon':
+                    new_polygon = MultiPolygon([new_polygon])
+
+                # if new_polygon.area < 1000:
+                #     continue
+
+                if not isinstance(new_polygon, MultiPolygon):
+                    print(new_polygon)
+                    
+                    continue
+
+                map_mask = self.mask_for_polygons(new_polygon, map_mask)
+
+        return map_mask
+
+    def _line_geom_to_mask(self,
+                           layer_geom: List[LineString],
+                           local_box: Tuple[float, float, float, float],
+                           layer_name: str,
+                           canvas_size: Tuple[int, int]) -> Optional[np.ndarray]:
+        """
+        Convert line inside patch to binary mask and return the map patch.
+        :param layer_geom: list of LineStrings for each map layer
+        :param local_box: The local patch box defined as (x_center, y_center, height, width), where typically
+            x_center = y_center = 0.
+        :param layer_name: name of map layer to be converted to binary map mask patch.
+        :param canvas_size: Size of the output mask (h, w).
+        :return: Binary map mask patch in a canvas size.
+        """
+        if layer_name not in self.map_api.non_geometric_line_layers:
+            raise ValueError("{} is not a line layer".format(layer_name))
+
+        patch_x, patch_y, patch_h, patch_w = local_box
+
+        patch = self.get_patch_coord(local_box)
+
+        canvas_h = canvas_size[0]
+        canvas_w = canvas_size[1]
+        scale_height = canvas_h/patch_h
+        scale_width = canvas_w/patch_w
+
+        trans_x = -patch_x + patch_w / 2.0
+        trans_y = -patch_y + patch_h / 2.0
+
+        map_mask = np.zeros(canvas_size, np.uint8)
+
+        if layer_name == 'traffic_light':
+            return None
+
+        for line in layer_geom:
+            new_line = line.intersection(patch)
+            if not new_line.is_empty:
+                new_line = affinity.affine_transform(new_line,
+                                                     [1.0, 0.0, 0.0, 1.0, trans_x, trans_y])
+                new_line = affinity.scale(new_line, xfact=scale_width, yfact=scale_height, origin=(0, 0))
+
+                map_mask = self.mask_for_lines(new_line, map_mask)
+        return map_mask
+
+    def _get_layer_polygon(self,
+                           patch_box: Tuple[float, float, float, float],
+                           patch_angle: float,
+                           layer_name: str) -> List[Polygon]:
+        """
+         Retrieve the polygons of a particular layer within the specified patch.
+         :param patch_box: Patch box defined as [x_center, y_center, height, width].
+         :param patch_angle: Patch orientation in degrees.
+         :param layer_name: name of map layer to be extracted.
+         :return: List of Polygon in a patch box.
+         """
+        if layer_name not in self.map_api.non_geometric_polygon_layers:
+            raise ValueError('{} is not a polygonal layer'.format(layer_name))
+
+        patch_x = patch_box[0]
+        patch_y = patch_box[1]
+
+        patch = self.get_patch_coord(patch_box, patch_angle)
+
+        records = getattr(self.map_api, layer_name)
+
+        polygon_list = []
+        if layer_name == 'drivable_area':
+            for record in records:
+                polygons = [self.map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']]
+
+                for polygon in polygons:
+                    new_polygon = polygon.intersection(patch)
+                    if not new_polygon.is_empty:
+                        new_polygon = affinity.rotate(new_polygon, -patch_angle,
+                                                      origin=(patch_x, patch_y), use_radians=False)
+                        new_polygon = affinity.affine_transform(new_polygon,
+                                                                [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+                        if new_polygon.geom_type == 'Polygon':
+                            new_polygon = MultiPolygon([new_polygon])
+                        polygon_list.append(new_polygon)
+
+        else:
+            for record in records:
+                polygon = self.map_api.extract_polygon(record['polygon_token'])
+
+                if polygon.is_valid:
+                    new_polygon = polygon.intersection(patch)
+                    if not new_polygon.is_empty:
+                        new_polygon = affinity.rotate(new_polygon, -patch_angle,
+                                                      origin=(patch_x, patch_y), use_radians=False)
+                        new_polygon = affinity.affine_transform(new_polygon,
+                                                                [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+                        if new_polygon.geom_type == 'Polygon':
+                            new_polygon = MultiPolygon([new_polygon])
+                        polygon_list.append(new_polygon)
+
+        return polygon_list
+
+    def _get_layer_line(self,
+                        patch_box: Tuple[float, float, float, float],
+                        patch_angle: float,
+                        layer_name: str) -> Optional[List[LineString]]:
+        """
+        Retrieve the lines of a particular layer within the specified patch.
+        :param patch_box: Patch box defined as [x_center, y_center, height, width].
+        :param patch_angle: Patch orientation in degrees.
+        :param layer_name: name of map layer to be converted to binary map mask patch.
+        :return: List of LineString in a patch box.
+        """
+        if layer_name not in self.map_api.non_geometric_line_layers:
+            raise ValueError("{} is not a line layer".format(layer_name))
+
+        if layer_name == 'traffic_light':
+            return None
+
+        patch_x = patch_box[0]
+        patch_y = patch_box[1]
+
+        patch = self.get_patch_coord(patch_box, patch_angle)
+
+        line_list = []
+        records = getattr(self.map_api, layer_name)
+        for record in records:
+            line = self.map_api.extract_line(record['line_token'])
+            if line.is_empty:  # Skip lines without nodes.
+                continue
+
+            new_line = line.intersection(patch)
+            if not new_line.is_empty:
+                new_line = affinity.rotate(new_line, -patch_angle,
+                                           origin=(patch_x, patch_y), use_radians=False)
+                new_line = affinity.affine_transform(new_line,
+                                                     [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+                line_list.append(new_line)
+
+        return line_list
+
+    @staticmethod
+    def get_patch_coord(patch_box: Tuple[float, float, float, float],
+                        patch_angle: float = 0.0) -> Polygon:
+        """
+        Convert patch_box to shapely Polygon coordinates.
+        :param patch_box: Patch box defined as [x_center, y_center, height, width].
+        :param patch_angle: Patch orientation in degrees.
+        :return: Box Polygon for patch_box.
+        """
+        patch_x, patch_y, patch_h, patch_w = patch_box
+
+        x_min = patch_x - patch_w / 2.0
+        y_min = patch_y - patch_h / 2.0
+        x_max = patch_x + patch_w / 2.0
+        y_max = patch_y + patch_h / 2.0
+
+        patch = box(x_min, y_min, x_max, y_max)
+        patch = affinity.rotate(patch, patch_angle, origin=(patch_x, patch_y), use_radians=False)
+
+        return patch
+
+    def _get_figsize(self, figsize: Union[None, float, Tuple[float, float]]) -> Tuple[float, float]:
+        """
+        Utility function that scales the figure size by the map canvas size.
+        If figsize is:
+        - None      => Return default scale.
+        - Scalar    => Scale canvas size.
+        - Two-tuple => Use the specified figure size.
+        :param figsize: The input figure size.
+        :return: The output figure size.
+        """
+        # Divide canvas size by arbitrary scalar to get into cm range.
+        canvas_size = np.array(self.map_api.canvas_edge)[::-1] / 200
+
+        if figsize is None:
+            return tuple(canvas_size)
+        elif type(figsize) in [int, float]:
+            return tuple(canvas_size * figsize)
+        elif type(figsize) == tuple and len(figsize) == 2:
+            return figsize
+        else:
+            raise Exception('Error: Invalid figsize: %s' % figsize)
diff --git a/mmcv/datasets/eval_utils/metric_utils.py b/mmcv/datasets/eval_utils/metric_utils.py
new file mode 100644
index 0000000..1058703
--- /dev/null
+++ b/mmcv/datasets/eval_utils/metric_utils.py
@@ -0,0 +1,104 @@
+import torch
+import math
+import numpy as np
+from typing import List, Dict, Tuple, Callable, Union
+
+def min_ade(traj: torch.Tensor, traj_gt: torch.Tensor,
+            masks: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Computes average displacement error for the best trajectory is a set,
+    with respect to ground truth
+    :param traj: predictions, shape [batch_size, num_modes, sequence_length, 2]
+    :param traj_gt: ground truth trajectory, shape
+    [batch_size, sequence_length, 2]
+    :param masks: masks for varying length ground truth, shape
+    [batch_size, sequence_length]
+    :return errs, inds: errors and indices for modes with min error, shape
+    [batch_size]
+    """
+    num_modes = traj.shape[1]
+    traj_gt_rpt = traj_gt.unsqueeze(1).repeat(1, num_modes, 1, 1)
+    masks_rpt = masks.unsqueeze(1).repeat(1, num_modes, 1)
+    err = traj_gt_rpt - traj[:, :, :, 0:2]
+    err = torch.pow(err, exponent=2)
+    err = torch.sum(err, dim=3)
+    err = torch.pow(err, exponent=0.5)
+    err = torch.sum(err * (1 - masks_rpt), dim=2) / \
+        torch.clip(torch.sum((1 - masks_rpt), dim=2), min=1)
+    err, inds = torch.min(err, dim=1)
+
+    return err, inds
+
+
+def min_fde(traj: torch.Tensor, traj_gt: torch.Tensor,
+            masks: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Computes final displacement error for the best trajectory is a set,
+    with respect to ground truth
+    :param traj: predictions, shape [batch_size, num_modes, sequence_length, 2]
+    :param traj_gt: ground truth trajectory, shape
+    [batch_size, sequence_length, 2]
+    :param masks: masks for varying length ground truth, shape
+    [batch_size, sequence_length]
+    :return errs, inds: errors and indices for modes with min error,
+    shape [batch_size]
+    """
+    num_modes = traj.shape[1]
+    traj_gt_rpt = traj_gt.unsqueeze(1).repeat(1, num_modes, 1, 1)
+    lengths = torch.sum(1 - masks, dim=1).long()
+    inds = lengths.unsqueeze(1).unsqueeze(
+        2).unsqueeze(3).repeat(1, num_modes, 1, 2) - 1
+
+    traj_last = torch.gather(traj[..., :2], dim=2, index=inds).squeeze(2)
+    traj_gt_last = torch.gather(traj_gt_rpt, dim=2, index=inds).squeeze(2)
+
+    err = traj_gt_last - traj_last[..., 0:2]
+    err = torch.pow(err, exponent=2)
+    err = torch.sum(err, dim=2)
+    err = torch.pow(err, exponent=0.5)
+    err, inds = torch.min(err, dim=1)
+
+    return err, inds
+
+
+def miss_rate(
+        traj: torch.Tensor,
+        traj_gt: torch.Tensor,
+        masks: torch.Tensor,
+        dist_thresh: float = 2) -> torch.Tensor:
+    """
+    Computes miss rate for mini batch of trajectories,
+    with respect to ground truth and given distance threshold
+    :param traj: predictions, shape [batch_size, num_modes, sequence_length, 2]
+    :param traj_gt: ground truth trajectory,
+    shape [batch_size, sequence_length, 2]
+    :param masks: masks for varying length ground truth,
+    shape [batch_size, sequence_length]
+    :param dist_thresh: distance threshold for computing miss rate.
+    :return errs, inds: errors and indices for modes with min error,
+    shape [batch_size]
+    """
+    num_modes = traj.shape[1]
+
+    traj_gt_rpt = traj_gt.unsqueeze(1).repeat(1, num_modes, 1, 1)
+    masks_rpt = masks.unsqueeze(1).repeat(1, num_modes, 1)
+    dist = traj_gt_rpt - traj[:, :, :, 0:2]
+    dist = torch.pow(dist, exponent=2)
+    dist = torch.sum(dist, dim=3)
+    dist = torch.pow(dist, exponent=0.5)
+    dist[masks_rpt.bool()] = -math.inf
+    dist, _ = torch.max(dist, dim=2)
+    dist, _ = torch.min(dist, dim=1)
+    m_r = torch.sum(torch.as_tensor(dist > dist_thresh)) / len(dist)
+
+    return m_r
+
+def traj_fde(gt_box, pred_box, final_step):
+    if gt_box.traj.shape[0] <= 0:
+        return np.inf
+    final_step = min(gt_box.traj.shape[0], final_step)
+    gt_final = gt_box.traj[None, final_step-1]
+    pred_final = np.array(pred_box.traj)[:,final_step-1,:]
+    err = gt_final - pred_final
+    err = np.sqrt(np.sum(np.square(gt_final - pred_final), axis=-1))
+    return np.min(err)
\ No newline at end of file
diff --git a/mmcv/datasets/eval_utils/nuscenes_eval.py b/mmcv/datasets/eval_utils/nuscenes_eval.py
new file mode 100644
index 0000000..48a136c
--- /dev/null
+++ b/mmcv/datasets/eval_utils/nuscenes_eval.py
@@ -0,0 +1,705 @@
+import argparse
+import copy
+import json
+import numpy as np
+import os
+import time
+from typing import Tuple, Dict, Any
+import tqdm
+from matplotlib import pyplot as plt
+from pyquaternion import Quaternion
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.common.loaders import load_prediction, load_gt, add_center_dist, filter_eval_boxes
+from nuscenes.eval.common.render import setup_axis
+from nuscenes.eval.detection.algo import accumulate, calc_ap, calc_tp
+from nuscenes.eval.detection.constants import TP_METRICS, TP_METRICS_UNITS, PRETTY_DETECTION_NAMES, PRETTY_TP_METRICS
+from nuscenes.eval.detection.data_classes import DetectionConfig, DetectionMetrics, DetectionBox, DetectionMetricDataList
+from nuscenes.eval.detection.evaluate import NuScenesEval
+from nuscenes.eval.detection.render import summary_plot, class_pr_curve, dist_pr_curve
+from nuscenes.eval.tracking.data_classes import TrackingBox
+from nuscenes.utils.data_classes import Box
+from nuscenes.utils.geometry_utils import view_points, BoxVisibility
+from nuscenes.utils.splits import create_splits_scenes
+from nuscenes.eval.detection.utils import category_to_detection_name
+
+
+Axis = Any
+
+def class_tp_curve(md_list: DetectionMetricDataList,
+                   metrics: DetectionMetrics,
+                   detection_name: str,
+                   min_recall: float,
+                   dist_th_tp: float,
+                   savepath: str = None,
+                   ax: Axis = None) -> None:
+    """
+    Plot the true positive curve for the specified class.
+    :param md_list: DetectionMetricDataList instance.
+    :param metrics: DetectionMetrics instance.
+    :param detection_name:
+    :param min_recall: Minimum recall value.
+    :param dist_th_tp: The distance threshold used to determine matches.
+    :param savepath: If given, saves the the rendering here instead of displaying.
+    :param ax: Axes onto which to render.
+    """
+    # Get metric data for given detection class with tp distance threshold.
+
+    md = md_list[(detection_name, dist_th_tp)]
+    min_recall_ind = round(100 * min_recall)
+    if min_recall_ind <= md.max_recall_ind:
+        # For traffic_cone and barrier only a subset of the metrics are plotted.
+        rel_metrics = [m for m in TP_METRICS if not np.isnan(metrics.get_label_tp(detection_name, m))]
+        ylimit = max([max(getattr(md, metric)[min_recall_ind:md.max_recall_ind + 1]) for metric in rel_metrics]) * 1.1
+    else:
+        ylimit = 1.0
+
+    # Prepare axis.
+    if ax is None:
+        ax = setup_axis(title=PRETTY_DETECTION_NAMES[detection_name], xlabel='Recall', ylabel='Error', xlim=1,
+                        min_recall=min_recall)
+    ax.set_ylim(0, ylimit)
+
+    # Plot the recall vs. error curve for each tp metric.
+    for metric in TP_METRICS:
+        tp = metrics.get_label_tp(detection_name, metric)
+
+        # Plot only if we have valid data.
+        if tp is not np.nan and min_recall_ind <= md.max_recall_ind:
+            recall, error = md.recall[:md.max_recall_ind + 1], getattr(md, metric)[:md.max_recall_ind + 1]
+        else:
+            recall, error = [], []
+
+        # Change legend based on tp value
+        if tp is np.nan:
+            label = '{}: n/a'.format(PRETTY_TP_METRICS[metric])
+        elif min_recall_ind > md.max_recall_ind:
+            label = '{}: nan'.format(PRETTY_TP_METRICS[metric])
+        else:
+            label = '{}: {:.2f} ({})'.format(PRETTY_TP_METRICS[metric], tp, TP_METRICS_UNITS[metric])
+        if metric == 'trans_err':
+            label += f' ({md.max_recall_ind})'  # add recall
+            print(f'Recall: {detection_name}: {md.max_recall_ind/100}')
+        ax.plot(recall, error, label=label)
+    ax.axvline(x=md.max_recall, linestyle='-.', color=(0, 0, 0, 0.3))
+    ax.legend(loc='best')
+
+    if savepath is not None:
+        plt.savefig(savepath)
+        plt.close()
+
+
+class DetectionBox_modified(DetectionBox):
+    def __init__(self, *args, token=None, visibility=None, index=None, **kwargs):
+        '''
+        add annotation token
+        '''
+        super().__init__(*args, **kwargs)
+        self.token = token
+        self.visibility = visibility
+        self.index = index
+
+    def serialize(self) -> dict:
+        """ Serialize instance into json-friendly format. """
+        return {
+            'token': self.token,
+            'sample_token': self.sample_token,
+            'translation': self.translation,
+            'size': self.size,
+            'rotation': self.rotation,
+            'velocity': self.velocity,
+            'ego_translation': self.ego_translation,
+            'num_pts': self.num_pts,
+            'detection_name': self.detection_name,
+            'detection_score': self.detection_score,
+            'attribute_name': self.attribute_name,
+            'visibility': self.visibility,
+            'index': self.index
+
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized content. """
+        return cls(
+            token=content['token'],
+            sample_token=content['sample_token'],
+            translation=tuple(content['translation']),
+            size=tuple(content['size']),
+            rotation=tuple(content['rotation']),
+            velocity=tuple(content['velocity']),
+            ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+            else tuple(content['ego_translation']),
+            num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+            detection_name=content['detection_name'],
+            detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+            attribute_name=content['attribute_name'],
+            visibility=content['visibility'],
+            index=content['index'],
+        )
+
+
+def center_in_image(box, intrinsic: np.ndarray, imsize: Tuple[int, int], vis_level: int = BoxVisibility.ANY) -> bool:
+    """
+    Check if a box is visible inside an image without accounting for occlusions.
+    :param box: The box to be checked.
+    :param intrinsic: <float: 3, 3>. Intrinsic camera matrix.
+    :param imsize: (width, height).
+    :param vis_level: One of the enumerations of <BoxVisibility>.
+    :return True if visibility condition is satisfied.
+    """
+
+    center_3d = box.center.reshape(3, 1)
+    center_img = view_points(center_3d, intrinsic, normalize=True)[:2, :]
+
+    visible = np.logical_and(center_img[0, :] > 0, center_img[0, :] < imsize[0])
+    visible = np.logical_and(visible, center_img[1, :] < imsize[1])
+    visible = np.logical_and(visible, center_img[1, :] > 0)
+    visible = np.logical_and(visible, center_3d[2, :] > 1)
+
+    in_front = center_3d[2, :] > 0.1  # True if a corner is at least 0.1 meter in front of the camera.
+
+    if vis_level == BoxVisibility.ALL:
+        return all(visible) and all(in_front)
+    elif vis_level == BoxVisibility.ANY:
+        return any(visible) and all(in_front)
+    elif vis_level == BoxVisibility.NONE:
+        return True
+    else:
+        raise ValueError("vis_level: {} not valid".format(vis_level))
+
+
+def exist_corners_in_image_but_not_all(box, intrinsic: np.ndarray, imsize: Tuple[int, int],
+                                       vis_level: int = BoxVisibility.ANY) -> bool:
+    """
+    Check if a box is visible in images but not all corners in image .
+    :param box: The box to be checked.
+    :param intrinsic: <float: 3, 3>. Intrinsic camera matrix.
+    :param imsize: (width, height).
+    :param vis_level: One of the enumerations of <BoxVisibility>.
+    :return True if visibility condition is satisfied.
+    """
+
+    corners_3d = box.corners()
+    corners_img = view_points(corners_3d, intrinsic, normalize=True)[:2, :]
+
+    visible = np.logical_and(corners_img[0, :] > 0, corners_img[0, :] < imsize[0])
+    visible = np.logical_and(visible, corners_img[1, :] < imsize[1])
+    visible = np.logical_and(visible, corners_img[1, :] > 0)
+    visible = np.logical_and(visible, corners_3d[2, :] > 1)
+
+    in_front = corners_3d[2, :] > 0.1  # True if a corner is at least 0.1 meter in front of the camera.
+
+    if any(visible) and not all(visible) and all(in_front):
+        return True
+    else:
+        return False
+
+
+def load_gt(nusc: NuScenes, eval_split: str, box_cls, verbose: bool = False):
+    """
+    Loads ground truth boxes from DB.
+    :param nusc: A NuScenes instance.
+    :param eval_split: The evaluation split for which we load GT boxes.
+    :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
+    :param verbose: Whether to print messages to stdout.
+    :return: The GT boxes.
+    """
+
+    # Init.
+    if box_cls == DetectionBox_modified:
+        attribute_map = {a['token']: a['name'] for a in nusc.attribute}
+
+    if verbose:
+        print('Loading annotations for {} split from nuScenes version: {}'.format(eval_split, nusc.version))
+    # Read out all sample_tokens in DB.
+    sample_tokens_all = [s['token'] for s in nusc.sample]
+    assert len(sample_tokens_all) > 0, "Error: Database has no samples!"
+
+    # Only keep samples from this split.
+    splits = create_splits_scenes()
+
+    # Check compatibility of split with nusc_version.
+    version = nusc.version
+    if eval_split in {'train', 'val', 'train_detect', 'train_track'}:
+        assert version.endswith('trainval'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    elif eval_split in {'mini_train', 'mini_val'}:
+        assert version.endswith('mini'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    elif eval_split == 'test':
+        assert version.endswith('test'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    else:
+        raise ValueError('Error: Requested split {} which this function cannot map to the correct NuScenes version.'
+                         .format(eval_split))
+
+    if eval_split == 'test':
+        # Check that you aren't trying to cheat :).
+        assert len(nusc.sample_annotation) > 0, \
+            'Error: You are trying to evaluate on the test set but you do not have the annotations!'
+    index_map = {}
+    for scene in nusc.scene:
+        first_sample_token = scene['first_sample_token']
+        sample = nusc.get('sample', first_sample_token)
+        index_map[first_sample_token] = 1
+        index = 2
+        while sample['next'] != '':
+            sample = nusc.get('sample', sample['next'])
+            index_map[sample['token']] = index
+            index += 1
+
+    sample_tokens = []
+    for sample_token in sample_tokens_all:
+        scene_token = nusc.get('sample', sample_token)['scene_token']
+        scene_record = nusc.get('scene', scene_token)
+        if scene_record['name'] in splits[eval_split]:
+            sample_tokens.append(sample_token)
+
+    all_annotations = EvalBoxes()
+
+    # Load annotations and filter predictions and annotations.
+    tracking_id_set = set()
+    for sample_token in tqdm.tqdm(sample_tokens, leave=verbose):
+
+        sample = nusc.get('sample', sample_token)
+        sample_annotation_tokens = sample['anns']
+
+        sample_boxes = []
+        for sample_annotation_token in sample_annotation_tokens:
+
+            sample_annotation = nusc.get('sample_annotation', sample_annotation_token)
+            if box_cls == DetectionBox_modified:
+                # Get label name in detection task and filter unused labels.
+                detection_name = category_to_detection_name(sample_annotation['category_name'])
+                if detection_name is None:
+                    continue
+
+                # Get attribute_name.
+                attr_tokens = sample_annotation['attribute_tokens']
+                attr_count = len(attr_tokens)
+                if attr_count == 0:
+                    attribute_name = ''
+                elif attr_count == 1:
+                    attribute_name = attribute_map[attr_tokens[0]]
+                else:
+                    raise Exception('Error: GT annotations must not have more than one attribute!')
+
+                sample_boxes.append(
+                    box_cls(
+                        token=sample_annotation_token,
+                        sample_token=sample_token,
+                        translation=sample_annotation['translation'],
+                        size=sample_annotation['size'],
+                        rotation=sample_annotation['rotation'],
+                        velocity=nusc.box_velocity(sample_annotation['token'])[:2],
+                        num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'],
+                        detection_name=detection_name,
+                        detection_score=-1.0,  # GT samples do not have a score.
+                        attribute_name=attribute_name,
+                        visibility=sample_annotation['visibility_token'],
+                        index=index_map[sample_token]
+                    )
+                )
+            elif box_cls == TrackingBox:
+                assert False
+            else:
+                raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls)
+
+        all_annotations.add_boxes(sample_token, sample_boxes)
+
+    if verbose:
+        print("Loaded ground truth annotations for {} samples.".format(len(all_annotations.sample_tokens)))
+
+    return all_annotations
+
+
+def filter_eval_boxes_by_id(nusc: NuScenes,
+                            eval_boxes: EvalBoxes,
+                            id=None,
+                            verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. Distance, bike-racks and points per box.
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param is: the anns token set that used to keep bboxes.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            if box.token in id:
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After anns based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+def filter_eval_boxes_by_visibility(
+        ori_eval_boxes: EvalBoxes,
+        visibility=None,
+        verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. Distance, bike-racks and points per box.
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param is: the anns token set that used to keep bboxes.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    eval_boxes = copy.deepcopy(ori_eval_boxes)
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            if box.visibility == visibility:
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After visibility based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+def filter_by_sample_token(ori_eval_boxes, valid_sample_tokens=[],  verbose=False):
+    eval_boxes = copy.deepcopy(ori_eval_boxes)
+    for sample_token in eval_boxes.sample_tokens:
+        if sample_token not in valid_sample_tokens:
+            eval_boxes.boxes.pop(sample_token)
+    return eval_boxes
+
+
+def filter_eval_boxes_by_overlap(nusc: NuScenes,
+                                 eval_boxes: EvalBoxes,
+                                 verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. basedon overlap .
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    cams = ['CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_BACK_RIGHT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_FRONT_LEFT']
+
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        sample_record = nusc.get('sample', sample_token)
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            count = 0
+            for cam in cams:
+                '''
+                copy-paste form nuscens
+                '''
+                sample_data_token = sample_record['data'][cam]
+                sd_record = nusc.get('sample_data', sample_data_token)
+                cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+                sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+                pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+                cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+                imsize = (sd_record['width'], sd_record['height'])
+                new_box = Box(box.translation, box.size, Quaternion(box.rotation),
+                              name=box.detection_name, token='')
+
+                # Move box to ego vehicle coord system.
+                new_box.translate(-np.array(pose_record['translation']))
+                new_box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+                #  Move box to sensor coord system.
+                new_box.translate(-np.array(cs_record['translation']))
+                new_box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+                if center_in_image(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+                    count += 1
+                # if exist_corners_in_image_but_not_all(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+                #    count += 1
+
+            if count > 1:
+                with open('center_overlap.txt', 'a') as f:
+                    try:
+                        f.write(box.token + '\n')
+                    except:
+                        pass
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    verbose = True
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After anns based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+class NuScenesEval_custom(NuScenesEval):
+    """
+    Dummy class for backward-compatibility. Same as DetectionEval.
+    """
+
+    def __init__(self,
+                 nusc: NuScenes,
+                 config: DetectionConfig,
+                 result_path: str,
+                 eval_set: str,
+                 output_dir: str = None,
+                 verbose: bool = True,
+                 overlap_test=False,
+                 eval_mask=False,
+                 data_infos=None
+                 ):
+        """
+        Initialize a DetectionEval object.
+        :param nusc: A NuScenes object.
+        :param config: A DetectionConfig object.
+        :param result_path: Path of the nuScenes JSON result file.
+        :param eval_set: The dataset split to evaluate on, e.g. train, val or test.
+        :param output_dir: Folder to save plots and results to.
+        :param verbose: Whether to print to stdout.
+        """
+
+        self.nusc = nusc
+        self.result_path = result_path
+        self.eval_set = eval_set
+        self.output_dir = output_dir
+        self.verbose = verbose
+        self.cfg = config
+        self.overlap_test = overlap_test
+        self.eval_mask = eval_mask
+        self.data_infos = data_infos
+        # Check result file exists.
+        assert os.path.exists(result_path), 'Error: The result file does not exist!'
+
+        # Make dirs.
+        self.plot_dir = os.path.join(self.output_dir, 'plots')
+        if not os.path.isdir(self.output_dir):
+            os.makedirs(self.output_dir)
+        if not os.path.isdir(self.plot_dir):
+            os.makedirs(self.plot_dir)
+
+        # Load data.
+        if verbose:
+            print('Initializing nuScenes detection evaluation')
+        self.pred_boxes, self.meta = load_prediction(self.result_path, self.cfg.max_boxes_per_sample, DetectionBox,
+                                                     verbose=verbose)
+        self.gt_boxes = load_gt(self.nusc, self.eval_set, DetectionBox_modified, verbose=verbose)
+
+        # assert set(self.pred_boxes.sample_tokens) == set(self.gt_boxes.sample_tokens), \
+        #     "Samples in split doesn't match samples in predictions."
+
+        # Add center distances.
+        self.pred_boxes = add_center_dist(nusc, self.pred_boxes)
+        self.gt_boxes = add_center_dist(nusc, self.gt_boxes)
+
+        # Filter boxes (distance, points per box, etc.).
+
+        if verbose:
+            print('Filtering predictions')
+        self.pred_boxes = filter_eval_boxes(nusc, self.pred_boxes, self.cfg.class_range, verbose=verbose)
+        if verbose:
+            print('Filtering ground truth annotations')
+        self.gt_boxes = filter_eval_boxes(nusc, self.gt_boxes, self.cfg.class_range, verbose=verbose)
+
+        if self.overlap_test:
+            self.pred_boxes = filter_eval_boxes_by_overlap(self.nusc, self.pred_boxes)
+
+            self.gt_boxes = filter_eval_boxes_by_overlap(self.nusc, self.gt_boxes, verbose=True)
+
+        self.all_gt = copy.deepcopy(self.gt_boxes)
+        self.all_preds = copy.deepcopy(self.pred_boxes)
+        self.sample_tokens = self.gt_boxes.sample_tokens
+
+        self.index_map = {}
+        for scene in nusc.scene:
+            first_sample_token = scene['first_sample_token']
+            sample = nusc.get('sample', first_sample_token)
+            self.index_map[first_sample_token] = 1
+            index = 2
+            while sample['next'] != '':
+                sample = nusc.get('sample', sample['next'])
+                self.index_map[sample['token']] = index
+                index += 1
+
+    def update_gt(self, type_='vis', visibility='1', index=1):
+        if type_ == 'vis':
+            self.visibility_test = True
+            if self.visibility_test:
+                '''[{'description': 'visibility of whole object is between 0 and 40%',
+                'token': '1',
+                'level': 'v0-40'},
+                {'description': 'visibility of whole object is between 40 and 60%',
+                'token': '2',
+                'level': 'v40-60'},
+                {'description': 'visibility of whole object is between 60 and 80%',
+                'token': '3',
+                'level': 'v60-80'},
+                {'description': 'visibility of whole object is between 80 and 100%',
+                'token': '4',
+                'level': 'v80-100'}]'''
+
+                self.gt_boxes = filter_eval_boxes_by_visibility(self.all_gt, visibility, verbose=True)
+
+        elif type_ == 'ord':
+
+            valid_tokens = [key for (key, value) in self.index_map.items() if value == index]
+            # from IPython import embed
+            # embed()
+            self.gt_boxes = filter_by_sample_token(self.all_gt, valid_tokens)
+            self.pred_boxes = filter_by_sample_token(self.all_preds, valid_tokens)
+        self.sample_tokens = self.gt_boxes.sample_tokens
+
+
+    def evaluate(self) -> Tuple[DetectionMetrics, DetectionMetricDataList]:
+        """
+        Performs the actual evaluation.
+        :return: A tuple of high-level and the raw metric data.
+        """
+        start_time = time.time()
+
+        # -----------------------------------
+        # Step 1: Accumulate metric data for all classes and distance thresholds.
+        # -----------------------------------
+        if self.verbose:
+            print('Accumulating metric data...')
+        metric_data_list = DetectionMetricDataList()
+
+        # print(self.cfg.dist_fcn_callable, self.cfg.dist_ths)
+        # self.cfg.dist_ths = [0.3]
+        # self.cfg.dist_fcn_callable
+        for class_name in self.cfg.class_names:
+            for dist_th in self.cfg.dist_ths:
+                md = accumulate(self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th)
+                metric_data_list.set(class_name, dist_th, md)
+
+        # -----------------------------------
+        # Step 2: Calculate metrics from the data.
+        # -----------------------------------
+        if self.verbose:
+            print('Calculating metrics...')
+        metrics = DetectionMetrics(self.cfg)
+        for class_name in self.cfg.class_names:
+            # Compute APs.
+            for dist_th in self.cfg.dist_ths:
+                metric_data = metric_data_list[(class_name, dist_th)]
+                ap = calc_ap(metric_data, self.cfg.min_recall, self.cfg.min_precision)
+                metrics.add_label_ap(class_name, dist_th, ap)
+            # Compute TP metrics.
+            for metric_name in TP_METRICS:
+                metric_data = metric_data_list[(class_name, self.cfg.dist_th_tp)]
+                if class_name in ['traffic_cone'] and metric_name in ['attr_err', 'vel_err', 'orient_err']:
+                    tp = np.nan
+                elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']:
+                    tp = np.nan
+                else:
+                    tp = calc_tp(metric_data, self.cfg.min_recall, metric_name)
+                metrics.add_label_tp(class_name, metric_name, tp)
+
+        # Compute evaluation time.
+        metrics.add_runtime(time.time() - start_time)
+
+        return metrics, metric_data_list
+
+    def render(self, metrics: DetectionMetrics, md_list: DetectionMetricDataList) -> None:
+        """
+        Renders various PR and TP curves.
+        :param metrics: DetectionMetrics instance.
+        :param md_list: DetectionMetricDataList instance.
+        """
+        if self.verbose:
+            print('Rendering PR and TP curves')
+
+        def savepath(name):
+            return os.path.join(self.plot_dir, name + '.pdf')
+
+        summary_plot(md_list, metrics, min_precision=self.cfg.min_precision, min_recall=self.cfg.min_recall,
+                     dist_th_tp=self.cfg.dist_th_tp, savepath=savepath('summary'))
+
+        for detection_name in self.cfg.class_names:
+            class_pr_curve(md_list, metrics, detection_name, self.cfg.min_precision, self.cfg.min_recall,
+                           savepath=savepath(detection_name + '_pr'))
+
+            class_tp_curve(md_list, metrics, detection_name, self.cfg.min_recall, self.cfg.dist_th_tp,
+                           savepath=savepath(detection_name + '_tp'))
+
+        for dist_th in self.cfg.dist_ths:
+            dist_pr_curve(md_list, metrics, dist_th, self.cfg.min_precision, self.cfg.min_recall,
+                          savepath=savepath('dist_pr_' + str(dist_th)))
+
+
+if __name__ == "__main__":
+
+    # Settings.
+    parser = argparse.ArgumentParser(description='Evaluate nuScenes detection results.',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('result_path', type=str, help='The submission as a JSON file.')
+    parser.add_argument('--output_dir', type=str, default='~/nuscenes-metrics',
+                        help='Folder to store result metrics, graphs and example visualizations.')
+    parser.add_argument('--eval_set', type=str, default='val',
+                        help='Which dataset split to evaluate on, train, val or test.')
+    parser.add_argument('--dataroot', type=str, default='data/nuscenes',
+                        help='Default nuScenes data directory.')
+    parser.add_argument('--version', type=str, default='v1.0-trainval',
+                        help='Which version of the nuScenes dataset to evaluate on, e.g. v1.0-trainval.')
+    parser.add_argument('--config_path', type=str, default='',
+                        help='Path to the configuration file.'
+                             'If no path given, the CVPR 2019 configuration will be used.')
+    parser.add_argument('--plot_examples', type=int, default=0,
+                        help='How many example visualizations to write to disk.')
+    parser.add_argument('--render_curves', type=int, default=1,
+                        help='Whether to render PR and TP curves to disk.')
+    parser.add_argument('--verbose', type=int, default=1,
+                        help='Whether to print to stdout.')
+    args = parser.parse_args()
+
+    result_path_ = os.path.expanduser(args.result_path)
+    output_dir_ = os.path.expanduser(args.output_dir)
+    eval_set_ = args.eval_set
+    dataroot_ = args.dataroot
+    version_ = args.version
+    config_path = args.config_path
+    plot_examples_ = args.plot_examples
+    render_curves_ = bool(args.render_curves)
+    verbose_ = bool(args.verbose)
+
+    if config_path == '':
+        cfg_ = config_factory('detection_cvpr_2019')
+    else:
+        with open(config_path, 'r') as _f:
+            cfg_ = DetectionConfig.deserialize(json.load(_f))
+
+    nusc_ = NuScenes(version=version_, verbose=verbose_, dataroot=dataroot_)
+    nusc_eval = NuScenesEval_custom(nusc_, config=cfg_, result_path=result_path_, eval_set=eval_set_,
+                                    output_dir=output_dir_, verbose=verbose_)
+    for vis in ['1', '2', '3', '4']:
+        nusc_eval.update_gt(type_='vis', visibility=vis)
+        print(f'================ {vis} ===============')
+        nusc_eval.main(plot_examples=plot_examples_, render_curves=render_curves_)
diff --git a/mmcv/datasets/eval_utils/nuscenes_eval_motion.py b/mmcv/datasets/eval_utils/nuscenes_eval_motion.py
new file mode 100644
index 0000000..8ff66f0
--- /dev/null
+++ b/mmcv/datasets/eval_utils/nuscenes_eval_motion.py
@@ -0,0 +1,933 @@
+import argparse
+import copy
+import json
+import os
+import time
+from typing import Tuple, Dict, Any
+import numpy as np
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.detection.data_classes import DetectionConfig
+from nuscenes.eval.detection.evaluate import NuScenesEval
+from pyquaternion import Quaternion
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.utils.data_classes import Box
+from nuscenes.eval.common.loaders import add_center_dist, filter_eval_boxes
+import tqdm
+from nuscenes.utils.geometry_utils import view_points, BoxVisibility
+import pycocotools.mask as mask_util
+import argparse
+import json
+import os
+import random
+import time
+from typing import Tuple, Dict, Any
+
+import numpy as np
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.common.loaders import add_center_dist, filter_eval_boxes
+from nuscenes.eval.detection.algo import calc_ap, calc_tp
+from nuscenes.eval.detection.constants import TP_METRICS
+from nuscenes.eval.detection.data_classes import DetectionConfig, DetectionMetrics, DetectionBox, \
+    DetectionMetricDataList
+from nuscenes.eval.detection.render import summary_plot, class_pr_curve, dist_pr_curve, visualize_sample
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from mmcv.core.bbox.iou_calculators import BboxOverlaps3D
+from IPython import embed
+import json
+from typing import Any
+
+import numpy as np
+from matplotlib import pyplot as plt
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.common.render import setup_axis
+from nuscenes.eval.common.utils import boxes_to_sensor
+from nuscenes.eval.detection.constants import TP_METRICS, DETECTION_NAMES, DETECTION_COLORS, TP_METRICS_UNITS, \
+    PRETTY_DETECTION_NAMES, PRETTY_TP_METRICS
+from nuscenes.eval.detection.data_classes import DetectionMetrics, DetectionMetricData, DetectionMetricDataList
+from nuscenes.utils.data_classes import LidarPointCloud
+from nuscenes.utils.geometry_utils import view_points
+from .eval_utils import load_prediction, load_gt, accumulate, accumulate_motion, \
+    DetectionMotionBox, DetectionMotionBox_modified, DetectionMotionMetricData, \
+    DetectionMotionMetrics, DetectionMotionMetricDataList
+from .metric_utils import traj_fde
+from prettytable import PrettyTable
+
+TP_METRICS = [
+    'trans_err',
+    'scale_err',
+    'orient_err',
+    'vel_err',
+    'attr_err',
+    'min_ade_err',
+    'min_fde_err',
+    'miss_rate_err']
+TP_TRAJ_METRICS = ['min_ade_err', 'min_fde_err', 'miss_rate_err']
+Axis = Any
+
+
+def class_tp_curve(md_list: DetectionMetricDataList,
+                   metrics: DetectionMetrics,
+                   detection_name: str,
+                   min_recall: float,
+                   dist_th_tp: float,
+                   savepath: str = None,
+                   ax: Axis = None) -> None:
+    """
+    Plot the true positive curve for the specified class.
+    :param md_list: DetectionMetricDataList instance.
+    :param metrics: DetectionMetrics instance.
+    :param detection_name:
+    :param min_recall: Minimum recall value.
+    :param dist_th_tp: The distance threshold used to determine matches.
+    :param savepath: If given, saves the the rendering here instead of displaying.
+    :param ax: Axes onto which to render.
+    """
+    # Get metric data for given detection class with tp distance threshold.
+
+    md = md_list[(detection_name, dist_th_tp)]
+    min_recall_ind = round(100 * min_recall)
+    if min_recall_ind <= md.max_recall_ind:
+        # For traffic_cone and barrier only a subset of the metrics are
+        # plotted.
+        rel_metrics = [
+            m for m in TP_METRICS if not np.isnan(
+                metrics.get_label_tp(
+                    detection_name, m))]
+        ylimit = max([max(getattr(md, metric)[min_recall_ind:md.max_recall_ind + 1])
+                     for metric in rel_metrics]) * 1.1
+    else:
+        ylimit = 1.0
+
+    # Prepare axis.
+    if ax is None:
+        ax = setup_axis(
+            title=PRETTY_DETECTION_NAMES[detection_name],
+            xlabel='Recall',
+            ylabel='Error',
+            xlim=1,
+            min_recall=min_recall)
+    ax.set_ylim(0, ylimit)
+
+    # Plot the recall vs. error curve for each tp metric.
+    for metric in TP_METRICS:
+        tp = metrics.get_label_tp(detection_name, metric)
+
+        # Plot only if we have valid data.
+        if tp is not np.nan and min_recall_ind <= md.max_recall_ind:
+            recall, error = md.recall[:md.max_recall_ind +
+                                      1], getattr(md, metric)[:md.max_recall_ind + 1]
+        else:
+            recall, error = [], []
+
+        # Change legend based on tp value
+        if tp is np.nan:
+            label = '{}: n/a'.format(PRETTY_TP_METRICS[metric])
+        elif min_recall_ind > md.max_recall_ind:
+            label = '{}: nan'.format(PRETTY_TP_METRICS[metric])
+        else:
+            label = '{}: {:.2f} ({})'.format(
+                PRETTY_TP_METRICS[metric], tp, TP_METRICS_UNITS[metric])
+        if metric == 'trans_err':
+            label += f' ({md.max_recall_ind})'  # add recall
+            print(f'Recall: {detection_name}: {md.max_recall_ind/100}')
+        ax.plot(recall, error, label=label)
+    ax.axvline(x=md.max_recall, linestyle='-.', color=(0, 0, 0, 0.3))
+    ax.legend(loc='best')
+
+    if savepath is not None:
+        plt.savefig(savepath)
+        plt.close()
+
+
+def center_in_image(box,
+                    intrinsic: np.ndarray,
+                    imsize: Tuple[int,
+                                  int],
+                    vis_level: int = BoxVisibility.ANY) -> bool:
+    """
+    Check if a box is visible inside an image without accounting for occlusions.
+    :param box: The box to be checked.
+    :param intrinsic: <float: 3, 3>. Intrinsic camera matrix.
+    :param imsize: (width, height).
+    :param vis_level: One of the enumerations of <BoxVisibility>.
+    :return True if visibility condition is satisfied.
+    """
+
+    center_3d = box.center.reshape(3, 1)
+    center_img = view_points(center_3d, intrinsic, normalize=True)[:2, :]
+
+    visible = np.logical_and(
+        center_img[0, :] > 0, center_img[0, :] < imsize[0])
+    visible = np.logical_and(visible, center_img[1, :] < imsize[1])
+    visible = np.logical_and(visible, center_img[1, :] > 0)
+    visible = np.logical_and(visible, center_3d[2, :] > 1)
+
+    # True if a corner is at least 0.1 meter in front of the camera.
+    in_front = center_3d[2, :] > 0.1
+
+    if vis_level == BoxVisibility.ALL:
+        return all(visible) and all(in_front)
+    elif vis_level == BoxVisibility.ANY:
+        return any(visible) and all(in_front)
+    elif vis_level == BoxVisibility.NONE:
+        return True
+    else:
+        raise ValueError("vis_level: {} not valid".format(vis_level))
+
+
+def exist_corners_in_image_but_not_all(box,
+                                       intrinsic: np.ndarray,
+                                       imsize: Tuple[int,
+                                                     int],
+                                       vis_level: int = BoxVisibility.ANY) -> bool:
+    """
+    Check if a box is visible in images but not all corners in image .
+    :param box: The box to be checked.
+    :param intrinsic: <float: 3, 3>. Intrinsic camera matrix.
+    :param imsize: (width, height).
+    :param vis_level: One of the enumerations of <BoxVisibility>.
+    :return True if visibility condition is satisfied.
+    """
+
+    corners_3d = box.corners()
+    corners_img = view_points(corners_3d, intrinsic, normalize=True)[:2, :]
+
+    visible = np.logical_and(
+        corners_img[0, :] > 0, corners_img[0, :] < imsize[0])
+    visible = np.logical_and(visible, corners_img[1, :] < imsize[1])
+    visible = np.logical_and(visible, corners_img[1, :] > 0)
+    visible = np.logical_and(visible, corners_3d[2, :] > 1)
+
+    # True if a corner is at least 0.1 meter in front of the camera.
+    in_front = corners_3d[2, :] > 0.1
+
+    if any(visible) and not all(visible) and all(in_front):
+        return True
+    else:
+        return False
+
+
+def filter_eval_boxes_by_id(nusc: NuScenes,
+                            eval_boxes: EvalBoxes,
+                            id=None,
+                            verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. Distance, bike-racks and points per box.
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param is: the anns token set that used to keep bboxes.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            if box.token in id:
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After anns based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+def filter_eval_boxes_by_visibility(
+        ori_eval_boxes: EvalBoxes,
+        visibility=None,
+        verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. Distance, bike-racks and points per box.
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param is: the anns token set that used to keep bboxes.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    eval_boxes = copy.deepcopy(ori_eval_boxes)
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            if box.visibility == visibility:
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After visibility based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+def filter_by_sample_token(
+        ori_eval_boxes,
+        valid_sample_tokens=[],
+        verbose=False):
+    eval_boxes = copy.deepcopy(ori_eval_boxes)
+    for sample_token in eval_boxes.sample_tokens:
+        if sample_token not in valid_sample_tokens:
+            eval_boxes.boxes.pop(sample_token)
+    return eval_boxes
+
+
+def filter_eval_boxes_by_overlap(nusc: NuScenes,
+                                 eval_boxes: EvalBoxes,
+                                 verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. basedon overlap .
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    cams = ['CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_BACK_RIGHT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_FRONT_LEFT']
+
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        sample_record = nusc.get('sample', sample_token)
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            count = 0
+            for cam in cams:
+                '''
+                copy-paste form nuscens
+                '''
+                sample_data_token = sample_record['data'][cam]
+                sd_record = nusc.get('sample_data', sample_data_token)
+                cs_record = nusc.get(
+                    'calibrated_sensor',
+                    sd_record['calibrated_sensor_token'])
+                sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+                pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+                cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+                imsize = (sd_record['width'], sd_record['height'])
+                new_box = Box(
+                    box.translation,
+                    box.size,
+                    Quaternion(
+                        box.rotation),
+                    name=box.detection_name,
+                    token='')
+
+                # Move box to ego vehicle coord system.
+                new_box.translate(-np.array(pose_record['translation']))
+                new_box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+                #  Move box to sensor coord system.
+                new_box.translate(-np.array(cs_record['translation']))
+                new_box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+                if center_in_image(
+                        new_box,
+                        cam_intrinsic,
+                        imsize,
+                        vis_level=BoxVisibility.ANY):
+                    count += 1
+                # if exist_corners_in_image_but_not_all(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+                #    count += 1
+
+            if count > 1:
+                with open('center_overlap.txt', 'a') as f:
+                    try:
+                        f.write(box.token + '\n')
+                    except BaseException:
+                        pass
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    verbose = True
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After anns based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+class MotionEval(NuScenesEval):
+    """
+    Dummy class for backward-compatibility. Same as DetectionEval.
+    """
+
+    def __init__(self,
+                 nusc: NuScenes,
+                 config: DetectionConfig,
+                 result_path: str,
+                 eval_set: str,
+                 output_dir: str = None,
+                 verbose: bool = True,
+                 overlap_test=False,
+                 eval_mask=False,
+                 data_infos=None,
+                 category_convert_type='motion_category',
+                 ):
+        """
+        Initialize a DetectionEval object.
+        :param nusc: A NuScenes object.
+        :param config: A DetectionConfig object.
+        :param result_path: Path of the nuScenes JSON result file.
+        :param eval_set: The dataset split to evaluate on, e.g. train, val or test.
+        :param output_dir: Folder to save plots and results to.
+        :param verbose: Whether to print to stdout.
+        """
+
+        self.nusc = nusc
+        self.result_path = result_path
+        self.eval_set = eval_set
+        self.output_dir = output_dir
+        self.verbose = verbose
+        self.cfg = config
+        self.overlap_test = overlap_test
+        self.eval_mask = eval_mask
+        self.data_infos = data_infos
+        # Check result file exists.
+        assert os.path.exists(
+            result_path), 'Error: The result file does not exist!'
+
+        # Make dirs.
+        self.plot_dir = os.path.join(self.output_dir, 'plots')
+        if not os.path.isdir(self.output_dir):
+            os.makedirs(self.output_dir)
+        if not os.path.isdir(self.plot_dir):
+            os.makedirs(self.plot_dir)
+
+        # Load data.
+        if verbose:
+            print('Initializing nuScenes detection evaluation')
+        self.pred_boxes, self.meta = load_prediction(self.result_path, self.cfg.max_boxes_per_sample, DetectionMotionBox,
+                                                     verbose=verbose, category_convert_type=category_convert_type)
+        self.gt_boxes = load_gt(
+            self.nusc,
+            self.eval_set,
+            DetectionMotionBox_modified,
+            verbose=verbose,
+            category_convert_type=category_convert_type)
+
+        assert set(self.pred_boxes.sample_tokens) == set(self.gt_boxes.sample_tokens), \
+            "Samples in split doesn't match samples in predictions."
+
+        # Add center distances.
+        self.pred_boxes = add_center_dist(nusc, self.pred_boxes)
+        self.gt_boxes = add_center_dist(nusc, self.gt_boxes)
+
+        # Filter boxes (distance, points per box, etc.).
+
+        if verbose:
+            print('Filtering predictions')
+        self.pred_boxes = filter_eval_boxes(
+            nusc, self.pred_boxes, self.cfg.class_range, verbose=verbose)
+        if verbose:
+            print('Filtering ground truth annotations')
+        self.gt_boxes = filter_eval_boxes(
+            nusc, self.gt_boxes, self.cfg.class_range, verbose=verbose)
+
+        if self.overlap_test:
+            self.pred_boxes = filter_eval_boxes_by_overlap(
+                self.nusc, self.pred_boxes)
+
+            self.gt_boxes = filter_eval_boxes_by_overlap(
+                self.nusc, self.gt_boxes, verbose=True)
+
+        self.all_gt = copy.deepcopy(self.gt_boxes)
+        self.all_preds = copy.deepcopy(self.pred_boxes)
+        self.sample_tokens = self.gt_boxes.sample_tokens
+
+        self.index_map = {}
+        for scene in nusc.scene:
+            first_sample_token = scene['first_sample_token']
+            sample = nusc.get('sample', first_sample_token)
+            self.index_map[first_sample_token] = 1
+            index = 2
+            while sample['next'] != '':
+                sample = nusc.get('sample', sample['next'])
+                self.index_map[sample['token']] = index
+                index += 1
+
+    def update_gt(self, type_='vis', visibility='1', index=1):
+        if type_ == 'vis':
+            self.visibility_test = True
+            if self.visibility_test:
+                '''[{'description': 'visibility of whole object is between 0 and 40%',
+                'token': '1',
+                'level': 'v0-40'},
+                {'description': 'visibility of whole object is between 40 and 60%',
+                'token': '2',
+                'level': 'v40-60'},
+                {'description': 'visibility of whole object is between 60 and 80%',
+                'token': '3',
+                'level': 'v60-80'},
+                {'description': 'visibility of whole object is between 80 and 100%',
+                'token': '4',
+                'level': 'v80-100'}]'''
+
+                self.gt_boxes = filter_eval_boxes_by_visibility(
+                    self.all_gt, visibility, verbose=True)
+
+        elif type_ == 'ord':
+
+            valid_tokens = [
+                key for (
+                    key,
+                    value) in self.index_map.items() if value == index]
+            # from IPython import embed
+            # embed()
+            self.gt_boxes = filter_by_sample_token(self.all_gt, valid_tokens)
+            self.pred_boxes = filter_by_sample_token(
+                self.all_preds, valid_tokens)
+        self.sample_tokens = self.gt_boxes.sample_tokens
+
+    def evaluate(self) -> Tuple[DetectionMotionMetrics,
+                                DetectionMotionMetricDataList]:
+        """
+        Performs the actual evaluation.
+        :return: A tuple of high-level and the raw metric data.
+        """
+        start_time = time.time()
+
+        # -----------------------------------
+        # Step 1: Accumulate metric data for all classes and distance thresholds.
+        # -----------------------------------
+        if self.verbose:
+            print('Accumulating metric data...')
+        metric_data_list = DetectionMotionMetricDataList()
+
+        # print(self.cfg.dist_fcn_callable, self.cfg.dist_ths)
+        # self.cfg.dist_ths = [0.3]
+        # self.cfg.dist_fcn_callable
+        for class_name in self.cfg.class_names:
+            for dist_th in self.cfg.dist_ths:
+                md, _, _, _ = accumulate(
+                    self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th)
+                metric_data_list.set(class_name, dist_th, md)
+
+        # -----------------------------------
+        # Step 2: Calculate metrics from the data.
+        # -----------------------------------
+        if self.verbose:
+            print('Calculating metrics...')
+        metrics = DetectionMotionMetrics(self.cfg)
+
+        traj_metrics = {}
+        for class_name in self.cfg.class_names:
+            # Compute APs.
+            for dist_th in self.cfg.dist_ths:
+                metric_data = metric_data_list[(class_name, dist_th)]
+                ap = calc_ap(
+                    metric_data,
+                    self.cfg.min_recall,
+                    self.cfg.min_precision)
+                metrics.add_label_ap(class_name, dist_th, ap)
+            # Compute TP metrics.
+            for metric_name in TP_METRICS:
+                metric_data = metric_data_list[(
+                    class_name, self.cfg.dist_th_tp)]
+                if class_name in ['traffic_cone'] and metric_name in [
+                        'attr_err', 'vel_err', 'orient_err']:
+                    tp = np.nan
+                elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']:
+                    tp = np.nan
+                else:
+                    tp = calc_tp(metric_data, self.cfg.min_recall, metric_name)
+                    if metric_name in TP_TRAJ_METRICS:
+                        if class_name not in traj_metrics:
+                            traj_metrics[class_name] = {}
+                        traj_metrics[class_name][metric_name] = tp
+                metrics.add_label_tp(class_name, metric_name, tp)
+        print_traj_metrics(traj_metrics)
+
+        # Compute evaluation time.
+        metrics.add_runtime(time.time() - start_time)
+
+        return metrics, metric_data_list
+
+    def evaluate_motion(
+            self) -> Tuple[DetectionMotionMetrics, DetectionMotionMetricDataList]:
+        """
+        Performs the actual evaluation.
+        :return: A tuple of high-level and the raw metric data.
+        """
+        start_time = time.time()
+
+        self.cfg.dist_ths = [1.0]
+        self.cfg.dist_th_tp = 1.0  # center dist for detection
+        traj_dist_th = 2.0  # FDE for traj
+
+        # -----------------------------------
+        # Step 1: Accumulate metric data for all classes and distance thresholds.
+        # -----------------------------------
+        if self.verbose:
+            print('Accumulating metric data...')
+        metric_data_list = DetectionMotionMetricDataList()
+
+        for class_name in self.cfg.class_names:
+            for dist_th in self.cfg.dist_ths:
+                md, _, _, _ = accumulate_motion(
+                    self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, traj_fde, dist_th, traj_dist_th)
+                metric_data_list.set(class_name, dist_th, md)
+
+        # -----------------------------------
+        # Step 2: Calculate metrics from the data.
+        # -----------------------------------
+        if self.verbose:
+            print('Calculating metrics...')
+        metrics = DetectionMotionMetrics(self.cfg)
+
+        traj_metrics = {}
+        for class_name in self.cfg.class_names:
+            # Compute APs.
+            for dist_th in self.cfg.dist_ths:
+                metric_data = metric_data_list[(class_name, dist_th)]
+                ap = calc_ap(
+                    metric_data,
+                    self.cfg.min_recall,
+                    self.cfg.min_precision)
+                metrics.add_label_ap(class_name, dist_th, ap)
+            # Compute TP metrics.
+            for metric_name in TP_METRICS:
+                metric_data = metric_data_list[(
+                    class_name, self.cfg.dist_th_tp)]
+                if class_name in ['traffic_cone'] and metric_name in [
+                        'attr_err', 'vel_err', 'orient_err']:
+                    tp = np.nan
+                elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']:
+                    tp = np.nan
+                else:
+                    tp = calc_tp(metric_data, self.cfg.min_recall, metric_name)
+                    if metric_name in TP_TRAJ_METRICS:
+                        if class_name not in traj_metrics:
+                            traj_metrics[class_name] = {}
+                        traj_metrics[class_name][metric_name] = tp
+                metrics.add_label_tp(class_name, metric_name, tp)
+        print_traj_metrics(traj_metrics)
+
+        # Compute evaluation time.
+        metrics.add_runtime(time.time() - start_time)
+
+        return metrics, metric_data_list
+
+    def evaluate_epa(
+            self) -> Tuple[DetectionMotionMetrics, DetectionMotionMetricDataList]:
+        """
+        Performs the actual evaluation.
+        :return: A tuple of high-level and the raw metric data.
+        """
+        start_time = time.time()
+
+        self.cfg.dist_ths = [2.0]
+        self.cfg.dist_th_tp = 2.0  # center dist for detection
+        traj_dist_th = 2.0  # FDE for traj
+
+        # -----------------------------------
+        # Step 1: Accumulate metric data for all classes and distance thresholds.
+        # -----------------------------------
+        if self.verbose:
+            print('Accumulating metric data...')
+        metric_data_list = DetectionMotionMetricDataList()
+
+        for class_name in self.cfg.class_names:
+            for dist_th in self.cfg.dist_ths:
+                md, N_det_tp, N_det_fp, N_det_gt = accumulate(
+                    self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th)
+                md, N_det_traj_tp, N_det_traj_fp, N_det_traj_gt = accumulate_motion(
+                    self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, traj_fde, dist_th, traj_dist_th)
+                metric_data_list.set(class_name, dist_th, md)
+                EPA = (N_det_traj_tp - 0.5 * N_det_fp) / (N_det_gt + 1e-5)
+                print(N_det_traj_tp, N_det_fp, N_det_gt)
+                print('EPA ', class_name, EPA)
+
+        # -----------------------------------
+        # Step 2: Calculate metrics from the data.
+        # -----------------------------------
+        if self.verbose:
+            print('Calculating metrics...')
+        metrics = DetectionMotionMetrics(self.cfg)
+
+        traj_metrics = {}
+        for class_name in self.cfg.class_names:
+            # Compute APs.
+            for dist_th in self.cfg.dist_ths:
+                metric_data = metric_data_list[(class_name, dist_th)]
+                ap = calc_ap(
+                    metric_data,
+                    self.cfg.min_recall,
+                    self.cfg.min_precision)
+                metrics.add_label_ap(class_name, dist_th, ap)
+            # Compute TP metrics.
+            for metric_name in TP_METRICS:
+                metric_data = metric_data_list[(
+                    class_name, self.cfg.dist_th_tp)]
+                if class_name in ['traffic_cone'] and metric_name in [
+                        'attr_err', 'vel_err', 'orient_err']:
+                    tp = np.nan
+                elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']:
+                    tp = np.nan
+                else:
+                    tp = calc_tp(metric_data, self.cfg.min_recall, metric_name)
+                    if metric_name in TP_TRAJ_METRICS:
+                        if class_name not in traj_metrics:
+                            traj_metrics[class_name] = {}
+                        traj_metrics[class_name][metric_name] = tp
+                metrics.add_label_tp(class_name, metric_name, tp)
+        print_traj_metrics(traj_metrics)
+
+        # Compute evaluation time.
+        metrics.add_runtime(time.time() - start_time)
+
+        return metrics, metric_data_list
+
+    def main(self,
+             plot_examples: int = 0,
+             render_curves: bool = True,
+             eval_mode: str = 'standard') -> Dict[str, Any]:
+        """
+        Main function that loads the evaluation code, visualizes samples, runs the evaluation and renders stat plots.
+        :param plot_examples: How many example visualizations to write to disk.
+        :param render_curves: Whether to render PR and TP curves to disk.
+        :return: A dict that stores the high-level metrics and meta data.
+        """
+        if plot_examples > 0:
+            # Select a random but fixed subset to plot.
+            random.seed(42)
+            sample_tokens = list(self.sample_tokens)
+            random.shuffle(sample_tokens)
+            sample_tokens = sample_tokens[:plot_examples]
+
+            # Visualize samples.
+            example_dir = os.path.join(self.output_dir, 'examples')
+            if not os.path.isdir(example_dir):
+                os.mkdir(example_dir)
+            for sample_token in sample_tokens:
+                visualize_sample(self.nusc,
+                                 sample_token,
+                                 self.gt_boxes if self.eval_set != 'test' else EvalBoxes(),
+                                 # Don't render test GT.
+                                 self.pred_boxes,
+                                 eval_range=max(self.cfg.class_range.values()),
+                                 savepath=os.path.join(example_dir, '{}.png'.format(sample_token)))
+
+        # Run evaluation.
+        if eval_mode == 'motion_map':
+            metrics, metric_data_list = self.evaluate_motion()
+        elif eval_mode == 'standard':
+            metrics, metric_data_list = self.evaluate()
+        elif eval_mode == 'epa':
+            metrics, metric_data_list = self.evaluate_epa()
+        else:
+            raise NotImplementedError
+        # Render PR and TP curves.
+        if render_curves:
+            self.render(metrics, metric_data_list)
+
+        # Dump the metric data, meta and metrics to disk.
+        if self.verbose:
+            print('Saving metrics to: %s' % self.output_dir)
+        metrics_summary = metrics.serialize()
+        metrics_summary['meta'] = self.meta.copy()
+        with open(os.path.join(self.output_dir, 'metrics_summary.json'), 'w') as f:
+            json.dump(metrics_summary, f, indent=2)
+        with open(os.path.join(self.output_dir, 'metrics_details.json'), 'w') as f:
+            json.dump(metric_data_list.serialize(), f, indent=2)
+
+        # Print high-level metrics.
+        print('mAP: %.4f' % (metrics_summary['mean_ap']))
+        err_name_mapping = {
+            'trans_err': 'mATE',
+            'scale_err': 'mASE',
+            'orient_err': 'mAOE',
+            'vel_err': 'mAVE',
+            'attr_err': 'mAAE'
+        }
+        for tp_name, tp_val in metrics_summary['tp_errors'].items():
+            print('%s: %.4f' % (err_name_mapping[tp_name], tp_val))
+        print('NDS: %.4f' % (metrics_summary['nd_score']))
+        print('Eval time: %.1fs' % metrics_summary['eval_time'])
+
+        # Print per-class metrics.
+        print()
+        print('Per-class results:')
+        print('Object Class\tAP\tATE\tASE\tAOE\tAVE\tAAE')
+        class_aps = metrics_summary['mean_dist_aps']
+        class_tps = metrics_summary['label_tp_errors']
+        for class_name in class_aps.keys():
+            print('%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
+                  % (class_name, class_aps[class_name],
+                     class_tps[class_name]['trans_err'],
+                     class_tps[class_name]['scale_err'],
+                     class_tps[class_name]['orient_err'],
+                     class_tps[class_name]['vel_err'],
+                     class_tps[class_name]['attr_err']))
+
+        return metrics_summary
+
+    def render(self, metrics: DetectionMetrics,
+               md_list: DetectionMetricDataList) -> None:
+        """
+        Renders various PR and TP curves.
+        :param metrics: DetectionMetrics instance.
+        :param md_list: DetectionMetricDataList instance.
+        """
+        if self.verbose:
+            print('Rendering PR and TP curves')
+
+        def savepath(name):
+            return os.path.join(self.plot_dir, name + '.pdf')
+
+        summary_plot(
+            md_list,
+            metrics,
+            min_precision=self.cfg.min_precision,
+            min_recall=self.cfg.min_recall,
+            dist_th_tp=self.cfg.dist_th_tp,
+            savepath=savepath('summary'))
+
+        for detection_name in self.cfg.class_names:
+            class_pr_curve(
+                md_list,
+                metrics,
+                detection_name,
+                self.cfg.min_precision,
+                self.cfg.min_recall,
+                savepath=savepath(
+                    detection_name +
+                    '_pr'))
+
+            class_tp_curve(
+                md_list,
+                metrics,
+                detection_name,
+                self.cfg.min_recall,
+                self.cfg.dist_th_tp,
+                savepath=savepath(
+                    detection_name +
+                    '_tp'))
+
+        for dist_th in self.cfg.dist_ths:
+            dist_pr_curve(
+                md_list,
+                metrics,
+                dist_th,
+                self.cfg.min_precision,
+                self.cfg.min_recall,
+                savepath=savepath(
+                    'dist_pr_' +
+                    str(dist_th)))
+
+
+def print_traj_metrics(metrics):
+    class_names = metrics.keys()
+    x = PrettyTable()
+    x.field_names = ["class names"] + TP_TRAJ_METRICS
+    for class_name in metrics.keys():
+        row_data = [class_name]
+        for m in TP_TRAJ_METRICS:
+            row_data.append('%.4f' % metrics[class_name][m])
+        x.add_row(row_data)
+    print(x)
+
+
+if __name__ == "__main__":
+
+    # Settings.
+    parser = argparse.ArgumentParser(
+        description='Evaluate nuScenes detection results.',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        'result_path',
+        type=str,
+        help='The submission as a JSON file.')
+    parser.add_argument(
+        '--output_dir',
+        type=str,
+        default='~/nuscenes-metrics',
+        help='Folder to store result metrics, graphs and example visualizations.')
+    parser.add_argument(
+        '--eval_set',
+        type=str,
+        default='val',
+        help='Which dataset split to evaluate on, train, val or test.')
+    parser.add_argument('--dataroot', type=str, default='data/nuscenes',
+                        help='Default nuScenes data directory.')
+    parser.add_argument(
+        '--version',
+        type=str,
+        default='v1.0-trainval',
+        help='Which version of the nuScenes dataset to evaluate on, e.g. v1.0-trainval.')
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        default='',
+        help='Path to the configuration file.'
+        'If no path given, the CVPR 2019 configuration will be used.')
+    parser.add_argument(
+        '--plot_examples',
+        type=int,
+        default=0,
+        help='How many example visualizations to write to disk.')
+    parser.add_argument('--render_curves', type=int, default=1,
+                        help='Whether to render PR and TP curves to disk.')
+    parser.add_argument('--verbose', type=int, default=1,
+                        help='Whether to print to stdout.')
+    args = parser.parse_args()
+
+    result_path_ = os.path.expanduser(args.result_path)
+    output_dir_ = os.path.expanduser(args.output_dir)
+    eval_set_ = args.eval_set
+    dataroot_ = args.dataroot
+    version_ = args.version
+    config_path = args.config_path
+    plot_examples_ = args.plot_examples
+    render_curves_ = bool(args.render_curves)
+    verbose_ = bool(args.verbose)
+
+    if config_path == '':
+        cfg_ = config_factory('detection_cvpr_2019')
+    else:
+        with open(config_path, 'r') as _f:
+            cfg_ = DetectionConfig.deserialize(json.load(_f))
+
+    nusc_ = NuScenes(version=version_, verbose=verbose_, dataroot=dataroot_)
+    nusc_eval = MotionEval(
+        nusc_,
+        config=cfg_,
+        result_path=result_path_,
+        eval_set=eval_set_,
+        output_dir=output_dir_,
+        verbose=verbose_)
+    for vis in ['1', '2', '3', '4']:
+        nusc_eval.update_gt(type_='vis', visibility=vis)
+        print(f'================ {vis} ===============')
+        nusc_eval.main(
+            plot_examples=plot_examples_,
+            render_curves=render_curves_)
diff --git a/mmcv/datasets/lyft_dataset.py b/mmcv/datasets/lyft_dataset.py
new file mode 100644
index 0000000..34707ee
--- /dev/null
+++ b/mmcv/datasets/lyft_dataset.py
@@ -0,0 +1,561 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+import os
+import pandas as pd
+import tempfile
+from lyft_dataset_sdk.lyftdataset import LyftDataset as Lyft
+from lyft_dataset_sdk.utils.data_classes import Box as LyftBox
+from os import path as osp
+from pyquaternion import Quaternion
+
+from mmcv.core.evaluation.lyft_eval import lyft_eval
+from mmcv.datasets import DATASETS
+from mmcv.core import show_result
+# from mmcv.core.bbox import Box3DMode, Coord3DMode, LiDARInstance3DBoxes
+from mmcv.core.bbox.structures.box_3d_mode import Box3DMode
+from mmcv.core.bbox.structures.coord_3d_mode import Coord3DMode
+from mmcv.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
+from .custom_3d import Custom3DDataset
+from .pipelines import Compose
+
+
+@DATASETS.register_module()
+class LyftDataset(Custom3DDataset):
+    r"""Lyft Dataset.
+
+    This class serves as the API for experiments on the Lyft Dataset.
+
+    Please refer to
+    `<https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/data>`_
+    for data downloading.
+
+    Args:
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        data_root (str): Path of dataset root.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        load_interval (int, optional): Interval of loading the dataset. It is
+            used to uniformly sample the dataset. Defaults to 1.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR' in this dataset. Available options includes
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
+            Defaults to True.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+    """  # noqa: E501
+    NameMapping = {
+        'bicycle': 'bicycle',
+        'bus': 'bus',
+        'car': 'car',
+        'emergency_vehicle': 'emergency_vehicle',
+        'motorcycle': 'motorcycle',
+        'other_vehicle': 'other_vehicle',
+        'pedestrian': 'pedestrian',
+        'truck': 'truck',
+        'animal': 'animal'
+    }
+    DefaultAttribute = {
+        'car': 'is_stationary',
+        'truck': 'is_stationary',
+        'bus': 'is_stationary',
+        'emergency_vehicle': 'is_stationary',
+        'other_vehicle': 'is_stationary',
+        'motorcycle': 'is_stationary',
+        'bicycle': 'is_stationary',
+        'pedestrian': 'is_stationary',
+        'animal': 'is_stationary'
+    }
+    CLASSES = ('car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle',
+               'motorcycle', 'bicycle', 'pedestrian', 'animal')
+
+    def __init__(self,
+                 ann_file,
+                 pipeline=None,
+                 data_root=None,
+                 classes=None,
+                 load_interval=1,
+                 modality=None,
+                 box_type_3d='LiDAR',
+                 filter_empty_gt=True,
+                 test_mode=False):
+        self.load_interval = load_interval
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            classes=classes,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode)
+
+        if self.modality is None:
+            self.modality = dict(
+                use_camera=False,
+                use_lidar=True,
+                use_radar=False,
+                use_map=False,
+                use_external=False,
+            )
+
+    def load_annotations(self, ann_file):
+        """Load annotations from ann_file.
+
+        Args:
+            ann_file (str): Path of the annotation file.
+
+        Returns:
+            list[dict]: List of annotations sorted by timestamps.
+        """
+        data = mmcv.load(ann_file)
+        data_infos = list(sorted(data['infos'], key=lambda e: e['timestamp']))
+        data_infos = data_infos[::self.load_interval]
+        self.metadata = data['metadata']
+        self.version = self.metadata['version']
+        return data_infos
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data \
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): sample index
+                - pts_filename (str): filename of point clouds
+                - sweeps (list[dict]): infos of sweeps
+                - timestamp (float): sample timestamp
+                - img_filename (str, optional): image filename
+                - lidar2img (list[np.ndarray], optional): transformations \
+                    from lidar to different cameras
+                - ann_info (dict): annotation info
+        """
+        info = self.data_infos[index]
+
+        # standard protocal modified from SECOND.Pytorch
+        input_dict = dict(
+            sample_idx=info['token'],
+            pts_filename=info['lidar_path'],
+            sweeps=info['sweeps'],
+            timestamp=info['timestamp'] / 1e6,
+        )
+
+        if self.modality['use_camera']:
+            image_paths = []
+            lidar2img_rts = []
+            for cam_type, cam_info in info['cams'].items():
+                image_paths.append(cam_info['data_path'])
+                # obtain lidar to image transformation matrix
+                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+                lidar2cam_t = cam_info[
+                    'sensor2lidar_translation'] @ lidar2cam_r.T
+                lidar2cam_rt = np.eye(4)
+                lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                lidar2cam_rt[3, :3] = -lidar2cam_t
+                intrinsic = cam_info['cam_intrinsic']
+                viewpad = np.eye(4)
+                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+                lidar2img_rts.append(lidar2img_rt)
+
+            input_dict.update(
+                dict(
+                    img_filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                ))
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+
+        return input_dict
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \
+                    3D ground truth bboxes.
+                - gt_labels_3d (np.ndarray): Labels of ground truths.
+                - gt_names (list[str]): Class names of ground truths.
+        """
+        info = self.data_infos[index]
+        gt_bboxes_3d = info['gt_boxes']
+        gt_names_3d = info['gt_names']
+        gt_labels_3d = []
+        for cat in gt_names_3d:
+            if cat in self.CLASSES:
+                gt_labels_3d.append(self.CLASSES.index(cat))
+            else:
+                gt_labels_3d.append(-1)
+        gt_labels_3d = np.array(gt_labels_3d)
+
+        if 'gt_shape' in info:
+            gt_shape = info['gt_shape']
+            gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_shape], axis=-1)
+
+        # the lyft box center is [0.5, 0.5, 0.5], we change it to be
+        # the same as KITTI (0.5, 0.5, 0)
+        gt_bboxes_3d = LiDARInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+        )
+        return anns_results
+
+    def _format_bbox(self, results, jsonfile_prefix=None):
+        """Convert the results to the standard format.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of the output jsonfile.
+                You can specify the output directory/filename by
+                modifying the jsonfile_prefix. Default: None.
+
+        Returns:
+            str: Path of the output json file.
+        """
+        lyft_annos = {}
+        mapped_class_names = self.CLASSES
+
+        print('Start to convert detection format...')
+        for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
+            annos = []
+            boxes = output_to_lyft_box(det)
+            sample_token = self.data_infos[sample_id]['token']
+            boxes = lidar_lyft_box_to_global(self.data_infos[sample_id], boxes)
+            for i, box in enumerate(boxes):
+                name = mapped_class_names[box.label]
+                lyft_anno = dict(
+                    sample_token=sample_token,
+                    translation=box.center.tolist(),
+                    size=box.wlh.tolist(),
+                    rotation=box.orientation.elements.tolist(),
+                    name=name,
+                    score=box.score)
+                annos.append(lyft_anno)
+            lyft_annos[sample_token] = annos
+        lyft_submissions = {
+            'meta': self.modality,
+            'results': lyft_annos,
+        }
+
+        mmcv.mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, 'results_lyft.json')
+        print('Results writes to', res_path)
+        mmcv.dump(lyft_submissions, res_path)
+        return res_path
+
+    def _evaluate_single(self,
+                         result_path,
+                         logger=None,
+                         metric='bbox',
+                         result_name='pts_bbox'):
+        """Evaluation for a single model in Lyft protocol.
+
+        Args:
+            result_path (str): Path of the result file.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            metric (str): Metric name used for evaluation. Default: 'bbox'.
+            result_name (str): Result name in the metric prefix.
+                Default: 'pts_bbox'.
+
+        Returns:
+            dict: Dictionary of evaluation details.
+        """
+
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+        lyft = Lyft(
+            data_path=osp.join(self.data_root, self.version),
+            json_path=osp.join(self.data_root, self.version, self.version),
+            verbose=True)
+        eval_set_map = {
+            'v1.01-train': 'val',
+        }
+        metrics = lyft_eval(lyft, self.data_root, result_path,
+                            eval_set_map[self.version], output_dir, logger)
+
+        # record metrics
+        detail = dict()
+        metric_prefix = f'{result_name}_Lyft'
+
+        for i, name in enumerate(metrics['class_names']):
+            AP = float(metrics['mAPs_cate'][i])
+            detail[f'{metric_prefix}/{name}_AP'] = AP
+
+        detail[f'{metric_prefix}/mAP'] = metrics['Final mAP']
+        return detail
+
+    def format_results(self, results, jsonfile_prefix=None, csv_savepath=None):
+        """Format the results to json (standard format for COCO evaluation).
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            csv_savepath (str | None): The path for saving csv files.
+                It includes the file path and the csv filename,
+                e.g., "a/b/filename.csv". If not specified,
+                the result will not be converted to csv file.
+
+        Returns:
+            tuple: Returns (result_files, tmp_dir), where `result_files` is a \
+                dict containing the json filepaths, `tmp_dir` is the temporal \
+                directory created for saving json files when \
+                `jsonfile_prefix` is not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+        assert len(results) == len(self), (
+            'The length of results is not equal to the dataset len: {} != {}'.
+            format(len(results), len(self)))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+
+        # currently the output prediction results could be in two formats
+        # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
+        # 2. list of dict('pts_bbox' or 'img_bbox':
+        #     dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
+        # this is a workaround to enable evaluation of both formats on Lyft
+        # refer to https://github.com/open-mmlab/mmdetection3d/issues/449
+        if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
+            result_files = self._format_bbox(results, jsonfile_prefix)
+        else:
+            # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
+            result_files = dict()
+            for name in results[0]:
+                print(f'\nFormating bboxes of {name}')
+                results_ = [out[name] for out in results]
+                tmp_file_ = osp.join(jsonfile_prefix, name)
+                result_files.update(
+                    {name: self._format_bbox(results_, tmp_file_)})
+        if csv_savepath is not None:
+            self.json2csv(result_files['pts_bbox'], csv_savepath)
+        return result_files, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 csv_savepath=None,
+                 result_names=['pts_bbox'],
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluation in Lyft protocol.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            csv_savepath (str | None): The path for saving csv files.
+                It includes the file path and the csv filename,
+                e.g., "a/b/filename.csv". If not specified,
+                the result will not be converted to csv file.
+            show (bool): Whether to visualize.
+                Default: False.
+            out_dir (str): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+
+        Returns:
+            dict[str, float]: Evaluation results.
+        """
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix,
+                                                    csv_savepath)
+
+        if isinstance(result_files, dict):
+            results_dict = dict()
+            for name in result_names:
+                print(f'Evaluating bboxes of {name}')
+                ret_dict = self._evaluate_single(result_files[name])
+            results_dict.update(ret_dict)
+        elif isinstance(result_files, str):
+            results_dict = self._evaluate_single(result_files)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+
+        if show:
+            self.show(results, out_dir, pipeline=pipeline)
+        return results_dict
+
+    def _build_default_pipeline(self):
+        """Build the default pipeline for this dataset."""
+        pipeline = [
+            dict(
+                type='LoadPointsFromFile',
+                coord_type='LIDAR',
+                load_dim=5,
+                use_dim=5,
+                file_client_args=dict(backend='disk')),
+            dict(
+                type='LoadPointsFromMultiSweeps',
+                sweeps_num=10,
+                file_client_args=dict(backend='disk')),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=self.CLASSES,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ]
+        return Compose(pipeline)
+
+    def show(self, results, out_dir, show=True, pipeline=None):
+        """Results visualization.
+
+        Args:
+            results (list[dict]): List of bounding boxes results.
+            out_dir (str): Output directory of visualization result.
+            show (bool): Visualize the results online.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+        """
+        assert out_dir is not None, 'Expect out_dir, got none.'
+        pipeline = self._get_pipeline(pipeline)
+        for i, result in enumerate(results):
+            if 'pts_bbox' in result.keys():
+                result = result['pts_bbox']
+            data_info = self.data_infos[i]
+            pts_path = data_info['lidar_path']
+            file_name = osp.split(pts_path)[-1].split('.')[0]
+            points = self._extract_data(i, pipeline, 'points').numpy()
+            points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
+                                               Coord3DMode.DEPTH)
+            inds = result['scores_3d'] > 0.1
+            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy()
+            show_gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR,
+                                               Box3DMode.DEPTH)
+            pred_bboxes = result['boxes_3d'][inds].tensor.numpy()
+            show_pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR,
+                                                 Box3DMode.DEPTH)
+            show_result(points, show_gt_bboxes, show_pred_bboxes, out_dir,
+                        file_name, show)
+
+    def json2csv(self, json_path, csv_savepath):
+        """Convert the json file to csv format for submission.
+
+        Args:
+            json_path (str): Path of the result json file.
+            csv_savepath (str): Path to save the csv file.
+        """
+        results = mmcv.load(json_path)['results']
+        sample_list_path = osp.join(self.data_root, 'sample_submission.csv')
+        data = pd.read_csv(sample_list_path)
+        Id_list = list(data['Id'])
+        pred_list = list(data['PredictionString'])
+        cnt = 0
+        print('Converting the json to csv...')
+        for token in results.keys():
+            cnt += 1
+            predictions = results[token]
+            prediction_str = ''
+            for i in range(len(predictions)):
+                prediction_str += \
+                    str(predictions[i]['score']) + ' ' + \
+                    str(predictions[i]['translation'][0]) + ' ' + \
+                    str(predictions[i]['translation'][1]) + ' ' + \
+                    str(predictions[i]['translation'][2]) + ' ' + \
+                    str(predictions[i]['size'][0]) + ' ' + \
+                    str(predictions[i]['size'][1]) + ' ' + \
+                    str(predictions[i]['size'][2]) + ' ' + \
+                    str(Quaternion(list(predictions[i]['rotation']))
+                        .yaw_pitch_roll[0]) + ' ' + \
+                    predictions[i]['name'] + ' '
+            prediction_str = prediction_str[:-1]
+            idx = Id_list.index(token)
+            pred_list[idx] = prediction_str
+        df = pd.DataFrame({'Id': Id_list, 'PredictionString': pred_list})
+        mmcv.mkdir_or_exist(os.path.dirname(csv_savepath))
+        df.to_csv(csv_savepath, index=False)
+
+
+def output_to_lyft_box(detection):
+    """Convert the output to the box class in the Lyft.
+
+    Args:
+        detection (dict): Detection results.
+
+    Returns:
+        list[:obj:`LyftBox`]: List of standard LyftBoxes.
+    """
+    box3d = detection['boxes_3d']
+    scores = detection['scores_3d'].numpy()
+    labels = detection['labels_3d'].numpy()
+
+    box_gravity_center = box3d.gravity_center.numpy()
+    box_dims = box3d.dims.numpy()
+    box_yaw = box3d.yaw.numpy()
+    # TODO: check whether this is necessary
+    # with dir_offset & dir_limit in the head
+    box_yaw = -box_yaw - np.pi / 2
+
+    box_list = []
+    for i in range(len(box3d)):
+        quat = Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+        box = LyftBox(
+            box_gravity_center[i],
+            box_dims[i],
+            quat,
+            label=labels[i],
+            score=scores[i])
+        box_list.append(box)
+    return box_list
+
+
+def lidar_lyft_box_to_global(info, boxes):
+    """Convert the box from ego to global coordinate.
+
+    Args:
+        info (dict): Info for a specific sample data, including the
+            calibration information.
+        boxes (list[:obj:`LyftBox`]): List of predicted LyftBoxes.
+
+    Returns:
+        list: List of standard LyftBoxes in the global
+            coordinate.
+    """
+    box_list = []
+    for box in boxes:
+        # Move box to ego vehicle coord system
+        box.rotate(Quaternion(info['lidar2ego_rotation']))
+        box.translate(np.array(info['lidar2ego_translation']))
+        # Move box to global coord system
+        box.rotate(Quaternion(info['ego2global_rotation']))
+        box.translate(np.array(info['ego2global_translation']))
+        box_list.append(box)
+    return box_list
\ No newline at end of file
diff --git a/mmcv/datasets/map_utils/mean_ap.py b/mmcv/datasets/map_utils/mean_ap.py
new file mode 100644
index 0000000..9b3a49b
--- /dev/null
+++ b/mmcv/datasets/map_utils/mean_ap.py
@@ -0,0 +1,390 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from multiprocessing import Pool
+from shapely.geometry import LineString, Polygon
+import mmcv
+import numpy as np
+from mmcv.utils import print_log
+from terminaltables import AsciiTable
+import json
+from os import path as osp
+import os
+from functools import partial
+from .tpfp import tpfp_gen, custom_tpfp_gen
+from mmcv.fileio.io import dump,load
+
+def average_precision(recalls, precisions, mode='area'):
+    """Calculate average precision (for single or multiple scales).
+
+    Args:
+        recalls (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        precisions (ndarray): shape (num_scales, num_dets) or (num_dets, )
+        mode (str): 'area' or '11points', 'area' means calculating the area
+            under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1]
+
+    Returns:
+        float or ndarray: calculated average precision
+    """
+    no_scale = False
+    if recalls.ndim == 1:
+        no_scale = True
+        recalls = recalls[np.newaxis, :]
+        precisions = precisions[np.newaxis, :]
+    assert recalls.shape == precisions.shape and recalls.ndim == 2
+    num_scales = recalls.shape[0]
+    ap = np.zeros(num_scales, dtype=np.float32)
+    if mode == 'area':
+        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
+        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
+        mrec = np.hstack((zeros, recalls, ones))
+        mpre = np.hstack((zeros, precisions, zeros))
+        for i in range(mpre.shape[1] - 1, 0, -1):
+            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
+        for i in range(num_scales):
+            ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
+            ap[i] = np.sum(
+                (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
+    elif mode == '11points':
+        for i in range(num_scales):
+            for thr in np.arange(0, 1 + 1e-3, 0.1):
+                precs = precisions[i, recalls[i, :] >= thr]
+                prec = precs.max() if precs.size > 0 else 0
+                ap[i] += prec
+        ap /= 11
+    else:
+        raise ValueError(
+            'Unrecognized mode, only "area" and "11points" are supported')
+    if no_scale:
+        ap = ap[0]
+    return ap
+
+def get_cls_results(gen_results, 
+                    annotations, 
+                    num_sample=100, 
+                    num_pred_pts_per_instance=30,
+                    eval_use_same_gt_sample_num_flag=False,
+                    class_id=0, 
+                    fix_interval=False):
+    """Get det results and gt information of a certain class.
+
+    Args:
+        gen_results (list[list]): Same as `eval_map()`.
+        annotations (list[dict]): Same as `eval_map()`.
+        class_id (int): ID of a specific class.
+
+    Returns:
+        tuple[list[np.ndarray]]: detected bboxes, gt bboxes
+    """
+    # if len(gen_results) == 0 or 
+
+    cls_gens, cls_scores = [], []
+    for res in gen_results['vectors']:
+        if res['type'] == class_id:
+            if len(res['pts']) < 2:
+                continue
+            if not eval_use_same_gt_sample_num_flag:
+                sampled_points = np.array(res['pts'])
+            else:
+                line = res['pts']
+                line = LineString(line)
+
+                if fix_interval:
+                    distances = list(np.arange(1., line.length, 1.))
+                    distances = [0,] + distances + [line.length,]
+                    sampled_points = np.array([list(line.interpolate(distance).coords)
+                                            for distance in distances]).reshape(-1, 2)
+                else:
+                    distances = np.linspace(0, line.length, num_sample)
+                    sampled_points = np.array([list(line.interpolate(distance).coords)
+                                                for distance in distances]).reshape(-1, 2)
+                
+            cls_gens.append(sampled_points)
+            cls_scores.append(res['confidence_level'])
+    num_res = len(cls_gens)
+    if num_res > 0:
+        cls_gens = np.stack(cls_gens).reshape(num_res,-1)
+        cls_scores = np.array(cls_scores)[:,np.newaxis]
+        cls_gens = np.concatenate([cls_gens,cls_scores],axis=-1)
+        # print(f'for class {i}, cls_gens has shape {cls_gens.shape}')
+    else:
+        if not eval_use_same_gt_sample_num_flag:
+            cls_gens = np.zeros((0,num_pred_pts_per_instance*2+1))
+        else:
+            cls_gens = np.zeros((0,num_sample*2+1))
+        # print(f'for class {i}, cls_gens has shape {cls_gens.shape}')
+
+    cls_gts = []
+    for ann in annotations['vectors']:
+        if ann['type'] == class_id:
+            # line = ann['pts'] +  np.array((1,1)) # for hdmapnet
+            line = ann['pts']
+            # line = ann['pts'].cumsum(0)
+            line = LineString(line)
+            distances = np.linspace(0, line.length, num_sample)
+            sampled_points = np.array([list(line.interpolate(distance).coords)
+                                        for distance in distances]).reshape(-1, 2)
+            
+            cls_gts.append(sampled_points)
+    num_gts = len(cls_gts)
+    if num_gts > 0:
+        cls_gts = np.stack(cls_gts).reshape(num_gts,-1)
+    else:
+        cls_gts = np.zeros((0,num_sample*2))
+    return cls_gens, cls_gts
+    # ones = np.ones((num_gts,1))
+    # tmp_cls_gens = np.concatenate([cls_gts,ones],axis=-1)
+    # return tmp_cls_gens, cls_gts
+
+def format_res_gt_by_classes(result_path,
+                             gen_results,
+                             annotations,
+                             cls_names=None,
+                             num_pred_pts_per_instance=30,
+                             eval_use_same_gt_sample_num_flag=False,
+                             pc_range=[-15.0, -30.0, -5.0, 15.0, 30.0, 3.0],
+                             nproc=24):
+    assert cls_names is not None
+    timer = mmcv.Timer()
+    num_fixed_sample_pts = 100
+    fix_interval = False
+    print('results path: {}'.format(result_path))
+
+    output_dir = osp.join(*osp.split(result_path)[:-1])
+    assert len(gen_results) == len(annotations)
+
+    pool = Pool(nproc)
+    cls_gens, cls_gts = {}, {}
+    print('Formatting ...')
+    formatting_file = 'cls_formatted.pkl'
+    formatting_file = osp.join(output_dir,formatting_file)
+
+    # for vis
+    if False:
+        from PIL import Image
+        import matplotlib.pyplot as plt
+        from matplotlib import transforms
+        from matplotlib.patches import Rectangle
+
+        show_dir = osp.join(output_dir,'vis_json')
+        mmcv.mkdir_or_exist(osp.abspath(show_dir))
+        # import pdb;pdb.set_trace()
+        car_img = Image.open('./figs/lidar_car.png')
+        colors_plt = ['r', 'b', 'g']
+        for i in range(20):
+
+            plt.figure(figsize=(2, 4))
+            plt.xlim(pc_range[0], pc_range[3])
+            plt.ylim(pc_range[1], pc_range[4])
+            plt.axis('off')
+
+            for line in gen_results[i]['vectors']:
+                l = np.array(line['pts'])
+                plt.plot(l[:,0],l[:,1],'-', 
+                # color=colors[line['type']]
+                color = 'red',
+                )
+
+            for line in annotations[i]['vectors']:
+                # l = np.array(line['pts']) + np.array((1,1))
+                l = np.array(line['pts'])
+                # l = line['pts']
+                plt.plot(l[:,0],l[:,1],'-', 
+                    # color=colors[line['type']],
+                    color = 'blue',
+                    )
+            plt.imshow(car_img, extent=[-1.2, 1.2, -1.5, 1.5])
+            map_path = osp.join(show_dir, 'COMPARE_MAP_{}.jpg'.format(i))
+            plt.savefig(map_path, bbox_inches='tight', dpi=400)
+            plt.close()
+
+    for i, clsname in enumerate(cls_names):
+
+        gengts = pool.starmap(
+                    partial(get_cls_results, num_sample=num_fixed_sample_pts,
+                        num_pred_pts_per_instance=num_pred_pts_per_instance,
+                        eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,class_id=i,fix_interval=fix_interval),
+                    zip(list(gen_results.values()), annotations))   
+        # gengts = map(partial(get_cls_results, num_sample=num_fixed_sample_pts, class_id=i,fix_interval=fix_interval),
+        #             zip(gen_results, annotations))
+        # import pdb;pdb.set_trace()
+        gens, gts = tuple(zip(*gengts))
+        cls_gens[clsname] = gens
+        cls_gts[clsname] = gts
+    
+    dump([cls_gens, cls_gts],formatting_file)
+    print('Cls data formatting done in {:2f}s!! with {}'.format(float(timer.since_start()),formatting_file))
+    pool.close()
+    return cls_gens, cls_gts
+
+def eval_map(gen_results,
+             annotations,
+             cls_gens,
+             cls_gts,
+             threshold=0.5,
+             cls_names=None,
+             logger=None,
+             tpfp_fn=None,
+             pc_range=[-15.0, -30.0, -5.0, 15.0, 30.0, 3.0],
+             metric=None,
+             num_pred_pts_per_instance=30,
+             nproc=24):
+    timer = mmcv.Timer()
+    pool = Pool(nproc)
+
+    eval_results = []
+    
+    for i, clsname in enumerate(cls_names):
+        
+        # get gt and det bboxes of this class
+        cls_gen = cls_gens[clsname]
+        cls_gt = cls_gts[clsname]
+        # choose proper function according to datasets to compute tp and fp
+        # XXX
+        # func_name = cls2func[clsname]
+        # tpfp_fn = tpfp_fn_dict[tpfp_fn_name]
+        tpfp_fn = custom_tpfp_gen
+        # Trick for serialized
+        # only top-level function can be serized
+        # somehow use partitial the return function is defined
+        # at the top level.
+
+        # tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold, metric=metric)
+        # import pdb; pdb.set_trace()
+        # TODO this is a hack
+        tpfp_fn = partial(tpfp_fn, threshold=threshold, metric=metric)
+        args = []
+        # compute tp and fp for each image with multiple processes
+        tpfp = pool.starmap(
+            tpfp_fn,
+            zip(cls_gen, cls_gt, *args))
+        # import pdb;pdb.set_trace()
+        tp, fp = tuple(zip(*tpfp))
+
+
+
+        # map_results = map(
+        #     tpfp_fn,
+        #     cls_gen, cls_gt)
+        # tp, fp = tuple(map(list, zip(*map_results)))
+
+
+        # debug and testing
+        # for i in range(len(cls_gen)):
+        #     # print(i)
+        #     tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold)
+        #     print(i)
+        #     tpfp = (tpfp,)
+        #     print(tpfp)
+        # i = 0 
+        # tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold)
+        # import pdb; pdb.set_trace()
+
+        # XXX
+        
+        num_gts = 0
+        for j, bbox in enumerate(cls_gt):
+            num_gts += bbox.shape[0]
+
+        # sort all det bboxes by score, also sort tp and fp
+        # import pdb;pdb.set_trace()
+        cls_gen = np.vstack(cls_gen)
+        num_dets = cls_gen.shape[0]
+        sort_inds = np.argsort(-cls_gen[:, -1]) #descending, high score front
+        tp = np.hstack(tp)[sort_inds]
+        fp = np.hstack(fp)[sort_inds]
+        
+        # calculate recall and precision with tp and fp
+        # num_det*num_res
+        tp = np.cumsum(tp, axis=0)
+        fp = np.cumsum(fp, axis=0)
+        eps = np.finfo(np.float32).eps
+        recalls = tp / np.maximum(num_gts, eps)
+        precisions = tp / np.maximum((tp + fp), eps)
+
+        # calculate AP
+        # if dataset != 'voc07' else '11points'
+        mode = 'area'
+        ap = average_precision(recalls, precisions, mode)
+        eval_results.append({
+            'num_gts': num_gts,
+            'num_dets': num_dets,
+            'recall': recalls,
+            'precision': precisions,
+            'ap': ap
+        })
+        print('cls:{} done in {:2f}s!!'.format(clsname,float(timer.since_last_check())))
+    pool.close()
+    aps = []
+    for cls_result in eval_results:
+        if cls_result['num_gts'] > 0:
+            aps.append(cls_result['ap'])
+    mean_ap = np.array(aps).mean().item() if len(aps) else 0.0
+
+    print_map_summary(
+        mean_ap, eval_results, class_name=cls_names, logger=logger)
+
+    return mean_ap, eval_results
+
+
+
+def print_map_summary(mean_ap,
+                      results,
+                      class_name=None,
+                      scale_ranges=None,
+                      logger=None):
+    """Print mAP and results of each class.
+
+    A table will be printed to show the gts/dets/recall/AP of each class and
+    the mAP.
+
+    Args:
+        mean_ap (float): Calculated from `eval_map()`.
+        results (list[dict]): Calculated from `eval_map()`.
+        dataset (list[str] | str | None): Dataset name or dataset classes.
+        scale_ranges (list[tuple] | None): Range of scales to be evaluated.
+        logger (logging.Logger | str | None): The way to print the mAP
+            summary. See `mmcv.utils.print_log()` for details. Default: None.
+    """
+
+    if logger == 'silent':
+        return
+
+    if isinstance(results[0]['ap'], np.ndarray):
+        num_scales = len(results[0]['ap'])
+    else:
+        num_scales = 1
+
+    if scale_ranges is not None:
+        assert len(scale_ranges) == num_scales
+
+    num_classes = len(results)
+
+    recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
+    aps = np.zeros((num_scales, num_classes), dtype=np.float32)
+    num_gts = np.zeros((num_scales, num_classes), dtype=int)
+    for i, cls_result in enumerate(results):
+        if cls_result['recall'].size > 0:
+            recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1]
+        aps[:, i] = cls_result['ap']
+        num_gts[:, i] = cls_result['num_gts']
+
+    label_names = class_name
+
+    if not isinstance(mean_ap, list):
+        mean_ap = [mean_ap]
+
+    header = ['class', 'gts', 'dets', 'recall', 'ap']
+    for i in range(num_scales):
+        if scale_ranges is not None:
+            print_log(f'Scale range {scale_ranges[i]}', logger=logger)
+        table_data = [header]
+        for j in range(num_classes):
+            row_data = [
+                label_names[j], num_gts[i, j], results[j]['num_dets'],
+                f'{recalls[i, j]:.3f}', f'{aps[i, j]:.3f}'
+            ]
+            table_data.append(row_data)
+        table_data.append(['mAP', '', '', '', f'{mean_ap[i]:.3f}'])
+        table = AsciiTable(table_data)
+        table.inner_footing_row_border = True
+        print_log('\n' + table.table, logger=logger)
diff --git a/mmcv/datasets/map_utils/struct.py b/mmcv/datasets/map_utils/struct.py
new file mode 100644
index 0000000..1f20fee
--- /dev/null
+++ b/mmcv/datasets/map_utils/struct.py
@@ -0,0 +1,438 @@
+import numpy as np
+import torch
+from shapely.geometry import LineString
+from mmcv.datasets.pipelines import to_tensor
+
+class LiDARInstanceLines(object):
+    """Line instance in LIDAR coordinates
+
+    """
+    def __init__(self, 
+                 instance_line_list, 
+                 sample_dist=1,
+                 num_samples=250,
+                 padding=False,
+                 fixed_num=-1,
+                 padding_value=-10000,
+                 patch_size=None):
+        assert isinstance(instance_line_list, list)
+        assert patch_size is not None
+        if len(instance_line_list) != 0:
+            assert isinstance(instance_line_list[0], LineString)
+        self.patch_size = patch_size
+        self.max_x = self.patch_size[1] / 2
+        self.max_y = self.patch_size[0] / 2
+        self.sample_dist = sample_dist
+        self.num_samples = num_samples
+        self.padding = padding
+        self.fixed_num = fixed_num
+        self.padding_value = padding_value
+
+        self.instance_list = instance_line_list
+
+    @property
+    def start_end_points(self):
+        """
+        return torch.Tensor([N,4]), in xstart, ystart, xend, yend form
+        """
+        assert len(self.instance_list) != 0
+        instance_se_points_list = []
+        for instance in self.instance_list:
+            se_points = []
+            se_points.extend(instance.coords[0])
+            se_points.extend(instance.coords[-1])
+            instance_se_points_list.append(se_points)
+        instance_se_points_array = np.array(instance_se_points_list)
+        instance_se_points_tensor = to_tensor(instance_se_points_array)
+        instance_se_points_tensor = instance_se_points_tensor.to(
+                                dtype=torch.float32)
+        instance_se_points_tensor[:,0] = torch.clamp(instance_se_points_tensor[:,0], min=-self.max_x,max=self.max_x)
+        instance_se_points_tensor[:,1] = torch.clamp(instance_se_points_tensor[:,1], min=-self.max_y,max=self.max_y)
+        instance_se_points_tensor[:,2] = torch.clamp(instance_se_points_tensor[:,2], min=-self.max_x,max=self.max_x)
+        instance_se_points_tensor[:,3] = torch.clamp(instance_se_points_tensor[:,3], min=-self.max_y,max=self.max_y)
+        return instance_se_points_tensor
+
+    @property
+    def bbox(self):
+        """
+        return torch.Tensor([N,4]), in xmin, ymin, xmax, ymax form
+        """
+        assert len(self.instance_list) != 0
+        instance_bbox_list = []
+        for instance in self.instance_list:
+            # bounds is bbox: [xmin, ymin, xmax, ymax]
+            instance_bbox_list.append(instance.bounds)
+        instance_bbox_array = np.array(instance_bbox_list)
+        instance_bbox_tensor = to_tensor(instance_bbox_array)
+        instance_bbox_tensor = instance_bbox_tensor.to(
+                            dtype=torch.float32)
+        instance_bbox_tensor[:,0] = torch.clamp(instance_bbox_tensor[:,0], min=-self.max_x,max=self.max_x)
+        instance_bbox_tensor[:,1] = torch.clamp(instance_bbox_tensor[:,1], min=-self.max_y,max=self.max_y)
+        instance_bbox_tensor[:,2] = torch.clamp(instance_bbox_tensor[:,2], min=-self.max_x,max=self.max_x)
+        instance_bbox_tensor[:,3] = torch.clamp(instance_bbox_tensor[:,3], min=-self.max_y,max=self.max_y)
+        return instance_bbox_tensor
+
+    @property
+    def fixed_num_sampled_points(self):
+        """
+        return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form
+            N means the num of instances
+        """
+        assert len(self.instance_list) != 0
+        instance_points_list = []
+        for instance in self.instance_list:
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+            instance_points_list.append(sampled_points)
+        instance_points_array = np.array(instance_points_list)
+        instance_points_tensor = to_tensor(instance_points_array)
+        instance_points_tensor = instance_points_tensor.to(
+                            dtype=torch.float32)
+        instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+        instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+        return instance_points_tensor
+
+    @property
+    def fixed_num_sampled_points_ambiguity(self):
+        """
+        return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form
+            N means the num of instances
+        """
+        assert len(self.instance_list) != 0
+        instance_points_list = []
+        for instance in self.instance_list:
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+            instance_points_list.append(sampled_points)
+        instance_points_array = np.array(instance_points_list)
+        instance_points_tensor = to_tensor(instance_points_array)
+        instance_points_tensor = instance_points_tensor.to(
+                            dtype=torch.float32)
+        instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+        instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+        instance_points_tensor = instance_points_tensor.unsqueeze(1)
+        return instance_points_tensor
+
+    @property
+    def fixed_num_sampled_points_torch(self):
+        """
+        return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form
+            N means the num of instances
+        """
+        assert len(self.instance_list) != 0
+        instance_points_list = []
+        for instance in self.instance_list:
+            # distances = np.linspace(0, instance.length, self.fixed_num)
+            # sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+            poly_pts = to_tensor(np.array(list(instance.coords)))
+            poly_pts = poly_pts.unsqueeze(0).permute(0,2,1)
+            sampled_pts = torch.nn.functional.interpolate(poly_pts,size=(self.fixed_num),mode='linear',align_corners=True)
+            sampled_pts = sampled_pts.permute(0,2,1).squeeze(0)
+            instance_points_list.append(sampled_pts)
+        # instance_points_array = np.array(instance_points_list)
+        # instance_points_tensor = to_tensor(instance_points_array)
+        instance_points_tensor = torch.stack(instance_points_list,dim=0)
+        instance_points_tensor = instance_points_tensor.to(
+                            dtype=torch.float32)
+        instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+        instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+        return instance_points_tensor
+
+    @property
+    def shift_fixed_num_sampled_points(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        fixed_num_sampled_points = self.fixed_num_sampled_points
+        instances_list = []
+        is_poly = False
+        # is_line = False
+        # import pdb;pdb.set_trace()
+        for fixed_num_pts in fixed_num_sampled_points:
+            # [fixed_num, 2]
+            is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+            fixed_num = fixed_num_pts.shape[0]
+            shift_pts_list = []
+            if is_poly:
+                # import pdb;pdb.set_trace()
+                for shift_right_i in range(fixed_num):
+                    shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0))
+            else:
+                shift_pts_list.append(fixed_num_pts)
+                shift_pts_list.append(fixed_num_pts.flip(0))
+            shift_pts = torch.stack(shift_pts_list,dim=0)
+
+            shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+            shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+            if not is_poly:
+                padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,2], self.padding_value)
+                shift_pts = torch.cat([shift_pts,padding],dim=0)
+                # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            instances_list.append(shift_pts)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v1(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        fixed_num_sampled_points = self.fixed_num_sampled_points
+        instances_list = []
+        is_poly = False
+        # is_line = False
+        # import pdb;pdb.set_trace()
+        for fixed_num_pts in fixed_num_sampled_points:
+            # [fixed_num, 2]
+            is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+            pts_num = fixed_num_pts.shape[0]
+            shift_num = pts_num - 1
+            if is_poly:
+                pts_to_shift = fixed_num_pts[:-1,:]
+            shift_pts_list = []
+            if is_poly:
+                for shift_right_i in range(shift_num):
+                    shift_pts_list.append(pts_to_shift.roll(shift_right_i,0))
+            else:
+                shift_pts_list.append(fixed_num_pts)
+                shift_pts_list.append(fixed_num_pts.flip(0))
+            shift_pts = torch.stack(shift_pts_list,dim=0)
+
+            if is_poly:
+                _, _, num_coords = shift_pts.shape
+                tmp_shift_pts = shift_pts.new_zeros((shift_num, pts_num, num_coords))
+                tmp_shift_pts[:,:-1,:] = shift_pts
+                tmp_shift_pts[:,-1,:] = shift_pts[:,0,:]
+                shift_pts = tmp_shift_pts
+
+            shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+            shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+            if not is_poly:
+                padding = torch.full([shift_num-shift_pts.shape[0],pts_num,2], self.padding_value)
+                shift_pts = torch.cat([shift_pts,padding],dim=0)
+                # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            instances_list.append(shift_pts)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v2(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        assert len(self.instance_list) != 0
+        instances_list = []
+        for instance in self.instance_list:
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            poly_pts = np.array(list(instance.coords))
+            start_pts = poly_pts[0]
+            end_pts = poly_pts[-1]
+            is_poly = np.equal(start_pts, end_pts)
+            is_poly = is_poly.all()
+            shift_pts_list = []
+            pts_num, coords_num = poly_pts.shape
+            shift_num = pts_num - 1
+            final_shift_num = self.fixed_num - 1
+            if is_poly:
+                pts_to_shift = poly_pts[:-1,:]
+                for shift_right_i in range(shift_num):
+                    shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0)
+                    pts_to_concat = shift_pts[0]
+                    pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+                    shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+                    shift_instance = LineString(shift_pts)
+                    shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+                    shift_pts_list.append(shift_sampled_points)
+                # import pdb;pdb.set_trace()
+            else:
+                sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+                flip_sampled_points = np.flip(sampled_points, axis=0)
+                shift_pts_list.append(sampled_points)
+                shift_pts_list.append(flip_sampled_points)
+            
+            multi_shifts_pts = np.stack(shift_pts_list,axis=0)
+            shifts_num,_,_ = multi_shifts_pts.shape
+
+            if shifts_num > final_shift_num:
+                index = np.random.choice(multi_shifts_pts.shape[0], final_shift_num, replace=False)
+                multi_shifts_pts = multi_shifts_pts[index]
+            
+            multi_shifts_pts_tensor = to_tensor(multi_shifts_pts)
+            multi_shifts_pts_tensor = multi_shifts_pts_tensor.to(
+                            dtype=torch.float32)
+            
+            multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+            multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+            # if not is_poly:
+            if multi_shifts_pts_tensor.shape[0] < final_shift_num:
+                padding = torch.full([final_shift_num-multi_shifts_pts_tensor.shape[0],self.fixed_num,2], self.padding_value)
+                multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0)
+            instances_list.append(multi_shifts_pts_tensor)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v3(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        assert len(self.instance_list) != 0
+        instances_list = []
+        for instance in self.instance_list:
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            poly_pts = np.array(list(instance.coords))
+            start_pts = poly_pts[0]
+            end_pts = poly_pts[-1]
+            is_poly = np.equal(start_pts, end_pts)
+            is_poly = is_poly.all()
+            shift_pts_list = []
+            pts_num, coords_num = poly_pts.shape
+            shift_num = pts_num - 1
+            final_shift_num = self.fixed_num - 1
+            if is_poly:
+                pts_to_shift = poly_pts[:-1,:]
+                for shift_right_i in range(shift_num):
+                    shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0)
+                    pts_to_concat = shift_pts[0]
+                    pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+                    shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+                    shift_instance = LineString(shift_pts)
+                    shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+                    shift_pts_list.append(shift_sampled_points)
+                flip_pts_to_shift = np.flip(pts_to_shift, axis=0)
+                for shift_right_i in range(shift_num):
+                    shift_pts = np.roll(flip_pts_to_shift,shift_right_i,axis=0)
+                    pts_to_concat = shift_pts[0]
+                    pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+                    shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+                    shift_instance = LineString(shift_pts)
+                    shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+                    shift_pts_list.append(shift_sampled_points)
+                # import pdb;pdb.set_trace()
+            else:
+                sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+                flip_sampled_points = np.flip(sampled_points, axis=0)
+                shift_pts_list.append(sampled_points)
+                shift_pts_list.append(flip_sampled_points)
+            
+            multi_shifts_pts = np.stack(shift_pts_list,axis=0)
+            shifts_num,_,_ = multi_shifts_pts.shape
+            # import pdb;pdb.set_trace()
+            if shifts_num > 2*final_shift_num:
+                index = np.random.choice(shift_num, final_shift_num, replace=False)
+                flip0_shifts_pts = multi_shifts_pts[index]
+                flip1_shifts_pts = multi_shifts_pts[index+shift_num]
+                multi_shifts_pts = np.concatenate((flip0_shifts_pts,flip1_shifts_pts),axis=0)
+            
+            multi_shifts_pts_tensor = to_tensor(multi_shifts_pts)
+            multi_shifts_pts_tensor = multi_shifts_pts_tensor.to(
+                            dtype=torch.float32)
+            
+            multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+            multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+            # if not is_poly:
+            if multi_shifts_pts_tensor.shape[0] < 2*final_shift_num:
+                padding = torch.full([final_shift_num*2-multi_shifts_pts_tensor.shape[0],self.fixed_num,2], self.padding_value)
+                multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0)
+            instances_list.append(multi_shifts_pts_tensor)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v4(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        fixed_num_sampled_points = self.fixed_num_sampled_points
+        instances_list = []
+        is_poly = False
+        # is_line = False
+        # import pdb;pdb.set_trace()
+        for fixed_num_pts in fixed_num_sampled_points:
+            # [fixed_num, 2]
+            is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+            pts_num = fixed_num_pts.shape[0]
+            shift_num = pts_num - 1
+            shift_pts_list = []
+            if is_poly:
+                pts_to_shift = fixed_num_pts[:-1,:]
+                for shift_right_i in range(shift_num):
+                    shift_pts_list.append(pts_to_shift.roll(shift_right_i,0))
+                flip_pts_to_shift = pts_to_shift.flip(0)
+                for shift_right_i in range(shift_num):
+                    shift_pts_list.append(flip_pts_to_shift.roll(shift_right_i,0))
+            else:
+                shift_pts_list.append(fixed_num_pts)
+                shift_pts_list.append(fixed_num_pts.flip(0))
+            shift_pts = torch.stack(shift_pts_list,dim=0)
+
+            if is_poly:
+                _, _, num_coords = shift_pts.shape
+                tmp_shift_pts = shift_pts.new_zeros((shift_num*2, pts_num, num_coords))
+                tmp_shift_pts[:,:-1,:] = shift_pts
+                tmp_shift_pts[:,-1,:] = shift_pts[:,0,:]
+                shift_pts = tmp_shift_pts
+
+            shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+            shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+            if not is_poly:
+                padding = torch.full([shift_num*2-shift_pts.shape[0],pts_num,2], self.padding_value)
+                shift_pts = torch.cat([shift_pts,padding],dim=0)
+                # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            instances_list.append(shift_pts)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_torch(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        fixed_num_sampled_points = self.fixed_num_sampled_points_torch
+        instances_list = []
+        is_poly = False
+        # is_line = False
+        # import pdb;pdb.set_trace()
+        for fixed_num_pts in fixed_num_sampled_points:
+            # [fixed_num, 2]
+            is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+            fixed_num = fixed_num_pts.shape[0]
+            shift_pts_list = []
+            if is_poly:
+                # import pdb;pdb.set_trace()
+                for shift_right_i in range(fixed_num):
+                    shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0))
+            else:
+                shift_pts_list.append(fixed_num_pts)
+                shift_pts_list.append(fixed_num_pts.flip(0))
+            shift_pts = torch.stack(shift_pts_list,dim=0)
+
+            shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+            shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+            if not is_poly:
+                padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,2], self.padding_value)
+                shift_pts = torch.cat([shift_pts,padding],dim=0)
+                # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            instances_list.append(shift_pts)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
\ No newline at end of file
diff --git a/mmcv/datasets/map_utils/tpfp.py b/mmcv/datasets/map_utils/tpfp.py
new file mode 100644
index 0000000..a40ea1d
--- /dev/null
+++ b/mmcv/datasets/map_utils/tpfp.py
@@ -0,0 +1,363 @@
+import mmcv
+import numpy as np
+
+from mmcv.core.evaluation.bbox_overlaps import bbox_overlaps
+from .tpfp_chamfer import vec_iou, convex_iou, rbbox_iou, polyline_score, custom_polyline_score
+from shapely.geometry import LineString, Polygon
+# from vecmapnet_ops.ops.iou import convex_iou
+
+def tpfp_bbox(det_bboxes,
+              gt_bboxes,
+              gt_bbox_masks,
+              threshold=0.5):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Default: None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Default: False.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+        each array is (num_scales, m).
+    """
+
+    num_dets = len(det_bboxes)
+    num_gts = len(gt_bboxes)
+
+    # tp and fp
+    tp = np.zeros((num_dets), dtype=np.float32)
+    fp = np.zeros((num_dets), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    # XXX
+    if num_gts == 0:
+        fp[...] = 1
+        return tp, fp
+    
+    if num_dets == 0:
+        return tp, fp
+    
+    # # distance matrix: n x m
+    bbox_p = det_bboxes[:, :-1].reshape(num_dets,-1,2)
+    bbox_g = gt_bboxes.reshape(num_gts,-1,2)
+    bbox_gm = gt_bbox_masks.reshape(num_gts,-1,2)
+    matrix = convex_iou(bbox_p,bbox_g,bbox_gm)
+
+    # for each det, the max iou with all gts
+    matrix_max = matrix.max(axis=1)
+    # for each det, which gt overlaps most with it
+    matrix_argmax = matrix.argmax(axis=1)
+    # sort all dets in descending order by scores
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+
+    gt_covered = np.zeros(num_gts, dtype=bool)
+
+    # tp = 0 and fp = 0 means ignore this detected bbox,
+    for i in sort_inds:
+        if matrix_max[i] >= threshold:
+            matched_gt = matrix_argmax[i]
+            if not gt_covered[matched_gt]:
+                gt_covered[matched_gt] = True
+                tp[i] = 1
+            else:
+                fp[i] = 1
+        else:
+            fp[i] = 1
+
+    return tp, fp
+
+
+def tpfp_rbbox(det_bboxes,
+              gt_bboxes,
+              gt_bbox_masks,
+              threshold=0.5):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Default: None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Default: False.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+        each array is (num_scales, m).
+    """
+
+    num_dets = len(det_bboxes)
+    num_gts = len(gt_bboxes)
+
+    # tp and fp
+    tp = np.zeros((num_dets), dtype=np.float32)
+    fp = np.zeros((num_dets), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    # XXX
+    if num_gts == 0:
+        fp[...] = 1
+        return tp, fp
+    
+    if num_dets == 0:
+        return tp, fp
+    
+    # # distance matrix: n x m
+    bbox_p = det_bboxes[:, :-1].reshape(num_dets,-1,2)
+    bbox_g = gt_bboxes.reshape(num_gts,-1,2)
+    bbox_gm = gt_bbox_masks.reshape(num_gts,-1,2)
+    matrix = rbbox_iou(bbox_p,bbox_g,bbox_gm)
+
+    # for each det, the max iou with all gts
+    matrix_max = matrix.max(axis=1)
+    # for each det, which gt overlaps most with it
+    matrix_argmax = matrix.argmax(axis=1)
+    # sort all dets in descending order by scores
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+
+    gt_covered = np.zeros(num_gts, dtype=bool)
+
+    # tp = 0 and fp = 0 means ignore this detected bbox,
+    for i in sort_inds:
+        if matrix_max[i] >= threshold:
+            matched_gt = matrix_argmax[i]
+            if not gt_covered[matched_gt]:
+                gt_covered[matched_gt] = True
+                tp[i] = 1
+            else:
+                fp[i] = 1
+        else:
+            fp[i] = 1
+
+    return tp, fp
+
+
+def tpfp_det(det_bboxes,
+             gt_bboxes,
+             threshold=0.5):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Default: None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Default: False.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+        each array is (num_scales, m).
+    """
+
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+
+    # tp and fp
+    tp = np.zeros((num_dets), dtype=np.float32)
+    fp = np.zeros((num_dets), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    # XXX
+    if num_gts == 0:
+        fp[...] = 1
+        return tp, fp
+    
+    if num_dets == 0:
+        return tp, fp
+    
+    # # distance matrix: n x m
+    matrix = vec_iou(
+            det_bboxes[:, :-1].reshape(num_dets,-1,2), 
+            gt_bboxes.reshape(num_gts,-1,2))
+    # for each det, the max iou with all gts
+    matrix_max = matrix.max(axis=1)
+    # for each det, which gt overlaps most with it
+    matrix_argmax = matrix.argmax(axis=1)
+    # sort all dets in descending order by scores
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+
+    gt_covered = np.zeros(num_gts, dtype=bool)
+
+    # tp = 0 and fp = 0 means ignore this detected bbox,
+    for i in sort_inds:
+        if matrix_max[i] >= threshold:
+            matched_gt = matrix_argmax[i]
+            if not gt_covered[matched_gt]:
+                gt_covered[matched_gt] = True
+                tp[i] = 1
+            else:
+                fp[i] = 1
+        else:
+            fp[i] = 1
+
+    return tp, fp
+
+
+def tpfp_gen(gen_lines,
+             gt_lines,
+             threshold=0.5,
+             metric='POR'):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Default: None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Default: False.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+        each array is (num_scales, m).
+    """
+
+    num_gens = gen_lines.shape[0]
+    num_gts = gt_lines.shape[0]
+    
+    # tp and fp
+    tp = np.zeros((num_gens), dtype=np.float32)
+    fp = np.zeros((num_gens), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    if num_gts == 0:
+        fp[...] = 1
+        return tp, fp
+    
+    if num_gens == 0:
+        return tp, fp
+    
+    gen_scores = gen_lines[:,-1] # n
+    # distance matrix: n x m
+
+    # matrix = custom_polyline_score(
+    #         gen_lines[:,:-1].reshape(num_gens,-1,2), 
+    #         gt_lines.reshape(num_gts,-1,2),linewidth=2.,metric=metric)
+
+    # TODO MAY bug here
+    matrix = polyline_score(
+            gen_lines[:,:-1].reshape(num_gens,-1,2), 
+            gt_lines.reshape(num_gts,-1,2),linewidth=2.,metric=metric)
+    # for each det, the max iou with all gts
+    matrix_max = matrix.max(axis=1)
+    # for each det, which gt overlaps most with it
+    matrix_argmax = matrix.argmax(axis=1)
+    # sort all dets in descending order by scores
+    sort_inds = np.argsort(-gen_scores)
+
+    gt_covered = np.zeros(num_gts, dtype=bool)
+
+    # tp = 0 and fp = 0 means ignore this detected bbox,
+    for i in sort_inds:
+        if matrix_max[i] >= threshold:
+            matched_gt = matrix_argmax[i]
+            if not gt_covered[matched_gt]:
+                gt_covered[matched_gt] = True
+                tp[i] = 1
+            else:
+                fp[i] = 1
+        else:
+            fp[i] = 1
+
+    return tp, fp
+
+
+def custom_tpfp_gen(gen_lines,
+             gt_lines,
+             threshold=0.5,
+             metric='chamfer'):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5).
+        gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4).
+        gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image,
+            of shape (k, 4). Default: None
+        iou_thr (float): IoU threshold to be considered as matched.
+            Default: 0.5.
+        use_legacy_coordinate (bool): Whether to use coordinate system in
+            mmdet v1.x. which means width, height should be
+            calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively.
+            Default: False.
+
+    Returns:
+        tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of
+        each array is (num_scales, m).
+    """
+    if metric == 'chamfer':
+        if threshold >0:
+            threshold= -threshold
+    # else:
+    #     raise NotImplementedError
+
+    # import pdb;pdb.set_trace()
+    num_gens = gen_lines.shape[0]
+    num_gts = gt_lines.shape[0]
+    
+    # tp and fp
+    tp = np.zeros((num_gens), dtype=np.float32)
+    fp = np.zeros((num_gens), dtype=np.float32)
+
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    if num_gts == 0:
+        fp[...] = 1
+        return tp, fp
+    
+    if num_gens == 0:
+        return tp, fp
+    
+    gen_scores = gen_lines[:,-1] # n
+    # distance matrix: n x m
+
+    matrix = custom_polyline_score(
+            gen_lines[:,:-1].reshape(num_gens,-1,2), 
+            gt_lines.reshape(num_gts,-1,2),linewidth=2.,metric=metric)
+    # for each det, the max iou with all gts
+    matrix_max = matrix.max(axis=1)
+    # for each det, which gt overlaps most with it
+    matrix_argmax = matrix.argmax(axis=1)
+    # sort all dets in descending order by scores
+    sort_inds = np.argsort(-gen_scores)
+
+    gt_covered = np.zeros(num_gts, dtype=bool)
+
+    # tp = 0 and fp = 0 means ignore this detected bbox,
+    for i in sort_inds:
+        if matrix_max[i] >= threshold:
+            matched_gt = matrix_argmax[i]
+            if not gt_covered[matched_gt]:
+                gt_covered[matched_gt] = True
+                tp[i] = 1
+            else:
+                fp[i] = 1
+        else:
+            fp[i] = 1
+
+    return tp, fp
+
diff --git a/mmcv/datasets/map_utils/tpfp_chamfer.py b/mmcv/datasets/map_utils/tpfp_chamfer.py
new file mode 100644
index 0000000..db55fdd
--- /dev/null
+++ b/mmcv/datasets/map_utils/tpfp_chamfer.py
@@ -0,0 +1,335 @@
+# from ..chamfer_dist import ChamferDistance
+import numpy as np
+from shapely.geometry import LineString, Polygon
+from shapely.strtree import STRtree
+from shapely.geometry import CAP_STYLE, JOIN_STYLE
+from scipy.spatial import distance
+import similaritymeasures
+
+# def chamfer_distance(pred_bbox, gt_bbox):
+
+#     cd_dist_func = ChamferDistance.vec_cd_dist(
+#         pred, pred_mask, tgt, tgt_mask)()
+
+
+def vec_iou(pred_lines, gt_lines):
+    '''
+        each line with 1 meter width
+        pred_lines: num_preds, npts, 2
+        gt_lines: num_gts, npts, 2
+    '''
+
+    num_preds = pred_lines.shape[0]
+    num_gts = gt_lines.shape[0]
+
+    pred_lines_shapely = \
+        [LineString(i).buffer(1.,
+            cap_style=CAP_STYLE.round, join_style=JOIN_STYLE.round)
+                          for i in pred_lines]
+    gt_lines_shapely =\
+        [LineString(i).buffer(1.,
+            cap_style=CAP_STYLE.round, join_style=JOIN_STYLE.round)
+                        for i in gt_lines]
+
+    # construct tree
+    tree = STRtree(gt_lines_shapely)
+    index_by_id = dict((id(pt), i) for i, pt in enumerate(gt_lines_shapely))
+
+    iou_matrix = np.zeros((num_preds, num_gts))
+
+    for i, pline in enumerate(pred_lines_shapely):
+
+        for o in tree.query(pline):
+            if o.intersects(pline):
+                gt_id = index_by_id[id(o)]
+
+                inter = o.intersection(pline).area
+                union = o.union(pline).area
+                iou_matrix[i, gt_id] = inter / union
+
+    return iou_matrix
+
+def convex_iou(pred_lines, gt_lines, gt_mask):
+    '''
+        each line with 1 meter width
+        pred_lines: num_preds, List [npts, 2]
+        gt_lines: num_gts, npts, 2
+        gt_mask: num_gts, npts, 2
+    '''
+
+    num_preds = len(pred_lines)
+    num_gts = len(gt_lines)
+
+    pred_lines_shapely = \
+        [Polygon(i).convex_hull for i in pred_lines]
+    gt_lines_shapely =\
+        [Polygon(i[m].reshape(-1,2)).convex_hull for i,m in zip(gt_lines,gt_mask)]
+
+    # construct tree
+    tree = STRtree(pred_lines_shapely)
+    index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely))
+
+    iou_matrix = np.zeros((num_preds, num_gts))
+
+    for i, pline in enumerate(gt_lines_shapely):
+
+        for o in tree.query(pline):
+            if o.intersects(pline):
+                pred_id = index_by_id[id(o)]
+
+                inter = o.intersection(pline).area
+                union = o.union(pline).area
+                iou_matrix[pred_id, i] = inter / union
+
+    return iou_matrix
+
+def rbbox_iou(pred_lines, gt_lines, gt_mask):
+    '''
+        each line with 1 meter width
+        pred_lines: num_preds, List [npts, 2]
+        gt_lines: num_gts, npts, 2
+        gt_mask: num_gts, npts, 2
+    '''
+
+    num_preds = len(pred_lines)
+    num_gts = len(gt_lines)
+
+    pred_lines_shapely = \
+        [Polygon(i).minimum_rotated_rectangle for i in pred_lines]
+    gt_lines_shapely =\
+        [Polygon(i[m].reshape(-1,2)) for i,m in zip(gt_lines,gt_mask)]
+
+    # construct tree
+    tree = STRtree(pred_lines_shapely)
+    index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely))
+
+    iou_matrix = np.zeros((num_preds, num_gts))
+
+    for i, pline in enumerate(gt_lines_shapely):
+
+        for o in tree.query(pline):
+            if o.intersects(pline):
+                pred_id = index_by_id[id(o)]
+
+                inter = o.intersection(pline).area
+                union = o.union(pline).area
+                iou_matrix[pred_id, i] = inter / union
+
+    return iou_matrix
+
+
+def polyline_score(pred_lines, gt_lines, linewidth=1., metric='POR'):
+    '''
+        each line with 1 meter width
+        pred_lines: num_preds, List [npts, 2]
+        gt_lines: num_gts, npts, 2
+        gt_mask: num_gts, npts, 2
+    '''
+    positive_threshold = 1.
+    num_preds = len(pred_lines)
+    num_gts = len(gt_lines)
+    line_length = pred_lines.shape[1]
+
+    # gt_lines = gt_lines + np.array((1.,1.))
+
+    pred_lines_shapely = \
+        [LineString(i).buffer(linewidth,
+            cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre)
+                          for i in pred_lines]
+    gt_lines_shapely =\
+        [LineString(i).buffer(linewidth,
+            cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre)
+                        for i in gt_lines]
+
+    # construct tree
+    tree = STRtree(pred_lines_shapely)
+    index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely))
+
+    if metric=='POR':
+        iou_matrix = np.zeros((num_preds, num_gts),dtype=np.float64)
+    elif metric=='frechet':
+        iou_matrix = np.full((num_preds, num_gts), -100.)
+    elif metric=='chamfer':
+        iou_matrix = np.full((num_preds, num_gts), -100.)
+    elif metric=='chamfer_v2':
+        iou_matrix = np.full((num_preds, num_gts), -100.)
+
+    for i, pline in enumerate(gt_lines_shapely):
+
+        for o in tree.query(pline):
+            if o.intersects(pline):
+                pred_id = index_by_id[id(o)]
+
+                if metric=='POR':
+                    dist_mat = distance.cdist(
+                        pred_lines[pred_id], gt_lines[i], 'euclidean')
+                    
+                    valid_ab = (dist_mat.min(-1) < positive_threshold).sum()
+                    valid_ba = (dist_mat.min(-2) < positive_threshold).sum()
+
+                    iou_matrix[pred_id, i] = min(valid_ba,valid_ab) / line_length
+                    # iou_matrix[pred_id, i] = ((valid_ba+valid_ab)/2) / line_length
+                    # assert iou_matrix[pred_id, i] <= 1. and iou_matrix[pred_id, i] >= 0.
+                elif metric=='frechet':
+                    fdistance_1 = \
+                        -similaritymeasures.frechet_dist(pred_lines[pred_id], gt_lines[i])
+                    fdistance_2 = \
+                        -similaritymeasures.frechet_dist(pred_lines[pred_id][::-1], gt_lines[i])
+                    fdistance = max(fdistance_1,fdistance_2)
+                    iou_matrix[pred_id, i] = fdistance
+
+                elif metric=='chamfer':
+                    dist_mat = distance.cdist(
+                        pred_lines[pred_id], gt_lines[i], 'euclidean')
+                    
+                    valid_ab = dist_mat.min(-1).sum()
+                    valid_ba = dist_mat.min(-2).sum()
+
+                    iou_matrix[pred_id, i] = -(valid_ba+valid_ab)/(2*line_length)
+                    # if iou_matrix[pred_id, i] == 0:
+                    #     import ipdb; ipdb.set_trace()
+                elif metric=='chamfer_v2':
+                    dist_mat = distance.cdist(
+                        pred_lines[pred_id], gt_lines[i], 'euclidean')
+                    
+                    valid_ab = dist_mat.min(-1).sum()
+                    valid_ba = dist_mat.min(-2).sum()
+
+                    iou_matrix[pred_id, i] = -(valid_ba/pred_lines[pred_id].shape[0]
+                                                +valid_ab/gt_lines[i].shape[0])/2
+                    # if iou_matrix[pred_id, i] == 0:
+                    #     import ipdb; ipdb.set_trace()
+
+    
+    # if True:
+    #     import matplotlib.pyplot as plt
+    #     print('pred num', num_preds)
+    #     print('gt num', num_gts)
+    #     for i in range(num_preds):
+    #         plt.plot(pred_lines[i][:,0],pred_lines[i][:,1],'-',color='red',alpha=0.5)
+    #     for i in range(num_gts):
+    #         plt.plot(gt_lines[i][:,0],gt_lines[i][:,1],'-',color='blue',alpha=0.5)
+    #     plt.savefig('test.png')
+    #     plt.close()
+    return iou_matrix
+
+
+def custom_polyline_score(pred_lines, gt_lines, linewidth=1., metric='chamfer'):
+    '''
+        each line with 1 meter width
+        pred_lines: num_preds, List [npts, 2]
+        gt_lines: num_gts, npts, 2
+        gt_mask: num_gts, npts, 2
+    '''
+    if metric == 'iou':
+        linewidth = 1.0
+    positive_threshold = 1.
+    num_preds = len(pred_lines)
+    num_gts = len(gt_lines)
+    line_length = pred_lines.shape[1]
+
+    # gt_lines = gt_lines + np.array((1.,1.))
+
+    pred_lines_shapely = \
+        [LineString(i).buffer(linewidth,
+            cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre)
+                          for i in pred_lines]
+    gt_lines_shapely =\
+        [LineString(i).buffer(linewidth,
+            cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre)
+                        for i in gt_lines]
+
+    # construct tree
+    tree = STRtree(pred_lines_shapely)
+    index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely))
+
+
+    if metric=='chamfer':
+        iou_matrix = np.full((num_preds, num_gts), -100.)
+    elif metric=='iou':
+        iou_matrix = np.zeros((num_preds, num_gts),dtype=np.float64)
+    else:
+        raise NotImplementedError
+
+    for i, pline in enumerate(gt_lines_shapely):
+
+        for o in tree.query(pline):
+            if o.intersects(pline):
+                pred_id = index_by_id[id(o)]
+
+                if metric=='chamfer':
+                    dist_mat = distance.cdist(
+                        pred_lines[pred_id], gt_lines[i], 'euclidean')
+                    # import pdb;pdb.set_trace()
+                    valid_ab = dist_mat.min(-1).mean()
+                    valid_ba = dist_mat.min(-2).mean()
+
+                    iou_matrix[pred_id, i] = -(valid_ba+valid_ab)/2
+                elif metric=='iou':
+                    inter = o.intersection(pline).area
+                    union = o.union(pline).area
+                    iou_matrix[pred_id, i] = inter / union
+
+    return iou_matrix
+
+if __name__ == '__main__':
+    import torch
+
+    line1 = torch.tensor([
+        [1, 5], [3, 5], [5, 5]
+    ])
+
+    line0 = torch.tensor([
+        [3, 6], [4, 8], [5, 6]
+    ])
+
+    line2 = torch.tensor([
+        [1, 4], [3, 4], [5, 4]
+    ])
+
+    line3 = torch.tensor([
+        [4, 4], [3, 3], [5, 3]
+    ])
+
+    gt = torch.stack((line2, line3), dim=0).type(torch.float32)
+    pred = torch.stack((line0, line1), dim=0).type(torch.float32)
+
+    # import ipdb; ipdb.set_trace()
+    import mmcv
+    # with mmcv.Timer():
+    #     gt = upsampler(gt, pts=10)
+    #     pred = upsampler(pred, pts=10)
+
+    import matplotlib.pyplot as plt
+    from shapely.geometry import LineString
+    from descartes import PolygonPatch
+    
+    iou_matrix = vec_iou(pred,gt)
+    print(iou_matrix)
+    # import pdb;pdb.set_trace()
+    score_matrix = custom_polyline_score(pred, gt, linewidth=1., metric='chamfer')
+    print(score_matrix)
+    fig, ax = plt.subplots()
+    for i in gt:
+        i = i.numpy()
+        plt.plot(i[:, 0], i[:, 1], 'o', color='red')
+        plt.plot(i[:, 0], i[:, 1], '-', color='red')
+
+        dilated = LineString(i).buffer(1, cap_style=CAP_STYLE.round, join_style=JOIN_STYLE.round)
+        patch1 = PolygonPatch(dilated, fc='red', ec='red', alpha=0.5, zorder=-1)
+        ax.add_patch(patch1)
+
+    for i in pred:
+        i = i.numpy()
+        plt.plot(i[:, 0], i[:, 1], 'o', color='blue')
+        plt.plot(i[:, 0], i[:, 1], '-', color='blue')
+
+        dilated = LineString(i).buffer(1, cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre)
+        patch1 = PolygonPatch(dilated, fc='blue', ec='blue', alpha=0.5, zorder=-1)
+        ax.add_patch(patch1)
+
+
+    ax.axis('equal')
+
+
+    plt.savefig('test3.png')    
\ No newline at end of file
diff --git a/mmcv/datasets/nuscenes_dataset.py b/mmcv/datasets/nuscenes_dataset.py
new file mode 100644
index 0000000..e9c76e0
--- /dev/null
+++ b/mmcv/datasets/nuscenes_dataset.py
@@ -0,0 +1,658 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pyquaternion
+import tempfile
+from nuscenes.utils.data_classes import Box as NuScenesBox
+from os import path as osp
+
+from mmcv.datasets import DATASETS
+from mmcv.fileio.io import load, dump
+from mmcv.utils import track_iter_progress, mkdir_or_exist
+from mmcv.core import show_result
+from mmcv.core.bbox.structures.box_3d_mode import Box3DMode
+from mmcv.core.bbox.structures.coord_3d_mode import Coord3DMode
+from mmcv.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
+from .custom_3d import Custom3DDataset
+from .pipelines import Compose
+
+
+@DATASETS.register_module()
+class NuScenesDataset(Custom3DDataset):
+    r"""NuScenes Dataset.
+
+    This class serves as the API for experiments on the NuScenes Dataset.
+
+    Please refer to `NuScenes Dataset <https://www.nuscenes.org/download>`_
+    for data downloading.
+
+    Args:
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        data_root (str): Path of dataset root.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        load_interval (int, optional): Interval of loading the dataset. It is
+            used to uniformly sample the dataset. Defaults to 1.
+        with_velocity (bool, optional): Whether include velocity prediction
+            into the experiments. Defaults to True.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR' in this dataset. Available options includes.
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
+            Defaults to True.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+        eval_version (bool, optional): Configuration version of evaluation.
+            Defaults to  'detection_cvpr_2019'.
+        use_valid_flag (bool): Whether to use `use_valid_flag` key in the info
+            file as mask to filter gt_boxes and gt_names. Defaults to False.
+    """
+    NameMapping = {
+        'movable_object.barrier': 'barrier',
+        'vehicle.bicycle': 'bicycle',
+        'vehicle.bus.bendy': 'bus',
+        'vehicle.bus.rigid': 'bus',
+        'vehicle.car': 'car',
+        'vehicle.construction': 'construction_vehicle',
+        'vehicle.motorcycle': 'motorcycle',
+        'human.pedestrian.adult': 'pedestrian',
+        'human.pedestrian.child': 'pedestrian',
+        'human.pedestrian.construction_worker': 'pedestrian',
+        'human.pedestrian.police_officer': 'pedestrian',
+        'movable_object.trafficcone': 'traffic_cone',
+        'vehicle.trailer': 'trailer',
+        'vehicle.truck': 'truck'
+    }
+    DefaultAttribute = {
+        'car': 'vehicle.parked',
+        'pedestrian': 'pedestrian.moving',
+        'trailer': 'vehicle.parked',
+        'truck': 'vehicle.parked',
+        'bus': 'vehicle.moving',
+        'motorcycle': 'cycle.without_rider',
+        'construction_vehicle': 'vehicle.parked',
+        'bicycle': 'cycle.without_rider',
+        'barrier': '',
+        'traffic_cone': '',
+    }
+    AttrMapping = {
+        'cycle.with_rider': 0,
+        'cycle.without_rider': 1,
+        'pedestrian.moving': 2,
+        'pedestrian.standing': 3,
+        'pedestrian.sitting_lying_down': 4,
+        'vehicle.moving': 5,
+        'vehicle.parked': 6,
+        'vehicle.stopped': 7,
+    }
+    AttrMapping_rev = [
+        'cycle.with_rider',
+        'cycle.without_rider',
+        'pedestrian.moving',
+        'pedestrian.standing',
+        'pedestrian.sitting_lying_down',
+        'vehicle.moving',
+        'vehicle.parked',
+        'vehicle.stopped',
+    ]
+    # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa
+    ErrNameMapping = {
+        'trans_err': 'mATE',
+        'scale_err': 'mASE',
+        'orient_err': 'mAOE',
+        'vel_err': 'mAVE',
+        'attr_err': 'mAAE'
+    }
+    CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+               'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+               'barrier')
+
+    def __init__(self,
+                 ann_file,
+                 pipeline=None,
+                 data_root=None,
+                 classes=None,
+                 load_interval=1,
+                 with_velocity=True,
+                 modality=None,
+                 box_type_3d='LiDAR',
+                 filter_empty_gt=True,
+                 test_mode=False,
+                 eval_version='detection_cvpr_2019',
+                 use_valid_flag=False):
+        self.load_interval = load_interval
+        self.use_valid_flag = use_valid_flag
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            classes=classes,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode)
+
+        self.with_velocity = with_velocity
+        self.eval_version = eval_version
+        from nuscenes.eval.detection.config import config_factory
+        self.eval_detection_configs = config_factory(self.eval_version)
+        # self.eval_detection_configs.class_names = list(self.eval_detection_configs.class_names)
+        if self.modality is None:
+            self.modality = dict(
+                use_camera=False,
+                use_lidar=True,
+                use_radar=False,
+                use_map=False,
+                use_external=False,
+            )
+
+    def get_cat_ids(self, idx):
+        """Get category distribution of single scene.
+
+        Args:
+            idx (int): Index of the data_info.
+
+        Returns:
+            dict[list]: for each category, if the current scene
+                contains such boxes, store a list containing idx,
+                otherwise, store empty list.
+        """
+        info = self.data_infos[idx]
+        if self.use_valid_flag:
+            mask = info['valid_flag']
+            gt_names = set(info['gt_names'][mask])
+        else:
+            gt_names = set(info['gt_names'])
+
+        cat_ids = []
+        for name in gt_names:
+            if name in self.CLASSES:
+                cat_ids.append(self.cat2id[name])
+        return cat_ids
+
+    def load_annotations(self, ann_file):
+        """Load annotations from ann_file.
+
+        Args:
+            ann_file (str): Path of the annotation file.
+
+        Returns:
+            list[dict]: List of annotations sorted by timestamps.
+        """
+        data = load(ann_file)
+        data_infos = list(sorted(data['infos'], key=lambda e: e['timestamp']))
+        data_infos = data_infos[::self.load_interval]
+        self.metadata = data['metadata']
+        self.version = self.metadata['version']
+        return data_infos
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data \
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - sweeps (list[dict]): Infos of sweeps.
+                - timestamp (float): Sample timestamp.
+                - img_filename (str, optional): Image filename.
+                - lidar2img (list[np.ndarray], optional): Transformations \
+                    from lidar to different cameras.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+        # standard protocal modified from SECOND.Pytorch
+        input_dict = dict(
+            sample_idx=info['token'],
+            pts_filename=info['lidar_path'],
+            sweeps=info['sweeps'],
+            timestamp=info['timestamp'] / 1e6,
+        )
+
+        if self.modality['use_camera']:
+            image_paths = []
+            lidar2img_rts = []
+            for cam_type, cam_info in info['cams'].items():
+                image_paths.append(cam_info['data_path'])
+                # obtain lidar to image transformation matrix
+                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+                lidar2cam_t = cam_info[
+                    'sensor2lidar_translation'] @ lidar2cam_r.T
+                lidar2cam_rt = np.eye(4)
+                lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                lidar2cam_rt[3, :3] = -lidar2cam_t
+                intrinsic = cam_info['cam_intrinsic']
+                viewpad = np.eye(4)
+                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+                lidar2img_rts.append(lidar2img_rt)
+
+            input_dict.update(
+                dict(
+                    img_filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                ))
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict['ann_info'] = annos
+
+        return input_dict
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \
+                    3D ground truth bboxes
+                - gt_labels_3d (np.ndarray): Labels of ground truths.
+                - gt_names (list[str]): Class names of ground truths.
+        """
+        info = self.data_infos[index]
+        # filter out bbox containing no points
+        if self.use_valid_flag:
+            mask = info['valid_flag']
+        else:
+            mask = info['num_lidar_pts'] > 0
+        gt_bboxes_3d = info['gt_boxes'][mask]
+        gt_names_3d = info['gt_names'][mask]
+        gt_labels_3d = []
+        for cat in gt_names_3d:
+            if cat in self.CLASSES:
+                gt_labels_3d.append(self.CLASSES.index(cat))
+            else:
+                gt_labels_3d.append(-1)
+        gt_labels_3d = np.array(gt_labels_3d)
+
+        if self.with_velocity:
+            gt_velocity = info['gt_velocity'][mask]
+            nan_mask = np.isnan(gt_velocity[:, 0])
+            gt_velocity[nan_mask] = [0.0, 0.0]
+            gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1)
+
+        # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
+        # the same as KITTI (0.5, 0.5, 0)
+        gt_bboxes_3d = LiDARInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+            gt_names=gt_names_3d)
+        return anns_results
+
+    def _format_bbox(self, results, jsonfile_prefix=None):
+        """Convert the results to the standard format.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of the output jsonfile.
+                You can specify the output directory/filename by
+                modifying the jsonfile_prefix. Default: None.
+
+        Returns:
+            str: Path of the output json file.
+        """
+
+        # import pdb
+        # pdb.set_trace()
+        nusc_annos = {}
+        mapped_class_names = self.CLASSES
+
+        print('Start to convert detection format...')
+        for sample_id, det in enumerate(track_iter_progress(results)):
+            annos = []
+            boxes = output_to_nusc_box(det)
+            sample_token = self.data_infos[sample_id]['token']
+            boxes = lidar_nusc_box_to_global(self.data_infos[sample_id], boxes,
+                                             mapped_class_names,
+                                             self.eval_detection_configs,
+                                             self.eval_version)
+            for i, box in enumerate(boxes):
+                name = mapped_class_names[box.label]
+                if np.sqrt(box.velocity[0]**2 + box.velocity[1]**2) > 0.2:
+                    if name in [
+                            'car',
+                            'construction_vehicle',
+                            'bus',
+                            'truck',
+                            'trailer',
+                    ]:
+                        attr = 'vehicle.moving'
+                    elif name in ['bicycle', 'motorcycle']:
+                        attr = 'cycle.with_rider'
+                    else:
+                        attr = NuScenesDataset.DefaultAttribute[name]
+                else:
+                    if name in ['pedestrian']:
+                        attr = 'pedestrian.standing'
+                    elif name in ['bus']:
+                        attr = 'vehicle.stopped'
+                    else:
+                        attr = NuScenesDataset.DefaultAttribute[name]
+
+                nusc_anno = dict(
+                    sample_token=sample_token,
+                    translation=box.center.tolist(),
+                    size=box.wlh.tolist(),
+                    rotation=box.orientation.elements.tolist(),
+                    velocity=box.velocity[:2].tolist(),
+                    detection_name=name,
+                    detection_score=box.score,
+                    attribute_name=attr)
+                annos.append(nusc_anno)
+            nusc_annos[sample_token] = annos
+        nusc_submissions = {
+            'meta': self.modality,
+            'results': nusc_annos,
+        }
+
+        #pdb.set_trace()
+
+        mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+        print('Results writes to', res_path)
+        dump(nusc_submissions, res_path)
+        return res_path
+
+    def _evaluate_single(self,
+                         result_path,
+                         logger=None,
+                         metric='bbox',
+                         result_name='pts_bbox'):
+        """Evaluation for a single model in nuScenes protocol.
+
+        Args:
+            result_path (str): Path of the result file.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            metric (str): Metric name used for evaluation. Default: 'bbox'.
+            result_name (str): Result name in the metric prefix.
+                Default: 'pts_bbox'.
+
+        Returns:
+            dict: Dictionary of evaluation details.
+        """
+        from nuscenes import NuScenes
+        from nuscenes.eval.detection.evaluate import NuScenesEval
+
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+        nusc = NuScenes(
+            version=self.version, dataroot=self.data_root, verbose=False)
+        eval_set_map = {
+            'v1.0-mini': 'mini_val',
+            'v1.0-trainval': 'val',
+        }
+        nusc_eval = NuScenesEval(
+            nusc,
+            config=self.eval_detection_configs,
+            result_path=result_path,
+            eval_set=eval_set_map[self.version],
+            output_dir=output_dir,
+            verbose=False)
+        nusc_eval.main(render_curves=False)
+
+        # record metrics
+        metrics = load(osp.join(output_dir, 'metrics_summary.json'))
+        detail = dict()
+        metric_prefix = f'{result_name}_NuScenes'
+        for name in self.CLASSES:
+            for k, v in metrics['label_aps'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['label_tp_errors'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['tp_errors'].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}'.format(metric_prefix,
+                                      self.ErrNameMapping[k])] = val
+
+        detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+        detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+        return detail
+
+    def format_results(self, results, jsonfile_prefix=None):
+        """Format the results to json (standard format for COCO evaluation).
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: Returns (result_files, tmp_dir), where `result_files` is a \
+                dict containing the json filepaths, `tmp_dir` is the temporal \
+                directory created for saving json files when \
+                `jsonfile_prefix` is not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+        assert len(results) == len(self), (
+            'The length of results is not equal to the dataset len: {} != {}'.
+            format(len(results), len(self)))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+
+        # currently the output prediction results could be in two formats
+        # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
+        # 2. list of dict('pts_bbox' or 'img_bbox':
+        #     dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
+        # this is a workaround to enable evaluation of both formats on nuScenes
+        # refer to https://github.com/open-mmlab/mmcvection3d/issues/449
+        if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
+            result_files = self._format_bbox(results, jsonfile_prefix)
+        else:
+            # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
+            result_files = dict()
+            for name in results[0]:
+                print(f'\nFormating bboxes of {name}')
+                results_ = [out[name] for out in results]
+                tmp_file_ = osp.join(jsonfile_prefix, name)
+                result_files.update(
+                    {name: self._format_bbox(results_, tmp_file_)})
+        return result_files, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 result_names=['pts_bbox'],
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluation in nuScenes protocol.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            show (bool): Whether to visualize.
+                Default: False.
+            out_dir (str): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+
+        Returns:
+            dict[str, float]: Results of each evaluation metric.
+        """
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+
+        if isinstance(result_files, dict):
+            results_dict = dict()
+            for name in result_names:
+                print('Evaluating bboxes of {}'.format(name))
+                ret_dict = self._evaluate_single(result_files[name])
+            results_dict.update(ret_dict)
+        elif isinstance(result_files, str):
+            results_dict = self._evaluate_single(result_files)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+
+        if show:
+            self.show(results, out_dir, pipeline=pipeline)
+        return results_dict
+
+    def _build_default_pipeline(self):
+        """Build the default pipeline for this dataset."""
+        pipeline = [
+            dict(
+                type='LoadPointsFromFile',
+                coord_type='LIDAR',
+                load_dim=5,
+                use_dim=5,
+                file_client_args=dict(backend='disk')),
+            dict(
+                type='LoadPointsFromMultiSweeps',
+                sweeps_num=10,
+                file_client_args=dict(backend='disk')),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=self.CLASSES,
+                with_label=False),
+            dict(type='Collect3D', keys=['points'])
+        ]
+        return Compose(pipeline)
+
+    def show(self, results, out_dir, show=True, pipeline=None):
+        """Results visualization.
+
+        Args:
+            results (list[dict]): List of bounding boxes results.
+            out_dir (str): Output directory of visualization result.
+            show (bool): Visualize the results online.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+        """
+        assert out_dir is not None, 'Expect out_dir, got none.'
+        pipeline = self._get_pipeline(pipeline)
+        for i, result in enumerate(results):
+            if 'pts_bbox' in result.keys():
+                result = result['pts_bbox']
+            data_info = self.data_infos[i]
+            pts_path = data_info['lidar_path']
+            file_name = osp.split(pts_path)[-1].split('.')[0]
+            points = self._extract_data(i, pipeline, 'points').numpy()
+            # for now we convert points into depth mode
+            points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
+                                               Coord3DMode.DEPTH)
+            inds = result['scores_3d'] > 0.1
+            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy()
+            show_gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR,
+                                               Box3DMode.DEPTH)
+            pred_bboxes = result['boxes_3d'][inds].tensor.numpy()
+            show_pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR,
+                                                 Box3DMode.DEPTH)
+            show_result(points, show_gt_bboxes, show_pred_bboxes, out_dir,
+                        file_name, show)
+
+
+def output_to_nusc_box(detection):
+    """Convert the output to the box class in the nuScenes.
+
+    Args:
+        detection (dict): Detection results.
+
+            - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+            - scores_3d (torch.Tensor): Detection scores.
+            - labels_3d (torch.Tensor): Predicted box labels.
+
+    Returns:
+        list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
+    """
+    box3d = detection['boxes_3d']
+    scores = detection['scores_3d'].numpy()
+    labels = detection['labels_3d'].numpy()
+
+    box_gravity_center = box3d.gravity_center.numpy()
+    box_dims = box3d.dims.numpy()
+    box_yaw = box3d.yaw.numpy()
+    # TODO: check whether this is necessary
+    # with dir_offset & dir_limit in the head
+    box_yaw = -box_yaw - np.pi / 2
+
+    box_list = []
+    for i in range(len(box3d)):
+        quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+        velocity = (*box3d.tensor[i, 7:9], 0.0)
+        # velo_val = np.linalg.norm(box3d[i, 7:9])
+        # velo_ori = box3d[i, 6]
+        # velocity = (
+        # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0)
+        box = NuScenesBox(
+            box_gravity_center[i],
+            box_dims[i],
+            quat,
+            label=labels[i],
+            score=scores[i],
+            velocity=velocity)
+        box_list.append(box)
+    return box_list
+
+
+def lidar_nusc_box_to_global(info,
+                             boxes,
+                             classes,
+                             eval_configs,
+                             eval_version='detection_cvpr_2019'):
+    """Convert the box from ego to global coordinate.
+
+    Args:
+        info (dict): Info for a specific sample data, including the
+            calibration information.
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        classes (list[str]): Mapped classes in the evaluation.
+        eval_configs (object): Evaluation configuration object.
+        eval_version (str): Evaluation version.
+            Default: 'detection_cvpr_2019'
+
+    Returns:
+        list: List of standard NuScenesBoxes in the global
+            coordinate.
+    """
+    box_list = []
+    for box in boxes:
+        # Move box to ego vehicle coord system
+        box.rotate(pyquaternion.Quaternion(info['lidar2ego_rotation']))
+        box.translate(np.array(info['lidar2ego_translation']))
+        # filter det in ego.
+        cls_range_map = eval_configs.class_range
+        radius = np.linalg.norm(box.center[:2], 2)
+        det_range = cls_range_map[classes[box.label]]
+        if radius > det_range:
+            continue
+        # Move box to global coord system
+        box.rotate(pyquaternion.Quaternion(info['ego2global_rotation']))
+        box.translate(np.array(info['ego2global_translation']))
+        box_list.append(box)
+    return box_list
diff --git a/mmcv/datasets/nuscenes_e2e_dataset.py b/mmcv/datasets/nuscenes_e2e_dataset.py
new file mode 100644
index 0000000..38b3ffc
--- /dev/null
+++ b/mmcv/datasets/nuscenes_e2e_dataset.py
@@ -0,0 +1,1247 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+#---------------------------------------------------------------------------------#
+
+import copy
+import numpy as np
+import torch
+from mmcv.datasets import DATASETS
+from mmcv.datasets.pipelines import to_tensor
+from mmcv.datasets import NuScenesDataset
+from mmcv.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
+from mmcv.fileio.file_client import FileClient
+from mmcv.fileio.io import load, dump
+from mmcv.utils import track_iter_progress, mkdir_or_exist
+from os import path as osp
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from .eval_utils.nuscenes_eval import NuScenesEval_custom
+from nuscenes.eval.tracking.evaluate import TrackingEval
+from .eval_utils.nuscenes_eval_motion import MotionEval
+from nuscenes.eval.common.config import config_factory
+import tempfile
+from mmcv.parallel import DataContainer as DC
+import random
+import pickle
+from prettytable import PrettyTable
+
+from nuscenes import NuScenes
+from mmcv.datasets.data_utils.vector_map import VectorizedLocalMap
+from mmcv.datasets.data_utils.rasterize import preprocess_map
+from mmcv.datasets.eval_utils.map_api import NuScenesMap
+from mmcv.datasets.data_utils.trajectory_api import NuScenesTraj
+from .data_utils.data_utils import lidar_nusc_box_to_global, obtain_map_info, output_to_nusc_box, output_to_nusc_box_det
+from nuscenes.prediction import convert_local_coords_to_global
+
+
+@DATASETS.register_module()
+class NuScenesE2EDataset(NuScenesDataset):
+    r"""NuScenes E2E Dataset.
+
+    This dataset only add camera intrinsics and extrinsics to the results.
+    """
+
+    def __init__(self,
+                 queue_length=4,
+                 bev_size=(200, 200),
+                 patch_size=(102.4, 102.4),
+                 canvas_size=(200, 200),
+                 overlap_test=False,
+                 predict_steps=12,
+                 planning_steps=6,
+                 past_steps=4,
+                 fut_steps=4,
+                 use_nonlinear_optimizer=False,
+                 lane_ann_file=None,
+                 eval_mod=None,
+
+                 # For debug
+                 is_debug=False,
+                 len_debug=30,
+
+                 # Occ dataset
+                 enbale_temporal_aug=False,
+                 occ_receptive_field=3,
+                 occ_n_future=4,
+                 occ_filter_invalid_sample=False,
+                 occ_filter_by_valid_flag=False,
+
+                 file_client_args=dict(backend='disk'),
+                 *args, 
+                 **kwargs):
+        # init before super init since it is called in parent class
+        self.file_client_args = file_client_args
+        self.file_client = FileClient(**file_client_args)
+
+        self.is_debug = is_debug
+        self.len_debug = len_debug
+        super().__init__(*args, **kwargs)
+        self.queue_length = queue_length
+        self.overlap_test = overlap_test
+        self.bev_size = bev_size
+        self.predict_steps = predict_steps
+        self.planning_steps = planning_steps
+        self.past_steps = past_steps
+        self.fut_steps = fut_steps
+        self.scene_token = None
+        self.lane_infos = self.load_annotations(lane_ann_file) \
+            if lane_ann_file else None
+        self.eval_mod = eval_mod
+
+        self.use_nonlinear_optimizer = use_nonlinear_optimizer
+
+        self.nusc = NuScenes(version=self.version,
+                             dataroot=self.data_root, verbose=True)
+
+        self.map_num_classes = 3
+        if canvas_size[0] == 50:
+            self.thickness = 1
+        elif canvas_size[0] == 200:
+            self.thickness = 2
+        else:
+            assert False
+        self.angle_class = 36
+        self.patch_size = patch_size
+        self.canvas_size = canvas_size
+        self.nusc_maps = {
+            'boston-seaport': NuScenesMap(dataroot=self.data_root, map_name='boston-seaport'),
+            'singapore-hollandvillage': NuScenesMap(dataroot=self.data_root, map_name='singapore-hollandvillage'),
+            'singapore-onenorth': NuScenesMap(dataroot=self.data_root, map_name='singapore-onenorth'),
+            'singapore-queenstown': NuScenesMap(dataroot=self.data_root, map_name='singapore-queenstown'),
+        }
+        self.vector_map = VectorizedLocalMap(
+            self.data_root,
+            patch_size=self.patch_size,
+            canvas_size=self.canvas_size)
+        self.traj_api = NuScenesTraj(self.nusc,
+                                     self.predict_steps,
+                                     self.planning_steps,
+                                     self.past_steps,
+                                     self.fut_steps,
+                                     self.with_velocity,
+                                     self.CLASSES,
+                                     self.box_mode_3d,
+                                     self.use_nonlinear_optimizer)
+
+        # Occ
+        self.enbale_temporal_aug = enbale_temporal_aug
+        assert self.enbale_temporal_aug is False
+
+        self.occ_receptive_field = occ_receptive_field  # past + current
+        self.occ_n_future = occ_n_future  # future only
+        self.occ_filter_invalid_sample = occ_filter_invalid_sample
+        self.occ_filter_by_valid_flag = occ_filter_by_valid_flag
+        self.occ_only_total_frames = 7  # NOTE: hardcode, not influenced by planning
+
+    def __len__(self):
+        if not self.is_debug:
+            return len(self.data_infos)
+        else:
+            return self.len_debug
+
+    def load_annotations(self, ann_file):
+        """Load annotations from ann_file.
+        Args:
+            ann_file (str): Path of the annotation file.
+
+        Returns:
+            list[dict]: List of annotations sorted by timestamps.
+        """
+        if self.file_client_args['backend'] == 'disk':
+            # data_infos = mmcv.load(ann_file)
+            data = pickle.loads(self.file_client.get(ann_file))
+            data_infos = list(
+                sorted(data['infos'], key=lambda e: e['timestamp']))
+            data_infos = data_infos[::self.load_interval]
+            self.metadata = data['metadata']
+            self.version = self.metadata['version']
+        elif self.file_client_args['backend'] == 'petrel':
+            data = pickle.loads(self.file_client.get(ann_file))
+            data_infos = list(
+                sorted(data['infos'], key=lambda e: e['timestamp']))
+            data_infos = data_infos[::self.load_interval]
+            self.metadata = data['metadata']
+            self.version = self.metadata['version']
+        else:
+            assert False, 'Invalid file_client_args!'
+        return data_infos
+
+    def prepare_train_data(self, index):
+        """
+        Training data preparation.
+        Args:
+            index (int): Index for accessing the target data.
+        Returns:
+            dict: Training data dict of the corresponding index.
+                img: queue_length, 6, 3, H, W
+                img_metas: img_metas of each frame (list)
+                gt_globals_3d: gt_globals of each frame (list)
+                gt_bboxes_3d: gt_bboxes of each frame (list)
+                gt_inds: gt_inds of each frame (list)
+        """
+        data_queue = []
+        self.enbale_temporal_aug = False
+        if self.enbale_temporal_aug:
+            # temporal aug
+            prev_indexs_list = list(range(index-self.queue_length, index))
+            random.shuffle(prev_indexs_list)
+            prev_indexs_list = sorted(prev_indexs_list[1:], reverse=True)
+            input_dict = self.get_data_info(index)
+        else:
+            # ensure the first and final frame in same scene
+            final_index = index
+            first_index = index - self.queue_length + 1
+            if first_index < 0:
+                return None
+            if self.data_infos[first_index]['scene_token'] != \
+                    self.data_infos[final_index]['scene_token']:
+                return None
+            # current timestamp
+            input_dict = self.get_data_info(final_index)
+            prev_indexs_list = list(reversed(range(first_index, final_index)))
+        if input_dict is None:
+            return None
+        frame_idx = input_dict['frame_idx']
+        scene_token = input_dict['scene_token']
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+
+        assert example['gt_labels_3d'].data.shape[0] == example['gt_fut_traj'].shape[0]
+        assert example['gt_labels_3d'].data.shape[0] == example['gt_past_traj'].shape[0]
+
+        if self.filter_empty_gt and \
+                (example is None or ~(example['gt_labels_3d']._data != -1).any()):
+            return None
+        data_queue.insert(0, example)
+
+        # retrieve previous infos
+
+        for i in prev_indexs_list:
+            if self.enbale_temporal_aug:
+                i = max(0, i)
+            input_dict = self.get_data_info(i)
+            if input_dict is None:
+                return None
+            if input_dict['frame_idx'] < frame_idx and input_dict['scene_token'] == scene_token:
+                self.pre_pipeline(input_dict)
+                example = self.pipeline(input_dict)
+                if self.filter_empty_gt and \
+                        (example is None or ~(example['gt_labels_3d']._data != -1).any()):
+                    return None
+                frame_idx = input_dict['frame_idx']
+            assert example['gt_labels_3d'].data.shape[0] == example['gt_fut_traj'].shape[0]
+            assert example['gt_labels_3d'].data.shape[0] == example['gt_past_traj'].shape[0]
+            data_queue.insert(0, copy.deepcopy(example))
+        data_queue = self.union2one(data_queue)
+        return data_queue
+
+    def prepare_test_data(self, index):
+        """
+        Training data preparation.
+        Args:
+            index (int): Index for accessing the target data.
+        Returns:
+            dict: Training data dict of the corresponding index.
+                img: queue_length, 6, 3, H, W
+                img_metas: img_metas of each frame (list)
+                gt_labels_3d: gt_labels of each frame (list)
+                gt_bboxes_3d: gt_bboxes of each frame (list)
+                gt_inds: gt_inds of each frame(list)
+        """
+
+        input_dict = self.get_data_info(index)
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        data_dict = {}
+        for key, value in example.items():
+            if 'l2g' in key:
+                data_dict[key] = to_tensor(value[0])
+            else:
+                data_dict[key] = value
+        return data_dict
+
+    def union2one(self, queue):
+        """
+        convert sample dict into one single sample.
+        """
+        imgs_list = [each['img'].data for each in queue]
+        gt_labels_3d_list = [each['gt_labels_3d'].data for each in queue]
+        gt_sdc_label_list = [each['gt_sdc_label'].data for each in queue]
+        gt_inds_list = [to_tensor(each['gt_inds']) for each in queue]
+        gt_bboxes_3d_list = [each['gt_bboxes_3d'].data for each in queue]
+        gt_past_traj_list = [to_tensor(each['gt_past_traj']) for each in queue]
+        gt_past_traj_mask_list = [
+            to_tensor(each['gt_past_traj_mask']) for each in queue]
+        gt_sdc_bbox_list = [each['gt_sdc_bbox'].data for each in queue]
+        l2g_r_mat_list = [to_tensor(each['l2g_r_mat']) for each in queue]
+        l2g_t_list = [to_tensor(each['l2g_t']) for each in queue]
+        timestamp_list = [to_tensor(each['timestamp']) for each in queue]
+        gt_fut_traj = to_tensor(queue[-1]['gt_fut_traj'])
+        gt_fut_traj_mask = to_tensor(queue[-1]['gt_fut_traj_mask'])
+        # gt_sdc_fut_traj = to_tensor(queue[-1]['gt_sdc_fut_traj'])
+        # gt_sdc_fut_traj_mask = to_tensor(queue[-1]['gt_sdc_fut_traj_mask'])
+        # gt_future_boxes_list = queue[-1]['gt_future_boxes']
+        # gt_future_labels_list = [to_tensor(each)
+        #                          for each in queue[-1]['gt_future_labels']]
+
+        metas_map = {}
+        prev_pos = None
+        prev_angle = None
+        for i, each in enumerate(queue):
+            metas_map[i] = each['img_metas'].data
+            if i == 0:
+                metas_map[i]['prev_bev'] = False
+                prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] = 0
+                metas_map[i]['can_bus'][-1] = 0
+            else:
+                metas_map[i]['prev_bev'] = True
+                tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] -= prev_pos
+                metas_map[i]['can_bus'][-1] -= prev_angle
+                prev_pos = copy.deepcopy(tmp_pos)
+                prev_angle = copy.deepcopy(tmp_angle)
+
+        queue[-1]['img'] = DC(torch.stack(imgs_list),
+                              cpu_only=False, stack=True)
+        queue[-1]['img_metas'] = DC(metas_map, cpu_only=True)
+        queue = queue[-1]
+
+        queue['gt_labels_3d'] = DC(gt_labels_3d_list)
+        queue['gt_sdc_label'] = DC(gt_sdc_label_list)
+        queue['gt_inds'] = DC(gt_inds_list)
+        queue['gt_bboxes_3d'] = DC(gt_bboxes_3d_list, cpu_only=True)
+        queue['gt_sdc_bbox'] = DC(gt_sdc_bbox_list, cpu_only=True)
+        queue['l2g_r_mat'] = DC(l2g_r_mat_list)
+        queue['l2g_t'] = DC(l2g_t_list)
+        queue['timestamp'] = DC(timestamp_list)
+        queue['gt_fut_traj'] = DC(gt_fut_traj)
+        queue['gt_fut_traj_mask'] = DC(gt_fut_traj_mask)
+        queue['gt_past_traj'] = DC(gt_past_traj_list)
+        queue['gt_past_traj_mask'] = DC(gt_past_traj_mask_list)
+        # queue['gt_future_boxes'] = DC(gt_future_boxes_list, cpu_only=True)
+        # queue['gt_future_labels'] = DC(gt_future_labels_list)
+        return queue
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \
+                    3D ground truth bboxes
+                - gt_labels_3d (np.ndarray): Labels of ground truths.
+                - gt_names (list[str]): Class names of ground truths.
+                - gt_inds (np.ndarray): Instance ids of ground truths.
+                - gt_fut_traj (np.ndarray): .
+                - gt_fut_traj_mask (np.ndarray): .
+        """
+        info = self.data_infos[index]
+        # filter out bbox containing no points
+        if self.use_valid_flag:
+            mask = info['valid_flag']
+        else:
+            mask = info['num_lidar_pts'] > 0
+        gt_bboxes_3d = info['gt_boxes'][mask]
+        gt_names_3d = info['gt_names'][mask]
+        gt_inds = info['gt_inds'][mask]
+
+        sample = self.nusc.get('sample', info['token'])
+        ann_tokens = np.array(sample['anns'])[mask]
+        assert ann_tokens.shape[0] == gt_bboxes_3d.shape[0]
+
+        gt_fut_traj, gt_fut_traj_mask, gt_past_traj, gt_past_traj_mask = self.traj_api.get_traj_label(
+            info['token'], ann_tokens)
+
+        sdc_vel = self.traj_api.sdc_vel_info[info['token']]
+        gt_sdc_bbox, gt_sdc_label = self.traj_api.generate_sdc_info(sdc_vel)
+        gt_sdc_fut_traj, gt_sdc_fut_traj_mask = self.traj_api.get_sdc_traj_label(
+            info['token'])
+
+        sdc_planning, sdc_planning_mask, command = self.traj_api.get_sdc_planning_label(
+            info['token'])
+
+        gt_labels_3d = []
+        for cat in gt_names_3d:
+            if cat in self.CLASSES:
+                gt_labels_3d.append(self.CLASSES.index(cat))
+            else:
+                gt_labels_3d.append(-1)
+        gt_labels_3d = np.array(gt_labels_3d)
+
+        if self.with_velocity:
+            gt_velocity = info['gt_velocity'][mask]
+            nan_mask = np.isnan(gt_velocity[:, 0])
+            gt_velocity[nan_mask] = [0.0, 0.0]
+            gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1)
+
+        # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
+        # the same as KITTI (0.5, 0.5, 0)
+        gt_bboxes_3d = LiDARInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+            gt_names=gt_names_3d,
+            gt_inds=gt_inds,
+            gt_fut_traj=gt_fut_traj,
+            gt_fut_traj_mask=gt_fut_traj_mask,
+            gt_past_traj=gt_past_traj,
+            gt_past_traj_mask=gt_past_traj_mask,
+            gt_sdc_bbox=gt_sdc_bbox,
+            gt_sdc_label=gt_sdc_label,
+            gt_sdc_fut_traj=gt_sdc_fut_traj,
+            gt_sdc_fut_traj_mask=gt_sdc_fut_traj_mask,
+            sdc_planning=sdc_planning,
+            sdc_planning_mask=sdc_planning_mask,
+            command=command,
+        )
+        assert gt_fut_traj.shape[0] == gt_labels_3d.shape[0]
+        assert gt_past_traj.shape[0] == gt_labels_3d.shape[0]
+        return anns_results
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+        Returns:
+            dict: Data information that will be passed to the data \
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - sweeps (list[dict]): Infos of sweeps.
+                - timestamp (float): Sample timestamp.
+                - img_filename (str, optional): Image filename.
+                - lidar2img (list[np.ndarray], optional): Transformations \
+                    from lidar to different cameras.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+
+        # semantic format
+        lane_info = self.lane_infos[index] if self.lane_infos else None
+        # panoptic format
+        location = self.nusc.get('log', self.nusc.get(
+            'scene', info['scene_token'])['log_token'])['location']
+        vectors = self.vector_map.gen_vectorized_samples(location,
+                                                         info['ego2global_translation'],
+                                                         info['ego2global_rotation'])
+        semantic_masks, instance_masks, forward_masks, backward_masks = preprocess_map(vectors,
+                                                                                       self.patch_size,
+                                                                                       self.canvas_size,
+                                                                                       self.map_num_classes,
+                                                                                       self.thickness,
+                                                                                       self.angle_class)
+        instance_masks = np.rot90(instance_masks, k=-1, axes=(1, 2))
+        instance_masks = torch.tensor(instance_masks.copy())
+        gt_labels = []
+        gt_bboxes = []
+        gt_masks = []
+        for cls in range(self.map_num_classes):
+            for i in np.unique(instance_masks[cls]):
+                if i == 0:
+                    continue
+                gt_mask = (instance_masks[cls] == i).to(torch.uint8)
+                ys, xs = np.where(gt_mask)
+                gt_bbox = [min(xs), min(ys), max(xs), max(ys)]
+                gt_labels.append(cls)
+                gt_bboxes.append(gt_bbox)
+                gt_masks.append(gt_mask)
+        map_mask = obtain_map_info(self.nusc,
+                                   self.nusc_maps,
+                                   info,
+                                   patch_size=self.patch_size,
+                                   canvas_size=self.canvas_size,
+                                   layer_names=['lane_divider', 'road_divider'])
+        map_mask = np.flip(map_mask, axis=1)
+        map_mask = np.rot90(map_mask, k=-1, axes=(1, 2))
+        map_mask = torch.tensor(map_mask.copy())
+        for i, gt_mask in enumerate(map_mask[:-1]):
+            ys, xs = np.where(gt_mask)
+            gt_bbox = [min(xs), min(ys), max(xs), max(ys)]
+            gt_labels.append(i + self.map_num_classes)
+            gt_bboxes.append(gt_bbox)
+            gt_masks.append(gt_mask)
+        gt_labels = torch.tensor(gt_labels)
+        gt_bboxes = torch.tensor(np.stack(gt_bboxes))
+        gt_masks = torch.stack(gt_masks)
+
+        # standard protocal modified from SECOND.Pytorch
+        input_dict = dict(
+            sample_idx=info['token'],
+            pts_filename=info['lidar_path'],
+            sweeps=info['sweeps'],
+            ego2global_translation=info['ego2global_translation'],
+            ego2global_rotation=info['ego2global_rotation'],
+            prev_idx=info['prev'],
+            next_idx=info['next'],
+            scene_token=info['scene_token'],
+            can_bus=info['can_bus'],
+            frame_idx=info['frame_idx'],
+            timestamp=info['timestamp'] / 1e6,
+            map_filename=lane_info['maps']['map_mask'] if lane_info else None,
+            gt_lane_labels=gt_labels,
+            gt_lane_bboxes=gt_bboxes,
+            gt_lane_masks=gt_masks,
+        )
+
+        l2e_r = info['lidar2ego_rotation']
+        l2e_t = info['lidar2ego_translation']
+        e2g_r = info['ego2global_rotation']
+        e2g_t = info['ego2global_translation']
+        l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+        e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+        l2g_r_mat = l2e_r_mat.T @ e2g_r_mat.T
+        l2g_t = l2e_t @ e2g_r_mat.T + e2g_t
+
+        input_dict.update(
+            dict(
+                l2g_r_mat=l2g_r_mat.astype(np.float32),
+                l2g_t=l2g_t.astype(np.float32)))
+
+        if self.modality['use_camera']:
+            image_paths = []
+            lidar2img_rts = []
+            lidar2cam_rts = []
+            cam_intrinsics = []
+            for cam_type, cam_info in info['cams'].items():
+                image_paths.append(cam_info['data_path'])
+                # obtain lidar to image transformation matrix
+                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+                lidar2cam_t = cam_info[
+                    'sensor2lidar_translation'] @ lidar2cam_r.T
+                lidar2cam_rt = np.eye(4)
+                lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                lidar2cam_rt[3, :3] = -lidar2cam_t
+                intrinsic = cam_info['cam_intrinsic']
+                viewpad = np.eye(4)
+                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+                lidar2img_rts.append(lidar2img_rt)
+
+                cam_intrinsics.append(viewpad)
+                lidar2cam_rts.append(lidar2cam_rt.T)
+
+            input_dict.update(
+                dict(
+                    img_filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                    cam_intrinsic=cam_intrinsics,
+                    lidar2cam=lidar2cam_rts,
+                ))
+
+        # if not self.test_mode:
+        annos = self.get_ann_info(index)
+        input_dict['ann_info'] = annos
+        if 'sdc_planning' in input_dict['ann_info'].keys():
+            input_dict['sdc_planning'] = input_dict['ann_info']['sdc_planning']
+            input_dict['sdc_planning_mask'] = input_dict['ann_info']['sdc_planning_mask']
+            input_dict['command'] = input_dict['ann_info']['command']
+
+        rotation = Quaternion(input_dict['ego2global_rotation'])
+        translation = input_dict['ego2global_translation']
+        can_bus = input_dict['can_bus']
+        can_bus[:3] = translation
+        can_bus[3:7] = rotation
+        patch_angle = quaternion_yaw(rotation) / np.pi * 180
+        if patch_angle < 0:
+            patch_angle += 360
+        can_bus[-2] = patch_angle / 180 * np.pi
+        can_bus[-1] = patch_angle
+
+        # TODO: Warp all those below occupancy-related codes into a function
+        prev_indices, future_indices = self.occ_get_temporal_indices(
+            index, self.occ_receptive_field, self.occ_n_future)
+
+        # ego motions of all frames are needed
+        all_frames = prev_indices + [index] + future_indices
+
+        # whether invalid frames is present
+        # 
+        has_invalid_frame = -1 in all_frames[:self.occ_only_total_frames]
+        # NOTE: This can only represent 7 frames in total as it influence evaluation
+        input_dict['occ_has_invalid_frame'] = has_invalid_frame
+        input_dict['occ_img_is_valid'] = np.array(all_frames) >= 0
+
+        # might have None if not in the same sequence
+        future_frames = [index] + future_indices
+
+        # get lidar to ego to global transforms for each curr and fut index
+        occ_transforms = self.occ_get_transforms(
+            future_frames)  # might have None
+        input_dict.update(occ_transforms)
+
+        # for (current and) future frames, detection labels are needed
+        # generate detection labels for current + future frames
+        input_dict['occ_future_ann_infos'] = \
+            self.get_future_detection_infos(future_frames)
+        return input_dict
+
+    def get_future_detection_infos(self, future_frames):
+        detection_ann_infos = []
+        for future_frame in future_frames:
+            if future_frame >= 0:
+                detection_ann_infos.append(
+                    self.occ_get_detection_ann_info(future_frame),
+                )
+            else:
+                detection_ann_infos.append(None)
+        return detection_ann_infos
+
+    def occ_get_temporal_indices(self, index, receptive_field, n_future):
+        current_scene_token = self.data_infos[index]['scene_token']
+
+        # generate the past
+        previous_indices = []
+
+        for t in range(- receptive_field + 1, 0):
+            index_t = index + t
+            if index_t >= 0 and self.data_infos[index_t]['scene_token'] == current_scene_token:
+                previous_indices.append(index_t)
+            else:
+                previous_indices.append(-1)  # for invalid indices
+
+        # generate the future
+        future_indices = []
+
+        for t in range(1, n_future + 1):
+            index_t = index + t
+            if index_t < len(self.data_infos) and self.data_infos[index_t]['scene_token'] == current_scene_token:
+                future_indices.append(index_t)
+            else:
+                # NOTE: How to deal the invalid indices???
+                future_indices.append(-1)
+
+        return previous_indices, future_indices
+
+    def occ_get_transforms(self, indices, data_type=torch.float32):
+        """
+        get l2e, e2g rotation and translation for each valid frame
+        """
+        l2e_r_mats = []
+        l2e_t_vecs = []
+        e2g_r_mats = []
+        e2g_t_vecs = []
+
+        for index in indices:
+            if index == -1:
+                l2e_r_mats.append(None)
+                l2e_t_vecs.append(None)
+                e2g_r_mats.append(None)
+                e2g_t_vecs.append(None)
+            else:
+                info = self.data_infos[index]
+                l2e_r = info['lidar2ego_rotation']
+                l2e_t = info['lidar2ego_translation']
+                e2g_r = info['ego2global_rotation']
+                e2g_t = info['ego2global_translation']
+
+                l2e_r_mat = torch.from_numpy(Quaternion(l2e_r).rotation_matrix)
+                e2g_r_mat = torch.from_numpy(Quaternion(e2g_r).rotation_matrix)
+
+                l2e_r_mats.append(l2e_r_mat.to(data_type))
+                l2e_t_vecs.append(torch.tensor(l2e_t).to(data_type))
+                e2g_r_mats.append(e2g_r_mat.to(data_type))
+                e2g_t_vecs.append(torch.tensor(e2g_t).to(data_type))
+
+        res = {
+            'occ_l2e_r_mats': l2e_r_mats,
+            'occ_l2e_t_vecs': l2e_t_vecs,
+            'occ_e2g_r_mats': e2g_r_mats,
+            'occ_e2g_t_vecs': e2g_t_vecs,
+        }
+
+        return res
+
+    def occ_get_detection_ann_info(self, index):
+        info = self.data_infos[index].copy()
+        gt_bboxes_3d = info['gt_boxes'].copy()
+        gt_names_3d = info['gt_names'].copy()
+        gt_ins_inds = info['gt_inds'].copy()
+
+        gt_vis_tokens = info.get('visibility_tokens', None)
+
+        if self.use_valid_flag:
+            gt_valid_flag = info['valid_flag']
+        else:
+            gt_valid_flag = info['num_lidar_pts'] > 0
+
+        assert self.occ_filter_by_valid_flag is False
+        if self.occ_filter_by_valid_flag:
+            gt_bboxes_3d = gt_bboxes_3d[gt_valid_flag]
+            gt_names_3d = gt_names_3d[gt_valid_flag]
+            gt_ins_inds = gt_ins_inds[gt_valid_flag]
+            gt_vis_tokens = gt_vis_tokens[gt_valid_flag]
+
+        # cls_name to cls_id
+        gt_labels_3d = []
+        for cat in gt_names_3d:
+            if cat in self.CLASSES:
+                gt_labels_3d.append(self.CLASSES.index(cat))
+            else:
+                gt_labels_3d.append(-1)
+        gt_labels_3d = np.array(gt_labels_3d)
+
+        if self.with_velocity:
+            gt_velocity = info['gt_velocity']
+            nan_mask = np.isnan(gt_velocity[:, 0])
+            gt_velocity[nan_mask] = [0.0, 0.0]
+            gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1)
+
+        # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
+        # the same as KITTI (0.5, 0.5, 0)
+        gt_bboxes_3d = LiDARInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+            # gt_names=gt_names_3d,
+            gt_inds=gt_ins_inds,
+            gt_vis_tokens=gt_vis_tokens,
+        )
+
+        return anns_results
+
+    def __getitem__(self, idx):
+        """Get item from infos according to the given index.
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        if self.test_mode:
+            return self.prepare_test_data(idx)
+        while True:
+
+            data = self.prepare_train_data(idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+
+    def _format_bbox(self, results, jsonfile_prefix=None):
+        """Convert the results to the standard format.
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of the output jsonfile.
+                You can specify the output directory/filename by
+                modifying the jsonfile_prefix. Default: None.
+        Returns:
+            str: Path of the output json file.
+        """
+        nusc_annos = {}
+        nusc_map_annos = {}
+        mapped_class_names = self.CLASSES
+
+        print('Start to convert detection format...')
+        for sample_id, det in enumerate(track_iter_progress(results)):
+            annos = []
+            sample_token = self.data_infos[sample_id]['token']
+
+            if 'map' in self.eval_mod:
+                map_annos = {}
+                for key, value in det['ret_iou'].items():
+                    map_annos[key] = float(value.numpy()[0])
+                    nusc_map_annos[sample_token] = map_annos
+
+            if 'boxes_3d' not in det:
+                nusc_annos[sample_token] = annos
+                continue
+
+            boxes = output_to_nusc_box(det)
+            boxes_ego = copy.deepcopy(boxes)
+            boxes, keep_idx = lidar_nusc_box_to_global(self.data_infos[sample_id], boxes,
+                                                       mapped_class_names,
+                                                       self.eval_detection_configs,
+                                                       self.eval_version)
+            for i, box in enumerate(boxes):
+                name = mapped_class_names[box.label]
+                if np.sqrt(box.velocity[0]**2 + box.velocity[1]**2) > 0.2:
+                    if name in [
+                            'car',
+                            'construction_vehicle',
+                            'bus',
+                            'truck',
+                            'trailer',
+                    ]:
+                        attr = 'vehicle.moving'
+                    elif name in ['bicycle', 'motorcycle']:
+                        attr = 'cycle.with_rider'
+                    else:
+                        attr = NuScenesDataset.DefaultAttribute[name]
+                else:
+                    if name in ['pedestrian']:
+                        attr = 'pedestrian.standing'
+                    elif name in ['bus']:
+                        attr = 'vehicle.stopped'
+                    else:
+                        attr = NuScenesDataset.DefaultAttribute[name]
+
+                # center_ = box.center.tolist()
+                # change from ground height to center height
+                # center_[2] = center_[2] + (box.wlh.tolist()[2] / 2.0)
+                if name not in ['car', 'truck', 'bus', 'trailer', 'motorcycle',
+                                'bicycle', 'pedestrian', ]:
+                    continue
+
+                box_ego = boxes_ego[keep_idx[i]]
+                trans = box_ego.center
+                if 'traj' in det:
+                    traj_local = det['traj'][keep_idx[i]].numpy()[..., :2]
+                    traj_scores = det['traj_scores'][keep_idx[i]].numpy()
+                else:
+                    traj_local = np.zeros((0,))
+                    traj_scores = np.zeros((0,))
+                traj_ego = np.zeros_like(traj_local)
+                rot = Quaternion(axis=np.array([0, 0.0, 1.0]), angle=np.pi/2)
+                for kk in range(traj_ego.shape[0]):
+                    traj_ego[kk] = convert_local_coords_to_global(
+                        traj_local[kk], trans, rot)
+
+                nusc_anno = dict(
+                    sample_token=sample_token,
+                    translation=box.center.tolist(),
+                    size=box.wlh.tolist(),
+                    rotation=box.orientation.elements.tolist(),
+                    velocity=box.velocity[:2].tolist(),
+                    detection_name=name,
+                    detection_score=box.score,
+                    attribute_name=attr,
+                    tracking_name=name,
+                    tracking_score=box.score,
+                    tracking_id=box.token,
+                    predict_traj=traj_ego,
+                    predict_traj_score=traj_scores,
+                )
+                annos.append(nusc_anno)
+            nusc_annos[sample_token] = annos
+        nusc_submissions = {
+            'meta': self.modality,
+            'results': nusc_annos,
+            'map_results': nusc_map_annos,
+        }
+
+        mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+        print('Results writes to', res_path)
+        dump(nusc_submissions, res_path)
+        return res_path
+
+    def format_results(self, results, jsonfile_prefix=None):
+        """Format the results to json (standard format for COCO evaluation).
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: Returns (result_files, tmp_dir), where `result_files` is a \
+                dict containing the json filepaths, `tmp_dir` is the temporal \
+                directory created for saving json files when \
+                `jsonfile_prefix` is not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+        # assert len(results) == len(self), (
+        #     'The length of results is not equal to the dataset len: {} != {}'.
+        #     format(len(results), len(self)))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+
+        result_files = self._format_bbox(results, jsonfile_prefix)
+
+        return result_files, tmp_dir
+
+    def _format_bbox_det(self, results, jsonfile_prefix=None):
+        """Convert the results to the standard format.
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of the output jsonfile.
+                You can specify the output directory/filename by
+                modifying the jsonfile_prefix. Default: None.
+        Returns:
+            str: Path of the output json file.
+        """
+        nusc_annos = {}
+        mapped_class_names = self.CLASSES
+
+        print('Start to convert detection format...')
+        for sample_id, det in enumerate(track_iter_progress(results)):
+            annos = []
+            sample_token = self.data_infos[sample_id]['token']
+
+            if det is None:
+                nusc_annos[sample_token] = annos
+                continue
+
+            boxes = output_to_nusc_box_det(det)
+            boxes_ego = copy.deepcopy(boxes)
+            boxes, keep_idx = lidar_nusc_box_to_global(self.data_infos[sample_id], boxes,
+                                                       mapped_class_names,
+                                                       self.eval_detection_configs,
+                                                       self.eval_version)
+            for i, box in enumerate(boxes):
+                name = mapped_class_names[box.label]
+                if np.sqrt(box.velocity[0]**2 + box.velocity[1]**2) > 0.2:
+                    if name in [
+                            'car',
+                            'construction_vehicle',
+                            'bus',
+                            'truck',
+                            'trailer',
+                    ]:
+                        attr = 'vehicle.moving'
+                    elif name in ['bicycle', 'motorcycle']:
+                        attr = 'cycle.with_rider'
+                    else:
+                        attr = NuScenesDataset.DefaultAttribute[name]
+                else:
+                    if name in ['pedestrian']:
+                        attr = 'pedestrian.standing'
+                    elif name in ['bus']:
+                        attr = 'vehicle.stopped'
+                    else:
+                        attr = NuScenesDataset.DefaultAttribute[name]
+
+                nusc_anno = dict(
+                    sample_token=sample_token,
+                    translation=box.center.tolist(),
+                    size=box.wlh.tolist(),
+                    rotation=box.orientation.elements.tolist(),
+                    velocity=box.velocity[:2].tolist(),
+                    detection_name=name,
+                    detection_score=box.score,
+                    attribute_name=attr,
+                )
+                annos.append(nusc_anno)
+            nusc_annos[sample_token] = annos
+        nusc_submissions = {
+            'meta': self.modality,
+            'results': nusc_annos,
+        }
+
+        mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, 'results_nusc_det.json')
+        print('Results writes to', res_path)
+        dump(nusc_submissions, res_path)
+        return res_path
+
+    def format_results_det(self, results, jsonfile_prefix=None):
+        """Format the results to json (standard format for COCO evaluation).
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: Returns (result_files, tmp_dir), where `result_files` is a \
+                dict containing the json filepaths, `tmp_dir` is the temporal \
+                directory created for saving json files when \
+                `jsonfile_prefix` is not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+        # assert len(results) == len(self), (
+        #     'The length of results is not equal to the dataset len: {} != {}'.
+        #     format(len(results), len(self)))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results_det')
+        else:
+            tmp_dir = None
+
+        result_files = self._format_bbox_det(results, jsonfile_prefix)
+        return result_files, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 result_names=['pts_bbox'],
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluation in nuScenes protocol.
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            show (bool): Whether to visualize.
+                Default: False.
+            out_dir (str): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+        Returns:
+            dict[str, float]: Results of each evaluation metric.
+        """
+        if isinstance(results, dict):
+            if 'occ_results_computed' in results.keys():
+                occ_results_computed = results['occ_results_computed']
+                out_metrics = ['iou']
+
+                # pan_eval
+                if occ_results_computed.get('pq', None) is not None:
+                    out_metrics = ['iou', 'pq', 'sq', 'rq']
+
+                print("Occ-flow Val Results:")
+                for panoptic_key in out_metrics:
+                    print(panoptic_key)
+                    # HERE!! connect
+                    print(' & '.join(
+                        [f'{x:.1f}' for x in occ_results_computed[panoptic_key]]))
+
+                if 'num_occ' in occ_results_computed.keys() and 'ratio_occ' in occ_results_computed.keys():
+                    print(
+                        f"num occ evaluated:{occ_results_computed['num_occ']}")
+                    print(
+                        f"ratio occ evaluated: {occ_results_computed['ratio_occ'] * 100:.1f}%")
+            if 'planning_results_computed' in results.keys():
+                planning_results_computed = results['planning_results_computed']
+                planning_tab = PrettyTable()
+                planning_tab.field_names = [
+                    "metrics", "0.5s", "1.0s", "1.5s", "2.0s", "2.5s", "3.0s"]
+                for key in planning_results_computed.keys():
+                    value = planning_results_computed[key]
+                    row_value = []
+                    row_value.append(key)
+                    for i in range(len(value)):
+                        row_value.append('%.4f' % float(value[i]))
+                    planning_tab.add_row(row_value)
+                print(planning_tab)
+            results = results['bbox_results']  # get bbox_results
+
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+        result_files_det, tmp_dir = self.format_results_det(
+            results, jsonfile_prefix)
+
+        if isinstance(result_files, dict):
+            results_dict = dict()
+            for name in result_names:
+                print('Evaluating bboxes of {}'.format(name))
+                ret_dict = self._evaluate_single(
+                    result_files[name], result_files_det[name])
+            results_dict.update(ret_dict)
+        elif isinstance(result_files, str):
+            results_dict = self._evaluate_single(
+                result_files, result_files_det)
+
+        if 'map' in self.eval_mod:
+            drivable_intersection = 0
+            drivable_union = 0
+            lanes_intersection = 0
+            lanes_union = 0
+            divider_intersection = 0
+            divider_union = 0
+            crossing_intersection = 0
+            crossing_union = 0
+            contour_intersection = 0
+            contour_union = 0
+            for i in range(len(results)):
+                drivable_intersection += results[i]['ret_iou']['drivable_intersection']
+                drivable_union += results[i]['ret_iou']['drivable_union']
+                lanes_intersection += results[i]['ret_iou']['lanes_intersection']
+                lanes_union += results[i]['ret_iou']['lanes_union']
+                divider_intersection += results[i]['ret_iou']['divider_intersection']
+                divider_union += results[i]['ret_iou']['divider_union']
+                crossing_intersection += results[i]['ret_iou']['crossing_intersection']
+                crossing_union += results[i]['ret_iou']['crossing_union']
+                contour_intersection += results[i]['ret_iou']['contour_intersection']
+                contour_union += results[i]['ret_iou']['contour_union']
+            results_dict.update({'drivable_iou': float(drivable_intersection / drivable_union),
+                                 'lanes_iou': float(lanes_intersection / lanes_union),
+                                 'divider_iou': float(divider_intersection / divider_union),
+                                 'crossing_iou': float(crossing_intersection / crossing_union),
+                                 'contour_iou': float(contour_intersection / contour_union)})
+
+            print(results_dict)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+
+        if show:
+            self.show(results, out_dir, pipeline=pipeline)
+        return results_dict
+
+    def _evaluate_single(self,
+                         result_path,
+                         result_path_det,
+                         logger=None,
+                         metric='bbox',
+                         result_name='pts_bbox'):
+        """Evaluation for a single model in nuScenes protocol.
+
+        Args:
+            result_path (str): Path of the result file.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            metric (str): Metric name used for evaluation. Default: 'bbox'.
+            result_name (str): Result name in the metric prefix.
+                Default: 'pts_bbox'.
+
+        Returns:
+            dict: Dictionary of evaluation details.
+        """
+
+        # TODO: fix the evaluation pipelines
+
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+        output_dir_det = osp.join(output_dir, 'det')
+        output_dir_track = osp.join(output_dir, 'track')
+        output_dir_motion = osp.join(output_dir, 'motion')
+        mkdir_or_exist(output_dir_det)
+        mkdir_or_exist(output_dir_track)
+        mkdir_or_exist(output_dir_motion)
+
+        eval_set_map = {
+            'v1.0-mini': 'mini_val',
+            'v1.0-trainval': 'val',
+        }
+        detail = dict()
+
+        if 'det' in self.eval_mod:
+            self.nusc_eval = NuScenesEval_custom(
+                self.nusc,
+                config=self.eval_detection_configs,
+                result_path=result_path_det,
+                eval_set=eval_set_map[self.version],
+                output_dir=output_dir_det,
+                verbose=True,
+                overlap_test=self.overlap_test,
+                data_infos=self.data_infos
+            )
+            self.nusc_eval.main(plot_examples=0, render_curves=False)
+            # record metrics
+            metrics = load(
+                osp.join(
+                    output_dir_det,
+                    'metrics_summary.json'))
+            metric_prefix = f'{result_name}_NuScenes'
+            for name in self.CLASSES:
+                for k, v in metrics['label_aps'][name].items():
+                    val = float('{:.4f}'.format(v))
+                    detail['{}/{}_AP_dist_{}'.format(
+                        metric_prefix, name, k)] = val
+                for k, v in metrics['label_tp_errors'][name].items():
+                    val = float('{:.4f}'.format(v))
+                    detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+                for k, v in metrics['tp_errors'].items():
+                    val = float('{:.4f}'.format(v))
+                    detail['{}/{}'.format(metric_prefix,
+                                          self.ErrNameMapping[k])] = val
+            detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+            detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+
+        if 'track' in self.eval_mod:
+            cfg = config_factory("tracking_nips_2019")
+            self.nusc_eval_track = TrackingEval(
+                config=cfg,
+                result_path=result_path,
+                eval_set=eval_set_map[self.version],
+                output_dir=output_dir_track,
+                verbose=True,
+                nusc_version=self.version,
+                nusc_dataroot=self.data_root
+            )
+            self.nusc_eval_track.main()
+            # record metrics
+            metrics = load(
+                osp.join(
+                    output_dir_track,
+                    'metrics_summary.json'))
+            keys = ['amota', 'amotp', 'recall', 'motar',
+                    'gt', 'mota', 'motp', 'mt', 'ml', 'faf',
+                    'tp', 'fp', 'fn', 'ids', 'frag', 'tid', 'lgd']
+            for key in keys:
+                detail['{}/{}'.format(metric_prefix, key)] = metrics[key]
+
+        # if 'map' in self.eval_mod:
+        #     for i, ret_iou in enumerate(ret_ious):
+        #         detail['iou_{}'.format(i)] = ret_iou
+
+        if 'motion' in self.eval_mod:
+            self.nusc_eval_motion = MotionEval(
+                self.nusc,
+                config=self.eval_detection_configs,
+                result_path=result_path,
+                eval_set=eval_set_map[self.version],
+                output_dir=output_dir,
+                verbose=True,
+                overlap_test=self.overlap_test,
+                data_infos=self.data_infos,
+                category_convert_type='motion_category'
+            )
+            print('-'*50)
+            print(
+                'Evaluate on motion category, merge class for vehicles and pedestrians...')
+            print('evaluate standard motion metrics...')
+            self.nusc_eval_motion.main(
+                plot_examples=0,
+                render_curves=False,
+                eval_mode='standard')
+            print('evaluate motion mAP-minFDE metrics...')
+            self.nusc_eval_motion.main(
+                plot_examples=0,
+                render_curves=False,
+                eval_mode='motion_map')
+            print('evaluate EPA motion metrics...')
+            self.nusc_eval_motion.main(
+                plot_examples=0,
+                render_curves=False,
+                eval_mode='epa')
+            print('-'*50)
+            print('Evaluate on detection category...')
+            self.nusc_eval_motion = MotionEval(
+                self.nusc,
+                config=self.eval_detection_configs,
+                result_path=result_path,
+                eval_set=eval_set_map[self.version],
+                output_dir=output_dir,
+                verbose=True,
+                overlap_test=self.overlap_test,
+                data_infos=self.data_infos,
+                category_convert_type='detection_category'
+            )
+            print('evaluate standard motion metrics...')
+            self.nusc_eval_motion.main(
+                plot_examples=0,
+                render_curves=False,
+                eval_mode='standard')
+            print('evaluate EPA motion metrics...')
+            self.nusc_eval_motion.main(
+                plot_examples=0,
+                render_curves=False,
+                eval_mode='motion_map')
+            print('evaluate EPA motion metrics...')
+            self.nusc_eval_motion.main(
+                plot_examples=0,
+                render_curves=False,
+                eval_mode='epa')
+
+        return detail
diff --git a/mmcv/datasets/nuscenes_eval.py b/mmcv/datasets/nuscenes_eval.py
new file mode 100644
index 0000000..a0dc0b7
--- /dev/null
+++ b/mmcv/datasets/nuscenes_eval.py
@@ -0,0 +1,752 @@
+import argparse
+import copy
+import json
+import os
+import time
+import cv2
+import argparse
+import random
+import tqdm
+import torch
+from typing import Tuple, Dict, Any
+from mmcv.fileio.io import dump,load
+from torchvision.transforms.functional import rotate
+import numpy as np
+from pyquaternion import Quaternion
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.detection.data_classes import DetectionConfig
+from nuscenes.eval.detection.evaluate import NuScenesEval
+from nuscenes.eval.detection.data_classes import DetectionBox
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.eval.tracking.data_classes import TrackingBox
+from nuscenes.utils.data_classes import Box
+from nuscenes.utils.splits import create_splits_scenes
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix
+import pycocotools.mask as mask_util
+# from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from nuscenes.eval.common.loaders import load_gt, add_center_dist, filter_eval_boxes
+from nuscenes.eval.detection.algo import accumulate, calc_ap, calc_tp
+from nuscenes.eval.detection.data_classes import DetectionConfig, DetectionMetrics, DetectionBox, \
+    DetectionMetricDataList
+from nuscenes.eval.detection.render import summary_plot, class_pr_curve, dist_pr_curve, visualize_sample
+from matplotlib import pyplot as plt
+from nuscenes.eval.common.render import setup_axis
+from nuscenes.eval.detection.constants import TP_METRICS, DETECTION_NAMES, DETECTION_COLORS, TP_METRICS_UNITS, \
+    PRETTY_DETECTION_NAMES, PRETTY_TP_METRICS
+from nuscenes.eval.detection.data_classes import DetectionMetrics, DetectionMetricData, DetectionMetricDataList
+import mmcv
+
+
+Axis = Any
+
+def class_tp_curve(md_list: DetectionMetricDataList,
+                   metrics: DetectionMetrics,
+                   detection_name: str,
+                   min_recall: float,
+                   dist_th_tp: float,
+                   savepath: str = None,
+                   ax: Axis = None) -> None:
+    """
+    Plot the true positive curve for the specified class.
+    :param md_list: DetectionMetricDataList instance.
+    :param metrics: DetectionMetrics instance.
+    :param detection_name:
+    :param min_recall: Minimum recall value.
+    :param dist_th_tp: The distance threshold used to determine matches.
+    :param savepath: If given, saves the the rendering here instead of displaying.
+    :param ax: Axes onto which to render.
+    """
+    # Get metric data for given detection class with tp distance threshold.
+
+    md = md_list[(detection_name, dist_th_tp)]
+    min_recall_ind = round(100 * min_recall)
+    if min_recall_ind <= md.max_recall_ind:
+        # For traffic_cone and barrier only a subset of the metrics are plotted.
+        rel_metrics = [m for m in TP_METRICS if not np.isnan(metrics.get_label_tp(detection_name, m))]
+        ylimit = max([max(getattr(md, metric)[min_recall_ind:md.max_recall_ind + 1]) for metric in rel_metrics]) * 1.1
+    else:
+        ylimit = 1.0
+
+    # Prepare axis.
+    if ax is None:
+        ax = setup_axis(title=PRETTY_DETECTION_NAMES[detection_name], xlabel='Recall', ylabel='Error', xlim=1,
+                        min_recall=min_recall)
+    ax.set_ylim(0, ylimit)
+
+    # Plot the recall vs. error curve for each tp metric.
+    for metric in TP_METRICS:
+        tp = metrics.get_label_tp(detection_name, metric)
+
+        # Plot only if we have valid data.
+        if tp is not np.nan and min_recall_ind <= md.max_recall_ind:
+            recall, error = md.recall[:md.max_recall_ind + 1], getattr(md, metric)[:md.max_recall_ind + 1]
+        else:
+            recall, error = [], []
+
+        # Change legend based on tp value
+        if tp is np.nan:
+            label = '{}: n/a'.format(PRETTY_TP_METRICS[metric])
+        elif min_recall_ind > md.max_recall_ind:
+            label = '{}: nan'.format(PRETTY_TP_METRICS[metric])
+        else:
+            label = '{}: {:.2f} ({})'.format(PRETTY_TP_METRICS[metric], tp, TP_METRICS_UNITS[metric])
+        if metric == 'trans_err':
+            label += f' ({md.max_recall_ind})'  # add recall
+            print(f'Recall: {detection_name}: {md.max_recall_ind/100}')
+        ax.plot(recall, error, label=label)
+    ax.axvline(x=md.max_recall, linestyle='-.', color=(0, 0, 0, 0.3))
+    ax.legend(loc='best')
+
+    if savepath is not None:
+        plt.savefig(savepath)
+        plt.close()
+
+
+class DetectionBox_modified(DetectionBox):
+    def __init__(self, *args, token=None, visibility=None, index=None, **kwargs):
+        '''
+        add annotation token
+        '''
+        super().__init__(*args, **kwargs)
+        self.token = token
+        self.visibility = visibility
+        self.index = index
+
+    def serialize(self) -> dict:
+        """ Serialize instance into json-friendly format. """
+        return {
+            'token': self.token,
+            'sample_token': self.sample_token,
+            'translation': self.translation,
+            'size': self.size,
+            'rotation': self.rotation,
+            'velocity': self.velocity,
+            'ego_translation': self.ego_translation,
+            'num_pts': self.num_pts,
+            'detection_name': self.detection_name,
+            'detection_score': self.detection_score,
+            'attribute_name': self.attribute_name,
+            'visibility': self.visibility,
+            'index': self.index
+
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized content. """
+        return cls(
+            token=content['token'],
+            sample_token=content['sample_token'],
+            translation=tuple(content['translation']),
+            size=tuple(content['size']),
+            rotation=tuple(content['rotation']),
+            velocity=tuple(content['velocity']),
+            ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+            else tuple(content['ego_translation']),
+            num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+            detection_name=content['detection_name'],
+            detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+            attribute_name=content['attribute_name'],
+            visibility=content['visibility'],
+            index=content['index'],
+        )
+
+
+def center_in_image(box, intrinsic: np.ndarray, imsize: Tuple[int, int], vis_level: int = BoxVisibility.ANY) -> bool:
+    """
+    Check if a box is visible inside an image without accounting for occlusions.
+    :param box: The box to be checked.
+    :param intrinsic: <float: 3, 3>. Intrinsic camera matrix.
+    :param imsize: (width, height).
+    :param vis_level: One of the enumerations of <BoxVisibility>.
+    :return True if visibility condition is satisfied.
+    """
+
+    center_3d = box.center.reshape(3, 1)
+    center_img = view_points(center_3d, intrinsic, normalize=True)[:2, :]
+
+    visible = np.logical_and(center_img[0, :] > 0, center_img[0, :] < imsize[0])
+    visible = np.logical_and(visible, center_img[1, :] < imsize[1])
+    visible = np.logical_and(visible, center_img[1, :] > 0)
+    visible = np.logical_and(visible, center_3d[2, :] > 1)
+
+    in_front = center_3d[2, :] > 0.1  # True if a corner is at least 0.1 meter in front of the camera.
+
+    if vis_level == BoxVisibility.ALL:
+        return all(visible) and all(in_front)
+    elif vis_level == BoxVisibility.ANY:
+        return any(visible) and all(in_front)
+    elif vis_level == BoxVisibility.NONE:
+        return True
+    else:
+        raise ValueError("vis_level: {} not valid".format(vis_level))
+
+
+def exist_corners_in_image_but_not_all(box, intrinsic: np.ndarray, imsize: Tuple[int, int],
+                                       vis_level: int = BoxVisibility.ANY) -> bool:
+    """
+    Check if a box is visible in images but not all corners in image .
+    :param box: The box to be checked.
+    :param intrinsic: <float: 3, 3>. Intrinsic camera matrix.
+    :param imsize: (width, height).
+    :param vis_level: One of the enumerations of <BoxVisibility>.
+    :return True if visibility condition is satisfied.
+    """
+
+    corners_3d = box.corners()
+    corners_img = view_points(corners_3d, intrinsic, normalize=True)[:2, :]
+
+    visible = np.logical_and(corners_img[0, :] > 0, corners_img[0, :] < imsize[0])
+    visible = np.logical_and(visible, corners_img[1, :] < imsize[1])
+    visible = np.logical_and(visible, corners_img[1, :] > 0)
+    visible = np.logical_and(visible, corners_3d[2, :] > 1)
+
+    in_front = corners_3d[2, :] > 0.1  # True if a corner is at least 0.1 meter in front of the camera.
+
+    if any(visible) and not all(visible) and all(in_front):
+        return True
+    else:
+        return False
+
+def load_prediction(result_path: str, max_boxes_per_sample: int, box_cls, verbose: bool = False) \
+        -> Tuple[EvalBoxes, Dict]:
+    """
+    Loads object predictions from file.
+    :param result_path: Path to the .json result file provided by the user.
+    :param max_boxes_per_sample: Maximim number of boxes allowed per sample.
+    :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
+    :param verbose: Whether to print messages to stdout.
+    :return: The deserialized results and meta data.
+    """
+
+    # Load from file and check that the format is correct.
+    # with open(result_path) as f:
+    #     data = json.load(f)
+    data = load(result_path)
+    assert 'results' in data, 'Error: No field `results` in result file. Please note that the result format changed.' \
+                              'See https://www.nuscenes.org/object-detection for more information.'
+
+    # Deserialize results and get meta data.
+    all_results = EvalBoxes.deserialize(data['results'], box_cls)
+    meta = data['meta']
+    if verbose:
+        print("Loaded results from {}. Found detections for {} samples."
+              .format(result_path, len(all_results.sample_tokens)))
+
+    # Check that each sample has no more than x predicted boxes.
+    for sample_token in all_results.sample_tokens:
+        assert len(all_results.boxes[sample_token]) <= max_boxes_per_sample, \
+            "Error: Only <= %d boxes per sample allowed!" % max_boxes_per_sample
+
+    return all_results, meta
+
+def load_gt(nusc: NuScenes, eval_split: str, box_cls, verbose: bool = False):
+    """
+    Loads ground truth boxes from DB.
+    :param nusc: A NuScenes instance.
+    :param eval_split: The evaluation split for which we load GT boxes.
+    :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
+    :param verbose: Whether to print messages to stdout.
+    :return: The GT boxes.
+    """
+
+    # Init.
+    if box_cls == DetectionBox_modified:
+        attribute_map = {a['token']: a['name'] for a in nusc.attribute}
+
+    if verbose:
+        print('Loading annotations for {} split from nuScenes version: {}'.format(eval_split, nusc.version))
+    # Read out all sample_tokens in DB.
+    sample_tokens_all = [s['token'] for s in nusc.sample]
+    assert len(sample_tokens_all) > 0, "Error: Database has no samples!"
+
+    # Only keep samples from this split.
+    splits = create_splits_scenes()
+
+    # Check compatibility of split with nusc_version.
+    version = nusc.version
+    if eval_split in {'train', 'val', 'train_detect', 'train_track'}:
+        assert version.endswith('trainval'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    elif eval_split in {'mini_train', 'mini_val'}:
+        assert version.endswith('mini'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    elif eval_split == 'test':
+        assert version.endswith('test'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    else:
+        raise ValueError('Error: Requested split {} which this function cannot map to the correct NuScenes version.'
+                         .format(eval_split))
+
+    if eval_split == 'test':
+        # Check that you aren't trying to cheat :).
+        assert len(nusc.sample_annotation) > 0, \
+            'Error: You are trying to evaluate on the test set but you do not have the annotations!'
+    index_map = {}
+    for scene in nusc.scene:
+        first_sample_token = scene['first_sample_token']
+        sample = nusc.get('sample', first_sample_token)
+        index_map[first_sample_token] = 1
+        index = 2
+        while sample['next'] != '':
+            sample = nusc.get('sample', sample['next'])
+            index_map[sample['token']] = index
+            index += 1
+
+    sample_tokens = []
+    for sample_token in sample_tokens_all:
+        scene_token = nusc.get('sample', sample_token)['scene_token']
+        scene_record = nusc.get('scene', scene_token)
+        if scene_record['name'] in splits[eval_split]:
+            sample_tokens.append(sample_token)
+
+    all_annotations = EvalBoxes()
+
+    # Load annotations and filter predictions and annotations.
+    tracking_id_set = set()
+    for sample_token in tqdm.tqdm(sample_tokens, leave=verbose):
+
+        sample = nusc.get('sample', sample_token)
+        sample_annotation_tokens = sample['anns']
+
+        sample_boxes = []
+        for sample_annotation_token in sample_annotation_tokens:
+
+            sample_annotation = nusc.get('sample_annotation', sample_annotation_token)
+            if box_cls == DetectionBox_modified:
+                # Get label name in detection task and filter unused labels.
+                detection_name = category_to_detection_name(sample_annotation['category_name'])
+                if detection_name is None:
+                    continue
+
+                # Get attribute_name.
+                attr_tokens = sample_annotation['attribute_tokens']
+                attr_count = len(attr_tokens)
+                if attr_count == 0:
+                    attribute_name = ''
+                elif attr_count == 1:
+                    attribute_name = attribute_map[attr_tokens[0]]
+                else:
+                    raise Exception('Error: GT annotations must not have more than one attribute!')
+
+                sample_boxes.append(
+                    box_cls(
+                        token=sample_annotation_token,
+                        sample_token=sample_token,
+                        translation=sample_annotation['translation'],
+                        size=sample_annotation['size'],
+                        rotation=sample_annotation['rotation'],
+                        velocity=nusc.box_velocity(sample_annotation['token'])[:2],
+                        num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'],
+                        detection_name=detection_name,
+                        detection_score=-1.0,  # GT samples do not have a score.
+                        attribute_name=attribute_name,
+                        visibility=sample_annotation['visibility_token'],
+                        index=index_map[sample_token]
+                    )
+                )
+            elif box_cls == TrackingBox:
+                assert False
+            else:
+                raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls)
+
+        all_annotations.add_boxes(sample_token, sample_boxes)
+
+    if verbose:
+        print("Loaded ground truth annotations for {} samples.".format(len(all_annotations.sample_tokens)))
+
+    return all_annotations
+
+
+def filter_eval_boxes_by_id(nusc: NuScenes,
+                            eval_boxes: EvalBoxes,
+                            id=None,
+                            verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. Distance, bike-racks and points per box.
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param is: the anns token set that used to keep bboxes.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            if box.token in id:
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After anns based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+def filter_eval_boxes_by_visibility(
+        ori_eval_boxes: EvalBoxes,
+        visibility=None,
+        verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. Distance, bike-racks and points per box.
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param is: the anns token set that used to keep bboxes.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    eval_boxes = copy.deepcopy(ori_eval_boxes)
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            if box.visibility == visibility:
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After visibility based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+def filter_by_sample_token(ori_eval_boxes, valid_sample_tokens=[],  verbose=False):
+    eval_boxes = copy.deepcopy(ori_eval_boxes)
+    for sample_token in eval_boxes.sample_tokens:
+        if sample_token not in valid_sample_tokens:
+            eval_boxes.boxes.pop(sample_token)
+    return eval_boxes
+
+
+def filter_eval_boxes_by_overlap(nusc: NuScenes,
+                                 eval_boxes: EvalBoxes,
+                                 verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. basedon overlap .
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    cams = ['CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_BACK_RIGHT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_FRONT_LEFT']
+
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        sample_record = nusc.get('sample', sample_token)
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            count = 0
+            for cam in cams:
+                '''
+                copy-paste form nuscens
+                '''
+                sample_data_token = sample_record['data'][cam]
+                sd_record = nusc.get('sample_data', sample_data_token)
+                cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+                sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+                pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+                cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+                imsize = (sd_record['width'], sd_record['height'])
+                new_box = Box(box.translation, box.size, Quaternion(box.rotation),
+                              name=box.detection_name, token='')
+
+                # Move box to ego vehicle coord system.
+                new_box.translate(-np.array(pose_record['translation']))
+                new_box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+                #  Move box to sensor coord system.
+                new_box.translate(-np.array(cs_record['translation']))
+                new_box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+                if center_in_image(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+                    count += 1
+                # if exist_corners_in_image_but_not_all(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+                #    count += 1
+
+            if count > 1:
+                with open('center_overlap.txt', 'a') as f:
+                    try:
+                        f.write(box.token + '\n')
+                    except:
+                        pass
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    verbose = True
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After anns based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+class NuScenesEval_custom(NuScenesEval):
+    """
+    Dummy class for backward-compatibility. Same as DetectionEval.
+    """
+
+    def __init__(self,
+                 nusc: NuScenes,
+                 config: DetectionConfig,
+                 result_path: str,
+                 eval_set: str,
+                 output_dir: str = None,
+                 verbose: bool = True,
+                 overlap_test=False,
+                 eval_mask=False,
+                 data_infos=None
+                 ):
+        """
+        Initialize a DetectionEval object.
+        :param nusc: A NuScenes object.
+        :param config: A DetectionConfig object.
+        :param result_path: Path of the nuScenes JSON result file.
+        :param eval_set: The dataset split to evaluate on, e.g. train, val or test.
+        :param output_dir: Folder to save plots and results to.
+        :param verbose: Whether to print to stdout.
+        """
+
+        self.nusc = nusc
+        self.result_path = result_path
+        self.eval_set = eval_set
+        self.output_dir = output_dir
+        self.verbose = verbose
+        self.cfg = config
+        self.overlap_test = overlap_test
+        self.eval_mask = eval_mask
+        self.data_infos = data_infos
+        # Check result file exists.
+        assert os.path.exists(result_path), 'Error: The result file does not exist!'
+
+        # Make dirs.
+        self.plot_dir = os.path.join(self.output_dir, 'plots')
+        if not os.path.isdir(self.output_dir):
+            os.makedirs(self.output_dir)
+        if not os.path.isdir(self.plot_dir):
+            os.makedirs(self.plot_dir)
+
+        # Load data.
+        if verbose:
+            print('Initializing nuScenes detection evaluation')
+        self.pred_boxes, self.meta = load_prediction(self.result_path, self.cfg.max_boxes_per_sample, DetectionBox,
+                                                     verbose=verbose)
+        self.gt_boxes = load_gt(self.nusc, self.eval_set, DetectionBox_modified, verbose=verbose)
+
+        assert set(self.pred_boxes.sample_tokens) == set(self.gt_boxes.sample_tokens), \
+            "Samples in split doesn't match samples in predictions."
+
+        # Add center distances.
+        self.pred_boxes = add_center_dist(nusc, self.pred_boxes)
+        self.gt_boxes = add_center_dist(nusc, self.gt_boxes)
+
+        # Filter boxes (distance, points per box, etc.).
+
+        if verbose:
+            print('Filtering predictions')
+        self.pred_boxes = filter_eval_boxes(nusc, self.pred_boxes, self.cfg.class_range, verbose=verbose)
+        if verbose:
+            print('Filtering ground truth annotations')
+        self.gt_boxes = filter_eval_boxes(nusc, self.gt_boxes, self.cfg.class_range, verbose=verbose)
+
+        if self.overlap_test:
+            self.pred_boxes = filter_eval_boxes_by_overlap(self.nusc, self.pred_boxes)
+
+            self.gt_boxes = filter_eval_boxes_by_overlap(self.nusc, self.gt_boxes, verbose=True)
+
+        self.all_gt = copy.deepcopy(self.gt_boxes)
+        self.all_preds = copy.deepcopy(self.pred_boxes)
+        self.sample_tokens = self.gt_boxes.sample_tokens
+
+        self.index_map = {}
+        for scene in nusc.scene:
+            first_sample_token = scene['first_sample_token']
+            sample = nusc.get('sample', first_sample_token)
+            self.index_map[first_sample_token] = 1
+            index = 2
+            while sample['next'] != '':
+                sample = nusc.get('sample', sample['next'])
+                self.index_map[sample['token']] = index
+                index += 1
+
+    def update_gt(self, type_='vis', visibility='1', index=1):
+        if type_ == 'vis':
+            self.visibility_test = True
+            if self.visibility_test:
+                '''[{'description': 'visibility of whole object is between 0 and 40%',
+                'token': '1',
+                'level': 'v0-40'},
+                {'description': 'visibility of whole object is between 40 and 60%',
+                'token': '2',
+                'level': 'v40-60'},
+                {'description': 'visibility of whole object is between 60 and 80%',
+                'token': '3',
+                'level': 'v60-80'},
+                {'description': 'visibility of whole object is between 80 and 100%',
+                'token': '4',
+                'level': 'v80-100'}]'''
+
+                self.gt_boxes = filter_eval_boxes_by_visibility(self.all_gt, visibility, verbose=True)
+
+        elif type_ == 'ord':
+
+            valid_tokens = [key for (key, value) in self.index_map.items() if value == index]
+            # from IPython import embed
+            # embed()
+            self.gt_boxes = filter_by_sample_token(self.all_gt, valid_tokens)
+            self.pred_boxes = filter_by_sample_token(self.all_preds, valid_tokens)
+        self.sample_tokens = self.gt_boxes.sample_tokens
+
+
+    def evaluate(self) -> Tuple[DetectionMetrics, DetectionMetricDataList]:
+        """
+        Performs the actual evaluation.
+        :return: A tuple of high-level and the raw metric data.
+        """
+        start_time = time.time()
+
+        # -----------------------------------
+        # Step 1: Accumulate metric data for all classes and distance thresholds.
+        # -----------------------------------
+        if self.verbose:
+            print('Accumulating metric data...')
+        metric_data_list = DetectionMetricDataList()
+
+        # print(self.cfg.dist_fcn_callable, self.cfg.dist_ths)
+        # self.cfg.dist_ths = [0.3]
+        # self.cfg.dist_fcn_callable
+        for class_name in self.cfg.class_names:
+            for dist_th in self.cfg.dist_ths:
+                md = accumulate(self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th)
+                metric_data_list.set(class_name, dist_th, md)
+
+        # -----------------------------------
+        # Step 2: Calculate metrics from the data.
+        # -----------------------------------
+        if self.verbose:
+            print('Calculating metrics...')
+        metrics = DetectionMetrics(self.cfg)
+        for class_name in self.cfg.class_names:
+            # Compute APs.
+            for dist_th in self.cfg.dist_ths:
+                metric_data = metric_data_list[(class_name, dist_th)]
+                ap = calc_ap(metric_data, self.cfg.min_recall, self.cfg.min_precision)
+                metrics.add_label_ap(class_name, dist_th, ap)
+            # Compute TP metrics.
+            for metric_name in TP_METRICS:
+                metric_data = metric_data_list[(class_name, self.cfg.dist_th_tp)]
+                if class_name in ['traffic_cone'] and metric_name in ['attr_err', 'vel_err', 'orient_err']:
+                    tp = np.nan
+                elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']:
+                    tp = np.nan
+                else:
+                    tp = calc_tp(metric_data, self.cfg.min_recall, metric_name)
+                metrics.add_label_tp(class_name, metric_name, tp)
+
+        # Compute evaluation time.
+        metrics.add_runtime(time.time() - start_time)
+
+        return metrics, metric_data_list
+
+    def render(self, metrics: DetectionMetrics, md_list: DetectionMetricDataList) -> None:
+        """
+        Renders various PR and TP curves.
+        :param metrics: DetectionMetrics instance.
+        :param md_list: DetectionMetricDataList instance.
+        """
+        if self.verbose:
+            print('Rendering PR and TP curves')
+
+        def savepath(name):
+            return os.path.join(self.plot_dir, name + '.pdf')
+
+        summary_plot(md_list, metrics, min_precision=self.cfg.min_precision, min_recall=self.cfg.min_recall,
+                     dist_th_tp=self.cfg.dist_th_tp, savepath=savepath('summary'))
+
+        for detection_name in self.cfg.class_names:
+            class_pr_curve(md_list, metrics, detection_name, self.cfg.min_precision, self.cfg.min_recall,
+                           savepath=savepath(detection_name + '_pr'))
+
+            class_tp_curve(md_list, metrics, detection_name, self.cfg.min_recall, self.cfg.dist_th_tp,
+                           savepath=savepath(detection_name + '_tp'))
+
+        for dist_th in self.cfg.dist_ths:
+            dist_pr_curve(md_list, metrics, dist_th, self.cfg.min_precision, self.cfg.min_recall,
+                          savepath=savepath('dist_pr_' + str(dist_th)))
+
+
+if __name__ == "__main__":
+
+    # Settings.
+    parser = argparse.ArgumentParser(description='Evaluate nuScenes detection results.',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('result_path', type=str, help='The submission as a JSON file.')
+    parser.add_argument('--output_dir', type=str, default='~/nuscenes-metrics',
+                        help='Folder to store result metrics, graphs and example visualizations.')
+    parser.add_argument('--eval_set', type=str, default='val',
+                        help='Which dataset split to evaluate on, train, val or test.')
+    parser.add_argument('--dataroot', type=str, default='data/nuscenes',
+                        help='Default nuScenes data directory.')
+    parser.add_argument('--version', type=str, default='v1.0-trainval',
+                        help='Which version of the nuScenes dataset to evaluate on, e.g. v1.0-trainval.')
+    parser.add_argument('--config_path', type=str, default='',
+                        help='Path to the configuration file.'
+                             'If no path given, the CVPR 2019 configuration will be used.')
+    parser.add_argument('--plot_examples', type=int, default=0,
+                        help='How many example visualizations to write to disk.')
+    parser.add_argument('--render_curves', type=int, default=1,
+                        help='Whether to render PR and TP curves to disk.')
+    parser.add_argument('--verbose', type=int, default=1,
+                        help='Whether to print to stdout.')
+    args = parser.parse_args()
+
+    result_path_ = os.path.expanduser(args.result_path)
+    output_dir_ = os.path.expanduser(args.output_dir)
+    eval_set_ = args.eval_set
+    dataroot_ = args.dataroot
+    version_ = args.version
+    config_path = args.config_path
+    plot_examples_ = args.plot_examples
+    render_curves_ = bool(args.render_curves)
+    verbose_ = bool(args.verbose)
+
+    if config_path == '':
+        cfg_ = config_factory('detection_cvpr_2019')
+    else:
+        with open(config_path, 'r') as _f:
+            cfg_ = DetectionConfig.deserialize(json.load(_f))
+
+    nusc_ = NuScenes(version=version_, verbose=verbose_, dataroot=dataroot_)
+    nusc_eval = NuScenesEval_custom(nusc_, config=cfg_, result_path=result_path_, eval_set=eval_set_,
+                                    output_dir=output_dir_, verbose=verbose_)
+    for vis in ['1', '2', '3', '4']:
+        nusc_eval.update_gt(type_='vis', visibility=vis)
+        print(f'================ {vis} ===============')
+        nusc_eval.main(plot_examples=plot_examples_, render_curves=render_curves_)
+    #for index in range(1, 41):
+    #    nusc_eval.update_gt(type_='ord', index=index)
+    #
diff --git a/mmcv/datasets/nuscenes_mono_dataset.py b/mmcv/datasets/nuscenes_mono_dataset.py
new file mode 100644
index 0000000..b036b87
--- /dev/null
+++ b/mmcv/datasets/nuscenes_mono_dataset.py
@@ -0,0 +1,777 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import mmcv
+import numpy as np
+import pyquaternion
+import tempfile
+import torch
+import warnings
+from nuscenes.utils.data_classes import Box as NuScenesBox
+from os import path as osp
+
+from mmdet3d.core import bbox3d2result, box3d_multiclass_nms, xywhr2xyxyr
+from mmdet.datasets import DATASETS, CocoDataset
+from mmdet3d.core import show_multi_modality_result
+from mmdet3d.core.bbox import CameraInstance3DBoxes, get_box_type
+from mmdet3d.datasets.pipelines import Compose
+from mmdet3d.datasets.utils import extract_result_dict, get_loading_pipeline
+
+
+@DATASETS.register_module()
+class CustomNuScenesMonoDataset(CocoDataset):
+    r"""Monocular 3D detection on NuScenes Dataset.
+    This class serves as the API for experiments on the NuScenes Dataset.
+    Please refer to `NuScenes Dataset <https://www.nuscenes.org/download>`_
+    for data downloading.
+    Args:
+        ann_file (str): Path of annotation file.
+        data_root (str): Path of dataset root.
+        load_interval (int, optional): Interval of loading the dataset. It is
+            used to uniformly sample the dataset. Defaults to 1.
+        with_velocity (bool, optional): Whether include velocity prediction
+            into the experiments. Defaults to True.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'Camera' in this class. Available options includes.
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        eval_version (str, optional): Configuration version of evaluation.
+            Defaults to  'detection_cvpr_2019'.
+        use_valid_flag (bool): Whether to use `use_valid_flag` key in the info
+            file as mask to filter gt_boxes and gt_names. Defaults to False.
+        version (str, optional): Dataset version. Defaults to 'v1.0-trainval'.
+    """
+    CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+               'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
+               'barrier')
+    DefaultAttribute = {
+        'car': 'vehicle.parked',
+        'pedestrian': 'pedestrian.moving',
+        'trailer': 'vehicle.parked',
+        'truck': 'vehicle.parked',
+        'bus': 'vehicle.moving',
+        'motorcycle': 'cycle.without_rider',
+        'construction_vehicle': 'vehicle.parked',
+        'bicycle': 'cycle.without_rider',
+        'barrier': '',
+        'traffic_cone': '',
+    }
+    # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa
+    ErrNameMapping = {
+        'trans_err': 'mATE',
+        'scale_err': 'mASE',
+        'orient_err': 'mAOE',
+        'vel_err': 'mAVE',
+        'attr_err': 'mAAE'
+    }
+
+    def __init__(self,
+                 data_root,
+                 load_interval=1,
+                 with_velocity=True,
+                 modality=None,
+                 box_type_3d='Camera',
+                 eval_version='detection_cvpr_2019',
+                 use_valid_flag=False,
+                 overlap_test=False,
+                 version='v1.0-trainval',
+                 **kwargs):
+        super().__init__(**kwargs)
+        # overlap_test = True
+        self.data_root = data_root
+        self.overlap_test = overlap_test
+        self.load_interval = load_interval
+        self.with_velocity = with_velocity
+        self.modality = modality
+        self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
+        self.eval_version = eval_version
+        self.use_valid_flag = use_valid_flag
+        self.bbox_code_size = 9
+        self.version = version
+        if self.eval_version is not None:
+            from nuscenes.eval.detection.config import config_factory
+            self.eval_detection_configs = config_factory(self.eval_version)
+        if self.modality is None:
+            self.modality = dict(
+                use_camera=True,
+                use_lidar=False,
+                use_radar=False,
+                use_map=False,
+                use_external=False)
+
+    def pre_pipeline(self, results):
+        """Initialization before data preparation.
+        Args:
+            results (dict): Dict before data preprocessing.
+                - img_fields (list): Image fields.
+                - bbox3d_fields (list): 3D bounding boxes fields.
+                - pts_mask_fields (list): Mask fields of points.
+                - pts_seg_fields (list): Mask fields of point segments.
+                - bbox_fields (list): Fields of bounding boxes.
+                - mask_fields (list): Fields of masks.
+                - seg_fields (list): Segment fields.
+                - box_type_3d (str): 3D box type.
+                - box_mode_3d (str): 3D box mode.
+        """
+        results['img_prefix'] = ''  # self.img_prefix
+        # print('img_prefix', self.img_prefix)
+        results['seg_prefix'] = self.seg_prefix
+        results['proposal_file'] = self.proposal_file
+        results['img_fields'] = []
+        results['bbox3d_fields'] = []
+        results['pts_mask_fields'] = []
+        results['pts_seg_fields'] = []
+        results['bbox_fields'] = []
+        results['mask_fields'] = []
+        results['seg_fields'] = []
+        results['box_type_3d'] = self.box_type_3d
+        results['box_mode_3d'] = self.box_mode_3d
+
+    def _parse_ann_info(self, img_info, ann_info):
+        """Parse bbox annotation.
+        Args:
+            img_info (list[dict]): Image info.
+            ann_info (list[dict]): Annotation info of an image.
+        Returns:
+            dict: A dict containing the following keys: bboxes, labels, \
+                gt_bboxes_3d, gt_labels_3d, attr_labels, centers2d, \
+                depths, bboxes_ignore, masks, seg_map
+        """
+        gt_bboxes = []
+        gt_labels = []
+        attr_labels = []
+        gt_bboxes_ignore = []
+        gt_masks_ann = []
+        gt_bboxes_cam3d = []
+        centers2d = []
+        depths = []
+        for i, ann in enumerate(ann_info):
+            if ann.get('ignore', False):
+                continue
+            x1, y1, w, h = ann['bbox']
+            inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
+            inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
+            if inter_w * inter_h == 0:
+                continue
+            if ann['area'] <= 0 or w < 1 or h < 1:
+                continue
+            if ann['category_id'] not in self.cat_ids:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
+            if ann.get('iscrowd', False):
+                gt_bboxes_ignore.append(bbox)
+            else:
+                gt_bboxes.append(bbox)
+                gt_labels.append(self.cat2label[ann['category_id']])
+                attr_labels.append(ann['attribute_id'])
+                gt_masks_ann.append(ann.get('segmentation', None))
+                # 3D annotations in camera coordinates
+                bbox_cam3d = np.array(ann['bbox_cam3d']).reshape(1, -1)
+                velo_cam3d = np.array(ann['velo_cam3d']).reshape(1, 2)
+                nan_mask = np.isnan(velo_cam3d[:, 0])
+                velo_cam3d[nan_mask] = [0.0, 0.0]
+                bbox_cam3d = np.concatenate([bbox_cam3d, velo_cam3d], axis=-1)
+                gt_bboxes_cam3d.append(bbox_cam3d.squeeze())
+                # 2.5D annotations in camera coordinates
+                center2d = ann['center2d'][:2]
+                depth = ann['center2d'][2]
+                centers2d.append(center2d)
+                depths.append(depth)
+
+        if gt_bboxes:
+            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+            gt_labels = np.array(gt_labels, dtype=np.int64)
+            attr_labels = np.array(attr_labels, dtype=np.int64)
+        else:
+            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+            gt_labels = np.array([], dtype=np.int64)
+            attr_labels = np.array([], dtype=np.int64)
+
+        if gt_bboxes_cam3d:
+            gt_bboxes_cam3d = np.array(gt_bboxes_cam3d, dtype=np.float32)
+            centers2d = np.array(centers2d, dtype=np.float32)
+            depths = np.array(depths, dtype=np.float32)
+        else:
+            gt_bboxes_cam3d = np.zeros((0, self.bbox_code_size),
+                                       dtype=np.float32)
+            centers2d = np.zeros((0, 2), dtype=np.float32)
+            depths = np.zeros((0), dtype=np.float32)
+
+        gt_bboxes_cam3d = CameraInstance3DBoxes(
+            gt_bboxes_cam3d,
+            box_dim=gt_bboxes_cam3d.shape[-1],
+            origin=(0.5, 0.5, 0.5))
+        gt_labels_3d = copy.deepcopy(gt_labels)
+
+        if gt_bboxes_ignore:
+            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+        else:
+            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+        seg_map = img_info['filename'].replace('jpg', 'png')
+
+        ann = dict(
+            bboxes=gt_bboxes,
+            labels=gt_labels,
+            gt_bboxes_3d=gt_bboxes_cam3d,
+            gt_labels_3d=gt_labels_3d,
+            attr_labels=attr_labels,
+            centers2d=centers2d,
+            depths=depths,
+            bboxes_ignore=gt_bboxes_ignore,
+            masks=gt_masks_ann,
+            seg_map=seg_map)
+
+        return ann
+
+    def get_attr_name(self, attr_idx, label_name):
+        """Get attribute from predicted index.
+        This is a workaround to predict attribute when the predicted velocity
+        is not reliable. We map the predicted attribute index to the one
+        in the attribute set. If it is consistent with the category, we will
+        keep it. Otherwise, we will use the default attribute.
+        Args:
+            attr_idx (int): Attribute index.
+            label_name (str): Predicted category name.
+        Returns:
+            str: Predicted attribute name.
+        """
+        # TODO: Simplify the variable name
+        AttrMapping_rev2 = [
+            'cycle.with_rider', 'cycle.without_rider', 'pedestrian.moving',
+            'pedestrian.standing', 'pedestrian.sitting_lying_down',
+            'vehicle.moving', 'vehicle.parked', 'vehicle.stopped', 'None'
+        ]
+        if label_name == 'car' or label_name == 'bus' \
+            or label_name == 'truck' or label_name == 'trailer' \
+                or label_name == 'construction_vehicle':
+            if AttrMapping_rev2[attr_idx] == 'vehicle.moving' or \
+                AttrMapping_rev2[attr_idx] == 'vehicle.parked' or \
+                    AttrMapping_rev2[attr_idx] == 'vehicle.stopped':
+                return AttrMapping_rev2[attr_idx]
+            else:
+                return CustomNuScenesMonoDataset.DefaultAttribute[label_name]
+        elif label_name == 'pedestrian':
+            if AttrMapping_rev2[attr_idx] == 'pedestrian.moving' or \
+                AttrMapping_rev2[attr_idx] == 'pedestrian.standing' or \
+                    AttrMapping_rev2[attr_idx] == \
+                    'pedestrian.sitting_lying_down':
+                return AttrMapping_rev2[attr_idx]
+            else:
+                return CustomNuScenesMonoDataset.DefaultAttribute[label_name]
+        elif label_name == 'bicycle' or label_name == 'motorcycle':
+            if AttrMapping_rev2[attr_idx] == 'cycle.with_rider' or \
+                    AttrMapping_rev2[attr_idx] == 'cycle.without_rider':
+                return AttrMapping_rev2[attr_idx]
+            else:
+                return CustomNuScenesMonoDataset.DefaultAttribute[label_name]
+        else:
+            return CustomNuScenesMonoDataset.DefaultAttribute[label_name]
+
+    def _format_bbox(self, results, jsonfile_prefix=None):
+        """Convert the results to the standard format.
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of the output jsonfile.
+                You can specify the output directory/filename by
+                modifying the jsonfile_prefix. Default: None.
+        Returns:
+            str: Path of the output json file.
+        """
+        nusc_annos = {}
+        mapped_class_names = self.CLASSES
+
+        print('Start to convert detection format...')
+
+        CAM_NUM = 6
+
+        for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
+
+            if sample_id % CAM_NUM == 0:
+                boxes_per_frame = []
+                attrs_per_frame = []
+
+            # need to merge results from images of the same sample
+            annos = []
+            boxes, attrs = output_to_nusc_box(det)
+            sample_token = self.data_infos[sample_id]['token']
+            boxes, attrs = cam_nusc_box_to_global(self.data_infos[sample_id],
+                                                  boxes, attrs,
+                                                  mapped_class_names,
+                                                  self.eval_detection_configs,
+                                                  self.eval_version)
+
+            boxes_per_frame.extend(boxes)
+            attrs_per_frame.extend(attrs)
+            # Remove redundant predictions caused by overlap of images
+            if (sample_id + 1) % CAM_NUM != 0:
+                continue
+            boxes = global_nusc_box_to_cam(
+                self.data_infos[sample_id + 1 - CAM_NUM], boxes_per_frame,
+                mapped_class_names, self.eval_detection_configs,
+                self.eval_version)
+            cam_boxes3d, scores, labels = nusc_box_to_cam_box3d(boxes)
+            # box nms 3d over 6 images in a frame
+            # TODO: move this global setting into config
+            nms_cfg = dict(
+                use_rotate_nms=True,
+                nms_across_levels=False,
+                nms_pre=4096,
+                nms_thr=0.05,
+                score_thr=0.01,
+                min_bbox_size=0,
+                max_per_frame=500)
+            from mmcv import Config
+            nms_cfg = Config(nms_cfg)
+            cam_boxes3d_for_nms = xywhr2xyxyr(cam_boxes3d.bev)
+            boxes3d = cam_boxes3d.tensor
+            # generate attr scores from attr labels
+            attrs = labels.new_tensor([attr for attr in attrs_per_frame])
+            boxes3d, scores, labels, attrs = box3d_multiclass_nms(
+                boxes3d,
+                cam_boxes3d_for_nms,
+                scores,
+                nms_cfg.score_thr,
+                nms_cfg.max_per_frame,
+                nms_cfg,
+                mlvl_attr_scores=attrs)
+            cam_boxes3d = CameraInstance3DBoxes(boxes3d, box_dim=9)
+            det = bbox3d2result(cam_boxes3d, scores, labels, attrs)
+            boxes, attrs = output_to_nusc_box(det)
+            boxes, attrs = cam_nusc_box_to_global(
+                self.data_infos[sample_id + 1 - CAM_NUM], boxes, attrs,
+                mapped_class_names, self.eval_detection_configs,
+                self.eval_version)
+
+            for i, box in enumerate(boxes):
+                name = mapped_class_names[box.label]
+                attr = self.get_attr_name(attrs[i], name)
+                nusc_anno = dict(
+                    sample_token=sample_token,
+                    translation=box.center.tolist(),
+                    size=box.wlh.tolist(),
+                    rotation=box.orientation.elements.tolist(),
+                    velocity=box.velocity[:2].tolist(),
+                    detection_name=name,
+                    detection_score=box.score,
+                    attribute_name=attr)
+                annos.append(nusc_anno)
+            # other views results of the same frame should be concatenated
+            if sample_token in nusc_annos:
+                nusc_annos[sample_token].extend(annos)
+            else:
+                nusc_annos[sample_token] = annos
+
+        nusc_submissions = {
+            'meta': self.modality,
+            'results': nusc_annos,
+        }
+
+        mmcv.mkdir_or_exist(jsonfile_prefix)
+        res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+        print('Results writes to', res_path)
+        mmcv.dump(nusc_submissions, res_path)
+        return res_path
+
+    def _evaluate_single(self,
+                         result_path,
+                         logger=None,
+                         metric='bbox',
+                         result_name='img_bbox'):
+        """Evaluation for a single model in nuScenes protocol.
+        Args:
+            result_path (str): Path of the result file.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            metric (str): Metric name used for evaluation. Default: 'bbox'.
+            result_name (str): Result name in the metric prefix.
+                Default: 'img_bbox'.
+        Returns:
+            dict: Dictionary of evaluation details.
+        """
+        from nuscenes import NuScenes
+        #from nuscenes.eval.detection.evaluate import NuScenesEval
+        from .nuscnes_eval import NuScenesEval_custom
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+        self.nusc = NuScenes(
+            version=self.version, dataroot=self.data_root, verbose=False)
+        eval_set_map = {
+            'v1.0-mini': 'mini_val',
+            'v1.0-trainval': 'val',
+        }
+        # nusc_eval = NuScenesEval(
+        #     nusc,
+        #     config=self.eval_detection_configs,
+        #     result_path=result_path,
+        #     eval_set=eval_set_map[self.version],
+        #     output_dir=output_dir,
+        #     verbose=False)
+        self.nusc_eval = NuScenesEval_custom(
+            self.nusc,
+            config=self.eval_detection_configs,
+            result_path=result_path,
+            eval_set=eval_set_map[self.version],
+            output_dir=output_dir,
+            verbose=True,
+            overlap_test=self.overlap_test,
+            data_infos=self.data_infos
+            )
+
+        self.nusc_eval.main(render_curves=True)
+
+        # record metrics
+        metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json'))
+        detail = dict()
+        metric_prefix = f'{result_name}_NuScenes'
+        for name in self.CLASSES:
+            for k, v in metrics['label_aps'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['label_tp_errors'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['tp_errors'].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}'.format(metric_prefix,
+                                      self.ErrNameMapping[k])] = val
+
+        detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+        detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+        return detail
+
+    def format_results(self, results, jsonfile_prefix=None, **kwargs):
+        """Format the results to json (standard format for COCO evaluation).
+        Args:
+            results (list[tuple | numpy.ndarray]): Testing results of the
+                dataset.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+        Returns:
+            tuple: (result_files, tmp_dir), result_files is a dict containing \
+                the json filepaths, tmp_dir is the temporal directory created \
+                for saving json files when jsonfile_prefix is not specified.
+        """
+        assert isinstance(results, list), 'results must be a list'
+        assert len(results) == len(self), (
+            'The length of results is not equal to the dataset len: {} != {}'.
+            format(len(results), len(self)))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+
+        # currently the output prediction results could be in two formats
+        # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
+        # 2. list of dict('pts_bbox' or 'img_bbox':
+        #     dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
+        # this is a workaround to enable evaluation of both formats on nuScenes
+        # refer to https://github.com/open-mmlab/mmdetection3d/issues/449
+        if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
+            result_files = self._format_bbox(results, jsonfile_prefix)
+        else:
+            # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
+            result_files = dict()
+            for name in results[0]:
+                # not evaluate 2D predictions on nuScenes
+                if '2d' in name:
+                    continue
+                print(f'\nFormating bboxes of {name}')
+                results_ = [out[name] for out in results]
+                tmp_file_ = osp.join(jsonfile_prefix, name)
+                result_files.update(
+                    {name: self._format_bbox(results_, tmp_file_)})
+
+        return result_files, tmp_dir
+
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 result_names=['img_bbox'],
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluation in nuScenes protocol.
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            show (bool): Whether to visualize.
+                Default: False.
+            out_dir (str): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+        Returns:
+            dict[str, float]: Results of each evaluation metric.
+        """
+
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+
+        if isinstance(result_files, dict):
+            results_dict = dict()
+            for name in result_names:
+                print('Evaluating bboxes of {}'.format(name))
+                ret_dict = self._evaluate_single(result_files[name])
+            results_dict.update(ret_dict)
+        elif isinstance(result_files, str):
+            results_dict = self._evaluate_single(result_files)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+
+        if show:
+            self.show(results, out_dir, pipeline=pipeline)
+        return results_dict
+
+    def _extract_data(self, index, pipeline, key, load_annos=False):
+        """Load data using input pipeline and extract data according to key.
+        Args:
+            index (int): Index for accessing the target data.
+            pipeline (:obj:`Compose`): Composed data loading pipeline.
+            key (str | list[str]): One single or a list of data key.
+            load_annos (bool): Whether to load data annotations.
+                If True, need to set self.test_mode as False before loading.
+        Returns:
+            np.ndarray | torch.Tensor | list[np.ndarray | torch.Tensor]:
+                A single or a list of loaded data.
+        """
+        assert pipeline is not None, 'data loading pipeline is not provided'
+        img_info = self.data_infos[index]
+        input_dict = dict(img_info=img_info)
+
+        if load_annos:
+            ann_info = self.get_ann_info(index)
+            input_dict.update(dict(ann_info=ann_info))
+
+        self.pre_pipeline(input_dict)
+        example = pipeline(input_dict)
+
+        # extract data items according to keys
+        if isinstance(key, str):
+            data = extract_result_dict(example, key)
+        else:
+            data = [extract_result_dict(example, k) for k in key]
+
+        return data
+
+    def _get_pipeline(self, pipeline):
+        """Get data loading pipeline in self.show/evaluate function.
+        Args:
+            pipeline (list[dict] | None): Input pipeline. If None is given, \
+                get from self.pipeline.
+        """
+        if pipeline is None:
+            if not hasattr(self, 'pipeline') or self.pipeline is None:
+                warnings.warn(
+                    'Use default pipeline for data loading, this may cause '
+                    'errors when data is on ceph')
+                return self._build_default_pipeline()
+            loading_pipeline = get_loading_pipeline(self.pipeline.transforms)
+            return Compose(loading_pipeline)
+        return Compose(pipeline)
+
+    def _build_default_pipeline(self):
+        """Build the default pipeline for this dataset."""
+        pipeline = [
+            dict(type='LoadImageFromFileMono3D'),
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=self.CLASSES,
+                with_label=False),
+            dict(type='Collect3D', keys=['img'])
+        ]
+        return Compose(pipeline)
+
+    def show(self, results, out_dir, show=True, pipeline=None):
+        """Results visualization.
+        Args:
+            results (list[dict]): List of bounding boxes results.
+            out_dir (str): Output directory of visualization result.
+            show (bool): Visualize the results online.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+        """
+        assert out_dir is not None, 'Expect out_dir, got none.'
+        pipeline = self._get_pipeline(pipeline)
+        for i, result in enumerate(results):
+            if 'img_bbox' in result.keys():
+                result = result['img_bbox']
+            data_info = self.data_infos[i]
+            img_path = data_info['file_name']
+            file_name = osp.split(img_path)[-1].split('.')[0]
+            img, img_metas = self._extract_data(i, pipeline,
+                                                ['img', 'img_metas'])
+            # need to transpose channel to first dim
+            img = img.numpy().transpose(1, 2, 0)
+            gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d']
+            pred_bboxes = result['boxes_3d']
+            show_multi_modality_result(
+                img,
+                gt_bboxes,
+                pred_bboxes,
+                img_metas['cam2img'],
+                out_dir,
+                file_name,
+                box_mode='camera',
+                show=show)
+
+
+def output_to_nusc_box(detection):
+    """Convert the output to the box class in the nuScenes.
+    Args:
+        detection (dict): Detection results.
+            - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+            - scores_3d (torch.Tensor): Detection scores.
+            - labels_3d (torch.Tensor): Predicted box labels.
+            - attrs_3d (torch.Tensor, optional): Predicted attributes.
+    Returns:
+        list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
+    """
+    box3d = detection['boxes_3d']
+    scores = detection['scores_3d'].numpy()
+    labels = detection['labels_3d'].numpy()
+    attrs = None
+    if 'attrs_3d' in detection:
+        attrs = detection['attrs_3d'].numpy()
+
+    box_gravity_center = box3d.gravity_center.numpy()
+    box_dims = box3d.dims.numpy()
+    box_yaw = box3d.yaw.numpy()
+
+    # convert the dim/rot to nuscbox convention
+    box_dims[:, [0, 1, 2]] = box_dims[:, [2, 0, 1]]
+    box_yaw = -box_yaw
+
+    box_list = []
+    for i in range(len(box3d)):
+        q1 = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+        q2 = pyquaternion.Quaternion(axis=[1, 0, 0], radians=np.pi / 2)
+        quat = q2 * q1
+        velocity = (box3d.tensor[i, 7], 0.0, box3d.tensor[i, 8])
+        box = NuScenesBox(
+            box_gravity_center[i],
+            box_dims[i],
+            quat,
+            label=labels[i],
+            score=scores[i],
+            velocity=velocity)
+        box_list.append(box)
+    return box_list, attrs
+
+
+def cam_nusc_box_to_global(info,
+                           boxes,
+                           attrs,
+                           classes,
+                           eval_configs,
+                           eval_version='detection_cvpr_2019'):
+    """Convert the box from camera to global coordinate.
+    Args:
+        info (dict): Info for a specific sample data, including the
+            calibration information.
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        classes (list[str]): Mapped classes in the evaluation.
+        eval_configs (object): Evaluation configuration object.
+        eval_version (str): Evaluation version.
+            Default: 'detection_cvpr_2019'
+    Returns:
+        list: List of standard NuScenesBoxes in the global
+            coordinate.
+    """
+    box_list = []
+    attr_list = []
+    for (box, attr) in zip(boxes, attrs):
+        # Move box to ego vehicle coord system
+        box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation']))
+        box.translate(np.array(info['cam2ego_translation']))
+        # filter det in ego.
+        cls_range_map = eval_configs.class_range
+        radius = np.linalg.norm(box.center[:2], 2)
+        det_range = cls_range_map[classes[box.label]]
+        if radius > det_range:
+            continue
+        # Move box to global coord system
+        box.rotate(pyquaternion.Quaternion(info['ego2global_rotation']))
+        box.translate(np.array(info['ego2global_translation']))
+        box_list.append(box)
+        attr_list.append(attr)
+    return box_list, attr_list
+
+
+def global_nusc_box_to_cam(info,
+                           boxes,
+                           classes,
+                           eval_configs,
+                           eval_version='detection_cvpr_2019'):
+    """Convert the box from global to camera coordinate.
+    Args:
+        info (dict): Info for a specific sample data, including the
+            calibration information.
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        classes (list[str]): Mapped classes in the evaluation.
+        eval_configs (object): Evaluation configuration object.
+        eval_version (str): Evaluation version.
+            Default: 'detection_cvpr_2019'
+    Returns:
+        list: List of standard NuScenesBoxes in the global
+            coordinate.
+    """
+    box_list = []
+    for box in boxes:
+        # Move box to ego vehicle coord system
+        box.translate(-np.array(info['ego2global_translation']))
+        box.rotate(
+            pyquaternion.Quaternion(info['ego2global_rotation']).inverse)
+        # filter det in ego.
+        cls_range_map = eval_configs.class_range
+        radius = np.linalg.norm(box.center[:2], 2)
+        det_range = cls_range_map[classes[box.label]]
+        if radius > det_range:
+            continue
+        # Move box to camera coord system
+        box.translate(-np.array(info['cam2ego_translation']))
+        box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation']).inverse)
+        box_list.append(box)
+    return box_list
+
+
+def nusc_box_to_cam_box3d(boxes):
+    """Convert boxes from :obj:`NuScenesBox` to :obj:`CameraInstance3DBoxes`.
+    Args:
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+    Returns:
+        tuple (:obj:`CameraInstance3DBoxes` | torch.Tensor | torch.Tensor): \
+            Converted 3D bounding boxes, scores and labels.
+    """
+    locs = torch.Tensor([b.center for b in boxes]).view(-1, 3)
+    dims = torch.Tensor([b.wlh for b in boxes]).view(-1, 3)
+    rots = torch.Tensor([b.orientation.yaw_pitch_roll[0]
+                         for b in boxes]).view(-1, 1)
+    velocity = torch.Tensor([b.velocity[:2] for b in boxes]).view(-1, 2)
+
+    # convert nusbox to cambox convention
+    dims[:, [0, 1, 2]] = dims[:, [1, 2, 0]]
+    rots = -rots
+
+    boxes_3d = torch.cat([locs, dims, rots, velocity], dim=1).cuda()
+    cam_boxes3d = CameraInstance3DBoxes(
+        boxes_3d, box_dim=9, origin=(0.5, 0.5, 0.5))
+    scores = torch.Tensor([b.score for b in boxes]).cuda()
+    labels = torch.LongTensor([b.label for b in boxes]).cuda()
+    nms_scores = scores.new_zeros(scores.shape[0], 10 + 1)
+    indices = labels.new_tensor(list(range(scores.shape[0])))
+    nms_scores[indices, labels] = scores
+    return cam_boxes3d, nms_scores, labels
\ No newline at end of file
diff --git a/mmcv/datasets/nuscenes_styled_eval_utils.py b/mmcv/datasets/nuscenes_styled_eval_utils.py
new file mode 100644
index 0000000..a8053aa
--- /dev/null
+++ b/mmcv/datasets/nuscenes_styled_eval_utils.py
@@ -0,0 +1,755 @@
+from collections import defaultdict
+from typing import List, Dict, Tuple, Union, Callable
+import abc
+import numpy as np
+from pyquaternion import Quaternion
+
+
+def center_distance(gt_box, pred_box) -> float:
+    """
+    L2 distance between the box centers (xy only).
+    :param gt_box: GT annotation sample.
+    :param pred_box: Predicted sample.
+    :return: L2 distance.
+    """
+    return np.linalg.norm(np.array(pred_box.translation[:2]) - np.array(gt_box.translation[:2]))
+
+
+def velocity_l2(gt_box, pred_box) -> float:
+    """
+    L2 distance between the velocity vectors (xy only).
+    If the predicted velocities are nan, we return inf, which is subsequently clipped to 1.
+    :param gt_box: GT annotation sample.
+    :param pred_box: Predicted sample.
+    :return: L2 distance.
+    """
+    return np.linalg.norm(np.array(pred_box.velocity) - np.array(gt_box.velocity))
+
+
+def yaw_diff(gt_box, eval_box, period: float = 2*np.pi) -> float:
+    """
+    Returns the yaw angle difference between the orientation of two boxes.
+    :param gt_box: Ground truth box.
+    :param eval_box: Predicted box.
+    :param period: Periodicity in radians for assessing angle difference.
+    :return: Yaw angle difference in radians in [0, pi].
+    """
+    yaw_gt = quaternion_yaw(Quaternion(gt_box.rotation))
+    yaw_est = quaternion_yaw(Quaternion(eval_box.rotation))
+
+    return abs(angle_diff(yaw_gt, yaw_est, period))
+
+
+def angle_diff(x: float, y: float, period: float) -> float:
+    """
+    Get the smallest angle difference between 2 angles: the angle from y to x.
+    :param x: To angle.
+    :param y: From angle.
+    :param period: Periodicity in radians for assessing angle difference.
+    :return: <float>. Signed smallest between-angle difference in range (-pi, pi).
+    """
+
+    # calculate angle difference, modulo to [0, 2*pi]
+    diff = (x - y + period / 2) % period - period / 2
+    if diff > np.pi:
+        diff = diff - (2 * np.pi)  # shift (pi, 2*pi] to (-pi, 0]
+
+    return diff
+
+
+def attr_acc(gt_box, pred_box) -> float:
+    """
+    Computes the classification accuracy for the attribute of this class (if any).
+    If the GT class has no attributes or the annotation is missing attributes, we assign an accuracy of nan, which is
+    ignored later on.
+    :param gt_box: GT annotation sample.
+    :param pred_box: Predicted sample.
+    :return: Attribute classification accuracy (0 or 1) or nan if GT annotation does not have any attributes.
+    """
+    if gt_box.attribute_name == '':
+        # If the class does not have attributes or this particular sample is missing attributes, return nan, which is
+        # ignored later. Note that about 0.4% of the sample_annotations have no attributes, although they should.
+        acc = np.nan
+    else:
+        # Check that label is correct.
+        acc = float(gt_box.attribute_name == pred_box.attribute_name)
+    return acc
+
+
+def scale_iou(sample_annotation, sample_result) -> float:
+    """
+    This method compares predictions to the ground truth in terms of scale.
+    It is equivalent to intersection over union (IOU) between the two boxes in 3D,
+    if we assume that the boxes are aligned, i.e. translation and rotation are considered identical.
+    :param sample_annotation: GT annotation sample.
+    :param sample_result: Predicted sample.
+    :return: Scale IOU.
+    """
+    # Validate inputs.
+    sa_size = np.array(sample_annotation.size)
+    sr_size = np.array(sample_result.size)
+    assert all(sa_size > 0), 'Error: sample_annotation sizes must be >0.'
+    assert all(sr_size > 0), 'Error: sample_result sizes must be >0.'
+
+    # Compute IOU.
+    min_wlh = np.minimum(sa_size, sr_size)
+    volume_annotation = np.prod(sa_size)
+    volume_result = np.prod(sr_size)
+    intersection = np.prod(min_wlh)  # type: float
+    union = volume_annotation + volume_result - intersection  # type: float
+    iou = intersection / union
+
+    return iou
+
+
+def quaternion_yaw(q: Quaternion) -> float:
+    """
+    Calculate the yaw angle from a quaternion.
+    Note that this only works for a quaternion that represents a box in lidar or global coordinate frame.
+    It does not work for a box in the camera frame.
+    :param q: Quaternion of interest.
+    :return: Yaw angle in radians.
+    """
+
+    # Project into xy plane.
+    v = np.dot(q.rotation_matrix, np.array([1, 0, 0]))
+
+    # Measure yaw using arctan.
+    yaw = np.arctan2(v[1], v[0])
+
+    return yaw
+
+
+
+def cummean(x: np.array) -> np.array:
+    """
+    Computes the cumulative mean up to each position in a NaN sensitive way
+    - If all values are NaN return an array of ones.
+    - If some values are NaN, accumulate arrays discording those entries.
+    """
+    if sum(np.isnan(x)) == len(x):
+        # Is all numbers in array are NaN's.
+        return np.ones(len(x))  # If all errors are NaN set to error to 1 for all operating points.
+    else:
+        # Accumulate in a nan-aware manner.
+        sum_vals = np.nancumsum(x.astype(float))  # Cumulative sum ignoring nans.
+        count_vals = np.cumsum(~np.isnan(x))  # Number of non-nans up to each position.
+        return np.divide(sum_vals, count_vals, out=np.zeros_like(sum_vals), where=count_vals != 0)
+    
+
+class DetectionMetricData(abc.ABC):
+    """ This class holds accumulated and interpolated data required to calculate the detection metrics. """
+
+    nelem = 101
+
+    def __init__(self,
+                 recall: np.array,
+                 precision: np.array,
+                 confidence: np.array,
+                 trans_err: np.array,
+                 vel_err: np.array,
+                 scale_err: np.array,
+                 orient_err: np.array,
+                 attr_err: np.array):
+
+        # Assert lengths.
+        assert len(recall) == self.nelem
+        assert len(precision) == self.nelem
+        assert len(confidence) == self.nelem
+        assert len(trans_err) == self.nelem
+        assert len(vel_err) == self.nelem
+        assert len(scale_err) == self.nelem
+        assert len(orient_err) == self.nelem
+        assert len(attr_err) == self.nelem
+
+        # Assert ordering.
+        assert all(confidence == sorted(confidence, reverse=True))  # Confidences should be descending.
+        assert all(recall == sorted(recall))  # Recalls should be ascending.
+
+        # Set attributes explicitly to help IDEs figure out what is going on.
+        self.recall = recall
+        self.precision = precision
+        self.confidence = confidence
+        self.trans_err = trans_err
+        self.vel_err = vel_err
+        self.scale_err = scale_err
+        self.orient_err = orient_err
+        self.attr_err = attr_err
+
+    def __eq__(self, other):
+        eq = True
+        for key in self.serialize().keys():
+            eq = eq and np.array_equal(getattr(self, key), getattr(other, key))
+        return eq
+
+    @property
+    def max_recall_ind(self):
+        """ Returns index of max recall achieved. """
+
+        # Last instance of confidence > 0 is index of max achieved recall.
+        non_zero = np.nonzero(self.confidence)[0]
+        if len(non_zero) == 0:  # If there are no matches, all the confidence values will be zero.
+            max_recall_ind = 0
+        else:
+            max_recall_ind = non_zero[-1]
+
+        return max_recall_ind
+
+    @property
+    def max_recall(self):
+        """ Returns max recall achieved. """
+
+        return self.recall[self.max_recall_ind]
+
+    def serialize(self):
+        """ Serialize instance into json-friendly format. """
+        return {
+            'recall': self.recall.tolist(),
+            'precision': self.precision.tolist(),
+            'confidence': self.confidence.tolist(),
+            'trans_err': self.trans_err.tolist(),
+            'vel_err': self.vel_err.tolist(),
+            'scale_err': self.scale_err.tolist(),
+            'orient_err': self.orient_err.tolist(),
+            'attr_err': self.attr_err.tolist(),
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized content. """
+        return cls(recall=np.array(content['recall']),
+                   precision=np.array(content['precision']),
+                   confidence=np.array(content['confidence']),
+                   trans_err=np.array(content['trans_err']),
+                   vel_err=np.array(content['vel_err']),
+                   scale_err=np.array(content['scale_err']),
+                   orient_err=np.array(content['orient_err']),
+                   attr_err=np.array(content['attr_err']))
+
+    @classmethod
+    def no_predictions(cls):
+        """ Returns a md instance corresponding to having no predictions. """
+        return cls(recall=np.linspace(0, 1, cls.nelem),
+                   precision=np.zeros(cls.nelem),
+                   confidence=np.zeros(cls.nelem),
+                   trans_err=np.ones(cls.nelem),
+                   vel_err=np.ones(cls.nelem),
+                   scale_err=np.ones(cls.nelem),
+                   orient_err=np.ones(cls.nelem),
+                   attr_err=np.ones(cls.nelem))
+
+    @classmethod
+    def random_md(cls):
+        """ Returns an md instance corresponding to a random results. """
+        return cls(recall=np.linspace(0, 1, cls.nelem),
+                   precision=np.random.random(cls.nelem),
+                   confidence=np.linspace(0, 1, cls.nelem)[::-1],
+                   trans_err=np.random.random(cls.nelem),
+                   vel_err=np.random.random(cls.nelem),
+                   scale_err=np.random.random(cls.nelem),
+                   orient_err=np.random.random(cls.nelem),
+                   attr_err=np.random.random(cls.nelem))
+    
+
+class DetectionMetricDataList:
+    """ This stores a set of MetricData in a dict indexed by (name, match-distance). """
+
+    def __init__(self):
+        self.md = {}
+
+    def __getitem__(self, key):
+        return self.md[key]
+
+    def __eq__(self, other):
+        eq = True
+        for key in self.md.keys():
+            eq = eq and self[key] == other[key]
+        return eq
+
+    def get_class_data(self, detection_name: str) -> List[Tuple[DetectionMetricData, float]]:
+        """ Get all the MetricData entries for a certain detection_name. """
+        return [(md, dist_th) for (name, dist_th), md in self.md.items() if name == detection_name]
+
+    def get_dist_data(self, dist_th: float) -> List[Tuple[DetectionMetricData, str]]:
+        """ Get all the MetricData entries for a certain match_distance. """
+        return [(md, detection_name) for (detection_name, dist), md in self.md.items() if dist == dist_th]
+
+    def set(self, detection_name: str, match_distance: float, data: DetectionMetricData):
+        """ Sets the MetricData entry for a certain detection_name and match_distance. """
+        self.md[(detection_name, match_distance)] = data
+
+    def serialize(self) -> dict:
+        return {key[0] + ':' + str(key[1]): value.serialize() for key, value in self.md.items()}
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        mdl = cls()
+        for key, md in content.items():
+            name, distance = key.split(':')
+            mdl.set(name, float(distance), DetectionMetricData.deserialize(md))
+        return mdl
+
+class DetectionMetrics:
+    """ Stores average precision and true positive metric results. Provides properties to summarize. """
+
+    def __init__(self, cfg: dict):
+
+        self.cfg = cfg
+        self._label_aps = defaultdict(lambda: defaultdict(float))
+        self._label_tp_errors = defaultdict(lambda: defaultdict(float))
+        self.eval_time = None
+
+    def add_label_ap(self, detection_name: str, dist_th: float, ap: float) -> None:
+        self._label_aps[detection_name][dist_th] = ap
+
+    def get_label_ap(self, detection_name: str, dist_th: float) -> float:
+        return self._label_aps[detection_name][dist_th]
+
+    def add_label_tp(self, detection_name: str, metric_name: str, tp: float):
+        self._label_tp_errors[detection_name][metric_name] = tp
+
+    def get_label_tp(self, detection_name: str, metric_name: str) -> float:
+        return self._label_tp_errors[detection_name][metric_name]
+
+    def add_runtime(self, eval_time: float) -> None:
+        self.eval_time = eval_time
+
+    @property
+    def mean_dist_aps(self) -> Dict[str, float]:
+        """ Calculates the mean over distance thresholds for each label. """
+        return {class_name: np.mean(list(d.values())) for class_name, d in self._label_aps.items()}
+
+    @property
+    def mean_ap(self) -> float:
+        """ Calculates the mean AP by averaging over distance thresholds and classes. """
+        return float(np.mean(list(self.mean_dist_aps.values())))
+
+    @property
+    def tp_errors(self) -> Dict[str, float]:
+        """ Calculates the mean true positive error across all classes for each metric. """
+        errors = {}
+        for metric_name in self.cfg['tp_metrics']:
+            class_errors = []
+            for detection_name in self.cfg['class_names']:
+                class_errors.append(self.get_label_tp(detection_name, metric_name))
+
+            errors[metric_name] = float(np.nanmean(class_errors))
+
+        return errors
+
+    @property
+    def tp_scores(self) -> Dict[str, float]:
+        scores = {}
+        tp_errors = self.tp_errors
+        for metric_name in self.cfg['tp_metrics']:
+
+            # We convert the true positive errors to "scores" by 1-error.
+            score = 1.0 - tp_errors[metric_name]
+
+            # Some of the true positive errors are unbounded, so we bound the scores to min 0.
+            score = max(0.0, score)
+
+            scores[metric_name] = score
+
+        return scores
+
+    @property
+    def nd_score(self) -> float:
+        """
+        Compute the nuScenes detection score (NDS, weighted sum of the individual scores).
+        :return: The NDS.
+        """
+        # Summarize.
+        total = float(self.cfg['mean_ap_weight'] * self.mean_ap + np.sum(list(self.tp_scores.values())))
+
+        # Normalize.
+        total = total / float(self.cfg['mean_ap_weight'] + len(self.tp_scores.keys()))
+
+        return total
+    
+
+    def serialize(self):
+        return {
+            'label_aps': self._label_aps,
+            'mean_dist_aps': self.mean_dist_aps,
+            'mean_ap': self.mean_ap,
+            'label_tp_errors': self._label_tp_errors,
+            'tp_errors': self.tp_errors,
+            'tp_scores': self.tp_scores,
+            'nd_score': self.nd_score,
+            'eval_time': self.eval_time,
+            'cfg': self.cfg
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized dictionary. """
+
+        cfg = content['cfg']
+        metrics = cls(cfg=cfg)
+        metrics.add_runtime(content['eval_time'])
+
+        for detection_name, label_aps in content['label_aps'].items():
+            for dist_th, ap in label_aps.items():
+                metrics.add_label_ap(detection_name=detection_name, dist_th=float(dist_th), ap=float(ap))
+
+        for detection_name, label_tps in content['label_tp_errors'].items():
+            for metric_name, tp in label_tps.items():
+                metrics.add_label_tp(detection_name=detection_name, metric_name=metric_name, tp=float(tp))
+
+        return metrics
+
+    def __eq__(self, other):
+        eq = True
+        eq = eq and self._label_aps == other._label_aps
+        eq = eq and self._label_tp_errors == other._label_tp_errors
+        eq = eq and self.eval_time == other.eval_time
+        eq = eq and self.cfg == other.cfg
+
+        return eq
+    
+
+class DetectionBox(abc.ABC):
+    """ Data class used during detection evaluation. Can be a prediction or ground truth."""
+
+    def __init__(self,
+                 sample_token: str = "",
+                 translation: Tuple[float, float, float] = (0, 0, 0),
+                 size: Tuple[float, float, float] = (0, 0, 0),
+                 rotation: Tuple[float, float, float, float] = (0, 0, 0, 0),
+                 velocity: Tuple[float, float] = (0, 0),
+                 ego_translation: Tuple[float, float, float] = (0, 0, 0),  # Translation to ego vehicle in meters.
+                 num_pts: int = -1,  # Nbr. LIDAR or RADAR inside the box. Only for gt boxes.
+                 detection_name: str = 'car',  # The class name used in the detection challenge.
+                 detection_score: float = -1.0,  # GT samples do not have a score.
+                 attribute_name: str = ''):  # Box attribute. Each box can have at most 1 attribute.
+
+
+        assert detection_name is not None, 'Error: detection_name cannot be empty!'
+        # assert detection_name in DETECTION_NAMES, 'Error: Unknown detection_name %s' % detection_name
+
+        # assert attribute_name in ATTRIBUTE_NAMES or attribute_name == '', \
+        #     'Error: Unknown attribute_name %s' % attribute_name
+
+        assert type(detection_score) == float, 'Error: detection_score must be a float!'
+        assert not np.any(np.isnan(detection_score)), 'Error: detection_score may not be NaN!'
+        self.sample_token = sample_token
+        self.translation = translation
+        self.size = size
+        self.rotation = rotation
+        self.velocity = velocity
+        self.ego_translation = ego_translation
+        self.num_pts = num_pts
+        self.detection_name = detection_name
+        self.detection_score = detection_score
+        self.attribute_name = attribute_name
+
+    def __eq__(self, other):
+        return (self.sample_token == other.sample_token and
+                self.translation == other.translation and
+                self.size == other.size and
+                self.rotation == other.rotation and
+                self.velocity == other.velocity and
+                self.ego_translation == other.ego_translation and
+                self.num_pts == other.num_pts and
+                self.detection_name == other.detection_name and
+                self.detection_score == other.detection_score and
+                self.attribute_name == other.attribute_name)
+
+    def serialize(self) -> dict:
+        """ Serialize instance into json-friendly format. """
+        return {
+            'sample_token': self.sample_token,
+            'translation': self.translation,
+            'size': self.size,
+            'rotation': self.rotation,
+            'velocity': self.velocity,
+            'ego_translation': self.ego_translation,
+            'num_pts': self.num_pts,
+            'detection_name': self.detection_name,
+            'detection_score': self.detection_score,
+            'attribute_name': self.attribute_name
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized content. """
+        return cls(sample_token=content['sample_token'],
+                   translation=tuple(content['translation']),
+                   size=tuple(content['size']),
+                   rotation=tuple(content['rotation']),
+                   velocity=tuple(content['velocity']),
+                   ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+                   else tuple(content['ego_translation']),
+                   num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+                   detection_name=content['detection_name'],
+                   detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+                   attribute_name=content['attribute_name'])
+    @property
+    def ego_dist(self) -> float:
+        """ Compute the distance from this box to the ego vehicle in 2D. """
+        return np.sqrt(np.sum(np.array(self.ego_translation[:2]) ** 2))    
+
+
+
+
+
+class EvalBoxes:
+    """ Data class that groups EvalBox instances by sample. """
+
+    def __init__(self):
+        """
+        Initializes the EvalBoxes for GT or predictions.
+        """
+        self.boxes = defaultdict(list)
+
+    def __repr__(self):
+        return "EvalBoxes with {} boxes across {} samples".format(len(self.all), len(self.sample_tokens))
+
+    def __getitem__(self, item) -> List[DetectionBox]:
+        return self.boxes[item]
+
+    def __eq__(self, other):
+        if not set(self.sample_tokens) == set(other.sample_tokens):
+            return False
+        for token in self.sample_tokens:
+            if not len(self[token]) == len(other[token]):
+                return False
+            for box1, box2 in zip(self[token], other[token]):
+                if box1 != box2:
+                    return False
+        return True
+
+    def __len__(self):
+        return len(self.boxes)
+
+    @property
+    def all(self) -> List[DetectionBox]:
+        """ Returns all EvalBoxes in a list. """
+        ab = []
+        for sample_token in self.sample_tokens:
+            ab.extend(self[sample_token])
+        return ab
+
+    @property
+    def sample_tokens(self) -> List[str]:
+        """ Returns a list of all keys. """
+        return list(self.boxes.keys())
+
+    def add_boxes(self, sample_token: str, boxes: List[DetectionBox]) -> None:
+        """ Adds a list of boxes. """
+        self.boxes[sample_token].extend(boxes)
+
+    def serialize(self) -> dict:
+        """ Serialize instance into json-friendly format. """
+        return {key: [box.serialize() for box in boxes] for key, boxes in self.boxes.items()}
+
+    @classmethod
+    def deserialize(cls, content: dict, box_cls):
+        """
+        Initialize from serialized content.
+        :param content: A dictionary with the serialized content of the box.
+        :param box_cls: The class of the boxes, DetectionBox or TrackingBox.
+        """
+        eb = cls()
+        for sample_token, boxes in content.items():
+            eb.add_boxes(sample_token, [box_cls.deserialize(box) for box in boxes])
+        return eb
+    
+
+def accumulate(gt_boxes,
+               pred_boxes,
+               class_name: str,
+               dist_fcn: Callable,
+               dist_th: float,
+               verbose: bool = False) -> DetectionMetricData:
+    """
+    Average Precision over predefined different recall thresholds for a single distance threshold.
+    The recall/conf thresholds and other raw metrics will be used in secondary metrics.
+    :param gt_boxes: Maps every sample_token to a list of its sample_annotations.
+    :param pred_boxes: Maps every sample_token to a list of its sample_results.
+    :param class_name: Class to compute AP on.
+    :param dist_fcn: Distance function used to match detections and ground truths.
+    :param dist_th: Distance threshold for a match.
+    :param verbose: If true, print debug messages.
+    :return: (average_prec, metrics). The average precision value and raw data for a number of metrics.
+    """
+    # ---------------------------------------------
+    # Organize input and initialize accumulators.
+    # ---------------------------------------------
+
+    # Count the positives.
+    npos = len([1 for gt_box in gt_boxes.all if gt_box.detection_name == class_name])
+    if verbose:
+        print("Found {} GT of class {} out of {} total across {} samples.".
+              format(npos, class_name, len(gt_boxes.all), len(gt_boxes.sample_tokens)))
+
+    # For missing classes in the GT, return a data structure corresponding to no predictions.
+    if npos == 0:
+        return DetectionMetricData.no_predictions()
+
+    # Organize the predictions in a single list.
+    pred_boxes_list = [box for box in pred_boxes.all if box.detection_name == class_name]
+    pred_confs = [box.detection_score for box in pred_boxes_list]
+
+    if verbose:
+        print("Found {} PRED of class {} out of {} total across {} samples.".
+              format(len(pred_confs), class_name, len(pred_boxes.all), len(pred_boxes.sample_tokens)))
+
+    # Sort by confidence.
+    sortind = [i for (v, i) in sorted((v, i) for (i, v) in enumerate(pred_confs))][::-1]
+
+    # Do the actual matching.
+    tp = []  # Accumulator of true positives.
+    fp = []  # Accumulator of false positives.
+    conf = []  # Accumulator of confidences.
+
+    # match_data holds the extra metrics we calculate for each match.
+    match_data = {'trans_err': [],
+                  'vel_err': [],
+                  'scale_err': [],
+                  'orient_err': [],
+                  'attr_err': [],
+                  'conf': []}
+
+    # ---------------------------------------------
+    # Match and accumulate match data.
+    # ---------------------------------------------
+
+    taken = set()  # Initially no gt bounding box is matched.
+    for ind in sortind:
+        pred_box = pred_boxes_list[ind]
+        min_dist = np.inf
+        match_gt_idx = None
+
+        for gt_idx, gt_box in enumerate(gt_boxes[pred_box.sample_token]):
+
+            # Find closest match among ground truth boxes
+            if gt_box.detection_name == class_name and not (pred_box.sample_token, gt_idx) in taken:
+                this_distance = dist_fcn(gt_box, pred_box)
+                if this_distance < min_dist:
+                    min_dist = this_distance
+                    match_gt_idx = gt_idx
+
+        # If the closest match is close enough according to threshold we have a match!
+        is_match = min_dist < dist_th
+
+        if is_match:
+            taken.add((pred_box.sample_token, match_gt_idx))
+
+            #  Update tp, fp and confs.
+            tp.append(1)
+            fp.append(0)
+            conf.append(pred_box.detection_score)
+
+            # Since it is a match, update match data also.
+            gt_box_match = gt_boxes[pred_box.sample_token][match_gt_idx]
+
+            match_data['trans_err'].append(center_distance(gt_box_match, pred_box))
+            match_data['vel_err'].append(velocity_l2(gt_box_match, pred_box))
+            match_data['scale_err'].append(1 - scale_iou(gt_box_match, pred_box))
+
+            # Barrier orientation is only determined up to 180 degree. (For cones orientation is discarded later)
+            period = np.pi if class_name == 'barrier' else 2 * np.pi
+            match_data['orient_err'].append(yaw_diff(gt_box_match, pred_box, period=period))
+
+            match_data['attr_err'].append(1 - attr_acc(gt_box_match, pred_box))
+            match_data['conf'].append(pred_box.detection_score)
+
+        else:
+            # No match. Mark this as a false positive.
+            tp.append(0)
+            fp.append(1)
+            conf.append(pred_box.detection_score)
+
+    # Check if we have any matches. If not, just return a "no predictions" array.
+    if len(match_data['trans_err']) == 0:
+        return DetectionMetricData.no_predictions()
+
+    # ---------------------------------------------
+    # Calculate and interpolate precision and recall
+    # ---------------------------------------------
+
+    # Accumulate.
+    tp = np.cumsum(tp).astype(float)
+    fp = np.cumsum(fp).astype(float)
+    conf = np.array(conf)
+
+    # Calculate precision and recall.
+    prec = tp / (fp + tp)
+    rec = tp / float(npos)
+
+    rec_interp = np.linspace(0, 1, DetectionMetricData.nelem)  # 101 steps, from 0% to 100% recall.
+    prec = np.interp(rec_interp, rec, prec, right=0)
+    conf = np.interp(rec_interp, rec, conf, right=0)
+    rec = rec_interp
+
+    # ---------------------------------------------
+    # Re-sample the match-data to match, prec, recall and conf.
+    # ---------------------------------------------
+
+    for key in match_data.keys():
+        if key == "conf":
+            continue  # Confidence is used as reference to align with fp and tp. So skip in this step.
+
+        else:
+            # For each match_data, we first calculate the accumulated mean.
+            tmp = cummean(np.array(match_data[key]))
+
+            # Then interpolate based on the confidences. (Note reversing since np.interp needs increasing arrays)
+            match_data[key] = np.interp(conf[::-1], match_data['conf'][::-1], tmp[::-1])[::-1]
+
+    # ---------------------------------------------
+    # Done. Instantiate MetricData and return
+    # ---------------------------------------------
+    return DetectionMetricData(recall=rec,
+                               precision=prec,
+                               confidence=conf,
+                               trans_err=match_data['trans_err'],
+                               vel_err=match_data['vel_err'],
+                               scale_err=match_data['scale_err'],
+                               orient_err=match_data['orient_err'],
+                               attr_err=match_data['attr_err'])
+
+
+
+def calc_ap(md: DetectionMetricData, min_recall: float, min_precision: float) -> float:
+    """ Calculated average precision. """
+
+    assert 0 <= min_precision < 1
+    assert 0 <= min_recall <= 1
+
+    prec = np.copy(md.precision)
+    prec = prec[round(100 * min_recall) + 1:]  # Clip low recalls. +1 to exclude the min recall bin.
+    prec -= min_precision  # Clip low precision
+    prec[prec < 0] = 0
+    return float(np.mean(prec)) / (1.0 - min_precision)
+
+
+def calc_tp(md: DetectionMetricData, min_recall: float, metric_name: str) -> float:
+    """ Calculates true positive errors. """
+
+    first_ind = round(100 * min_recall) + 1  # +1 to exclude the error at min recall.
+    last_ind = md.max_recall_ind  # First instance of confidence = 0 is index of max achieved recall.
+    if last_ind < first_ind:
+        return 1.0  # Assign 1 here. If this happens for all classes, the score for that TP metric will be 0.
+    else:
+        return float(np.mean(getattr(md, metric_name)[first_ind: last_ind + 1]))  # +1 to include error at max recall.
+    
+    
+def quaternion_yaw(q: Quaternion) -> float:
+    """
+    Calculate the yaw angle from a quaternion.
+    Note that this only works for a quaternion that represents a box in lidar or global coordinate frame.
+    It does not work for a box in the camera frame.
+    :param q: Quaternion of interest.
+    :return: Yaw angle in radians.
+    """
+
+    # Project into xy plane.
+    v = np.dot(q.rotation_matrix, np.array([1, 0, 0]))
+
+    # Measure yaw using arctan.
+    yaw = np.arctan2(v[1], v[0])
+
+    return yaw
\ No newline at end of file
diff --git a/mmcv/datasets/nuscenes_vad_dataset.py b/mmcv/datasets/nuscenes_vad_dataset.py
new file mode 100644
index 0000000..a552afb
--- /dev/null
+++ b/mmcv/datasets/nuscenes_vad_dataset.py
@@ -0,0 +1,1933 @@
+import os
+import json
+import copy
+import tempfile
+from typing import Dict, List
+from mmcv.fileio.io import dump,load
+import numpy as np
+from .builder import DATASETS
+from mmcv.datasets import NuScenesDataset
+import pyquaternion
+import mmcv
+from os import path as osp
+import torch
+import numpy as np
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from .vad_custom_nuscenes_eval import NuScenesEval_custom
+from nuscenes.eval.common.utils import center_distance
+from mmcv.utils.visual import save_tensor
+from mmcv.parallel import DataContainer as DC
+import random
+from mmcv.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
+from nuscenes.utils.data_classes import Box as NuScenesBox
+from mmcv.core.bbox.structures.nuscenes_box import CustomNuscenesBox
+from shapely import affinity, ops
+from shapely.geometry import LineString, box, MultiPolygon, MultiLineString
+from mmcv.datasets.pipelines import to_tensor
+from nuscenes.map_expansion.map_api import NuScenesMap, NuScenesMapExplorer
+from nuscenes.eval.detection.constants import DETECTION_NAMES
+
+
+class LiDARInstanceLines(object):
+    """Line instance in LIDAR coordinates
+
+    """
+    def __init__(self, 
+                 instance_line_list, 
+                 sample_dist=1,
+                 num_samples=250,
+                 padding=False,
+                 fixed_num=-1,
+                 padding_value=-10000,
+                 patch_size=None):
+        assert isinstance(instance_line_list, list)
+        assert patch_size is not None
+        if len(instance_line_list) != 0:
+            assert isinstance(instance_line_list[0], LineString)
+        self.patch_size = patch_size
+        self.max_x = self.patch_size[1] / 2
+        self.max_y = self.patch_size[0] / 2
+        self.sample_dist = sample_dist
+        self.num_samples = num_samples
+        self.padding = padding
+        self.fixed_num = fixed_num
+        self.padding_value = padding_value
+
+        self.instance_list = instance_line_list
+
+    @property
+    def start_end_points(self):
+        """
+        return torch.Tensor([N,4]), in xstart, ystart, xend, yend form
+        """
+        assert len(self.instance_list) != 0
+        instance_se_points_list = []
+        for instance in self.instance_list:
+            se_points = []
+            se_points.extend(instance.coords[0])
+            se_points.extend(instance.coords[-1])
+            instance_se_points_list.append(se_points)
+        instance_se_points_array = np.array(instance_se_points_list)
+        instance_se_points_tensor = to_tensor(instance_se_points_array)
+        instance_se_points_tensor = instance_se_points_tensor.to(
+                                dtype=torch.float32)
+        instance_se_points_tensor[:,0] = torch.clamp(instance_se_points_tensor[:,0], min=-self.max_x,max=self.max_x)
+        instance_se_points_tensor[:,1] = torch.clamp(instance_se_points_tensor[:,1], min=-self.max_y,max=self.max_y)
+        instance_se_points_tensor[:,2] = torch.clamp(instance_se_points_tensor[:,2], min=-self.max_x,max=self.max_x)
+        instance_se_points_tensor[:,3] = torch.clamp(instance_se_points_tensor[:,3], min=-self.max_y,max=self.max_y)
+        return instance_se_points_tensor
+
+    @property
+    def bbox(self):
+        """
+        return torch.Tensor([N,4]), in xmin, ymin, xmax, ymax form
+        """
+        assert len(self.instance_list) != 0
+        instance_bbox_list = []
+        for instance in self.instance_list:
+            # bounds is bbox: [xmin, ymin, xmax, ymax]
+            instance_bbox_list.append(instance.bounds)
+        instance_bbox_array = np.array(instance_bbox_list)
+        instance_bbox_tensor = to_tensor(instance_bbox_array)
+        instance_bbox_tensor = instance_bbox_tensor.to(
+                            dtype=torch.float32)
+        instance_bbox_tensor[:,0] = torch.clamp(instance_bbox_tensor[:,0], min=-self.max_x,max=self.max_x)
+        instance_bbox_tensor[:,1] = torch.clamp(instance_bbox_tensor[:,1], min=-self.max_y,max=self.max_y)
+        instance_bbox_tensor[:,2] = torch.clamp(instance_bbox_tensor[:,2], min=-self.max_x,max=self.max_x)
+        instance_bbox_tensor[:,3] = torch.clamp(instance_bbox_tensor[:,3], min=-self.max_y,max=self.max_y)
+        return instance_bbox_tensor
+
+    @property
+    def fixed_num_sampled_points(self):
+        """
+        return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form
+            N means the num of instances
+        """
+        assert len(self.instance_list) != 0
+        instance_points_list = []
+        for instance in self.instance_list:
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+            instance_points_list.append(sampled_points)
+        instance_points_array = np.array(instance_points_list)
+        instance_points_tensor = to_tensor(instance_points_array)
+        instance_points_tensor = instance_points_tensor.to(
+                            dtype=torch.float32)
+        instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+        instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+        return instance_points_tensor
+
+    @property
+    def fixed_num_sampled_points_ambiguity(self):
+        """
+        return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form
+            N means the num of instances
+        """
+        assert len(self.instance_list) != 0
+        instance_points_list = []
+        for instance in self.instance_list:
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+            instance_points_list.append(sampled_points)
+        instance_points_array = np.array(instance_points_list)
+        instance_points_tensor = to_tensor(instance_points_array)
+        instance_points_tensor = instance_points_tensor.to(
+                            dtype=torch.float32)
+        instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+        instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+        instance_points_tensor = instance_points_tensor.unsqueeze(1)
+        return instance_points_tensor
+
+    @property
+    def fixed_num_sampled_points_torch(self):
+        """
+        return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form
+            N means the num of instances
+        """
+        assert len(self.instance_list) != 0
+        instance_points_list = []
+        for instance in self.instance_list:
+            # distances = np.linspace(0, instance.length, self.fixed_num)
+            # sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+            poly_pts = to_tensor(np.array(list(instance.coords)))
+            poly_pts = poly_pts.unsqueeze(0).permute(0,2,1)
+            sampled_pts = torch.nn.functional.interpolate(poly_pts,size=(self.fixed_num),mode='linear',align_corners=True)
+            sampled_pts = sampled_pts.permute(0,2,1).squeeze(0)
+            instance_points_list.append(sampled_pts)
+        # instance_points_array = np.array(instance_points_list)
+        # instance_points_tensor = to_tensor(instance_points_array)
+        instance_points_tensor = torch.stack(instance_points_list,dim=0)
+        instance_points_tensor = instance_points_tensor.to(
+                            dtype=torch.float32)
+        instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+        instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+        return instance_points_tensor
+
+    @property
+    def shift_fixed_num_sampled_points(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        fixed_num_sampled_points = self.fixed_num_sampled_points
+        instances_list = []
+        is_poly = False
+        # is_line = False
+        # import pdb;pdb.set_trace()
+        for fixed_num_pts in fixed_num_sampled_points:
+            # [fixed_num, 2]
+            is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+            fixed_num = fixed_num_pts.shape[0]
+            shift_pts_list = []
+            if is_poly:
+                # import pdb;pdb.set_trace()
+                for shift_right_i in range(fixed_num):
+                    shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0))
+            else:
+                shift_pts_list.append(fixed_num_pts)
+                shift_pts_list.append(fixed_num_pts.flip(0))
+            shift_pts = torch.stack(shift_pts_list,dim=0)
+
+            shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+            shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+            if not is_poly:
+                padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,2], self.padding_value)
+                shift_pts = torch.cat([shift_pts,padding],dim=0)
+                # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            instances_list.append(shift_pts)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v1(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        fixed_num_sampled_points = self.fixed_num_sampled_points
+        instances_list = []
+        is_poly = False
+        # is_line = False
+        # import pdb;pdb.set_trace()
+        for fixed_num_pts in fixed_num_sampled_points:
+            # [fixed_num, 2]
+            is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+            pts_num = fixed_num_pts.shape[0]
+            shift_num = pts_num - 1
+            if is_poly:
+                pts_to_shift = fixed_num_pts[:-1,:]
+            shift_pts_list = []
+            if is_poly:
+                for shift_right_i in range(shift_num):
+                    shift_pts_list.append(pts_to_shift.roll(shift_right_i,0))
+            else:
+                shift_pts_list.append(fixed_num_pts)
+                shift_pts_list.append(fixed_num_pts.flip(0))
+            shift_pts = torch.stack(shift_pts_list,dim=0)
+
+            if is_poly:
+                _, _, num_coords = shift_pts.shape
+                tmp_shift_pts = shift_pts.new_zeros((shift_num, pts_num, num_coords))
+                tmp_shift_pts[:,:-1,:] = shift_pts
+                tmp_shift_pts[:,-1,:] = shift_pts[:,0,:]
+                shift_pts = tmp_shift_pts
+
+            shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+            shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+            if not is_poly:
+                padding = torch.full([shift_num-shift_pts.shape[0],pts_num,2], self.padding_value)
+                shift_pts = torch.cat([shift_pts,padding],dim=0)
+                # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            instances_list.append(shift_pts)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v2(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        assert len(self.instance_list) != 0
+        instances_list = []
+        for instance in self.instance_list:
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            poly_pts = np.array(list(instance.coords))
+            start_pts = poly_pts[0]
+            end_pts = poly_pts[-1]
+            is_poly = np.equal(start_pts, end_pts)
+            is_poly = is_poly.all()
+            shift_pts_list = []
+            pts_num, coords_num = poly_pts.shape
+            shift_num = pts_num - 1
+            final_shift_num = self.fixed_num - 1
+            if is_poly:
+                pts_to_shift = poly_pts[:-1,:]
+                for shift_right_i in range(shift_num):
+                    shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0)
+                    pts_to_concat = shift_pts[0]
+                    pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+                    shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+                    shift_instance = LineString(shift_pts)
+                    shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+                    shift_pts_list.append(shift_sampled_points)
+                # import pdb;pdb.set_trace()
+            else:
+                sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+                flip_sampled_points = np.flip(sampled_points, axis=0)
+                shift_pts_list.append(sampled_points)
+                shift_pts_list.append(flip_sampled_points)
+            
+            multi_shifts_pts = np.stack(shift_pts_list,axis=0)
+            shifts_num,_,_ = multi_shifts_pts.shape
+
+            if shifts_num > final_shift_num:
+                index = np.random.choice(multi_shifts_pts.shape[0], final_shift_num, replace=False)
+                multi_shifts_pts = multi_shifts_pts[index]
+            
+            multi_shifts_pts_tensor = to_tensor(multi_shifts_pts)
+            multi_shifts_pts_tensor = multi_shifts_pts_tensor.to(
+                            dtype=torch.float32)
+            
+            multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+            multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+            # if not is_poly:
+            if multi_shifts_pts_tensor.shape[0] < final_shift_num:
+                padding = torch.full([final_shift_num-multi_shifts_pts_tensor.shape[0],self.fixed_num,2], self.padding_value)
+                multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0)
+            instances_list.append(multi_shifts_pts_tensor)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v3(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        assert len(self.instance_list) != 0
+        instances_list = []
+        for instance in self.instance_list:
+            distances = np.linspace(0, instance.length, self.fixed_num)
+            poly_pts = np.array(list(instance.coords))
+            start_pts = poly_pts[0]
+            end_pts = poly_pts[-1]
+            is_poly = np.equal(start_pts, end_pts)
+            is_poly = is_poly.all()
+            shift_pts_list = []
+            pts_num, coords_num = poly_pts.shape
+            shift_num = pts_num - 1
+            final_shift_num = self.fixed_num - 1
+            if is_poly:
+                pts_to_shift = poly_pts[:-1,:]
+                for shift_right_i in range(shift_num):
+                    shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0)
+                    pts_to_concat = shift_pts[0]
+                    pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+                    shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+                    shift_instance = LineString(shift_pts)
+                    shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+                    shift_pts_list.append(shift_sampled_points)
+                flip_pts_to_shift = np.flip(pts_to_shift, axis=0)
+                for shift_right_i in range(shift_num):
+                    shift_pts = np.roll(flip_pts_to_shift,shift_right_i,axis=0)
+                    pts_to_concat = shift_pts[0]
+                    pts_to_concat = np.expand_dims(pts_to_concat,axis=0)
+                    shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0)
+                    shift_instance = LineString(shift_pts)
+                    shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+                    shift_pts_list.append(shift_sampled_points)
+                # import pdb;pdb.set_trace()
+            else:
+                sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+                flip_sampled_points = np.flip(sampled_points, axis=0)
+                shift_pts_list.append(sampled_points)
+                shift_pts_list.append(flip_sampled_points)
+            
+            multi_shifts_pts = np.stack(shift_pts_list,axis=0)
+            shifts_num,_,_ = multi_shifts_pts.shape
+            # import pdb;pdb.set_trace()
+            if shifts_num > 2*final_shift_num:
+                index = np.random.choice(shift_num, final_shift_num, replace=False)
+                flip0_shifts_pts = multi_shifts_pts[index]
+                flip1_shifts_pts = multi_shifts_pts[index+shift_num]
+                multi_shifts_pts = np.concatenate((flip0_shifts_pts,flip1_shifts_pts),axis=0)
+            
+            multi_shifts_pts_tensor = to_tensor(multi_shifts_pts)
+            multi_shifts_pts_tensor = multi_shifts_pts_tensor.to(
+                            dtype=torch.float32)
+            
+            multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x)
+            multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y)
+            # if not is_poly:
+            if multi_shifts_pts_tensor.shape[0] < 2*final_shift_num:
+                padding = torch.full([final_shift_num*2-multi_shifts_pts_tensor.shape[0],self.fixed_num,2], self.padding_value)
+                multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0)
+            instances_list.append(multi_shifts_pts_tensor)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_v4(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        fixed_num_sampled_points = self.fixed_num_sampled_points
+        instances_list = []
+        is_poly = False
+        # is_line = False
+        # import pdb;pdb.set_trace()
+        for fixed_num_pts in fixed_num_sampled_points:
+            # [fixed_num, 2]
+            is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+            pts_num = fixed_num_pts.shape[0]
+            shift_num = pts_num - 1
+            shift_pts_list = []
+            if is_poly:
+                pts_to_shift = fixed_num_pts[:-1,:]
+                for shift_right_i in range(shift_num):
+                    shift_pts_list.append(pts_to_shift.roll(shift_right_i,0))
+                flip_pts_to_shift = pts_to_shift.flip(0)
+                for shift_right_i in range(shift_num):
+                    shift_pts_list.append(flip_pts_to_shift.roll(shift_right_i,0))
+            else:
+                shift_pts_list.append(fixed_num_pts)
+                shift_pts_list.append(fixed_num_pts.flip(0))
+            shift_pts = torch.stack(shift_pts_list,dim=0)
+
+            if is_poly:
+                _, _, num_coords = shift_pts.shape
+                tmp_shift_pts = shift_pts.new_zeros((shift_num*2, pts_num, num_coords))
+                tmp_shift_pts[:,:-1,:] = shift_pts
+                tmp_shift_pts[:,-1,:] = shift_pts[:,0,:]
+                shift_pts = tmp_shift_pts
+
+            shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+            shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+            if not is_poly:
+                padding = torch.full([shift_num*2-shift_pts.shape[0],pts_num,2], self.padding_value)
+                shift_pts = torch.cat([shift_pts,padding],dim=0)
+                # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            instances_list.append(shift_pts)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    @property
+    def shift_fixed_num_sampled_points_torch(self):
+        """
+        return  [instances_num, num_shifts, fixed_num, 2]
+        """
+        fixed_num_sampled_points = self.fixed_num_sampled_points_torch
+        instances_list = []
+        is_poly = False
+        # is_line = False
+        # import pdb;pdb.set_trace()
+        for fixed_num_pts in fixed_num_sampled_points:
+            # [fixed_num, 2]
+            is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1])
+            fixed_num = fixed_num_pts.shape[0]
+            shift_pts_list = []
+            if is_poly:
+                # import pdb;pdb.set_trace()
+                for shift_right_i in range(fixed_num):
+                    shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0))
+            else:
+                shift_pts_list.append(fixed_num_pts)
+                shift_pts_list.append(fixed_num_pts.flip(0))
+            shift_pts = torch.stack(shift_pts_list,dim=0)
+
+            shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x)
+            shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y)
+
+            if not is_poly:
+                padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,2], self.padding_value)
+                shift_pts = torch.cat([shift_pts,padding],dim=0)
+                # padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                # sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            instances_list.append(shift_pts)
+        instances_tensor = torch.stack(instances_list, dim=0)
+        instances_tensor = instances_tensor.to(
+                            dtype=torch.float32)
+        return instances_tensor
+
+    # @property
+    # def polyline_points(self):
+    #     """
+    #     return [[x0,y0],[x1,y1],...]
+    #     """
+    #     assert len(self.instance_list) != 0
+    #     for instance in self.instance_list:
+
+
+class VectorizedLocalMap(object):
+    CLASS2LABEL = {
+        'road_divider': 0,
+        'lane_divider': 0,
+        'ped_crossing': 1,
+        'contours': 2,
+        'others': -1
+    }
+    def __init__(self,
+                 dataroot,
+                 patch_size,
+                 map_classes=['divider','ped_crossing','boundary'],
+                 line_classes=['road_divider', 'lane_divider'],
+                 ped_crossing_classes=['ped_crossing'],
+                 contour_classes=['road_segment', 'lane'],
+                 sample_dist=1,
+                 num_samples=250,
+                 padding=False,
+                 fixed_ptsnum_per_line=-1,
+                 padding_value=-10000,):
+        '''
+        Args:
+            fixed_ptsnum_per_line = -1 : no fixed num
+        '''
+        super().__init__()
+        self.data_root = dataroot
+        self.MAPS = ['boston-seaport', 'singapore-hollandvillage',
+                     'singapore-onenorth', 'singapore-queenstown']
+        self.vec_classes = map_classes
+        self.line_classes = line_classes
+        self.ped_crossing_classes = ped_crossing_classes
+        self.polygon_classes = contour_classes
+        self.nusc_maps = {}
+        self.map_explorer = {}
+        for loc in self.MAPS:
+            self.nusc_maps[loc] = NuScenesMap(dataroot=self.data_root, map_name=loc)
+            self.map_explorer[loc] = NuScenesMapExplorer(self.nusc_maps[loc])
+
+        self.patch_size = patch_size
+        self.sample_dist = sample_dist
+        self.num_samples = num_samples
+        self.padding = padding
+        self.fixed_num = fixed_ptsnum_per_line
+        self.padding_value = padding_value
+
+    def gen_vectorized_samples(self, location, lidar2global_translation, lidar2global_rotation):
+        '''
+        use lidar2global to get gt map layers
+        '''
+        
+        map_pose = lidar2global_translation[:2]
+        rotation = Quaternion(lidar2global_rotation)
+
+        patch_box = (map_pose[0], map_pose[1], self.patch_size[0], self.patch_size[1])
+        patch_angle = quaternion_yaw(rotation) / np.pi * 180
+        # import pdb;pdb.set_trace()
+        vectors = []
+        for vec_class in self.vec_classes:
+            if vec_class == 'divider':
+                line_geom = self.get_map_geom(patch_box, patch_angle, self.line_classes, location)
+                line_instances_dict = self.line_geoms_to_instances(line_geom)     
+                for line_type, instances in line_instances_dict.items():
+                    for instance in instances:
+                        vectors.append((instance, self.CLASS2LABEL.get(line_type, -1)))
+            elif vec_class == 'ped_crossing':
+                ped_geom = self.get_map_geom(patch_box, patch_angle, self.ped_crossing_classes, location)
+                # ped_vector_list = self.ped_geoms_to_vectors(ped_geom)
+                ped_instance_list = self.ped_poly_geoms_to_instances(ped_geom)
+                # import pdb;pdb.set_trace()
+                for instance in ped_instance_list:
+                    vectors.append((instance, self.CLASS2LABEL.get('ped_crossing', -1)))
+            elif vec_class == 'boundary':
+                polygon_geom = self.get_map_geom(patch_box, patch_angle, self.polygon_classes, location)
+                # import pdb;pdb.set_trace()
+                poly_bound_list = self.poly_geoms_to_instances(polygon_geom)
+                # import pdb;pdb.set_trace()
+                for contour in poly_bound_list:
+                    vectors.append((contour, self.CLASS2LABEL.get('contours', -1)))
+            else:
+                raise ValueError(f'WRONG vec_class: {vec_class}')
+
+        # filter out -1
+        filtered_vectors = []
+        gt_pts_loc_3d = []
+        gt_pts_num_3d = []
+        gt_labels = []
+        gt_instance = []
+        for instance, type in vectors:
+            if type != -1:
+                gt_instance.append(instance)
+                gt_labels.append(type)
+        
+        gt_instance = LiDARInstanceLines(gt_instance,self.sample_dist,
+                        self.num_samples, self.padding, self.fixed_num,self.padding_value, patch_size=self.patch_size)
+
+        anns_results = dict(
+            gt_vecs_pts_loc=gt_instance,
+            gt_vecs_label=gt_labels,
+
+        )
+        # import pdb;pdb.set_trace()
+        return anns_results
+
+    def get_map_geom(self, patch_box, patch_angle, layer_names, location):
+        map_geom = []
+        for layer_name in layer_names:
+            if layer_name in self.line_classes:
+                # import pdb;pdb.set_trace()
+                geoms = self.get_divider_line(patch_box, patch_angle, layer_name, location)
+                # import pdb;pdb.set_trace()
+                # geoms = self.map_explorer[location]._get_layer_line(patch_box, patch_angle, layer_name)
+                map_geom.append((layer_name, geoms))
+            elif layer_name in self.polygon_classes:
+                geoms = self.get_contour_line(patch_box, patch_angle, layer_name, location)
+                # geoms = self.map_explorer[location]._get_layer_polygon(patch_box, patch_angle, layer_name)
+                map_geom.append((layer_name, geoms))
+            elif layer_name in self.ped_crossing_classes:
+                geoms = self.get_ped_crossing_line(patch_box, patch_angle, location)
+                # geoms = self.map_explorer[location]._get_layer_polygon(patch_box, patch_angle, layer_name)
+                map_geom.append((layer_name, geoms))
+        return map_geom
+
+    def _one_type_line_geom_to_vectors(self, line_geom):
+        line_vectors = []
+        
+        for line in line_geom:
+            if not line.is_empty:
+                if line.geom_type == 'MultiLineString':
+                    for single_line in line.geoms:
+                        line_vectors.append(self.sample_pts_from_line(single_line))
+                elif line.geom_type == 'LineString':
+                    line_vectors.append(self.sample_pts_from_line(line))
+                else:
+                    raise NotImplementedError
+        return line_vectors
+
+    def _one_type_line_geom_to_instances(self, line_geom):
+        line_instances = []
+        
+        for line in line_geom:
+            if not line.is_empty:
+                if line.geom_type == 'MultiLineString':
+                    for single_line in line.geoms:
+                        line_instances.append(single_line)
+                elif line.geom_type == 'LineString':
+                    line_instances.append(line)
+                else:
+                    raise NotImplementedError
+        return line_instances
+
+    def poly_geoms_to_vectors(self, polygon_geom):
+        roads = polygon_geom[0][1]
+        lanes = polygon_geom[1][1]
+        union_roads = ops.unary_union(roads)
+        union_lanes = ops.unary_union(lanes)
+        union_segments = ops.unary_union([union_roads, union_lanes])
+        max_x = self.patch_size[1] / 2
+        max_y = self.patch_size[0] / 2
+        local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+        exteriors = []
+        interiors = []
+        if union_segments.geom_type != 'MultiPolygon':
+            union_segments = MultiPolygon([union_segments])
+        for poly in union_segments.geoms:
+            exteriors.append(poly.exterior)
+            for inter in poly.interiors:
+                interiors.append(inter)
+
+        results = []
+        for ext in exteriors:
+            if ext.is_ccw:
+                ext.coords = list(ext.coords)[::-1]
+            lines = ext.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        for inter in interiors:
+            if not inter.is_ccw:
+                inter.coords = list(inter.coords)[::-1]
+            lines = inter.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        return self._one_type_line_geom_to_vectors(results)
+
+    def ped_poly_geoms_to_instances(self, ped_geom):
+        # import pdb;pdb.set_trace()
+        ped = ped_geom[0][1]
+        union_segments = ops.unary_union(ped)
+        max_x = self.patch_size[1] / 2
+        max_y = self.patch_size[0] / 2
+        # local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+        local_patch = box(-max_x - 0.2, -max_y - 0.2, max_x + 0.2, max_y + 0.2)
+        exteriors = []
+        interiors = []
+        if union_segments.geom_type != 'MultiPolygon':
+            union_segments = MultiPolygon([union_segments])
+        for poly in union_segments.geoms:
+            exteriors.append(poly.exterior)
+            for inter in poly.interiors:
+                interiors.append(inter)
+
+        results = []
+        for ext in exteriors:
+            if ext.is_ccw:
+                ext.coords = list(ext.coords)[::-1]
+            lines = ext.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        for inter in interiors:
+            if not inter.is_ccw:
+                inter.coords = list(inter.coords)[::-1]
+            lines = inter.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        return self._one_type_line_geom_to_instances(results)
+
+
+    def poly_geoms_to_instances(self, polygon_geom):
+        roads = polygon_geom[0][1]
+        lanes = polygon_geom[1][1]
+        union_roads = ops.unary_union(roads)
+        union_lanes = ops.unary_union(lanes)
+        union_segments = ops.unary_union([union_roads, union_lanes])
+        max_x = self.patch_size[1] / 2
+        max_y = self.patch_size[0] / 2
+        local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+        exteriors = []
+        interiors = []
+        if union_segments.geom_type != 'MultiPolygon':
+            union_segments = MultiPolygon([union_segments])
+        for poly in union_segments.geoms:
+            exteriors.append(poly.exterior)
+            for inter in poly.interiors:
+                interiors.append(inter)
+
+        results = []
+        for ext in exteriors:
+            if ext.is_ccw:
+                ext.coords = list(ext.coords)[::-1]
+            lines = ext.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        for inter in interiors:
+            if not inter.is_ccw:
+                inter.coords = list(inter.coords)[::-1]
+            lines = inter.intersection(local_patch)
+            if isinstance(lines, MultiLineString):
+                lines = ops.linemerge(lines)
+            results.append(lines)
+
+        return self._one_type_line_geom_to_instances(results)
+
+    def line_geoms_to_vectors(self, line_geom):
+        line_vectors_dict = dict()
+        for line_type, a_type_of_lines in line_geom:
+            one_type_vectors = self._one_type_line_geom_to_vectors(a_type_of_lines)
+            line_vectors_dict[line_type] = one_type_vectors
+
+        return line_vectors_dict
+    def line_geoms_to_instances(self, line_geom):
+        line_instances_dict = dict()
+        for line_type, a_type_of_lines in line_geom:
+            one_type_instances = self._one_type_line_geom_to_instances(a_type_of_lines)
+            line_instances_dict[line_type] = one_type_instances
+
+        return line_instances_dict
+
+    def ped_geoms_to_vectors(self, ped_geom):
+        ped_geom = ped_geom[0][1]
+        union_ped = ops.unary_union(ped_geom)
+        if union_ped.geom_type != 'MultiPolygon':
+            union_ped = MultiPolygon([union_ped])
+
+        max_x = self.patch_size[1] / 2
+        max_y = self.patch_size[0] / 2
+        local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2)
+        results = []
+        for ped_poly in union_ped:
+            # rect = ped_poly.minimum_rotated_rectangle
+            ext = ped_poly.exterior
+            if not ext.is_ccw:
+                ext.coords = list(ext.coords)[::-1]
+            lines = ext.intersection(local_patch)
+            results.append(lines)
+
+        return self._one_type_line_geom_to_vectors(results)
+
+    def get_contour_line(self,patch_box,patch_angle,layer_name,location):
+        if layer_name not in self.map_explorer[location].map_api.non_geometric_polygon_layers:
+            raise ValueError('{} is not a polygonal layer'.format(layer_name))
+
+        patch_x = patch_box[0]
+        patch_y = patch_box[1]
+
+        patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle)
+
+        records = getattr(self.map_explorer[location].map_api, layer_name)
+
+        polygon_list = []
+        if layer_name == 'drivable_area':
+            for record in records:
+                polygons = [self.map_explorer[location].map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']]
+
+                for polygon in polygons:
+                    new_polygon = polygon.intersection(patch)
+                    if not new_polygon.is_empty:
+                        new_polygon = affinity.rotate(new_polygon, -patch_angle,
+                                                      origin=(patch_x, patch_y), use_radians=False)
+                        new_polygon = affinity.affine_transform(new_polygon,
+                                                                [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+                        if new_polygon.geom_type == 'Polygon':
+                            new_polygon = MultiPolygon([new_polygon])
+                        polygon_list.append(new_polygon)
+
+        else:
+            for record in records:
+                polygon = self.map_explorer[location].map_api.extract_polygon(record['polygon_token'])
+
+                if polygon.is_valid:
+                    new_polygon = polygon.intersection(patch)
+                    if not new_polygon.is_empty:
+                        new_polygon = affinity.rotate(new_polygon, -patch_angle,
+                                                      origin=(patch_x, patch_y), use_radians=False)
+                        new_polygon = affinity.affine_transform(new_polygon,
+                                                                [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+                        if new_polygon.geom_type == 'Polygon':
+                            new_polygon = MultiPolygon([new_polygon])
+                        polygon_list.append(new_polygon)
+
+        return polygon_list
+
+    def get_divider_line(self,patch_box,patch_angle,layer_name,location):
+        if layer_name not in self.map_explorer[location].map_api.non_geometric_line_layers:
+            raise ValueError("{} is not a line layer".format(layer_name))
+
+        if layer_name == 'traffic_light':
+            return None
+
+        patch_x = patch_box[0]
+        patch_y = patch_box[1]
+
+        patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle)
+
+        line_list = []
+        records = getattr(self.map_explorer[location].map_api, layer_name)
+        for record in records:
+            line = self.map_explorer[location].map_api.extract_line(record['line_token'])
+            if line.is_empty:  # Skip lines without nodes.
+                continue
+
+            new_line = line.intersection(patch)
+            if not new_line.is_empty:
+                new_line = affinity.rotate(new_line, -patch_angle, origin=(patch_x, patch_y), use_radians=False)
+                new_line = affinity.affine_transform(new_line,
+                                                     [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+                line_list.append(new_line)
+
+        return line_list
+
+    def get_ped_crossing_line(self, patch_box, patch_angle, location):
+        patch_x = patch_box[0]
+        patch_y = patch_box[1]
+
+        patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle)
+        polygon_list = []
+        records = getattr(self.map_explorer[location].map_api, 'ped_crossing')
+        # records = getattr(self.nusc_maps[location], 'ped_crossing')
+        for record in records:
+            polygon = self.map_explorer[location].map_api.extract_polygon(record['polygon_token'])
+            if polygon.is_valid:
+                new_polygon = polygon.intersection(patch)
+                if not new_polygon.is_empty:
+                    new_polygon = affinity.rotate(new_polygon, -patch_angle,
+                                                      origin=(patch_x, patch_y), use_radians=False)
+                    new_polygon = affinity.affine_transform(new_polygon,
+                                                            [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y])
+                    if new_polygon.geom_type == 'Polygon':
+                        new_polygon = MultiPolygon([new_polygon])
+                    polygon_list.append(new_polygon)
+
+        return polygon_list
+
+    def sample_pts_from_line(self, line):
+        if self.fixed_num < 0:
+            distances = np.arange(0, line.length, self.sample_dist)
+            sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+        else:
+            # fixed number of points, so distance is line.length / self.fixed_num
+            distances = np.linspace(0, line.length, self.fixed_num)
+            sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2)
+
+            # tmpdistances = np.linspace(0, line.length, 2)
+            # tmpsampled_points = np.array([list(line.interpolate(tmpdistance).coords) for tmpdistance in tmpdistances]).reshape(-1, 2)
+        # import pdb;pdb.set_trace()
+        # if self.normalize:
+        #     sampled_points = sampled_points / np.array([self.patch_size[1], self.patch_size[0]])
+
+        num_valid = len(sampled_points)
+
+        if not self.padding or self.fixed_num > 0:
+            # fixed num sample can return now!
+            return sampled_points, num_valid
+
+        # fixed distance sampling need padding!
+        num_valid = len(sampled_points)
+
+        if self.fixed_num < 0:
+            if num_valid < self.num_samples:
+                padding = np.zeros((self.num_samples - len(sampled_points), 2))
+                sampled_points = np.concatenate([sampled_points, padding], axis=0)
+            else:
+                sampled_points = sampled_points[:self.num_samples, :]
+                num_valid = self.num_samples
+
+            # if self.normalize:
+            #     sampled_points = sampled_points / np.array([self.patch_size[1], self.patch_size[0]])
+            #     num_valid = len(sampled_points)
+
+        return sampled_points, num_valid
+
+
+###############################################################################################################
+###############################################################################################################
+###############################################################################################################
+
+class v1CustomDetectionConfig:
+    """ Data class that specifies the detection evaluation settings. """
+
+    def __init__(self,
+                 class_range_x: Dict[str, int],
+                 class_range_y: Dict[str, int],
+                 dist_fcn: str,
+                 dist_ths: List[float],
+                 dist_th_tp: float,
+                 min_recall: float,
+                 min_precision: float,
+                 max_boxes_per_sample: int,
+                 mean_ap_weight: int):
+
+        assert set(class_range_x.keys()) == set(DETECTION_NAMES), "Class count mismatch."
+        assert dist_th_tp in dist_ths, "dist_th_tp must be in set of dist_ths."
+
+        self.class_range_x = class_range_x
+        self.class_range_y = class_range_y
+        self.dist_fcn = dist_fcn
+        self.dist_ths = dist_ths
+        self.dist_th_tp = dist_th_tp
+        self.min_recall = min_recall
+        self.min_precision = min_precision
+        self.max_boxes_per_sample = max_boxes_per_sample
+        self.mean_ap_weight = mean_ap_weight
+
+        self.class_names = self.class_range_y.keys()
+
+    def __eq__(self, other):
+        eq = True
+        for key in self.serialize().keys():
+            eq = eq and np.array_equal(getattr(self, key), getattr(other, key))
+        return eq
+
+    def serialize(self) -> dict:
+        """ Serialize instance into json-friendly format. """
+        return {
+            'class_range_x': self.class_range_x,
+            'class_range_y': self.class_range_y,
+            'dist_fcn': self.dist_fcn,
+            'dist_ths': self.dist_ths,
+            'dist_th_tp': self.dist_th_tp,
+            'min_recall': self.min_recall,
+            'min_precision': self.min_precision,
+            'max_boxes_per_sample': self.max_boxes_per_sample,
+            'mean_ap_weight': self.mean_ap_weight
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized dictionary. """
+        return cls(content['class_range_x'],
+                   content['class_range_y'],
+                   content['dist_fcn'],
+                   content['dist_ths'],
+                   content['dist_th_tp'],
+                   content['min_recall'],
+                   content['min_precision'],
+                   content['max_boxes_per_sample'],
+                   content['mean_ap_weight'])
+
+    @property
+    def dist_fcn_callable(self):
+        """ Return the distance function corresponding to the dist_fcn string. """
+        if self.dist_fcn == 'center_distance':
+            return center_distance
+        else:
+            raise Exception('Error: Unknown distance function %s!' % self.dist_fcn)
+
+@DATASETS.register_module()
+class VADCustomNuScenesDataset(NuScenesDataset):
+    r"""Custom NuScenes Dataset.
+    """
+    MAPCLASSES = ('divider',)
+    def __init__(
+        self,
+        queue_length=4,
+        bev_size=(200, 200),
+        overlap_test=False,
+        with_attr=True,
+        fut_ts=6,
+        pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
+        map_classes=None,
+        map_ann_file=None,
+        map_fixed_ptsnum_per_line=-1,
+        map_eval_use_same_gt_sample_num_flag=False,
+        padding_value=-10000,
+        use_pkl_result=False,
+        custom_eval_version='vad_nusc_detection_cvpr_2019',
+        *args,
+        **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self.queue_length = queue_length
+        self.overlap_test = overlap_test
+        self.bev_size = bev_size
+        self.with_attr = with_attr
+        self.fut_ts = fut_ts
+        self.use_pkl_result = use_pkl_result
+
+        self.custom_eval_version = custom_eval_version
+        # Check if config exists.
+        this_dir = os.path.dirname(os.path.abspath(__file__))
+        cfg_path = os.path.join(this_dir, '%s.json' % self.custom_eval_version)
+        assert os.path.exists(cfg_path), \
+            'Requested unknown configuration {}'.format(self.custom_eval_version)
+        # Load config file and deserialize it.
+        with open(cfg_path, 'r') as f:
+            data = json.load(f)
+        self.custom_eval_detection_configs = v1CustomDetectionConfig.deserialize(data)
+
+        self.map_ann_file = map_ann_file
+        self.MAPCLASSES = self.get_map_classes(map_classes)
+        self.NUM_MAPCLASSES = len(self.MAPCLASSES)
+        self.pc_range = pc_range
+        patch_h = pc_range[4]-pc_range[1]
+        patch_w = pc_range[3]-pc_range[0]
+        self.patch_size = (patch_h, patch_w)
+        self.padding_value = padding_value
+        self.fixed_num = map_fixed_ptsnum_per_line
+        self.eval_use_same_gt_sample_num_flag = map_eval_use_same_gt_sample_num_flag
+        self.vector_map = VectorizedLocalMap(kwargs['data_root'], 
+                            patch_size=self.patch_size, map_classes=self.MAPCLASSES, 
+                            fixed_ptsnum_per_line=map_fixed_ptsnum_per_line,
+                            padding_value=self.padding_value)
+        self.is_vis_on_test = True
+
+    @classmethod
+    def get_map_classes(cls, map_classes=None):
+        """Get class names of current dataset.
+
+        Args:
+            classes (Sequence[str] | str | None): If classes is None, use
+                default CLASSES defined by builtin dataset. If classes is a
+                string, take it as a file name. The file contains the name of
+                classes where each line contains one class name. If classes is
+                a tuple or list, override the CLASSES defined by the dataset.
+
+        Return:
+            list[str]: A list of class names.
+        """
+        if map_classes is None:
+            return cls.MAPCLASSES
+
+        if isinstance(map_classes, str):
+            # take it as a file path
+            class_names = mmcv.list_from_file(map_classes)
+        elif isinstance(map_classes, (tuple, list)):
+            class_names = map_classes
+        else:
+            raise ValueError(f'Unsupported type {type(map_classes)} of map classes.')
+
+        return class_names
+
+    def vectormap_pipeline(self, example, input_dict):
+        '''
+        `example` type: <class 'dict'>
+            keys: 'img_metas', 'gt_bboxes_3d', 'gt_labels_3d', 'img';
+                  all keys type is 'DataContainer';
+                  'img_metas' cpu_only=True, type is dict, others are false;
+                  'gt_labels_3d' shape torch.size([num_samples]), stack=False,
+                                padding_value=0, cpu_only=False
+                  'gt_bboxes_3d': stack=False, cpu_only=True
+        '''
+        # import pdb;pdb.set_trace()
+        lidar2ego = np.eye(4)
+        lidar2ego[:3,:3] = Quaternion(input_dict['lidar2ego_rotation']).rotation_matrix
+        lidar2ego[:3, 3] = input_dict['lidar2ego_translation']
+        ego2global = np.eye(4)
+        ego2global[:3,:3] = Quaternion(input_dict['ego2global_rotation']).rotation_matrix
+        ego2global[:3, 3] = input_dict['ego2global_translation']
+
+        lidar2global = ego2global @ lidar2ego
+
+        lidar2global_translation = list(lidar2global[:3,3])
+        lidar2global_rotation = list(Quaternion(matrix=lidar2global).q)
+
+        location = input_dict['map_location']
+        ego2global_translation = input_dict['ego2global_translation']
+        ego2global_rotation = input_dict['ego2global_rotation']
+        anns_results = self.vector_map.gen_vectorized_samples(
+            location, lidar2global_translation, lidar2global_rotation
+        )
+        
+        '''
+        anns_results, type: dict
+            'gt_vecs_pts_loc': list[num_vecs], vec with num_points*2 coordinates
+            'gt_vecs_pts_num': list[num_vecs], vec with num_points
+            'gt_vecs_label': list[num_vecs], vec with cls index
+        '''
+        gt_vecs_label = to_tensor(anns_results['gt_vecs_label'])
+        if isinstance(anns_results['gt_vecs_pts_loc'], LiDARInstanceLines):
+            gt_vecs_pts_loc = anns_results['gt_vecs_pts_loc']
+        else:
+            gt_vecs_pts_loc = to_tensor(anns_results['gt_vecs_pts_loc'])
+            try:
+                gt_vecs_pts_loc = gt_vecs_pts_loc.flatten(1).to(dtype=torch.float32)
+            except:
+                # empty tensor, will be passed in train, 
+                # but we preserve it for test
+                gt_vecs_pts_loc = gt_vecs_pts_loc
+
+        example['map_gt_labels_3d'] = DC(gt_vecs_label, cpu_only=False)
+        example['map_gt_bboxes_3d'] = DC(gt_vecs_pts_loc, cpu_only=True)
+
+        return example
+
+    def prepare_train_data(self, index):
+        """
+        Training data preparation.
+        Args:
+            index (int): Index for accessing the target data.
+        Returns:
+            dict: Training data dict of the corresponding index.
+        """
+        data_queue = []
+
+        # temporal aug
+        prev_indexs_list = list(range(index-self.queue_length, index))
+        random.shuffle(prev_indexs_list)
+        prev_indexs_list = sorted(prev_indexs_list[1:], reverse=True)
+        ##
+
+        input_dict = self.get_data_info(index)
+        if input_dict is None:
+            return None
+        frame_idx = input_dict['frame_idx']
+        scene_token = input_dict['scene_token']
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        example = self.vectormap_pipeline(example,input_dict)
+        if self.filter_empty_gt and \
+                ((example is None or ~(example['gt_labels_3d']._data != -1).any()) or \
+                    (example is None or ~(example['map_gt_labels_3d']._data != -1).any())):
+            return None
+        data_queue.insert(0, example)
+        for i in prev_indexs_list:
+            i = max(0, i)
+            input_dict = self.get_data_info(i)
+            if input_dict is None:
+                return None
+            if input_dict['frame_idx'] < frame_idx and input_dict['scene_token'] == scene_token:
+                self.pre_pipeline(input_dict)
+                example = self.pipeline(input_dict)
+                example = self.vectormap_pipeline(example,input_dict)
+                if self.filter_empty_gt and \
+                        (example is None or ~(example['gt_labels_3d']._data != -1).any()) and \
+                            (example is None or ~(example['map_gt_labels_3d']._data != -1).any()):
+                    return None
+                frame_idx = input_dict['frame_idx']
+            data_queue.insert(0, copy.deepcopy(example))
+        return self.union2one(data_queue)
+
+    def prepare_test_data(self, index):
+        """Prepare data for testing.
+
+        Args:
+            index (int): Index for accessing the target data.
+
+        Returns:
+            dict: Testing data dict of the corresponding index.
+        """
+        input_dict = self.get_data_info(index)
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        if self.is_vis_on_test:
+            example = self.vectormap_pipeline(example, input_dict)
+        return example
+
+    def union2one(self, queue):
+        """
+        convert sample queue into one single sample.
+        """
+        imgs_list = [each['img'].data for each in queue]
+        metas_map = {}
+        prev_pos = None
+        prev_angle = None
+        for i, each in enumerate(queue):
+            metas_map[i] = each['img_metas'].data
+            if i == 0:
+                metas_map[i]['prev_bev'] = False
+                prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] = 0
+                metas_map[i]['can_bus'][-1] = 0
+            else:
+                metas_map[i]['prev_bev'] = True
+                tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3])
+                tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1])
+                metas_map[i]['can_bus'][:3] -= prev_pos
+                metas_map[i]['can_bus'][-1] -= prev_angle
+                prev_pos = copy.deepcopy(tmp_pos)
+                prev_angle = copy.deepcopy(tmp_angle)
+
+        queue[-1]['img'] = DC(torch.stack(imgs_list),
+                              cpu_only=False, stack=True)
+        queue[-1]['img_metas'] = DC(metas_map, cpu_only=True)
+        queue = queue[-1]
+        return queue
+
+    def get_ann_info(self, index):
+        """Get annotation info according to the given index.
+
+        Args:
+            index (int): Index of the annotation data to get.
+
+        Returns:
+            dict: Annotation information consists of the following keys:
+
+                - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \
+                    3D ground truth bboxes
+                - gt_labels_3d (np.ndarray): Labels of ground truths.
+                - gt_names (list[str]): Class names of ground truths.
+        """
+        info = self.data_infos[index]
+        # filter out bbox containing no points
+        if self.use_valid_flag:
+            mask = info['valid_flag']
+        else:
+            mask = info['num_lidar_pts'] > 0
+        gt_bboxes_3d = info['gt_boxes'][mask]
+        gt_names_3d = info['gt_names'][mask]
+        gt_labels_3d = []
+        for cat in gt_names_3d:
+            if cat in self.CLASSES:
+                gt_labels_3d.append(self.CLASSES.index(cat))
+            else:
+                gt_labels_3d.append(-1)
+        gt_labels_3d = np.array(gt_labels_3d)
+
+        if self.with_velocity:
+            gt_velocity = info['gt_velocity'][mask]
+            nan_mask = np.isnan(gt_velocity[:, 0])
+            gt_velocity[nan_mask] = [0.0, 0.0]
+            gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1)
+        
+        if self.with_attr:
+            gt_fut_trajs = info['gt_agent_fut_trajs'][mask]
+            gt_fut_masks = info['gt_agent_fut_masks'][mask]
+            gt_fut_goal = info['gt_agent_fut_goal'][mask]
+            gt_lcf_feat = info['gt_agent_lcf_feat'][mask]
+            gt_fut_yaw = info['gt_agent_fut_yaw'][mask]
+            attr_labels = np.concatenate(
+                [gt_fut_trajs, gt_fut_masks, gt_fut_goal[..., None], gt_lcf_feat, gt_fut_yaw], axis=-1
+            ).astype(np.float32)
+
+        # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
+        # the same as KITTI (0.5, 0.5, 0)
+        gt_bboxes_3d = LiDARInstance3DBoxes(
+            gt_bboxes_3d,
+            box_dim=gt_bboxes_3d.shape[-1],
+            origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
+        
+        anns_results = dict(
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+            gt_names=gt_names_3d,
+            attr_labels=attr_labels)
+
+        return anns_results
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data \
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - sweeps (list[dict]): Infos of sweeps.
+                - timestamp (float): Sample timestamp.
+                - img_filename (str, optional): Image filename.
+                - lidar2img (list[np.ndarray], optional): Transformations \
+                    from lidar to different cameras.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+        # standard protocal modified from SECOND.Pytorch
+        input_dict = dict(
+            sample_idx=info['token'],
+            #pts_filename=info['lidar_path'],
+            #sweeps=info['sweeps'],
+            ego2global_translation=info['ego2global_translation'],
+            ego2global_rotation=info['ego2global_rotation'],
+            lidar2ego_translation=info['lidar2ego_translation'],
+            lidar2ego_rotation=info['lidar2ego_rotation'],
+            prev_idx=info['prev'],
+            next_idx=info['next'],
+            scene_token=info['scene_token'],
+            can_bus=info['can_bus'],
+            frame_idx=info['frame_idx'],
+            timestamp=info['timestamp'] / 1e6,
+            fut_valid_flag=info['fut_valid_flag'],
+            map_location=info['map_location'],
+            ego_his_trajs=info['gt_ego_his_trajs'],
+            ego_fut_trajs=info['gt_ego_fut_trajs'],
+            ego_fut_masks=info['gt_ego_fut_masks'],
+            ego_fut_cmd=info['gt_ego_fut_cmd'],
+            ego_lcf_feat=info['gt_ego_lcf_feat']
+        )
+        # lidar to ego transform
+        lidar2ego = np.eye(4).astype(np.float32)
+        lidar2ego[:3, :3] = Quaternion(info["lidar2ego_rotation"]).rotation_matrix
+        lidar2ego[:3, 3] = info["lidar2ego_translation"]
+        input_dict["lidar2ego"] = lidar2ego
+
+        if self.modality['use_camera']:
+            image_paths = []
+            lidar2img_rts = []
+            lidar2cam_rts = []
+            cam_intrinsics = []
+            input_dict["camera2ego"] = []
+            input_dict["camera_intrinsics"] = []
+            for cam_type, cam_info in info['cams'].items():
+                image_paths.append(cam_info['data_path'])
+                # obtain lidar to image transformation matrix
+                lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
+                lidar2cam_t = cam_info[
+                    'sensor2lidar_translation'] @ lidar2cam_r.T
+                lidar2cam_rt = np.eye(4)
+                lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                lidar2cam_rt[3, :3] = -lidar2cam_t
+                intrinsic = cam_info['cam_intrinsic']
+                viewpad = np.eye(4)
+                viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
+                lidar2img_rt = (viewpad @ lidar2cam_rt.T)
+                lidar2img_rts.append(lidar2img_rt)
+
+                cam_intrinsics.append(viewpad)
+                lidar2cam_rts.append(lidar2cam_rt.T)
+            
+                # camera to ego transform
+                camera2ego = np.eye(4).astype(np.float32)
+                camera2ego[:3, :3] = Quaternion(
+                    cam_info["sensor2ego_rotation"]
+                ).rotation_matrix
+                camera2ego[:3, 3] = cam_info["sensor2ego_translation"]
+                input_dict["camera2ego"].append(camera2ego)
+                # camera intrinsics
+                camera_intrinsics = np.eye(4).astype(np.float32)
+                camera_intrinsics[:3, :3] = cam_info["cam_intrinsic"]
+                input_dict["camera_intrinsics"].append(camera_intrinsics)
+
+            input_dict.update(
+                dict(
+                    img_filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                    cam_intrinsic=cam_intrinsics,
+                    lidar2cam=lidar2cam_rts,
+                ))
+
+        # NOTE: now we load gt in test_mode for evaluating
+        # if not self.test_mode:
+        #     annos = self.get_ann_info(index)
+        #     input_dict['ann_info'] = annos
+
+        annos = self.get_ann_info(index)
+        input_dict['ann_info'] = annos
+
+        rotation = Quaternion(input_dict['ego2global_rotation'])
+        translation = input_dict['ego2global_translation']
+        can_bus = input_dict['can_bus']
+        can_bus[:3] = translation
+        can_bus[3:7] = rotation
+        patch_angle = quaternion_yaw(rotation) / np.pi * 180
+        if patch_angle < 0:
+            patch_angle += 360
+        can_bus[-2] = patch_angle / 180 * np.pi
+        can_bus[-1] = patch_angle
+
+        lidar2ego = np.eye(4)
+        lidar2ego[:3,:3] = Quaternion(input_dict['lidar2ego_rotation']).rotation_matrix
+        lidar2ego[:3, 3] = input_dict['lidar2ego_translation']
+        ego2global = np.eye(4)
+        ego2global[:3,:3] = Quaternion(input_dict['ego2global_rotation']).rotation_matrix
+        ego2global[:3, 3] = input_dict['ego2global_translation']
+        lidar2global = ego2global @ lidar2ego
+        input_dict['lidar2global'] = lidar2global
+
+        return input_dict
+
+    def __getitem__(self, idx):
+        """Get item from infos according to the given index.
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        if self.test_mode:
+            return self.prepare_test_data(idx)
+        while True:
+
+            data = self.prepare_train_data(idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+
+    def _format_gt(self):
+        gt_annos = []
+        print('Start to convert gt map format...')
+        # assert self.map_ann_file is not None
+        if (not os.path.exists(self.map_ann_file)) :
+            dataset_length = len(self)
+            prog_bar = mmcv.ProgressBar(dataset_length)
+            mapped_class_names = self.MAPCLASSES
+            for sample_id in range(dataset_length):
+                sample_token = self.data_infos[sample_id]['token']
+                gt_anno = {}
+                gt_anno['sample_token'] = sample_token
+                # gt_sample_annos = []
+                gt_sample_dict = {}
+                gt_sample_dict = self.vectormap_pipeline(gt_sample_dict, self.data_infos[sample_id])
+                gt_labels = gt_sample_dict['map_gt_labels_3d'].data.numpy()
+                gt_vecs = gt_sample_dict['map_gt_bboxes_3d'].data.instance_list
+                gt_vec_list = []
+                for i, (gt_label, gt_vec) in enumerate(zip(gt_labels, gt_vecs)):
+                    name = mapped_class_names[gt_label]
+                    anno = dict(
+                        pts=np.array(list(gt_vec.coords)),
+                        pts_num=len(list(gt_vec.coords)),
+                        cls_name=name,
+                        type=gt_label,
+                    )
+                    gt_vec_list.append(anno)
+                gt_anno['vectors']=gt_vec_list
+                gt_annos.append(gt_anno)
+
+                prog_bar.update()
+            nusc_submissions = {
+                'GTs': gt_annos
+            }
+            print('\n GT anns writes to', self.map_ann_file)
+            dump(nusc_submissions, self.map_ann_file)
+        else:
+            print(f'{self.map_ann_file} exist, not update')
+
+    def _format_bbox(self, results, jsonfile_prefix=None, score_thresh=0.2):
+        """Convert the results to the standard format.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str): The prefix of the output jsonfile.
+                You can specify the output directory/filename by
+                modifying the jsonfile_prefix. Default: None.
+
+        Returns:
+            str: Path of the output json file.
+        """
+        nusc_annos = {}
+        det_mapped_class_names = self.CLASSES
+
+        # assert self.map_ann_file is not None
+        map_pred_annos = {}
+        map_mapped_class_names = self.MAPCLASSES
+
+        plan_annos = {}
+
+        print('Start to convert detection format...')
+        for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
+            annos = []
+            boxes = output_to_nusc_box(det)
+            sample_token = self.data_infos[sample_id]['token']
+
+            plan_annos[sample_token] = [det['ego_fut_preds'], det['ego_fut_cmd']]
+
+            boxes = lidar_nusc_box_to_global(self.data_infos[sample_id], boxes,
+                                             det_mapped_class_names,
+                                             self.custom_eval_detection_configs,
+                                             self.eval_version)
+            for i, box in enumerate(boxes):
+                if box.score < score_thresh:
+                    continue
+                name = det_mapped_class_names[box.label]
+                if np.sqrt(box.velocity[0]**2 + box.velocity[1]**2) > 0.2:
+                    if name in [
+                            'car',
+                            'construction_vehicle',
+                            'bus',
+                            'truck',
+                            'trailer',
+                    ]:
+                        attr = 'vehicle.moving'
+                    elif name in ['bicycle', 'motorcycle']:
+                        attr = 'cycle.with_rider'
+                    else:
+                        attr = NuScenesDataset.DefaultAttribute[name]
+                else:
+                    if name in ['pedestrian']:
+                        attr = 'pedestrian.standing'
+                    elif name in ['bus']:
+                        attr = 'vehicle.stopped'
+                    else:
+                        attr = NuScenesDataset.DefaultAttribute[name]
+
+                nusc_anno = dict(
+                    sample_token=sample_token,
+                    translation=box.center.tolist(),
+                    size=box.wlh.tolist(),
+                    rotation=box.orientation.elements.tolist(),
+                    velocity=box.velocity[:2].tolist(),
+                    detection_name=name,
+                    detection_score=box.score,
+                    attribute_name=attr,
+                    fut_traj=box.fut_trajs.tolist())
+                annos.append(nusc_anno)
+            nusc_annos[sample_token] = annos
+
+
+            map_pred_anno = {}
+            vecs = output_to_vecs(det)
+            sample_token = self.data_infos[sample_id]['token']
+            map_pred_anno['sample_token'] = sample_token
+            pred_vec_list=[]
+            for i, vec in enumerate(vecs):
+                name = map_mapped_class_names[vec['label']]
+                anno = dict(
+                    # sample_token=sample_token,
+                    pts=vec['pts'],
+                    pts_num=len(vec['pts']),
+                    cls_name=name,
+                    type=vec['label'],
+                    confidence_level=vec['score'])
+                pred_vec_list.append(anno)
+                # annos.append(nusc_anno)
+            # nusc_annos[sample_token] = annos
+            map_pred_anno['vectors'] = pred_vec_list
+            map_pred_annos[sample_token] = map_pred_anno
+
+        if not os.path.exists(self.map_ann_file):
+            self._format_gt()
+        else:
+            print(f'{self.map_ann_file} exist, not update')
+        # with open(self.map_ann_file,'r') as f:
+        #     GT_anns = json.load(f)
+        # gt_annos = GT_anns['GTs']
+
+        nusc_submissions = {
+            'meta': self.modality,
+            'results': nusc_annos,
+            'map_results': map_pred_annos,
+            'plan_results': plan_annos
+            # 'GTs': gt_annos
+        }
+
+        mmcv.mkdir_or_exist(jsonfile_prefix)
+        if self.use_pkl_result:
+            res_path = osp.join(jsonfile_prefix, 'results_nusc.pkl')
+        else:
+            res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
+        print('Results writes to', res_path)
+        dump(nusc_submissions, res_path)
+        return res_path
+
+    def format_results(self, results, jsonfile_prefix=None):
+        """Format the results to json (standard format for COCO evaluation).
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+
+        Returns:
+            tuple: Returns (result_files, tmp_dir), where `result_files` is a \
+                dict containing the json filepaths, `tmp_dir` is the temporal \
+                directory created for saving json files when \
+                `jsonfile_prefix` is not specified.
+        """
+        if isinstance(results, dict):
+            # print(f'results must be a list, but get dict, keys={results.keys()}')
+            # assert isinstance(results, list)
+            results = results['bbox_results']
+        assert isinstance(results, list)
+        # assert len(results) == len(self), (
+        #     'The length of results is not equal to the dataset len: {} != {}'.
+        #     format(len(results), len(self)))
+
+        if jsonfile_prefix is None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            jsonfile_prefix = osp.join(tmp_dir.name, 'results')
+        else:
+            tmp_dir = None
+
+        # currently the output prediction results could be in two formats
+        # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)
+        # 2. list of dict('pts_bbox' or 'img_bbox':
+        #     dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...))
+        # this is a workaround to enable evaluation of both formats on nuScenes
+        # refer to https://github.com/open-mmlab/mmdetection3d/issues/449
+        if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]):
+            result_files = self._format_bbox(results, jsonfile_prefix)
+        else:
+            # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict
+            result_files = dict()
+            for name in results[0]:
+                if name == 'metric_results':
+                    continue
+                print(f'\nFormating bboxes of {name}')
+                results_ = [out[name] for out in results]
+                tmp_file_ = osp.join(jsonfile_prefix, name)
+                result_files.update(
+                    {name: self._format_bbox(results_, tmp_file_)})
+        return result_files, tmp_dir
+
+    def _evaluate_single(self,
+                         result_path,
+                         logger=None,
+                         metric='bbox',
+                         map_metric='chamfer',
+                         result_name='pts_bbox'):
+        """Evaluation for a single model in nuScenes protocol.
+
+        Args:
+            result_path (str): Path of the result file.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            metric (str): Metric name used for evaluation. Default: 'bbox'.
+            result_name (str): Result name in the metric prefix.
+                Default: 'pts_bbox'.
+
+        Returns:
+            dict: Dictionary of evaluation details.
+        """
+        detail = dict()
+        from nuscenes import NuScenes
+        self.nusc = NuScenes(version=self.version, dataroot=self.data_root,
+                             verbose=False)
+
+        output_dir = osp.join(*osp.split(result_path)[:-1])
+
+        eval_set_map = {
+            'v1.0-mini': 'mini_val',
+            'v1.0-trainval': 'val',
+        }
+        self.nusc_eval = NuScenesEval_custom(
+            self.nusc,
+            config=self.custom_eval_detection_configs,
+            result_path=result_path,
+            eval_set=eval_set_map[self.version],
+            output_dir=output_dir,
+            verbose=False,
+            overlap_test=self.overlap_test,
+            data_infos=self.data_infos
+        )
+        self.nusc_eval.main(plot_examples=0, render_curves=False)
+        # record metrics
+        metrics = load(osp.join(output_dir, 'metrics_summary.json'))
+        metric_prefix = f'{result_name}_NuScenes'
+        for name in self.CLASSES:
+            for k, v in metrics['label_aps'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['label_tp_errors'][name].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
+            for k, v in metrics['tp_errors'].items():
+                val = float('{:.4f}'.format(v))
+                detail['{}/{}'.format(metric_prefix,
+                                      self.ErrNameMapping[k])] = val
+        detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
+        detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
+
+
+        from mmcv.datasets.map_utils.mean_ap import eval_map
+        from mmcv.datasets.map_utils.mean_ap import format_res_gt_by_classes
+        result_path = osp.abspath(result_path)
+        
+        print('Formating results & gts by classes')
+        pred_results = load(result_path)
+        map_results = pred_results['map_results']
+        gt_anns = load(self.map_ann_file)
+        map_annotations = gt_anns['GTs']
+        cls_gens, cls_gts = format_res_gt_by_classes(result_path,
+                                                     map_results,
+                                                     map_annotations,
+                                                     cls_names=self.MAPCLASSES,
+                                                     num_pred_pts_per_instance=self.fixed_num,
+                                                     eval_use_same_gt_sample_num_flag=self.eval_use_same_gt_sample_num_flag,
+                                                     pc_range=self.pc_range)
+        map_metrics = map_metric if isinstance(map_metric, list) else [map_metric]
+        allowed_metrics = ['chamfer', 'iou']
+        for metric in map_metrics:
+            if metric not in allowed_metrics:
+                raise KeyError(f'metric {metric} is not supported')
+        for metric in map_metrics:
+            print('-*'*10+f'use metric:{metric}'+'-*'*10)
+            if metric == 'chamfer':
+                thresholds = [0.5,1.0,1.5]
+            elif metric == 'iou':
+                thresholds= np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+            cls_aps = np.zeros((len(thresholds),self.NUM_MAPCLASSES))
+            for i, thr in enumerate(thresholds):
+                print('-*'*10+f'threshhold:{thr}'+'-*'*10)
+                mAP, cls_ap = eval_map(
+                                map_results,
+                                map_annotations,
+                                cls_gens,
+                                cls_gts,
+                                threshold=thr,
+                                cls_names=self.MAPCLASSES,
+                                logger=logger,
+                                num_pred_pts_per_instance=self.fixed_num,
+                                pc_range=self.pc_range,
+                                metric=metric)
+                for j in range(self.NUM_MAPCLASSES):
+                    cls_aps[i, j] = cls_ap[j]['ap']
+            for i, name in enumerate(self.MAPCLASSES):
+                print('{}: {}'.format(name, cls_aps.mean(0)[i]))
+                detail['NuscMap_{}/{}_AP'.format(metric,name)] =  cls_aps.mean(0)[i]
+            print('map: {}'.format(cls_aps.mean(0).mean()))
+            detail['NuscMap_{}/mAP'.format(metric)] = cls_aps.mean(0).mean()
+            for i, name in enumerate(self.MAPCLASSES):
+                for j, thr in enumerate(thresholds):
+                    if metric == 'chamfer':
+                        detail['NuscMap_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i]
+                    elif metric == 'iou':
+                        if thr == 0.5 or thr == 0.75:
+                            detail['NuscMap_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i]
+
+        return detail
+    
+    def evaluate(self,
+                 results,
+                 metric='bbox',
+                 map_metric='chamfer',
+                 logger=None,
+                 jsonfile_prefix=None,
+                 result_names=['pts_bbox'],
+                 show=False,
+                 out_dir=None,
+                 pipeline=None):
+        """Evaluation in nuScenes protocol.
+
+        Args:
+            results (list[dict]): Testing results of the dataset.
+            metric (str | list[str]): Metrics to be evaluated.
+            logger (logging.Logger | str | None): Logger used for printing
+                related information during evaluation. Default: None.
+            jsonfile_prefix (str | None): The prefix of json files. It includes
+                the file path and the prefix of filename, e.g., "a/b/prefix".
+                If not specified, a temp file will be created. Default: None.
+            show (bool): Whether to visualize.
+                Default: False.
+            out_dir (str): Path to save the visualization results.
+                Default: None.
+            pipeline (list[dict], optional): raw data loading for showing.
+                Default: None.
+
+        Returns:
+            dict[str, float]: Results of each evaluation metric.
+        """
+        result_metric_names = ['EPA', 'ADE', 'FDE', 'MR']
+        motion_cls_names = ['car', 'pedestrian']
+        motion_metric_names = ['gt', 'cnt_ade', 'cnt_fde', 'hit',
+                               'fp', 'ADE', 'FDE', 'MR']
+        all_metric_dict = {}
+        for met in motion_metric_names:
+            for cls in motion_cls_names:
+                all_metric_dict[met+'_'+cls] = 0.0
+        result_dict = {}
+        for met in result_metric_names:
+            for cls in motion_cls_names:
+                result_dict[met+'_'+cls] = 0.0
+        
+        alpha = 0.5
+
+        for i in range(len(results)):
+            for key in all_metric_dict.keys():
+                all_metric_dict[key] += results[i]['metric_results'][key]
+        
+        for cls in motion_cls_names:
+            result_dict['EPA_'+cls] = (all_metric_dict['hit_'+cls] - \
+                 alpha * all_metric_dict['fp_'+cls]) / all_metric_dict['gt_'+cls]
+            result_dict['ADE_'+cls] = all_metric_dict['ADE_'+cls] / all_metric_dict['cnt_ade_'+cls]
+            result_dict['FDE_'+cls] = all_metric_dict['FDE_'+cls] / all_metric_dict['cnt_fde_'+cls]
+            result_dict['MR_'+cls] = all_metric_dict['MR_'+cls] / all_metric_dict['cnt_fde_'+cls]
+        
+        print('\n')
+        print('-------------- Motion Prediction --------------')
+        for k, v in result_dict.items():
+            print(f'{k}: {v}')
+
+        # NOTE: print planning metric
+        print('\n')
+        print('-------------- Planning --------------')
+        metric_dict = None
+        num_valid = 0
+        for res in results:
+            if res['metric_results']['fut_valid_flag']:
+                num_valid += 1
+            else:
+                continue
+            if metric_dict is None:
+                metric_dict = copy.deepcopy(res['metric_results'])
+            else:
+                for k in res['metric_results'].keys():
+                    metric_dict[k] += res['metric_results'][k]
+        
+        for k in metric_dict:
+            metric_dict[k] = metric_dict[k] / num_valid
+            print("{}:{}".format(k, metric_dict[k]))
+
+        result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
+
+        if isinstance(result_files, dict):
+            results_dict = dict()
+            for name in result_names:
+                print('Evaluating bboxes of {}'.format(name))
+                ret_dict = self._evaluate_single(result_files[name], metric=metric, map_metric=map_metric)
+            results_dict.update(ret_dict)
+        elif isinstance(result_files, str):
+            results_dict = self._evaluate_single(result_files, metric=metric, map_metric=map_metric)
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
+
+        if show:
+            self.show(results, out_dir, pipeline=pipeline)
+        return results_dict
+
+def output_to_nusc_box(detection):
+    """Convert the output to the box class in the nuScenes.
+
+    Args:
+        detection (dict): Detection results.
+
+            - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
+            - scores_3d (torch.Tensor): Detection scores.
+            - labels_3d (torch.Tensor): Predicted box labels.
+
+    Returns:
+        list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
+    """
+    box3d = detection['boxes_3d']
+    scores = detection['scores_3d'].numpy()
+    labels = detection['labels_3d'].numpy()
+    trajs = detection['trajs_3d'].numpy()
+
+
+    box_gravity_center = box3d.gravity_center.numpy()
+    box_dims = box3d.dims.numpy()
+    box_yaw = box3d.yaw.numpy()
+    # TODO: check whether this is necessary
+    # with dir_offset & dir_limit in the head
+    box_yaw = -box_yaw - np.pi / 2
+
+    box_list = []
+    for i in range(len(box3d)):
+        quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
+        velocity = (*box3d.tensor[i, 7:9], 0.0)
+        # velo_val = np.linalg.norm(box3d[i, 7:9])
+        # velo_ori = box3d[i, 6]
+        # velocity = (
+        # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0)
+        box = CustomNuscenesBox(
+            center=box_gravity_center[i],
+            size=box_dims[i],
+            orientation=quat,
+            fut_trajs=trajs[i],
+            label=labels[i],
+            score=scores[i],
+            velocity=velocity)
+        box_list.append(box)
+    return box_list
+
+
+def lidar_nusc_box_to_global(info,
+                             boxes,
+                             classes,
+                             eval_configs,
+                             eval_version='detection_cvpr_2019'):
+    """Convert the box from ego to global coordinate.
+
+    Args:
+        info (dict): Info for a specific sample data, including the
+            calibration information.
+        boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
+        classes (list[str]): Mapped classes in the evaluation.
+        eval_configs (object): Evaluation configuration object.
+        eval_version (str): Evaluation version.
+            Default: 'detection_cvpr_2019'
+
+    Returns:
+        list: List of standard NuScenesBoxes in the global
+            coordinate.
+    """
+    box_list = []
+    for box in boxes:
+        # Move box to ego vehicle coord system
+        box.rotate(pyquaternion.Quaternion(info['lidar2ego_rotation']))
+        box.translate(np.array(info['lidar2ego_translation']))
+        # filter det in ego.
+        cls_range_x_map = eval_configs.class_range_x
+        cls_range_y_map = eval_configs.class_range_y
+        x_distance, y_distance = box.center[0], box.center[1]
+        det_range_x = cls_range_x_map[classes[box.label]]
+        det_range_y = cls_range_y_map[classes[box.label]]
+        if abs(x_distance) > det_range_x or abs(y_distance) > det_range_y:
+            continue
+        # Move box to global coord system
+        box.rotate(pyquaternion.Quaternion(info['ego2global_rotation']))
+        box.translate(np.array(info['ego2global_translation']))
+        box_list.append(box)
+    return box_list
+
+def output_to_vecs(detection):
+    box3d = detection['map_boxes_3d'].numpy()
+    scores = detection['map_scores_3d'].numpy()
+    labels = detection['map_labels_3d'].numpy()
+    pts = detection['map_pts_3d'].numpy()
+
+    vec_list = []
+    # import pdb;pdb.set_trace()
+    for i in range(box3d.shape[0]):
+        vec = dict(
+            bbox = box3d[i], # xyxy
+            label=labels[i],
+            score=scores[i],
+            pts=pts[i],
+        )
+        vec_list.append(vec)
+    return vec_list
\ No newline at end of file
diff --git a/mmcv/datasets/nuscnes_eval.py b/mmcv/datasets/nuscnes_eval.py
new file mode 100644
index 0000000..2b14535
--- /dev/null
+++ b/mmcv/datasets/nuscnes_eval.py
@@ -0,0 +1,756 @@
+import argparse
+import copy
+import json
+import os
+import time
+from typing import Tuple, Dict, Any
+import torch
+import numpy as np
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.detection.data_classes import DetectionConfig
+from nuscenes.eval.detection.evaluate import NuScenesEval
+from pyquaternion import Quaternion
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.detection.data_classes import DetectionBox
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.eval.tracking.data_classes import TrackingBox
+from nuscenes.utils.data_classes import Box
+from nuscenes.utils.geometry_utils import points_in_box
+from nuscenes.utils.splits import create_splits_scenes
+from nuscenes.eval.common.loaders import load_prediction, add_center_dist, filter_eval_boxes
+import tqdm
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix
+from torchvision.transforms.functional import rotate
+import pycocotools.mask as mask_util
+# from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from torchvision.transforms.functional import rotate
+import cv2
+import argparse
+import json
+import os
+import random
+import time
+from typing import Tuple, Dict, Any
+
+import numpy as np
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.common.loaders import load_prediction, load_gt, add_center_dist, filter_eval_boxes
+from nuscenes.eval.detection.algo import accumulate, calc_ap, calc_tp
+from nuscenes.eval.detection.constants import TP_METRICS
+from nuscenes.eval.detection.data_classes import DetectionConfig, DetectionMetrics, DetectionBox, \
+    DetectionMetricDataList
+from nuscenes.eval.detection.render import summary_plot, class_pr_curve, dist_pr_curve, visualize_sample
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from mmcv.core.bbox import BboxOverlaps3D
+from IPython import embed
+import json
+from typing import Any
+
+import numpy as np
+from matplotlib import pyplot as plt
+
+from nuscenes import NuScenes
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.common.render import setup_axis
+from nuscenes.eval.common.utils import boxes_to_sensor
+from nuscenes.eval.detection.constants import TP_METRICS, DETECTION_NAMES, DETECTION_COLORS, TP_METRICS_UNITS, \
+    PRETTY_DETECTION_NAMES, PRETTY_TP_METRICS
+from nuscenes.eval.detection.data_classes import DetectionMetrics, DetectionMetricData, DetectionMetricDataList
+from nuscenes.utils.data_classes import LidarPointCloud
+from nuscenes.utils.geometry_utils import view_points
+
+
+
+Axis = Any
+
+def class_tp_curve(md_list: DetectionMetricDataList,
+                   metrics: DetectionMetrics,
+                   detection_name: str,
+                   min_recall: float,
+                   dist_th_tp: float,
+                   savepath: str = None,
+                   ax: Axis = None) -> None:
+    """
+    Plot the true positive curve for the specified class.
+    :param md_list: DetectionMetricDataList instance.
+    :param metrics: DetectionMetrics instance.
+    :param detection_name:
+    :param min_recall: Minimum recall value.
+    :param dist_th_tp: The distance threshold used to determine matches.
+    :param savepath: If given, saves the the rendering here instead of displaying.
+    :param ax: Axes onto which to render.
+    """
+    # Get metric data for given detection class with tp distance threshold.
+
+    md = md_list[(detection_name, dist_th_tp)]
+    min_recall_ind = round(100 * min_recall)
+    if min_recall_ind <= md.max_recall_ind:
+        # For traffic_cone and barrier only a subset of the metrics are plotted.
+        rel_metrics = [m for m in TP_METRICS if not np.isnan(metrics.get_label_tp(detection_name, m))]
+        ylimit = max([max(getattr(md, metric)[min_recall_ind:md.max_recall_ind + 1]) for metric in rel_metrics]) * 1.1
+    else:
+        ylimit = 1.0
+
+    # Prepare axis.
+    if ax is None:
+        ax = setup_axis(title=PRETTY_DETECTION_NAMES[detection_name], xlabel='Recall', ylabel='Error', xlim=1,
+                        min_recall=min_recall)
+    ax.set_ylim(0, ylimit)
+
+    # Plot the recall vs. error curve for each tp metric.
+    for metric in TP_METRICS:
+        tp = metrics.get_label_tp(detection_name, metric)
+
+        # Plot only if we have valid data.
+        if tp is not np.nan and min_recall_ind <= md.max_recall_ind:
+            recall, error = md.recall[:md.max_recall_ind + 1], getattr(md, metric)[:md.max_recall_ind + 1]
+        else:
+            recall, error = [], []
+
+        # Change legend based on tp value
+        if tp is np.nan:
+            label = '{}: n/a'.format(PRETTY_TP_METRICS[metric])
+        elif min_recall_ind > md.max_recall_ind:
+            label = '{}: nan'.format(PRETTY_TP_METRICS[metric])
+        else:
+            label = '{}: {:.2f} ({})'.format(PRETTY_TP_METRICS[metric], tp, TP_METRICS_UNITS[metric])
+        if metric == 'trans_err':
+            label += f' ({md.max_recall_ind})'  # add recall
+            print(f'Recall: {detection_name}: {md.max_recall_ind/100}')
+        ax.plot(recall, error, label=label)
+    ax.axvline(x=md.max_recall, linestyle='-.', color=(0, 0, 0, 0.3))
+    ax.legend(loc='best')
+
+    if savepath is not None:
+        plt.savefig(savepath)
+        plt.close()
+
+
+class DetectionBox_modified(DetectionBox):
+    def __init__(self, *args, token=None, visibility=None, index=None, **kwargs):
+        '''
+        add annotation token
+        '''
+        super().__init__(*args, **kwargs)
+        self.token = token
+        self.visibility = visibility
+        self.index = index
+
+    def serialize(self) -> dict:
+        """ Serialize instance into json-friendly format. """
+        return {
+            'token': self.token,
+            'sample_token': self.sample_token,
+            'translation': self.translation,
+            'size': self.size,
+            'rotation': self.rotation,
+            'velocity': self.velocity,
+            'ego_translation': self.ego_translation,
+            'num_pts': self.num_pts,
+            'detection_name': self.detection_name,
+            'detection_score': self.detection_score,
+            'attribute_name': self.attribute_name,
+            'visibility': self.visibility,
+            'index': self.index
+
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized content. """
+        return cls(
+            token=content['token'],
+            sample_token=content['sample_token'],
+            translation=tuple(content['translation']),
+            size=tuple(content['size']),
+            rotation=tuple(content['rotation']),
+            velocity=tuple(content['velocity']),
+            ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+            else tuple(content['ego_translation']),
+            num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+            detection_name=content['detection_name'],
+            detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+            attribute_name=content['attribute_name'],
+            visibility=content['visibility'],
+            index=content['index'],
+        )
+
+
+def center_in_image(box, intrinsic: np.ndarray, imsize: Tuple[int, int], vis_level: int = BoxVisibility.ANY) -> bool:
+    """
+    Check if a box is visible inside an image without accounting for occlusions.
+    :param box: The box to be checked.
+    :param intrinsic: <float: 3, 3>. Intrinsic camera matrix.
+    :param imsize: (width, height).
+    :param vis_level: One of the enumerations of <BoxVisibility>.
+    :return True if visibility condition is satisfied.
+    """
+
+    center_3d = box.center.reshape(3, 1)
+    center_img = view_points(center_3d, intrinsic, normalize=True)[:2, :]
+
+    visible = np.logical_and(center_img[0, :] > 0, center_img[0, :] < imsize[0])
+    visible = np.logical_and(visible, center_img[1, :] < imsize[1])
+    visible = np.logical_and(visible, center_img[1, :] > 0)
+    visible = np.logical_and(visible, center_3d[2, :] > 1)
+
+    in_front = center_3d[2, :] > 0.1  # True if a corner is at least 0.1 meter in front of the camera.
+
+    if vis_level == BoxVisibility.ALL:
+        return all(visible) and all(in_front)
+    elif vis_level == BoxVisibility.ANY:
+        return any(visible) and all(in_front)
+    elif vis_level == BoxVisibility.NONE:
+        return True
+    else:
+        raise ValueError("vis_level: {} not valid".format(vis_level))
+
+
+def exist_corners_in_image_but_not_all(box, intrinsic: np.ndarray, imsize: Tuple[int, int],
+                                       vis_level: int = BoxVisibility.ANY) -> bool:
+    """
+    Check if a box is visible in images but not all corners in image .
+    :param box: The box to be checked.
+    :param intrinsic: <float: 3, 3>. Intrinsic camera matrix.
+    :param imsize: (width, height).
+    :param vis_level: One of the enumerations of <BoxVisibility>.
+    :return True if visibility condition is satisfied.
+    """
+
+    corners_3d = box.corners()
+    corners_img = view_points(corners_3d, intrinsic, normalize=True)[:2, :]
+
+    visible = np.logical_and(corners_img[0, :] > 0, corners_img[0, :] < imsize[0])
+    visible = np.logical_and(visible, corners_img[1, :] < imsize[1])
+    visible = np.logical_and(visible, corners_img[1, :] > 0)
+    visible = np.logical_and(visible, corners_3d[2, :] > 1)
+
+    in_front = corners_3d[2, :] > 0.1  # True if a corner is at least 0.1 meter in front of the camera.
+
+    if any(visible) and not all(visible) and all(in_front):
+        return True
+    else:
+        return False
+
+
+def load_gt(nusc: NuScenes, eval_split: str, box_cls, verbose: bool = False):
+    """
+    Loads ground truth boxes from DB.
+    :param nusc: A NuScenes instance.
+    :param eval_split: The evaluation split for which we load GT boxes.
+    :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
+    :param verbose: Whether to print messages to stdout.
+    :return: The GT boxes.
+    """
+
+    # Init.
+    if box_cls == DetectionBox_modified:
+        attribute_map = {a['token']: a['name'] for a in nusc.attribute}
+
+    if verbose:
+        print('Loading annotations for {} split from nuScenes version: {}'.format(eval_split, nusc.version))
+    # Read out all sample_tokens in DB.
+    sample_tokens_all = [s['token'] for s in nusc.sample]
+    assert len(sample_tokens_all) > 0, "Error: Database has no samples!"
+
+    # Only keep samples from this split.
+    splits = create_splits_scenes()
+
+    # Check compatibility of split with nusc_version.
+    version = nusc.version
+    if eval_split in {'train', 'val', 'train_detect', 'train_track'}:
+        assert version.endswith('trainval'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    elif eval_split in {'mini_train', 'mini_val'}:
+        assert version.endswith('mini'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    elif eval_split == 'test':
+        assert version.endswith('test'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    else:
+        raise ValueError('Error: Requested split {} which this function cannot map to the correct NuScenes version.'
+                         .format(eval_split))
+
+    if eval_split == 'test':
+        # Check that you aren't trying to cheat :).
+        assert len(nusc.sample_annotation) > 0, \
+            'Error: You are trying to evaluate on the test set but you do not have the annotations!'
+    index_map = {}
+    for scene in nusc.scene:
+        first_sample_token = scene['first_sample_token']
+        sample = nusc.get('sample', first_sample_token)
+        index_map[first_sample_token] = 1
+        index = 2
+        while sample['next'] != '':
+            sample = nusc.get('sample', sample['next'])
+            index_map[sample['token']] = index
+            index += 1
+
+    sample_tokens = []
+    for sample_token in sample_tokens_all:
+        scene_token = nusc.get('sample', sample_token)['scene_token']
+        scene_record = nusc.get('scene', scene_token)
+        if scene_record['name'] in splits[eval_split]:
+            sample_tokens.append(sample_token)
+
+    all_annotations = EvalBoxes()
+
+    # Load annotations and filter predictions and annotations.
+    tracking_id_set = set()
+    for sample_token in tqdm.tqdm(sample_tokens, leave=verbose):
+
+        sample = nusc.get('sample', sample_token)
+        sample_annotation_tokens = sample['anns']
+
+        sample_boxes = []
+        for sample_annotation_token in sample_annotation_tokens:
+
+            sample_annotation = nusc.get('sample_annotation', sample_annotation_token)
+            if box_cls == DetectionBox_modified:
+                # Get label name in detection task and filter unused labels.
+                detection_name = category_to_detection_name(sample_annotation['category_name'])
+                if detection_name is None:
+                    continue
+
+                # Get attribute_name.
+                attr_tokens = sample_annotation['attribute_tokens']
+                attr_count = len(attr_tokens)
+                if attr_count == 0:
+                    attribute_name = ''
+                elif attr_count == 1:
+                    attribute_name = attribute_map[attr_tokens[0]]
+                else:
+                    raise Exception('Error: GT annotations must not have more than one attribute!')
+
+                sample_boxes.append(
+                    box_cls(
+                        token=sample_annotation_token,
+                        sample_token=sample_token,
+                        translation=sample_annotation['translation'],
+                        size=sample_annotation['size'],
+                        rotation=sample_annotation['rotation'],
+                        velocity=nusc.box_velocity(sample_annotation['token'])[:2],
+                        num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'],
+                        detection_name=detection_name,
+                        detection_score=-1.0,  # GT samples do not have a score.
+                        attribute_name=attribute_name,
+                        visibility=sample_annotation['visibility_token'],
+                        index=index_map[sample_token]
+                    )
+                )
+            elif box_cls == TrackingBox:
+                assert False
+            else:
+                raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls)
+
+        all_annotations.add_boxes(sample_token, sample_boxes)
+
+    if verbose:
+        print("Loaded ground truth annotations for {} samples.".format(len(all_annotations.sample_tokens)))
+
+    return all_annotations
+
+
+def filter_eval_boxes_by_id(nusc: NuScenes,
+                            eval_boxes: EvalBoxes,
+                            id=None,
+                            verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. Distance, bike-racks and points per box.
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param is: the anns token set that used to keep bboxes.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            if box.token in id:
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After anns based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+def filter_eval_boxes_by_visibility(
+        ori_eval_boxes: EvalBoxes,
+        visibility=None,
+        verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. Distance, bike-racks and points per box.
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param is: the anns token set that used to keep bboxes.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    eval_boxes = copy.deepcopy(ori_eval_boxes)
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            if box.visibility == visibility:
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After visibility based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+def filter_by_sample_token(ori_eval_boxes, valid_sample_tokens=[],  verbose=False):
+    eval_boxes = copy.deepcopy(ori_eval_boxes)
+    for sample_token in eval_boxes.sample_tokens:
+        if sample_token not in valid_sample_tokens:
+            eval_boxes.boxes.pop(sample_token)
+    return eval_boxes
+
+
+def filter_eval_boxes_by_overlap(nusc: NuScenes,
+                                 eval_boxes: EvalBoxes,
+                                 verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. basedon overlap .
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    cams = ['CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_BACK_RIGHT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_FRONT_LEFT']
+
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        sample_record = nusc.get('sample', sample_token)
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            count = 0
+            for cam in cams:
+                '''
+                copy-paste form nuscens
+                '''
+                sample_data_token = sample_record['data'][cam]
+                sd_record = nusc.get('sample_data', sample_data_token)
+                cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+                sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+                pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+                cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+                imsize = (sd_record['width'], sd_record['height'])
+                new_box = Box(box.translation, box.size, Quaternion(box.rotation),
+                              name=box.detection_name, token='')
+
+                # Move box to ego vehicle coord system.
+                new_box.translate(-np.array(pose_record['translation']))
+                new_box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+                #  Move box to sensor coord system.
+                new_box.translate(-np.array(cs_record['translation']))
+                new_box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+                if center_in_image(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+                    count += 1
+                # if exist_corners_in_image_but_not_all(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+                #    count += 1
+
+            if count > 1:
+                with open('center_overlap.txt', 'a') as f:
+                    try:
+                        f.write(box.token + '\n')
+                    except:
+                        pass
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    verbose = True
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After anns based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+class NuScenesEval_custom(NuScenesEval):
+    """
+    Dummy class for backward-compatibility. Same as DetectionEval.
+    """
+
+    def __init__(self,
+                 nusc: NuScenes,
+                 config: DetectionConfig,
+                 result_path: str,
+                 eval_set: str,
+                 output_dir: str = None,
+                 verbose: bool = True,
+                 overlap_test=False,
+                 eval_mask=False,
+                 data_infos=None
+                 ):
+        """
+        Initialize a DetectionEval object.
+        :param nusc: A NuScenes object.
+        :param config: A DetectionConfig object.
+        :param result_path: Path of the nuScenes JSON result file.
+        :param eval_set: The dataset split to evaluate on, e.g. train, val or test.
+        :param output_dir: Folder to save plots and results to.
+        :param verbose: Whether to print to stdout.
+        """
+
+        self.nusc = nusc
+        self.result_path = result_path
+        self.eval_set = eval_set
+        self.output_dir = output_dir
+        self.verbose = verbose
+        self.cfg = config
+        self.overlap_test = overlap_test
+        self.eval_mask = eval_mask
+        self.data_infos = data_infos
+        # Check result file exists.
+        assert os.path.exists(result_path), 'Error: The result file does not exist!'
+
+        # Make dirs.
+        self.plot_dir = os.path.join(self.output_dir, 'plots')
+        if not os.path.isdir(self.output_dir):
+            os.makedirs(self.output_dir)
+        if not os.path.isdir(self.plot_dir):
+            os.makedirs(self.plot_dir)
+
+        # Load data.
+        if verbose:
+            print('Initializing nuScenes detection evaluation')
+        self.pred_boxes, self.meta = load_prediction(self.result_path, self.cfg.max_boxes_per_sample, DetectionBox,
+                                                     verbose=verbose)
+        self.gt_boxes = load_gt(self.nusc, self.eval_set, DetectionBox_modified, verbose=verbose)
+
+        assert set(self.pred_boxes.sample_tokens) == set(self.gt_boxes.sample_tokens), \
+            "Samples in split doesn't match samples in predictions."
+
+        # Add center distances.
+        # self.pred_boxes = add_center_dist(nusc, self.pred_boxes)
+        # self.gt_boxes = add_center_dist(nusc, self.gt_boxes)
+
+        import pdb
+        pdb.set_trace()
+
+
+
+        # Filter boxes (distance, points per box, etc.).
+
+        if verbose:
+            print('Filtering predictions')
+        self.pred_boxes = filter_eval_boxes(nusc, self.pred_boxes, self.cfg.class_range, verbose=verbose)
+        if verbose:
+            print('Filtering ground truth annotations')
+        self.gt_boxes = filter_eval_boxes(nusc, self.gt_boxes, self.cfg.class_range, verbose=verbose)
+
+        if self.overlap_test:
+            self.pred_boxes = filter_eval_boxes_by_overlap(self.nusc, self.pred_boxes)
+
+            self.gt_boxes = filter_eval_boxes_by_overlap(self.nusc, self.gt_boxes, verbose=True)
+
+        self.all_gt = copy.deepcopy(self.gt_boxes)
+        self.all_preds = copy.deepcopy(self.pred_boxes)
+        self.sample_tokens = self.gt_boxes.sample_tokens
+
+        self.index_map = {}
+        for scene in nusc.scene:
+            first_sample_token = scene['first_sample_token']
+            sample = nusc.get('sample', first_sample_token)
+            self.index_map[first_sample_token] = 1
+            index = 2
+            while sample['next'] != '':
+                sample = nusc.get('sample', sample['next'])
+                self.index_map[sample['token']] = index
+                index += 1
+
+    def update_gt(self, type_='vis', visibility='1', index=1):
+        if type_ == 'vis':
+            self.visibility_test = True
+            if self.visibility_test:
+                '''[{'description': 'visibility of whole object is between 0 and 40%',
+                'token': '1',
+                'level': 'v0-40'},
+                {'description': 'visibility of whole object is between 40 and 60%',
+                'token': '2',
+                'level': 'v40-60'},
+                {'description': 'visibility of whole object is between 60 and 80%',
+                'token': '3',
+                'level': 'v60-80'},
+                {'description': 'visibility of whole object is between 80 and 100%',
+                'token': '4',
+                'level': 'v80-100'}]'''
+
+                self.gt_boxes = filter_eval_boxes_by_visibility(self.all_gt, visibility, verbose=True)
+
+        elif type_ == 'ord':
+
+            valid_tokens = [key for (key, value) in self.index_map.items() if value == index]
+            # from IPython import embed
+            # embed()
+            self.gt_boxes = filter_by_sample_token(self.all_gt, valid_tokens)
+            self.pred_boxes = filter_by_sample_token(self.all_preds, valid_tokens)
+        self.sample_tokens = self.gt_boxes.sample_tokens
+
+
+    def evaluate(self) -> Tuple[DetectionMetrics, DetectionMetricDataList]:
+        """
+        Performs the actual evaluation.
+        :return: A tuple of high-level and the raw metric data.
+        """
+        start_time = time.time()
+
+        # -----------------------------------
+        # Step 1: Accumulate metric data for all classes and distance thresholds.
+        # -----------------------------------
+        if self.verbose:
+            print('Accumulating metric data...')
+        metric_data_list = DetectionMetricDataList()
+
+        # print(self.cfg.dist_fcn_callable, self.cfg.dist_ths)
+        # self.cfg.dist_ths = [0.3]
+        # self.cfg.dist_fcn_callable
+        for class_name in self.cfg.class_names:
+            for dist_th in self.cfg.dist_ths:
+                md = accumulate(self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th)
+                metric_data_list.set(class_name, dist_th, md)
+
+        # -----------------------------------
+        # Step 2: Calculate metrics from the data.
+        # -----------------------------------
+        if self.verbose:
+            print('Calculating metrics...')
+        metrics = DetectionMetrics(self.cfg)
+        for class_name in self.cfg.class_names:
+            # Compute APs.
+            for dist_th in self.cfg.dist_ths:
+                metric_data = metric_data_list[(class_name, dist_th)]
+                ap = calc_ap(metric_data, self.cfg.min_recall, self.cfg.min_precision)
+                metrics.add_label_ap(class_name, dist_th, ap)
+            # Compute TP metrics.
+            for metric_name in TP_METRICS:
+                metric_data = metric_data_list[(class_name, self.cfg.dist_th_tp)]
+                if class_name in ['traffic_cone'] and metric_name in ['attr_err', 'vel_err', 'orient_err']:
+                    tp = np.nan
+                elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']:
+                    tp = np.nan
+                else:
+                    tp = calc_tp(metric_data, self.cfg.min_recall, metric_name)
+                metrics.add_label_tp(class_name, metric_name, tp)
+
+        # Compute evaluation time.
+        metrics.add_runtime(time.time() - start_time)
+
+        return metrics, metric_data_list
+
+    def render(self, metrics: DetectionMetrics, md_list: DetectionMetricDataList) -> None:
+        """
+        Renders various PR and TP curves.
+        :param metrics: DetectionMetrics instance.
+        :param md_list: DetectionMetricDataList instance.
+        """
+        if self.verbose:
+            print('Rendering PR and TP curves')
+
+        def savepath(name):
+            return os.path.join(self.plot_dir, name + '.pdf')
+
+        summary_plot(md_list, metrics, min_precision=self.cfg.min_precision, min_recall=self.cfg.min_recall,
+                     dist_th_tp=self.cfg.dist_th_tp, savepath=savepath('summary'))
+
+        for detection_name in self.cfg.class_names:
+            class_pr_curve(md_list, metrics, detection_name, self.cfg.min_precision, self.cfg.min_recall,
+                           savepath=savepath(detection_name + '_pr'))
+
+            class_tp_curve(md_list, metrics, detection_name, self.cfg.min_recall, self.cfg.dist_th_tp,
+                           savepath=savepath(detection_name + '_tp'))
+
+        for dist_th in self.cfg.dist_ths:
+            dist_pr_curve(md_list, metrics, dist_th, self.cfg.min_precision, self.cfg.min_recall,
+                          savepath=savepath('dist_pr_' + str(dist_th)))
+
+
+if __name__ == "__main__":
+
+    # Settings.
+    parser = argparse.ArgumentParser(description='Evaluate nuScenes detection results.',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('result_path', type=str, help='The submission as a JSON file.')
+    parser.add_argument('--output_dir', type=str, default='~/nuscenes-metrics',
+                        help='Folder to store result metrics, graphs and example visualizations.')
+    parser.add_argument('--eval_set', type=str, default='val',
+                        help='Which dataset split to evaluate on, train, val or test.')
+    parser.add_argument('--dataroot', type=str, default='data/nuscenes',
+                        help='Default nuScenes data directory.')
+    parser.add_argument('--version', type=str, default='v1.0-trainval',
+                        help='Which version of the nuScenes dataset to evaluate on, e.g. v1.0-trainval.')
+    parser.add_argument('--config_path', type=str, default='',
+                        help='Path to the configuration file.'
+                             'If no path given, the CVPR 2019 configuration will be used.')
+    parser.add_argument('--plot_examples', type=int, default=0,
+                        help='How many example visualizations to write to disk.')
+    parser.add_argument('--render_curves', type=int, default=1,
+                        help='Whether to render PR and TP curves to disk.')
+    parser.add_argument('--verbose', type=int, default=1,
+                        help='Whether to print to stdout.')
+    args = parser.parse_args()
+
+    result_path_ = os.path.expanduser(args.result_path)
+    output_dir_ = os.path.expanduser(args.output_dir)
+    eval_set_ = args.eval_set
+    dataroot_ = args.dataroot
+    version_ = args.version
+    config_path = args.config_path
+    plot_examples_ = args.plot_examples
+    render_curves_ = bool(args.render_curves)
+    verbose_ = bool(args.verbose)
+
+    if config_path == '':
+        cfg_ = config_factory('detection_cvpr_2019')
+    else:
+        with open(config_path, 'r') as _f:
+            cfg_ = DetectionConfig.deserialize(json.load(_f))
+
+    nusc_ = NuScenes(version=version_, verbose=verbose_, dataroot=dataroot_)
+    nusc_eval = NuScenesEval_custom(nusc_, config=cfg_, result_path=result_path_, eval_set=eval_set_,
+                                    output_dir=output_dir_, verbose=verbose_)
+    for vis in ['1', '2', '3', '4']:
+        nusc_eval.update_gt(type_='vis', visibility=vis)
+        print(f'================ {vis} ===============')
+        nusc_eval.main(plot_examples=plot_examples_, render_curves=render_curves_)
+    #for index in range(1, 41):
+    #    nusc_eval.update_gt(type_='ord', index=index)
+    #
diff --git a/mmcv/datasets/pipelines/__init__.py b/mmcv/datasets/pipelines/__init__.py
new file mode 100644
index 0000000..04e195d
--- /dev/null
+++ b/mmcv/datasets/pipelines/__init__.py
@@ -0,0 +1,50 @@
+from .compose import Compose
+from .formating import (Collect, Collect3D, DefaultFormatBundle, DefaultFormatBundle3D, 
+                        CustomDefaultFormatBundle3D, ImageToTensor,
+                        ToDataContainer, ToTensor, Transpose, to_tensor,VADFormatBundle3D)
+from .loading import (LoadAnnotations, LoadImageFromFile, LoadImageFromWebcam,
+                      LoadMultiChannelImageFromFiles, LoadProposals,
+                      LoadAnnotations3D, LoadImageFromFileMono3D,
+                      LoadMultiViewImageFromFiles, LoadPointsFromFile,
+                      LoadPointsFromMultiSweeps, NormalizePointsColor,
+                      PointSegClassMapping, LoadAnnotations3D_E2E, CustomLoadPointsFromMultiSweeps, CustomLoadPointsFromFile)
+from .test_time_aug import MultiScaleFlipAug, MultiScaleFlipAug3D
+from .transforms_3d import (BackgroundPointsFilter, GlobalAlignment,
+                            GlobalRotScaleTrans, IndoorPatchPointSample,
+                            IndoorPointSample, ObjectNameFilter, ObjectNoise,
+                            ObjectRangeFilter, ObjectSample, PointSample,
+                            PointShuffle, PointsRangeFilter,
+                            RandomDropPointsColor, RandomFlip3D,
+                            RandomJitterPoints, VoxelBasedPointSampler,
+                            PadMultiViewImage, NormalizeMultiviewImage, 
+                            PhotoMetricDistortionMultiViewImage, CustomCollect3D, 
+                            RandomScaleImageMultiViewImage,VADObjectRangeFilter,VADObjectNameFilter,CustomPointsRangeFilter)
+from .transforms import (Albu, CutOut, Expand, MinIoURandomCrop, Normalize,
+                         Pad, PhotoMetricDistortion, RandomCenterCropPad,
+                         RandomCrop, RandomFlip, RandomShift, Resize,
+                         SegRescale)
+from .occflow_label import GenerateOccFlowLabels
+
+# __all__ = [
+#     'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer',
+#     'Transpose', 'Collect', 'DefaultFormatBundle', 'LoadAnnotations',
+#     'LoadImageFromFile', 'LoadImageFromWebcam',
+#     'LoadMultiChannelImageFromFiles', 'LoadProposals', 'MultiScaleFlipAug',
+#     'Resize', 'RandomFlip', 'Pad', 'RandomCrop', 'Normalize', 'SegRescale',
+#     'MinIoURandomCrop', 'Expand', 'PhotoMetricDistortion', 'Albu',
+#     'InstaBoost', 'RandomCenterCropPad', 'AutoAugment', 'CutOut', 'Shear',
+#     'Rotate', 'ColorTransform', 'EqualizeTransform', 'BrightnessTransform',
+#     'ContrastTransform', 'Translate', 'RandomShift',
+#     'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans',
+#     'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D',
+#     'LoadMultiViewImageFromFiles', 'LoadPointsFromFile',
+#     'DefaultFormatBundle3D', 'DataBaseSampler',
+#     'NormalizePointsColor', 'LoadAnnotations3D', 'IndoorPointSample',
+#     'PointSample', 'PointSegClassMapping', 'MultiScaleFlipAug3D',
+#     'LoadPointsFromMultiSweeps', 'BackgroundPointsFilter',
+#     'VoxelBasedPointSampler', 'GlobalAlignment', 'IndoorPatchPointSample',
+#     'LoadImageFromFileMono3D', 'ObjectNameFilter', 'RandomDropPointsColor',
+#     'RandomJitterPoints', 'CustomDefaultFormatBundle3D', 'LoadAnnotations3D_E2E',
+#     'GenerateOccFlowLabels', 'PadMultiViewImage', 'NormalizeMultiviewImage', 
+#     'PhotoMetricDistortionMultiViewImage', 'CustomCollect3D', 'RandomScaleImageMultiViewImage'
+# ]
diff --git a/mmcv/datasets/pipelines/compose.py b/mmcv/datasets/pipelines/compose.py
new file mode 100644
index 0000000..1567530
--- /dev/null
+++ b/mmcv/datasets/pipelines/compose.py
@@ -0,0 +1,51 @@
+import collections
+
+from mmcv.utils import build_from_cfg
+
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class Compose:
+    """Compose multiple transforms sequentially.
+
+    Args:
+        transforms (Sequence[dict | callable]): Sequence of transform object or
+            config dict to be composed.
+    """
+
+    def __init__(self, transforms):
+        assert isinstance(transforms, collections.abc.Sequence)
+        self.transforms = []
+        for transform in transforms:
+            if isinstance(transform, dict):
+                transform = build_from_cfg(transform, PIPELINES)
+                self.transforms.append(transform)
+            elif callable(transform):
+                self.transforms.append(transform)
+            else:
+                raise TypeError('transform must be callable or a dict')
+
+    def __call__(self, data):
+        """Call function to apply transforms sequentially.
+
+        Args:
+            data (dict): A result dict contains the data to transform.
+
+        Returns:
+           dict: Transformed data.
+        """
+
+        for t in self.transforms:
+            data = t(data)
+            if data is None:
+                return None
+        return data
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += f'    {t}'
+        format_string += '\n)'
+        return format_string
diff --git a/mmcv/datasets/pipelines/data_augment_utils.py b/mmcv/datasets/pipelines/data_augment_utils.py
new file mode 100644
index 0000000..231ab80
--- /dev/null
+++ b/mmcv/datasets/pipelines/data_augment_utils.py
@@ -0,0 +1,409 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numba
+import numpy as np
+import warnings
+from numba.errors import NumbaPerformanceWarning
+
+from mmcv.core.bbox import box_np_ops
+
+warnings.filterwarnings('ignore', category=NumbaPerformanceWarning)
+
+
+@numba.njit
+def _rotation_box2d_jit_(corners, angle, rot_mat_T):
+    """Rotate 2D boxes.
+
+    Args:
+        corners (np.ndarray): Corners of boxes.
+        angle (float): Rotation angle.
+        rot_mat_T (np.ndarray): Transposed rotation matrix.
+    """
+    rot_sin = np.sin(angle)
+    rot_cos = np.cos(angle)
+    rot_mat_T[0, 0] = rot_cos
+    rot_mat_T[0, 1] = -rot_sin
+    rot_mat_T[1, 0] = rot_sin
+    rot_mat_T[1, 1] = rot_cos
+    corners[:] = corners @ rot_mat_T
+
+
+@numba.jit(nopython=True)
+def box_collision_test(boxes, qboxes, clockwise=True):
+    """Box collision test.
+
+    Args:
+        boxes (np.ndarray): Corners of current boxes.
+        qboxes (np.ndarray): Boxes to be avoid colliding.
+        clockwise (bool): Whether the corners are in clockwise order.
+            Default: True.
+    """
+    N = boxes.shape[0]
+    K = qboxes.shape[0]
+    ret = np.zeros((N, K), dtype=np.bool_)
+    slices = np.array([1, 2, 3, 0])
+    lines_boxes = np.stack((boxes, boxes[:, slices, :]),
+                           axis=2)  # [N, 4, 2(line), 2(xy)]
+    lines_qboxes = np.stack((qboxes, qboxes[:, slices, :]), axis=2)
+    # vec = np.zeros((2,), dtype=boxes.dtype)
+    boxes_standup = box_np_ops.corner_to_standup_nd_jit(boxes)
+    qboxes_standup = box_np_ops.corner_to_standup_nd_jit(qboxes)
+    for i in range(N):
+        for j in range(K):
+            # calculate standup first
+            iw = (
+                min(boxes_standup[i, 2], qboxes_standup[j, 2]) -
+                max(boxes_standup[i, 0], qboxes_standup[j, 0]))
+            if iw > 0:
+                ih = (
+                    min(boxes_standup[i, 3], qboxes_standup[j, 3]) -
+                    max(boxes_standup[i, 1], qboxes_standup[j, 1]))
+                if ih > 0:
+                    for k in range(4):
+                        for box_l in range(4):
+                            A = lines_boxes[i, k, 0]
+                            B = lines_boxes[i, k, 1]
+                            C = lines_qboxes[j, box_l, 0]
+                            D = lines_qboxes[j, box_l, 1]
+                            acd = (D[1] - A[1]) * (C[0] -
+                                                   A[0]) > (C[1] - A[1]) * (
+                                                       D[0] - A[0])
+                            bcd = (D[1] - B[1]) * (C[0] -
+                                                   B[0]) > (C[1] - B[1]) * (
+                                                       D[0] - B[0])
+                            if acd != bcd:
+                                abc = (C[1] - A[1]) * (B[0] - A[0]) > (
+                                    B[1] - A[1]) * (
+                                        C[0] - A[0])
+                                abd = (D[1] - A[1]) * (B[0] - A[0]) > (
+                                    B[1] - A[1]) * (
+                                        D[0] - A[0])
+                                if abc != abd:
+                                    ret[i, j] = True  # collision.
+                                    break
+                        if ret[i, j] is True:
+                            break
+                    if ret[i, j] is False:
+                        # now check complete overlap.
+                        # box overlap qbox:
+                        box_overlap_qbox = True
+                        for box_l in range(4):  # point l in qboxes
+                            for k in range(4):  # corner k in boxes
+                                vec = boxes[i, k] - boxes[i, (k + 1) % 4]
+                                if clockwise:
+                                    vec = -vec
+                                cross = vec[1] * (
+                                    boxes[i, k, 0] - qboxes[j, box_l, 0])
+                                cross -= vec[0] * (
+                                    boxes[i, k, 1] - qboxes[j, box_l, 1])
+                                if cross >= 0:
+                                    box_overlap_qbox = False
+                                    break
+                            if box_overlap_qbox is False:
+                                break
+
+                        if box_overlap_qbox is False:
+                            qbox_overlap_box = True
+                            for box_l in range(4):  # point box_l in boxes
+                                for k in range(4):  # corner k in qboxes
+                                    vec = qboxes[j, k] - qboxes[j, (k + 1) % 4]
+                                    if clockwise:
+                                        vec = -vec
+                                    cross = vec[1] * (
+                                        qboxes[j, k, 0] - boxes[i, box_l, 0])
+                                    cross -= vec[0] * (
+                                        qboxes[j, k, 1] - boxes[i, box_l, 1])
+                                    if cross >= 0:  #
+                                        qbox_overlap_box = False
+                                        break
+                                if qbox_overlap_box is False:
+                                    break
+                            if qbox_overlap_box:
+                                ret[i, j] = True  # collision.
+                        else:
+                            ret[i, j] = True  # collision.
+    return ret
+
+
+@numba.njit
+def noise_per_box(boxes, valid_mask, loc_noises, rot_noises):
+    """Add noise to every box (only on the horizontal plane).
+
+    Args:
+        boxes (np.ndarray): Input boxes with shape (N, 5).
+        valid_mask (np.ndarray): Mask to indicate which boxes are valid
+            with shape (N).
+        loc_noises (np.ndarray): Location noises with shape (N, M, 3).
+        rot_noises (np.ndarray): Rotation noises with shape (N, M).
+
+    Returns:
+        np.ndarray: Mask to indicate whether the noise is
+            added successfully (pass the collision test).
+    """
+    num_boxes = boxes.shape[0]
+    num_tests = loc_noises.shape[1]
+    box_corners = box_np_ops.box2d_to_corner_jit(boxes)
+    current_corners = np.zeros((4, 2), dtype=boxes.dtype)
+    rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
+    success_mask = -np.ones((num_boxes, ), dtype=np.int64)
+    # print(valid_mask)
+    for i in range(num_boxes):
+        if valid_mask[i]:
+            for j in range(num_tests):
+                current_corners[:] = box_corners[i]
+                current_corners -= boxes[i, :2]
+                _rotation_box2d_jit_(current_corners, rot_noises[i, j],
+                                     rot_mat_T)
+                current_corners += boxes[i, :2] + loc_noises[i, j, :2]
+                coll_mat = box_collision_test(
+                    current_corners.reshape(1, 4, 2), box_corners)
+                coll_mat[0, i] = False
+                # print(coll_mat)
+                if not coll_mat.any():
+                    success_mask[i] = j
+                    box_corners[i] = current_corners
+                    break
+    return success_mask
+
+
+@numba.njit
+def noise_per_box_v2_(boxes, valid_mask, loc_noises, rot_noises,
+                      global_rot_noises):
+    """Add noise to every box (only on the horizontal plane). Version 2 used
+    when enable global rotations.
+
+    Args:
+        boxes (np.ndarray): Input boxes with shape (N, 5).
+        valid_mask (np.ndarray): Mask to indicate which boxes are valid
+            with shape (N).
+        loc_noises (np.ndarray): Location noises with shape (N, M, 3).
+        rot_noises (np.ndarray): Rotation noises with shape (N, M).
+
+    Returns:
+        np.ndarray: Mask to indicate whether the noise is
+            added successfully (pass the collision test).
+    """
+    num_boxes = boxes.shape[0]
+    num_tests = loc_noises.shape[1]
+    box_corners = box_np_ops.box2d_to_corner_jit(boxes)
+    current_corners = np.zeros((4, 2), dtype=boxes.dtype)
+    current_box = np.zeros((1, 5), dtype=boxes.dtype)
+    rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
+    dst_pos = np.zeros((2, ), dtype=boxes.dtype)
+    success_mask = -np.ones((num_boxes, ), dtype=np.int64)
+    corners_norm = np.zeros((4, 2), dtype=boxes.dtype)
+    corners_norm[1, 1] = 1.0
+    corners_norm[2] = 1.0
+    corners_norm[3, 0] = 1.0
+    corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype)
+    corners_norm = corners_norm.reshape(4, 2)
+    for i in range(num_boxes):
+        if valid_mask[i]:
+            for j in range(num_tests):
+                current_box[0, :] = boxes[i]
+                current_radius = np.sqrt(boxes[i, 0]**2 + boxes[i, 1]**2)
+                current_grot = np.arctan2(boxes[i, 0], boxes[i, 1])
+                dst_grot = current_grot + global_rot_noises[i, j]
+                dst_pos[0] = current_radius * np.sin(dst_grot)
+                dst_pos[1] = current_radius * np.cos(dst_grot)
+                current_box[0, :2] = dst_pos
+                current_box[0, -1] += (dst_grot - current_grot)
+
+                rot_sin = np.sin(current_box[0, -1])
+                rot_cos = np.cos(current_box[0, -1])
+                rot_mat_T[0, 0] = rot_cos
+                rot_mat_T[0, 1] = -rot_sin
+                rot_mat_T[1, 0] = rot_sin
+                rot_mat_T[1, 1] = rot_cos
+                current_corners[:] = current_box[
+                    0, 2:4] * corners_norm @ rot_mat_T + current_box[0, :2]
+                current_corners -= current_box[0, :2]
+                _rotation_box2d_jit_(current_corners, rot_noises[i, j],
+                                     rot_mat_T)
+                current_corners += current_box[0, :2] + loc_noises[i, j, :2]
+                coll_mat = box_collision_test(
+                    current_corners.reshape(1, 4, 2), box_corners)
+                coll_mat[0, i] = False
+                if not coll_mat.any():
+                    success_mask[i] = j
+                    box_corners[i] = current_corners
+                    loc_noises[i, j, :2] += (dst_pos - boxes[i, :2])
+                    rot_noises[i, j] += (dst_grot - current_grot)
+                    break
+    return success_mask
+
+
+def _select_transform(transform, indices):
+    """Select transform.
+
+    Args:
+        transform (np.ndarray): Transforms to select from.
+        indices (np.ndarray): Mask to indicate which transform to select.
+
+    Returns:
+        np.ndarray: Selected transforms.
+    """
+    result = np.zeros((transform.shape[0], *transform.shape[2:]),
+                      dtype=transform.dtype)
+    for i in range(transform.shape[0]):
+        if indices[i] != -1:
+            result[i] = transform[i, indices[i]]
+    return result
+
+
+@numba.njit
+def _rotation_matrix_3d_(rot_mat_T, angle, axis):
+    """Get the 3D rotation matrix.
+
+    Args:
+        rot_mat_T (np.ndarray): Transposed rotation matrix.
+        angle (float): Rotation angle.
+        axis (int): Rotation axis.
+    """
+    rot_sin = np.sin(angle)
+    rot_cos = np.cos(angle)
+    rot_mat_T[:] = np.eye(3)
+    if axis == 1:
+        rot_mat_T[0, 0] = rot_cos
+        rot_mat_T[0, 2] = -rot_sin
+        rot_mat_T[2, 0] = rot_sin
+        rot_mat_T[2, 2] = rot_cos
+    elif axis == 2 or axis == -1:
+        rot_mat_T[0, 0] = rot_cos
+        rot_mat_T[0, 1] = -rot_sin
+        rot_mat_T[1, 0] = rot_sin
+        rot_mat_T[1, 1] = rot_cos
+    elif axis == 0:
+        rot_mat_T[1, 1] = rot_cos
+        rot_mat_T[1, 2] = -rot_sin
+        rot_mat_T[2, 1] = rot_sin
+        rot_mat_T[2, 2] = rot_cos
+
+
+@numba.njit
+def points_transform_(points, centers, point_masks, loc_transform,
+                      rot_transform, valid_mask):
+    """Apply transforms to points and box centers.
+
+    Args:
+        points (np.ndarray): Input points.
+        centers (np.ndarray): Input box centers.
+        point_masks (np.ndarray): Mask to indicate which points need
+            to be transformed.
+        loc_transform (np.ndarray): Location transform to be applied.
+        rot_transform (np.ndarray): Rotation transform to be applied.
+        valid_mask (np.ndarray): Mask to indicate which boxes are valid.
+    """
+    num_box = centers.shape[0]
+    num_points = points.shape[0]
+    rot_mat_T = np.zeros((num_box, 3, 3), dtype=points.dtype)
+    for i in range(num_box):
+        _rotation_matrix_3d_(rot_mat_T[i], rot_transform[i], 2)
+    for i in range(num_points):
+        for j in range(num_box):
+            if valid_mask[j]:
+                if point_masks[i, j] == 1:
+                    points[i, :3] -= centers[j, :3]
+                    points[i:i + 1, :3] = points[i:i + 1, :3] @ rot_mat_T[j]
+                    points[i, :3] += centers[j, :3]
+                    points[i, :3] += loc_transform[j]
+                    break  # only apply first box's transform
+
+
+@numba.njit
+def box3d_transform_(boxes, loc_transform, rot_transform, valid_mask):
+    """Transform 3D boxes.
+
+    Args:
+        boxes (np.ndarray): 3D boxes to be transformed.
+        loc_transform (np.ndarray): Location transform to be applied.
+        rot_transform (np.ndarray): Rotation transform to be applied.
+        valid_mask (np.ndarray | None): Mask to indicate which boxes are valid.
+    """
+    num_box = boxes.shape[0]
+    for i in range(num_box):
+        if valid_mask[i]:
+            boxes[i, :3] += loc_transform[i]
+            boxes[i, 6] += rot_transform[i]
+
+
+def noise_per_object_v3_(gt_boxes,
+                         points=None,
+                         valid_mask=None,
+                         rotation_perturb=np.pi / 4,
+                         center_noise_std=1.0,
+                         global_random_rot_range=np.pi / 4,
+                         num_try=100):
+    """Random rotate or remove each groundtruth independently. use kitti viewer
+    to test this function points_transform_
+
+    Args:
+        gt_boxes (np.ndarray): Ground truth boxes with shape (N, 7).
+        points (np.ndarray | None): Input point cloud with shape (M, 4).
+            Default: None.
+        valid_mask (np.ndarray | None): Mask to indicate which boxes are valid.
+            Default: None.
+        rotation_perturb (float): Rotation perturbation. Default: pi / 4.
+        center_noise_std (float): Center noise standard deviation.
+            Default: 1.0.
+        global_random_rot_range (float): Global random rotation range.
+            Default: pi/4.
+        num_try (int): Number of try. Default: 100.
+    """
+    num_boxes = gt_boxes.shape[0]
+    if not isinstance(rotation_perturb, (list, tuple, np.ndarray)):
+        rotation_perturb = [-rotation_perturb, rotation_perturb]
+    if not isinstance(global_random_rot_range, (list, tuple, np.ndarray)):
+        global_random_rot_range = [
+            -global_random_rot_range, global_random_rot_range
+        ]
+    enable_grot = np.abs(global_random_rot_range[0] -
+                         global_random_rot_range[1]) >= 1e-3
+
+    if not isinstance(center_noise_std, (list, tuple, np.ndarray)):
+        center_noise_std = [
+            center_noise_std, center_noise_std, center_noise_std
+        ]
+    if valid_mask is None:
+        valid_mask = np.ones((num_boxes, ), dtype=np.bool_)
+    center_noise_std = np.array(center_noise_std, dtype=gt_boxes.dtype)
+
+    loc_noises = np.random.normal(
+        scale=center_noise_std, size=[num_boxes, num_try, 3])
+    rot_noises = np.random.uniform(
+        rotation_perturb[0], rotation_perturb[1], size=[num_boxes, num_try])
+    gt_grots = np.arctan2(gt_boxes[:, 0], gt_boxes[:, 1])
+    grot_lowers = global_random_rot_range[0] - gt_grots
+    grot_uppers = global_random_rot_range[1] - gt_grots
+    global_rot_noises = np.random.uniform(
+        grot_lowers[..., np.newaxis],
+        grot_uppers[..., np.newaxis],
+        size=[num_boxes, num_try])
+
+    origin = (0.5, 0.5, 0)
+    gt_box_corners = box_np_ops.center_to_corner_box3d(
+        gt_boxes[:, :3],
+        gt_boxes[:, 3:6],
+        gt_boxes[:, 6],
+        origin=origin,
+        axis=2)
+
+    # TODO: rewrite this noise box function?
+    if not enable_grot:
+        selected_noise = noise_per_box(gt_boxes[:, [0, 1, 3, 4, 6]],
+                                       valid_mask, loc_noises, rot_noises)
+    else:
+        selected_noise = noise_per_box_v2_(gt_boxes[:, [0, 1, 3, 4, 6]],
+                                           valid_mask, loc_noises, rot_noises,
+                                           global_rot_noises)
+
+    loc_transforms = _select_transform(loc_noises, selected_noise)
+    rot_transforms = _select_transform(rot_noises, selected_noise)
+    surfaces = box_np_ops.corner_to_surfaces_3d_jit(gt_box_corners)
+    if points is not None:
+        # TODO: replace this points_in_convex function by my tools?
+        point_masks = box_np_ops.points_in_convex_polygon_3d_jit(
+            points[:, :3], surfaces)
+        points_transform_(points, gt_boxes[:, :3], point_masks, loc_transforms,
+                          rot_transforms, valid_mask)
+
+    box3d_transform_(gt_boxes, loc_transforms, rot_transforms, valid_mask)
diff --git a/mmcv/datasets/pipelines/formating.py b/mmcv/datasets/pipelines/formating.py
new file mode 100644
index 0000000..a7b3e61
--- /dev/null
+++ b/mmcv/datasets/pipelines/formating.py
@@ -0,0 +1,700 @@
+from collections.abc import Sequence
+
+import numpy as np
+import torch
+from mmcv.parallel import DataContainer as DC
+
+from mmcv.core.bbox.structures.base_box3d import BaseInstance3DBoxes
+from mmcv.core.points import BasePoints
+from mmcv.utils import is_str
+from ..builder import PIPELINES
+
+
+def to_tensor(data):
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+
+    Args:
+        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+            be converted.
+    """
+
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(f'type {type(data)} cannot be converted to tensor.')
+
+
+@PIPELINES.register_module()
+class ToTensor:
+    """Convert some results to :obj:`torch.Tensor` by given keys.
+
+    Args:
+        keys (Sequence[str]): Keys that need to be converted to Tensor.
+    """
+
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        """Call function to convert data in results to :obj:`torch.Tensor`.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data converted
+                to :obj:`torch.Tensor`.
+        """
+        for key in self.keys:
+            results[key] = to_tensor(results[key])
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@PIPELINES.register_module()
+class ImageToTensor:
+    """Convert image to :obj:`torch.Tensor` by given keys.
+
+    The dimension order of input image is (H, W, C). The pipeline will convert
+    it to (C, H, W). If only 2 dimension (H, W) is given, the output would be
+    (1, H, W).
+
+    Args:
+        keys (Sequence[str]): Key of images to be converted to Tensor.
+    """
+
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        """Call function to convert image in results to :obj:`torch.Tensor` and
+        transpose the channel order.
+
+        Args:
+            results (dict): Result dict contains the image data to convert.
+
+        Returns:
+            dict: The result dict contains the image converted
+                to :obj:`torch.Tensor` and transposed to (C, H, W) order.
+        """
+        for key in self.keys:
+            img = results[key]
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            results[key] = to_tensor(img.transpose(2, 0, 1))
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@PIPELINES.register_module()
+class Transpose:
+    """Transpose some results by given keys.
+
+    Args:
+        keys (Sequence[str]): Keys of results to be transposed.
+        order (Sequence[int]): Order of transpose.
+    """
+
+    def __init__(self, keys, order):
+        self.keys = keys
+        self.order = order
+
+    def __call__(self, results):
+        """Call function to transpose the channel order of data in results.
+
+        Args:
+            results (dict): Result dict contains the data to transpose.
+
+        Returns:
+            dict: The result dict contains the data transposed to \
+                ``self.order``.
+        """
+        for key in self.keys:
+            results[key] = results[key].transpose(self.order)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+            f'(keys={self.keys}, order={self.order})'
+
+
+@PIPELINES.register_module()
+class ToDataContainer:
+    """Convert results to :obj:`mmcv.DataContainer` by given fields.
+
+    Args:
+        fields (Sequence[dict]): Each field is a dict like
+            ``dict(key='xxx', **kwargs)``. The ``key`` in result will
+            be converted to :obj:`mmcv.DataContainer` with ``**kwargs``.
+            Default: ``(dict(key='img', stack=True), dict(key='gt_bboxes'),
+            dict(key='gt_labels'))``.
+    """
+
+    def __init__(self,
+                 fields=(dict(key='img', stack=True), dict(key='gt_bboxes'),
+                         dict(key='gt_labels'))):
+        self.fields = fields
+
+    def __call__(self, results):
+        """Call function to convert data in results to
+        :obj:`mmcv.DataContainer`.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data converted to \
+                :obj:`mmcv.DataContainer`.
+        """
+
+        for field in self.fields:
+            field = field.copy()
+            key = field.pop('key')
+            results[key] = DC(results[key], **field)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(fields={self.fields})'
+
+
+@PIPELINES.register_module()
+class DefaultFormatBundle:
+    """Default formatting bundle.
+
+    It simplifies the pipeline of formatting common fields, including "img",
+    "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
+    These fields are formatted as follows.
+
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
+    - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \
+                       (3)to DataContainer (stack=True)
+    """
+
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data that is formatted with \
+                default bundle.
+        """
+
+        if 'img' in results:
+            img = results['img']
+            # add default meta keys
+            results = self._add_default_meta_keys(results)
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            img = np.ascontiguousarray(img.transpose(2, 0, 1))
+            results['img'] = DC(to_tensor(img), stack=True)
+        for key in ['proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels']:
+            if key not in results:
+                continue
+            results[key] = DC(to_tensor(results[key]))
+        if 'gt_masks' in results:
+            results['gt_masks'] = DC(results['gt_masks'], cpu_only=True)
+        if 'gt_semantic_seg' in results:
+            results['gt_semantic_seg'] = DC(
+                to_tensor(results['gt_semantic_seg'][None, ...]), stack=True)
+        return results
+
+    def _add_default_meta_keys(self, results):
+        """Add default meta keys.
+
+        We set default meta keys including `pad_shape`, `scale_factor` and
+        `img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and
+        `Pad` are implemented during the whole pipeline.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            results (dict): Updated result dict contains the data to convert.
+        """
+        img = results['img']
+        results.setdefault('pad_shape', img.shape)
+        results.setdefault('scale_factor', 1.0)
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results.setdefault(
+            'img_norm_cfg',
+            dict(
+                mean=np.zeros(num_channels, dtype=np.float32),
+                std=np.ones(num_channels, dtype=np.float32),
+                to_rgb=False))
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+
+@PIPELINES.register_module()
+class Collect:
+    """Collect data from the loader relevant to the specific task.
+
+    This is usually the last stage of the data loader pipeline. Typically keys
+    is set to some subset of "img", "proposals", "gt_bboxes",
+    "gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
+
+    The "img_meta" item is always populated.  The contents of the "img_meta"
+    dictionary depends on "meta_keys". By default this includes:
+
+        - "img_shape": shape of the image input to the network as a tuple \
+            (h, w, c).  Note that images may be zero padded on the \
+            bottom/right if the batch tensor is larger than this shape.
+
+        - "scale_factor": a float indicating the preprocessing scale
+
+        - "flip": a boolean indicating if image flip transform was used
+
+        - "filename": path to the image file
+
+        - "ori_shape": original shape of the image as a tuple (h, w, c)
+
+        - "pad_shape": image shape after padding
+
+        - "img_norm_cfg": a dict of normalization information:
+
+            - mean - per channel mean subtraction
+            - std - per channel std divisor
+            - to_rgb - bool indicating if bgr was converted to rgb
+
+    Args:
+        keys (Sequence[str]): Keys of results to be collected in ``data``.
+        meta_keys (Sequence[str], optional): Meta keys to be converted to
+            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
+            Default: ``('filename', 'ori_filename', 'ori_shape', 'img_shape',
+            'pad_shape', 'scale_factor', 'flip', 'flip_direction',
+            'img_norm_cfg')``
+    """
+
+    def __init__(self,
+                 keys,
+                 meta_keys=('filename', 'ori_filename', 'ori_shape',
+                            'img_shape', 'pad_shape', 'scale_factor', 'flip',
+                            'flip_direction', 'img_norm_cfg')):
+        self.keys = keys
+        self.meta_keys = meta_keys
+
+    def __call__(self, results):
+        """Call function to collect keys in results. The keys in ``meta_keys``
+        will be converted to :obj:mmcv.DataContainer.
+
+        Args:
+            results (dict): Result dict contains the data to collect.
+
+        Returns:
+            dict: The result dict contains the following keys
+
+                - keys in``self.keys``
+                - ``img_metas``
+        """
+
+        data = {}
+        img_meta = {}
+        for key in self.meta_keys:
+            img_meta[key] = results[key]
+        data['img_metas'] = DC(img_meta, cpu_only=True)
+        for key in self.keys:
+            data[key] = results[key]
+        return data
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+            f'(keys={self.keys}, meta_keys={self.meta_keys})'
+
+
+@PIPELINES.register_module()
+class WrapFieldsToLists:
+    """Wrap fields of the data dictionary into lists for evaluation.
+
+    This class can be used as a last step of a test or validation
+    pipeline for single image evaluation or inference.
+
+    Example:
+        >>> test_pipeline = [
+        >>>    dict(type='LoadImageFromFile'),
+        >>>    dict(type='Normalize',
+                    mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True),
+        >>>    dict(type='Pad', size_divisor=32),
+        >>>    dict(type='ImageToTensor', keys=['img']),
+        >>>    dict(type='Collect', keys=['img']),
+        >>>    dict(type='WrapFieldsToLists')
+        >>> ]
+    """
+
+    def __call__(self, results):
+        """Call function to wrap fields into lists.
+
+        Args:
+            results (dict): Result dict contains the data to wrap.
+
+        Returns:
+            dict: The result dict where value of ``self.keys`` are wrapped \
+                into list.
+        """
+
+        # Wrap dict fields into lists
+        for key, val in results.items():
+            results[key] = [val]
+        return results
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}()'
+    
+
+PIPELINES._module_dict.pop('DefaultFormatBundle')
+
+@PIPELINES.register_module()
+class DefaultFormatBundle(object):
+    """Default formatting bundle.
+
+    It simplifies the pipeline of formatting common fields, including "img",
+    "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
+    These fields are formatted as follows.
+
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
+    - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \
+                       (3)to DataContainer (stack=True)
+    """
+
+    def __init__(self, ):
+        return
+
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data that is formatted with
+                default bundle.
+        """
+        if 'img' in results:
+            if isinstance(results['img'], list):
+                # process multiple imgs in single frame
+                imgs = [img.transpose(2, 0, 1) for img in results['img']]
+                imgs = np.ascontiguousarray(np.stack(imgs, axis=0))
+                results['img'] = DC(to_tensor(imgs), stack=True)
+            else:
+                img = np.ascontiguousarray(results['img'].transpose(2, 0, 1))
+                results['img'] = DC(to_tensor(img), stack=True)
+        for key in [
+                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
+                'gt_labels_3d', 'attr_labels', 'pts_instance_mask',
+                'pts_semantic_mask', 'centers2d', 'depths'
+        ]:
+            if key not in results:
+                continue
+            if isinstance(results[key], list):
+                results[key] = DC([to_tensor(res) for res in results[key]])
+            else:
+                results[key] = DC(to_tensor(results[key]))
+        if 'gt_bboxes_3d' in results:
+            if isinstance(results['gt_bboxes_3d'], BaseInstance3DBoxes):
+                results['gt_bboxes_3d'] = DC(
+                    results['gt_bboxes_3d'], cpu_only=True)
+            else:
+                results['gt_bboxes_3d'] = DC(
+                    to_tensor(results['gt_bboxes_3d']))
+
+        if 'gt_masks' in results:
+            results['gt_masks'] = DC(results['gt_masks'], cpu_only=True)
+        if 'gt_semantic_seg' in results:
+            results['gt_semantic_seg'] = DC(
+                to_tensor(results['gt_semantic_seg'][None, ...]), stack=True)
+
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+
+@PIPELINES.register_module()
+class Collect3D(object):
+    """Collect data from the loader relevant to the specific task.
+
+    This is usually the last stage of the data loader pipeline. Typically keys
+    is set to some subset of "img", "proposals", "gt_bboxes",
+    "gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
+
+    The "img_meta" item is always populated.  The contents of the "img_meta"
+    dictionary depends on "meta_keys". By default this includes:
+
+        - 'img_shape': shape of the image input to the network as a tuple \
+            (h, w, c).  Note that images may be zero padded on the \
+            bottom/right if the batch tensor is larger than this shape.
+        - 'scale_factor': a float indicating the preprocessing scale
+        - 'flip': a boolean indicating if image flip transform was used
+        - 'filename': path to the image file
+        - 'ori_shape': original shape of the image as a tuple (h, w, c)
+        - 'pad_shape': image shape after padding
+        - 'lidar2img': transform from lidar to image
+        - 'depth2img': transform from depth to image
+        - 'cam2img': transform from camera to image
+        - 'pcd_horizontal_flip': a boolean indicating if point cloud is \
+            flipped horizontally
+        - 'pcd_vertical_flip': a boolean indicating if point cloud is \
+            flipped vertically
+        - 'box_mode_3d': 3D box mode
+        - 'box_type_3d': 3D box type
+        - 'img_norm_cfg': a dict of normalization information:
+            - mean: per channel mean subtraction
+            - std: per channel std divisor
+            - to_rgb: bool indicating if bgr was converted to rgb
+        - 'pcd_trans': point cloud transformations
+        - 'sample_idx': sample index
+        - 'pcd_scale_factor': point cloud scale factor
+        - 'pcd_rotation': rotation applied to point cloud
+        - 'pts_filename': path to point cloud file.
+
+    Args:
+        keys (Sequence[str]): Keys of results to be collected in ``data``.
+        meta_keys (Sequence[str], optional): Meta keys to be converted to
+            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
+            Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img',
+            'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
+            'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
+            'box_type_3d', 'img_norm_cfg', 'pcd_trans',
+            'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename')
+    """
+
+    def __init__(self,
+                 keys,
+                 meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
+                            'depth2img', 'cam2img', 'pad_shape',
+                            'scale_factor', 'flip', 'pcd_horizontal_flip',
+                            'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
+                            'img_norm_cfg', 'pcd_trans', 'sample_idx',
+                            'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
+                            'transformation_3d_flow')):
+        self.keys = keys
+        self.meta_keys = meta_keys
+
+    def __call__(self, results):
+        """Call function to collect keys in results. The keys in ``meta_keys``
+        will be converted to :obj:`mmcv.DataContainer`.
+
+        Args:
+            results (dict): Result dict contains the data to collect.
+
+        Returns:
+            dict: The result dict contains the following keys
+                - keys in ``self.keys``
+                - ``img_metas``
+        """
+        data = {}
+        img_metas = {}
+        for key in self.meta_keys:
+            if key in results:
+                img_metas[key] = results[key]
+
+        data['img_metas'] = DC(img_metas, cpu_only=True)
+        for key in self.keys:
+            data[key] = results[key]
+        return data
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        return self.__class__.__name__ + \
+            f'(keys={self.keys}, meta_keys={self.meta_keys})'
+
+
+@PIPELINES.register_module()
+class DefaultFormatBundle3D(DefaultFormatBundle):
+    """Default formatting bundle.
+
+    It simplifies the pipeline of formatting common fields for voxels,
+    including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and
+    "gt_semantic_seg".
+    These fields are formatted as follows.
+
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    """
+
+    def __init__(self, class_names, with_gt=True, with_label=True):
+        super(DefaultFormatBundle3D, self).__init__()
+        self.class_names = class_names
+        self.with_gt = with_gt
+        self.with_label = with_label
+
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data that is formatted with
+                default bundle.
+        """
+        # Format 3D data
+        if 'points' in results:
+            assert isinstance(results['points'], BasePoints)
+            results['points'] = DC(results['points'].tensor)
+
+        for key in ['voxels', 'coors', 'voxel_centers', 'num_points']:
+            if key not in results:
+                continue
+            results[key] = DC(to_tensor(results[key]), stack=False)
+
+        if self.with_gt:
+            # Clean GT bboxes in the final
+            if 'gt_bboxes_3d_mask' in results:
+                gt_bboxes_3d_mask = results['gt_bboxes_3d_mask']
+                results['gt_bboxes_3d'] = results['gt_bboxes_3d'][
+                    gt_bboxes_3d_mask]
+                if 'gt_names_3d' in results:
+                    results['gt_names_3d'] = results['gt_names_3d'][
+                        gt_bboxes_3d_mask]
+                if 'centers2d' in results:
+                    results['centers2d'] = results['centers2d'][
+                        gt_bboxes_3d_mask]
+                if 'depths' in results:
+                    results['depths'] = results['depths'][gt_bboxes_3d_mask]
+            if 'gt_bboxes_mask' in results:
+                gt_bboxes_mask = results['gt_bboxes_mask']
+                if 'gt_bboxes' in results:
+                    results['gt_bboxes'] = results['gt_bboxes'][gt_bboxes_mask]
+                results['gt_names'] = results['gt_names'][gt_bboxes_mask]
+            if self.with_label:
+                if 'gt_names' in results and len(results['gt_names']) == 0:
+                    results['gt_labels'] = np.array([], dtype=np.int64)
+                    results['attr_labels'] = np.array([], dtype=np.int64)
+                elif 'gt_names' in results and isinstance(
+                        results['gt_names'][0], list):
+                    # gt_labels might be a list of list in multi-view setting
+                    results['gt_labels'] = [
+                        np.array([self.class_names.index(n) for n in res],
+                                 dtype=np.int64) for res in results['gt_names']
+                    ]
+                elif 'gt_names' in results:
+                    results['gt_labels'] = np.array([
+                        self.class_names.index(n) for n in results['gt_names']
+                    ],
+                                                    dtype=np.int64)
+                # we still assume one pipeline for one frame LiDAR
+                # thus, the 3D name is list[string]
+                if 'gt_names_3d' in results:
+                    results['gt_labels_3d'] = np.array([
+                        self.class_names.index(n)
+                        for n in results['gt_names_3d']
+                    ],
+                                                       dtype=np.int64)
+        results = super(DefaultFormatBundle3D, self).__call__(results)
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(class_names={self.class_names}, '
+        repr_str += f'with_gt={self.with_gt}, with_label={self.with_label})'
+        return repr_str
+    
+@PIPELINES.register_module()
+class CustomDefaultFormatBundle3D(DefaultFormatBundle3D):
+    """Default formatting bundle.
+    It simplifies the pipeline of formatting common fields for voxels,
+    including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and
+    "gt_semantic_seg".
+    These fields are formatted as follows.
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    """
+
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+        Args:
+            results (dict): Result dict contains the data to convert.
+        Returns:
+            dict: The result dict contains the data that is formatted with
+                default bundle.
+        """
+        # Format 3D data
+        results = super(CustomDefaultFormatBundle3D, self).__call__(results)
+        results['gt_map_masks'] = DC(
+            to_tensor(results['gt_map_masks']), stack=True)
+
+        return results
+
+@PIPELINES.register_module()
+class VADFormatBundle3D(DefaultFormatBundle3D):
+    """Default formatting bundle.
+    It simplifies the pipeline of formatting common fields for voxels,
+    including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and
+    "gt_semantic_seg".
+    These fields are formatted as follows.
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    """
+    def __init__(self, class_names, with_gt=True, with_label=True, with_ego=True):
+        super(VADFormatBundle3D, self).__init__(class_names, with_gt, with_label)
+        self.with_ego = with_ego
+
+
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+        Args:
+            results (dict): Result dict contains the data to convert.
+        Returns:
+            dict: The result dict contains the data that is formatted with
+                default bundle.
+        """
+        # Format 3D data
+        results = super(VADFormatBundle3D, self).__call__(results)
+        # results['gt_map_masks'] = DC(to_tensor(results['gt_map_masks']), stack=True)
+        if self.with_ego:
+            if 'ego_his_trajs' in results:
+                results['ego_his_trajs'] = DC(to_tensor(results['ego_his_trajs'][None, ...]), stack=True)
+            if 'ego_fut_trajs' in results:
+                results['ego_fut_trajs'] = DC(to_tensor(results['ego_fut_trajs'][None, ...]), stack=True)
+            if 'ego_fut_masks' in results:
+                results['ego_fut_masks'] = DC(to_tensor(results['ego_fut_masks'][None, None, ...]), stack=True)
+            if 'ego_fut_cmd' in results:
+                results['ego_fut_cmd'] = DC(to_tensor(results['ego_fut_cmd'][None, None, ...]), stack=True)
+            if 'ego_lcf_feat' in results:
+                results['ego_lcf_feat'] = DC(to_tensor(results['ego_lcf_feat'][None, None, ...]), stack=True)
+            if 'gt_attr_labels' in results:
+                results['gt_attr_labels'] = DC(to_tensor(results['gt_attr_labels']), cpu_only=False)
+                
+        return results
+
diff --git a/mmcv/datasets/pipelines/loading.py b/mmcv/datasets/pipelines/loading.py
new file mode 100644
index 0000000..dbf494e
--- /dev/null
+++ b/mmcv/datasets/pipelines/loading.py
@@ -0,0 +1,1709 @@
+import os
+import os.path as osp
+import torch
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+from einops import rearrange
+from mmcv.core.points import BasePoints, get_points_type
+from mmcv.fileio.file_client import FileClient
+from mmcv.image import imfrombytes, imread
+from mmcv.utils import check_file_exist
+from mmcv.core.mask.structures import BitmapMasks, PolygonMasks
+# from mmcv.datasets.pipelines.loading import LoadAnnotations, LoadImageFromFile
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class LoadImageFromFile:
+    """Load an image from file.
+
+    Required keys are "img_prefix" and "img_info" (a dict that must contain the
+    key "filename"). Added or updated keys are "filename", "img", "img_shape",
+    "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`),
+    "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1).
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+        color_type (str): The flag argument for :func:`mmcv.imfrombytes`.
+            Defaults to 'color'.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+    """
+
+    def __init__(self,
+                 to_float32=False,
+                 color_type='color',
+                 file_client_args=dict(backend='disk')):
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def __call__(self, results):
+        """Call functions to load image and get image meta information.
+
+        Args:
+            results (dict): Result dict from :obj:`mmcv.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        if self.file_client is None:
+            self.file_client = FileClient(**self.file_client_args)
+
+        if results['img_prefix'] is not None:
+            filename = osp.join(results['img_prefix'],
+                                results['img_info']['filename'])
+        else:
+            filename = results['img_info']['filename']
+
+        img_bytes = self.file_client.get(filename)
+        img = imfrombytes(img_bytes, flag=self.color_type)
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['filename'] = filename
+        results['ori_filename'] = results['img_info']['filename']
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['ori_shape'] = img.shape
+        results['img_fields'] = ['img']
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32}, '
+                    f"color_type='{self.color_type}', "
+                    f'file_client_args={self.file_client_args})')
+        return repr_str
+
+
+@PIPELINES.register_module()
+class LoadImageFromWebcam(LoadImageFromFile):
+    """Load an image from webcam.
+
+    Similar with :obj:`LoadImageFromFile`, but the image read from webcam is in
+    ``results['img']``.
+    """
+
+    def __call__(self, results):
+        """Call functions to add image meta information.
+
+        Args:
+            results (dict): Result dict with Webcam read image in
+                ``results['img']``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        img = results['img']
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['filename'] = None
+        results['ori_filename'] = None
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['ori_shape'] = img.shape
+        results['img_fields'] = ['img']
+        return results
+
+
+@PIPELINES.register_module()
+class LoadMultiChannelImageFromFiles:
+    """Load multi-channel images from a list of separate channel files.
+
+    Required keys are "img_prefix" and "img_info" (a dict that must contain the
+    key "filename", which is expected to be a list of filenames).
+    Added or updated keys are "filename", "img", "img_shape",
+    "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`),
+    "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1).
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+        color_type (str): The flag argument for :func:`mmcv.imfrombytes`.
+            Defaults to 'color'.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+    """
+
+    def __init__(self,
+                 to_float32=False,
+                 color_type='unchanged',
+                 file_client_args=dict(backend='disk')):
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def __call__(self, results):
+        """Call functions to load multiple images and get images meta
+        information.
+
+        Args:
+            results (dict): Result dict from :obj:`mmcv.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded images and meta information.
+        """
+
+        if self.file_client is None:
+            self.file_client = FileClient(**self.file_client_args)
+
+        if results['img_prefix'] is not None:
+            filename = [
+                osp.join(results['img_prefix'], fname)
+                for fname in results['img_info']['filename']
+            ]
+        else:
+            filename = results['img_info']['filename']
+
+        img = []
+        for name in filename:
+            img_bytes = self.file_client.get(name)
+            img.append(imfrombytes(img_bytes, flag=self.color_type))
+        img = np.stack(img, axis=-1)
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['filename'] = filename
+        results['ori_filename'] = results['img_info']['filename']
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['ori_shape'] = img.shape
+        # Set initial values for default meta_keys
+        results['pad_shape'] = img.shape
+        results['scale_factor'] = 1.0
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results['img_norm_cfg'] = dict(
+            mean=np.zeros(num_channels, dtype=np.float32),
+            std=np.ones(num_channels, dtype=np.float32),
+            to_rgb=False)
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32}, '
+                    f"color_type='{self.color_type}', "
+                    f'file_client_args={self.file_client_args})')
+        return repr_str
+
+
+@PIPELINES.register_module()
+class LoadAnnotations:
+    """Load multiple types of annotations.
+
+    Args:
+        with_bbox (bool): Whether to parse and load the bbox annotation.
+             Default: True.
+        with_label (bool): Whether to parse and load the label annotation.
+            Default: True.
+        with_mask (bool): Whether to parse and load the mask annotation.
+             Default: False.
+        with_seg (bool): Whether to parse and load the semantic segmentation
+            annotation. Default: False.
+        poly2mask (bool): Whether to convert the instance masks from polygons
+            to bitmaps. Default: True.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+    """
+
+    def __init__(self,
+                 with_bbox=True,
+                 with_label=True,
+                 with_mask=False,
+                 with_seg=False,
+                 poly2mask=True,
+                 file_client_args=dict(backend='disk')):
+        self.with_bbox = with_bbox
+        self.with_label = with_label
+        self.with_mask = with_mask
+        self.with_seg = with_seg
+        self.poly2mask = poly2mask
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def _load_bboxes(self, results):
+        """Private function to load bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmcv.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+
+        ann_info = results['ann_info']
+        results['gt_bboxes'] = ann_info['bboxes'].copy()
+
+        gt_bboxes_ignore = ann_info.get('bboxes_ignore', None)
+        if gt_bboxes_ignore is not None:
+            results['gt_bboxes_ignore'] = gt_bboxes_ignore.copy()
+            results['bbox_fields'].append('gt_bboxes_ignore')
+        results['bbox_fields'].append('gt_bboxes')
+        return results
+
+    def _load_labels(self, results):
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmcv.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded label annotations.
+        """
+
+        results['gt_labels'] = results['ann_info']['labels'].copy()
+        return results
+
+    def _poly2mask(self, mask_ann, img_h, img_w):
+        """Private function to convert masks represented with polygon to
+        bitmaps.
+
+        Args:
+            mask_ann (list | dict): Polygon mask annotation input.
+            img_h (int): The height of output mask.
+            img_w (int): The width of output mask.
+
+        Returns:
+            numpy.ndarray: The decode bitmap mask of shape (img_h, img_w).
+        """
+
+        if isinstance(mask_ann, list):
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+            rle = maskUtils.merge(rles)
+        elif isinstance(mask_ann['counts'], list):
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+        else:
+            # rle
+            rle = mask_ann
+        mask = maskUtils.decode(rle)
+        return mask
+
+    def process_polygons(self, polygons):
+        """Convert polygons to list of ndarray and filter invalid polygons.
+
+        Args:
+            polygons (list[list]): Polygons of one instance.
+
+        Returns:
+            list[numpy.ndarray]: Processed polygons.
+        """
+
+        polygons = [np.array(p) for p in polygons]
+        valid_polygons = []
+        for polygon in polygons:
+            if len(polygon) % 2 == 0 and len(polygon) >= 6:
+                valid_polygons.append(polygon)
+        return valid_polygons
+
+    def _load_masks(self, results):
+        """Private function to load mask annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmcv.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded mask annotations.
+                If ``self.poly2mask`` is set ``True``, `gt_mask` will contain
+                :obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used.
+        """
+
+        h, w = results['img_info']['height'], results['img_info']['width']
+        gt_masks = results['ann_info']['masks']
+        if self.poly2mask:
+            gt_masks = BitmapMasks(
+                [self._poly2mask(mask, h, w) for mask in gt_masks], h, w)
+        else:
+            gt_masks = PolygonMasks(
+                [self.process_polygons(polygons) for polygons in gt_masks], h,
+                w)
+        results['gt_masks'] = gt_masks
+        results['mask_fields'].append('gt_masks')
+        return results
+
+    def _load_semantic_seg(self, results):
+        """Private function to load semantic segmentation annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`dataset`.
+
+        Returns:
+            dict: The dict contains loaded semantic segmentation annotations.
+        """
+
+        if self.file_client is None:
+            self.file_client = FileClient(**self.file_client_args)
+
+        filename = osp.join(results['seg_prefix'],
+                            results['ann_info']['seg_map'])
+        img_bytes = self.file_client.get(filename)
+        results['gt_semantic_seg'] = imfrombytes(
+            img_bytes, flag='unchanged').squeeze()
+        results['seg_fields'].append('gt_semantic_seg')
+        return results
+
+    def __call__(self, results):
+        """Call function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmcv.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box, label, mask and
+                semantic segmentation annotations.
+        """
+
+        if self.with_bbox:
+            results = self._load_bboxes(results)
+            if results is None:
+                return None
+        if self.with_label:
+            results = self._load_labels(results)
+        if self.with_mask:
+            results = self._load_masks(results)
+        if self.with_seg:
+            results = self._load_semantic_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(with_bbox={self.with_bbox}, '
+        repr_str += f'with_label={self.with_label}, '
+        repr_str += f'with_mask={self.with_mask}, '
+        repr_str += f'with_seg={self.with_seg}, '
+        repr_str += f'poly2mask={self.poly2mask}, '
+        repr_str += f'poly2mask={self.file_client_args})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class LoadProposals:
+    """Load proposal pipeline.
+
+    Required key is "proposals". Updated keys are "proposals", "bbox_fields".
+
+    Args:
+        num_max_proposals (int, optional): Maximum number of proposals to load.
+            If not specified, all proposals will be loaded.
+    """
+
+    def __init__(self, num_max_proposals=None):
+        self.num_max_proposals = num_max_proposals
+
+    def __call__(self, results):
+        """Call function to load proposals from file.
+
+        Args:
+            results (dict): Result dict from :obj:`mmcv.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded proposal annotations.
+        """
+
+        proposals = results['proposals']
+        if proposals.shape[1] not in (4, 5):
+            raise AssertionError(
+                'proposals should have shapes (n, 4) or (n, 5), '
+                f'but found {proposals.shape}')
+        proposals = proposals[:, :4]
+
+        if self.num_max_proposals is not None:
+            proposals = proposals[:self.num_max_proposals]
+
+        if len(proposals) == 0:
+            proposals = np.array([[0, 0, 0, 0]], dtype=np.float32)
+        results['proposals'] = proposals
+        results['bbox_fields'].append('proposals')
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+            f'(num_max_proposals={self.num_max_proposals})'
+
+
+@PIPELINES.register_module()
+class FilterAnnotations:
+    """Filter invalid annotations.
+
+    Args:
+        min_gt_bbox_wh (tuple[int]): Minimum width and height of ground truth
+            boxes.
+    """
+
+    def __init__(self, min_gt_bbox_wh):
+        # TODO: add more filter options
+        self.min_gt_bbox_wh = min_gt_bbox_wh
+
+    def __call__(self, results):
+        assert 'gt_bboxes' in results
+        gt_bboxes = results['gt_bboxes']
+        w = gt_bboxes[:, 2] - gt_bboxes[:, 0]
+        h = gt_bboxes[:, 3] - gt_bboxes[:, 1]
+        keep = (w > self.min_gt_bbox_wh[0]) & (h > self.min_gt_bbox_wh[1])
+        if not keep.any():
+            return None
+        else:
+            keys = ('gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg')
+            for key in keys:
+                if key in results:
+                    results[key] = results[key][keep]
+            return results
+        
+
+@PIPELINES.register_module()
+class LoadMultiViewImageFromFiles(object):
+    """Load multi channel images from a list of separate channel files.
+
+    Expects results['img_filename'] to be a list of filenames.
+
+    Args:
+        to_float32 (bool): Whether to convert the img to float32.
+            Defaults to False.
+        color_type (str): Color type of the file. Defaults to 'unchanged'.
+    """
+
+    def __init__(self, to_float32=False, color_type='unchanged'):
+        self.to_float32 = to_float32
+        self.color_type = color_type
+
+    def __call__(self, results):
+        """Call function to load multi-view image from files.
+
+        Args:
+            results (dict): Result dict containing multi-view image filenames.
+
+        Returns:
+            dict: The result dict containing the multi-view image data. \
+                Added keys and values are described below.
+
+                - filename (str): Multi-view image filenames.
+                - img (np.ndarray): Multi-view image arrays.
+                - img_shape (tuple[int]): Shape of multi-view image arrays.
+                - ori_shape (tuple[int]): Shape of original image arrays.
+                - pad_shape (tuple[int]): Shape of padded image arrays.
+                - scale_factor (float): Scale factor.
+                - img_norm_cfg (dict): Normalization configuration of images.
+        """
+        filename = results['img_filename']
+        # img is of shape (h, w, c, num_views)
+        img = np.stack(
+            [imread(name, self.color_type) for name in filename], axis=-1)
+        if self.to_float32:
+            img = img.astype(np.float32)
+        results['filename'] = filename
+        # unravel to list, see `DefaultFormatBundle` in formating.py
+        # which will transpose each image separately and then stack into array
+        results['img'] = [img[..., i] for i in range(img.shape[-1])]
+        results['img_shape'] = img.shape
+        results['ori_shape'] = img.shape
+        # Set initial values for default meta_keys
+        results['pad_shape'] = img.shape
+        results['scale_factor'] = 1.0
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results['img_norm_cfg'] = dict(
+            mean=np.zeros(num_channels, dtype=np.float32),
+            std=np.ones(num_channels, dtype=np.float32),
+            to_rgb=False)
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(to_float32={self.to_float32}, '
+        repr_str += f"color_type='{self.color_type}')"
+        return repr_str
+
+
+@PIPELINES.register_module()
+class LoadImageFromFileMono3D(LoadImageFromFile):
+    """Load an image from file in monocular 3D object detection. Compared to 2D
+    detection, additional camera parameters need to be loaded.
+
+    Args:
+        kwargs (dict): Arguments are the same as those in \
+            :class:`LoadImageFromFile`.
+    """
+
+    def __call__(self, results):
+        """Call functions to load image and get image meta information.
+
+        Args:
+            results (dict): Result dict from :obj:`mmcv.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+        super().__call__(results)
+        results['cam2img'] = results['img_info']['cam_intrinsic']
+        return results
+
+
+@PIPELINES.register_module()
+class LoadPointsFromMultiSweeps(object):
+    """Load points from multiple sweeps.
+
+    This is usually used for nuScenes dataset to utilize previous sweeps.
+
+    Args:
+        sweeps_num (int): Number of sweeps. Defaults to 10.
+        load_dim (int): Dimension number of the loaded points. Defaults to 5.
+        use_dim (list[int]): Which dimension to use. Defaults to [0, 1, 2, 4].
+        file_client_args (dict): Config dict of file clients, refer to
+            https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
+            for more details. Defaults to dict(backend='disk').
+        pad_empty_sweeps (bool): Whether to repeat keyframe when
+            sweeps is empty. Defaults to False.
+        remove_close (bool): Whether to remove close points.
+            Defaults to False.
+        test_mode (bool): If test_model=True used for testing, it will not
+            randomly sample sweeps but select the nearest N frames.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 sweeps_num=10,
+                 load_dim=5,
+                 use_dim=[0, 1, 2, 4],
+                 file_client_args=dict(backend='disk'),
+                 pad_empty_sweeps=False,
+                 remove_close=False,
+                 test_mode=False):
+        self.load_dim = load_dim
+        self.sweeps_num = sweeps_num
+        self.use_dim = use_dim
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+        self.pad_empty_sweeps = pad_empty_sweeps
+        self.remove_close = remove_close
+        self.test_mode = test_mode
+
+    def _load_points(self, pts_filename):
+        """Private function to load point clouds data.
+
+        Args:
+            pts_filename (str): Filename of point clouds data.
+
+        Returns:
+            np.ndarray: An array containing point clouds data.
+        """
+        if self.file_client is None:
+            self.file_client = FileClient(**self.file_client_args)
+        try:
+            pts_bytes = self.file_client.get(pts_filename)
+            points = np.frombuffer(pts_bytes, dtype=np.float32)
+        except ConnectionError:
+            check_file_exist(pts_filename)
+            if pts_filename.endswith('.npy'):
+                points = np.load(pts_filename)
+            else:
+                points = np.fromfile(pts_filename, dtype=np.float32)
+        return points
+
+    def _remove_close(self, points, radius=1.0):
+        """Removes point too close within a certain radius from origin.
+
+        Args:
+            points (np.ndarray | :obj:`BasePoints`): Sweep points.
+            radius (float): Radius below which points are removed.
+                Defaults to 1.0.
+
+        Returns:
+            np.ndarray: Points after removing.
+        """
+        if isinstance(points, np.ndarray):
+            points_numpy = points
+        elif isinstance(points, BasePoints):
+            points_numpy = points.tensor.numpy()
+        else:
+            raise NotImplementedError
+        x_filt = np.abs(points_numpy[:, 0]) < radius
+        y_filt = np.abs(points_numpy[:, 1]) < radius
+        not_close = np.logical_not(np.logical_and(x_filt, y_filt))
+        return points[not_close]
+
+    def __call__(self, results):
+        """Call function to load multi-sweep point clouds from files.
+
+        Args:
+            results (dict): Result dict containing multi-sweep point cloud \
+                filenames.
+
+        Returns:
+            dict: The result dict containing the multi-sweep points data. \
+                Added key and value are described below.
+
+                - points (np.ndarray | :obj:`BasePoints`): Multi-sweep point \
+                    cloud arrays.
+        """
+        points = results['points']
+        points.tensor[:, 4] = 0
+        sweep_points_list = [points]
+        ts = results['timestamp']
+        if self.pad_empty_sweeps and len(results['sweeps']) == 0:
+            for i in range(self.sweeps_num):
+                if self.remove_close:
+                    sweep_points_list.append(self._remove_close(points))
+                else:
+                    sweep_points_list.append(points)
+        else:
+            if len(results['sweeps']) <= self.sweeps_num:
+                choices = np.arange(len(results['sweeps']))
+            elif self.test_mode:
+                choices = np.arange(self.sweeps_num)
+            else:
+                choices = np.random.choice(
+                    len(results['sweeps']), self.sweeps_num, replace=False)
+            for idx in choices:
+                sweep = results['sweeps'][idx]
+                points_sweep = self._load_points(sweep['data_path'])
+                points_sweep = np.copy(points_sweep).reshape(-1, self.load_dim)
+                if self.remove_close:
+                    points_sweep = self._remove_close(points_sweep)
+                sweep_ts = sweep['timestamp'] / 1e6
+                points_sweep[:, :3] = points_sweep[:, :3] @ sweep[
+                    'sensor2lidar_rotation'].T
+                points_sweep[:, :3] += sweep['sensor2lidar_translation']
+                points_sweep[:, 4] = ts - sweep_ts
+                points_sweep = points.new_point(points_sweep)
+                sweep_points_list.append(points_sweep)
+
+        points = points.cat(sweep_points_list)
+        points = points[:, self.use_dim]
+        results['points'] = points
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        return f'{self.__class__.__name__}(sweeps_num={self.sweeps_num})'
+
+
+@PIPELINES.register_module()
+class PointSegClassMapping(object):
+    """Map original semantic class to valid category ids.
+
+    Map valid classes as 0~len(valid_cat_ids)-1 and
+    others as len(valid_cat_ids).
+
+    Args:
+        valid_cat_ids (tuple[int]): A tuple of valid category.
+        max_cat_id (int): The max possible cat_id in input segmentation mask.
+            Defaults to 40.
+    """
+
+    def __init__(self, valid_cat_ids, max_cat_id=40):
+        assert max_cat_id >= np.max(valid_cat_ids), \
+            'max_cat_id should be greater than maximum id in valid_cat_ids'
+
+        self.valid_cat_ids = valid_cat_ids
+        self.max_cat_id = int(max_cat_id)
+
+        # build cat_id to class index mapping
+        neg_cls = len(valid_cat_ids)
+        self.cat_id2class = np.ones(
+            self.max_cat_id + 1, dtype=np.int) * neg_cls
+        for cls_idx, cat_id in enumerate(valid_cat_ids):
+            self.cat_id2class[cat_id] = cls_idx
+
+    def __call__(self, results):
+        """Call function to map original semantic class to valid category ids.
+
+        Args:
+            results (dict): Result dict containing point semantic masks.
+
+        Returns:
+            dict: The result dict containing the mapped category ids. \
+                Updated key and value are described below.
+
+                - pts_semantic_mask (np.ndarray): Mapped semantic masks.
+        """
+        assert 'pts_semantic_mask' in results
+        pts_semantic_mask = results['pts_semantic_mask']
+
+        converted_pts_sem_mask = self.cat_id2class[pts_semantic_mask]
+
+        results['pts_semantic_mask'] = converted_pts_sem_mask
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(valid_cat_ids={self.valid_cat_ids}, '
+        repr_str += f'max_cat_id={self.max_cat_id})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class NormalizePointsColor(object):
+    """Normalize color of points.
+
+    Args:
+        color_mean (list[float]): Mean color of the point cloud.
+    """
+
+    def __init__(self, color_mean):
+        self.color_mean = color_mean
+
+    def __call__(self, results):
+        """Call function to normalize color of points.
+
+        Args:
+            results (dict): Result dict containing point clouds data.
+
+        Returns:
+            dict: The result dict containing the normalized points. \
+                Updated key and value are described below.
+
+                - points (:obj:`BasePoints`): Points after color normalization.
+        """
+        points = results['points']
+        assert points.attribute_dims is not None and \
+            'color' in points.attribute_dims.keys(), \
+            'Expect points have color attribute'
+        if self.color_mean is not None:
+            points.color = points.color - \
+                points.color.new_tensor(self.color_mean)
+        points.color = points.color / 255.0
+        results['points'] = points
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(color_mean={self.color_mean})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class LoadPointsFromFile(object):
+    """Load Points From File.
+
+    Load sunrgbd and scannet points from file.
+
+    Args:
+        coord_type (str): The type of coordinates of points cloud.
+            Available options includes:
+            - 'LIDAR': Points in LiDAR coordinates.
+            - 'DEPTH': Points in depth coordinates, usually for indoor dataset.
+            - 'CAMERA': Points in camera coordinates.
+        load_dim (int): The dimension of the loaded points.
+            Defaults to 6.
+        use_dim (list[int]): Which dimensions of the points to be used.
+            Defaults to [0, 1, 2]. For KITTI dataset, set use_dim=4
+            or use_dim=[0, 1, 2, 3] to use the intensity dimension.
+        shift_height (bool): Whether to use shifted height. Defaults to False.
+        use_color (bool): Whether to use color features. Defaults to False.
+        file_client_args (dict): Config dict of file clients, refer to
+            https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
+            for more details. Defaults to dict(backend='disk').
+    """
+
+    def __init__(self,
+                 coord_type,
+                 load_dim=6,
+                 use_dim=[0, 1, 2],
+                 shift_height=False,
+                 use_color=False,
+                 file_client_args=dict(backend='disk')):
+        self.shift_height = shift_height
+        self.use_color = use_color
+        if isinstance(use_dim, int):
+            use_dim = list(range(use_dim))
+        assert max(use_dim) < load_dim, \
+            f'Expect all used dimensions < {load_dim}, got {use_dim}'
+        assert coord_type in ['CAMERA', 'LIDAR', 'DEPTH']
+
+        self.coord_type = coord_type
+        self.load_dim = load_dim
+        self.use_dim = use_dim
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def _load_points(self, pts_filename):
+        """Private function to load point clouds data.
+
+        Args:
+            pts_filename (str): Filename of point clouds data.
+
+        Returns:
+            np.ndarray: An array containing point clouds data.
+        """
+        if self.file_client is None:
+            self.file_client = FileClient(**self.file_client_args)
+        try:
+            pts_bytes = self.file_client.get(pts_filename)
+            points = np.frombuffer(pts_bytes, dtype=np.float32)
+        except ConnectionError:
+            check_file_exist(pts_filename)
+            if pts_filename.endswith('.npy'):
+                points = np.load(pts_filename)
+            else:
+                points = np.fromfile(pts_filename, dtype=np.float32)
+
+        return points
+
+    def __call__(self, results):
+        """Call function to load points data from file.
+
+        Args:
+            results (dict): Result dict containing point clouds data.
+
+        Returns:
+            dict: The result dict containing the point clouds data. \
+                Added key and value are described below.
+
+                - points (:obj:`BasePoints`): Point clouds data.
+        """
+        pts_filename = results['pts_filename']
+        points = self._load_points(pts_filename)
+        points = points.reshape(-1, self.load_dim)
+        points = points[:, self.use_dim]
+        attribute_dims = None
+
+        if self.shift_height:
+            floor_height = np.percentile(points[:, 2], 0.99)
+            height = points[:, 2] - floor_height
+            points = np.concatenate(
+                [points[:, :3],
+                 np.expand_dims(height, 1), points[:, 3:]], 1)
+            attribute_dims = dict(height=3)
+
+        if self.use_color:
+            assert len(self.use_dim) >= 6
+            if attribute_dims is None:
+                attribute_dims = dict()
+            attribute_dims.update(
+                dict(color=[
+                    points.shape[1] - 3,
+                    points.shape[1] - 2,
+                    points.shape[1] - 1,
+                ]))
+
+        points_class = get_points_type(self.coord_type)
+        points = points_class(
+            points, points_dim=points.shape[-1], attribute_dims=attribute_dims)
+        results['points'] = points
+
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__ + '('
+        repr_str += f'shift_height={self.shift_height}, '
+        repr_str += f'use_color={self.use_color}, '
+        repr_str += f'file_client_args={self.file_client_args}, '
+        repr_str += f'load_dim={self.load_dim}, '
+        repr_str += f'use_dim={self.use_dim})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class LoadAnnotations3D(LoadAnnotations):
+    """Load Annotations3D.
+
+    Load instance mask and semantic mask of points and
+    encapsulate the items into related fields.
+
+    Args:
+        with_bbox_3d (bool, optional): Whether to load 3D boxes.
+            Defaults to True.
+        with_label_3d (bool, optional): Whether to load 3D labels.
+            Defaults to True.
+        with_attr_label (bool, optional): Whether to load attribute label.
+            Defaults to False.
+        with_mask_3d (bool, optional): Whether to load 3D instance masks.
+            for points. Defaults to False.
+        with_seg_3d (bool, optional): Whether to load 3D semantic masks.
+            for points. Defaults to False.
+        with_bbox (bool, optional): Whether to load 2D boxes.
+            Defaults to False.
+        with_label (bool, optional): Whether to load 2D labels.
+            Defaults to False.
+        with_mask (bool, optional): Whether to load 2D instance masks.
+            Defaults to False.
+        with_seg (bool, optional): Whether to load 2D semantic masks.
+            Defaults to False.
+        with_bbox_depth (bool, optional): Whether to load 2.5D boxes.
+            Defaults to False.
+        poly2mask (bool, optional): Whether to convert polygon annotations
+            to bitmasks. Defaults to True.
+        seg_3d_dtype (dtype, optional): Dtype of 3D semantic masks.
+            Defaults to int64
+        file_client_args (dict): Config dict of file clients, refer to
+            https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
+            for more details.
+    """
+
+    def __init__(self,
+                 with_bbox_3d=True,
+                 with_label_3d=True,
+                 with_attr_label=False,
+                 with_mask_3d=False,
+                 with_seg_3d=False,
+                 with_bbox=False,
+                 with_label=False,
+                 with_mask=False,
+                 with_seg=False,
+                 with_bbox_depth=False,
+                 poly2mask=True,
+                 seg_3d_dtype='int',
+                 file_client_args=dict(backend='disk')):
+        super().__init__(
+            with_bbox,
+            with_label,
+            with_mask,
+            with_seg,
+            poly2mask,
+            file_client_args=file_client_args)
+        self.with_bbox_3d = with_bbox_3d
+        self.with_bbox_depth = with_bbox_depth
+        self.with_label_3d = with_label_3d
+        self.with_attr_label = with_attr_label
+        self.with_mask_3d = with_mask_3d
+        self.with_seg_3d = with_seg_3d
+        self.seg_3d_dtype = seg_3d_dtype
+
+    def _load_bboxes_3d(self, results):
+        """Private function to load 3D bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmcv3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded 3D bounding box annotations.
+        """
+        results['gt_bboxes_3d'] = results['ann_info']['gt_bboxes_3d']
+        results['bbox3d_fields'].append('gt_bboxes_3d')
+        return results
+
+    def _load_bboxes_depth(self, results):
+        """Private function to load 2.5D bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmcv3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded 2.5D bounding box annotations.
+        """
+        results['centers2d'] = results['ann_info']['centers2d']
+        results['depths'] = results['ann_info']['depths']
+        return results
+
+    def _load_labels_3d(self, results):
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmcv3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded label annotations.
+        """
+        results['gt_labels_3d'] = results['ann_info']['gt_labels_3d']
+        return results
+
+    def _load_attr_labels(self, results):
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmcv3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded label annotations.
+        """
+        results['attr_labels'] = results['ann_info']['attr_labels']
+        return results
+
+    def _load_masks_3d(self, results):
+        """Private function to load 3D mask annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmcv3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded 3D mask annotations.
+        """
+        pts_instance_mask_path = results['ann_info']['pts_instance_mask_path']
+
+        if self.file_client is None:
+            self.file_client = FileClient(**self.file_client_args)
+        try:
+            mask_bytes = self.file_client.get(pts_instance_mask_path)
+            pts_instance_mask = np.frombuffer(mask_bytes, dtype=np.int)
+        except ConnectionError:
+            check_file_exist(pts_instance_mask_path)
+            pts_instance_mask = np.fromfile(
+                pts_instance_mask_path, dtype=np.long)
+
+        results['pts_instance_mask'] = pts_instance_mask
+        results['pts_mask_fields'].append('pts_instance_mask')
+        return results
+
+    def _load_semantic_seg_3d(self, results):
+        """Private function to load 3D semantic segmentation annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmcv3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing the semantic segmentation annotations.
+        """
+        pts_semantic_mask_path = results['ann_info']['pts_semantic_mask_path']
+
+        if self.file_client is None:
+            self.file_client = FileClient(**self.file_client_args)
+        try:
+            mask_bytes = self.file_client.get(pts_semantic_mask_path)
+            # add .copy() to fix read-only bug
+            pts_semantic_mask = np.frombuffer(
+                mask_bytes, dtype=self.seg_3d_dtype).copy()
+        except ConnectionError:
+            check_file_exist(pts_semantic_mask_path)
+            pts_semantic_mask = np.fromfile(
+                pts_semantic_mask_path, dtype=np.long)
+
+        results['pts_semantic_mask'] = pts_semantic_mask
+        results['pts_seg_fields'].append('pts_semantic_mask')
+        return results
+
+    def __call__(self, results):
+        """Call function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmcv3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded 3D bounding box, label, mask and
+                semantic segmentation annotations.
+        """
+        results = super().__call__(results)
+        if self.with_bbox_3d:
+            results = self._load_bboxes_3d(results)
+            if results is None:
+                return None
+        if self.with_bbox_depth:
+            results = self._load_bboxes_depth(results)
+            if results is None:
+                return None
+        if self.with_label_3d:
+            results = self._load_labels_3d(results)
+        if self.with_attr_label:
+            results = self._load_attr_labels(results)
+        if self.with_mask_3d:
+            results = self._load_masks_3d(results)
+        if self.with_seg_3d:
+            results = self._load_semantic_seg_3d(results)
+
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        indent_str = '    '
+        repr_str = self.__class__.__name__ + '(\n'
+        repr_str += f'{indent_str}with_bbox_3d={self.with_bbox_3d}, '
+        repr_str += f'{indent_str}with_label_3d={self.with_label_3d}, '
+        repr_str += f'{indent_str}with_attr_label={self.with_attr_label}, '
+        repr_str += f'{indent_str}with_mask_3d={self.with_mask_3d}, '
+        repr_str += f'{indent_str}with_seg_3d={self.with_seg_3d}, '
+        repr_str += f'{indent_str}with_bbox={self.with_bbox}, '
+        repr_str += f'{indent_str}with_label={self.with_label}, '
+        repr_str += f'{indent_str}with_mask={self.with_mask}, '
+        repr_str += f'{indent_str}with_seg={self.with_seg}, '
+        repr_str += f'{indent_str}with_bbox_depth={self.with_bbox_depth}, '
+        repr_str += f'{indent_str}poly2mask={self.poly2mask})'
+        return repr_str
+
+@PIPELINES.register_module()
+class LoadMultiViewImageFromFilesInCeph(object):
+    """Load multi channel images from a list of separate channel files.
+
+    Expects results['img_filename'] to be a list of filenames.
+
+    Args:
+        to_float32 (bool): Whether to convert the img to float32.
+            Defaults to False.
+        color_type (str): Color type of the file. Defaults to 'unchanged'.
+    """
+
+    def __init__(self, to_float32=False, color_type='unchanged', file_client_args=dict(backend='disk'), img_root=''):
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.file_client_args = file_client_args.copy()
+        self.file_client = FileClient(**self.file_client_args)
+        self.img_root = img_root
+
+    def __call__(self, results):
+        """Call function to load multi-view image from files.
+
+        Args:
+            results (dict): Result dict containing multi-view image filenames.
+
+        Returns:
+            dict: The result dict containing the multi-view image data. \
+                Added keys and values are described below.
+
+                - filename (list of str): Multi-view image filenames.
+                - img (np.ndarray): Multi-view image arrays.
+                - img_shape (tuple[int]): Shape of multi-view image arrays.
+                - ori_shape (tuple[int]): Shape of original image arrays.
+                - pad_shape (tuple[int]): Shape of padded image arrays.
+                - scale_factor (float): Scale factor.
+                - img_norm_cfg (dict): Normalization configuration of images.
+        """
+        images_multiView = []
+        filename = results['img_filename']
+        for img_path in filename:
+            # img_path = os.path.join(self.img_root, img_path)
+            if self.file_client_args['backend'] == 'petrel':
+                img_bytes = self.file_client.get(img_path)
+                img = imfrombytes(img_bytes)
+            elif self.file_client_args['backend'] == 'disk':
+                img = imread(img_path, self.color_type)
+            images_multiView.append(img)
+        # img is of shape (h, w, c, num_views)
+        img = np.stack(
+            #[mmcv.imread(name, self.color_type) for name in filename], axis=-1)
+            images_multiView, axis=-1)
+        if self.to_float32:
+            img = img.astype(np.float32)
+        results['filename'] = filename
+        # unravel to list, see `DefaultFormatBundle` in formating.py
+        # which will transpose each image separately and then stack into array
+        results['img'] = [img[..., i] for i in range(img.shape[-1])]
+        results['img_shape'] = img.shape
+        results['ori_shape'] = img.shape
+        # Set initial values for default meta_keys
+        results['pad_shape'] = img.shape
+        results['scale_factor'] = 1.0
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results['img_norm_cfg'] = dict(
+            mean=np.zeros(num_channels, dtype=np.float32),
+            std=np.ones(num_channels, dtype=np.float32),
+            to_rgb=False)
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(to_float32={self.to_float32}, '
+        repr_str += f"color_type='{self.color_type}')"
+        return repr_str
+
+
+@PIPELINES.register_module()
+class LoadAnnotations3D_E2E(LoadAnnotations3D):
+    """Load Annotations3D.
+
+    Load instance mask and semantic mask of points and
+    encapsulate the items into related fields.
+
+    Args:
+        with_bbox_3d (bool, optional): Whether to load 3D boxes.
+            Defaults to True.
+        with_label_3d (bool, optional): Whether to load 3D labels.
+            Defaults to True.
+        with_attr_label (bool, optional): Whether to load attribute label.
+            Defaults to False.
+        with_mask_3d (bool, optional): Whether to load 3D instance masks.
+            for points. Defaults to False.
+        with_seg_3d (bool, optional): Whether to load 3D semantic masks.
+            for points. Defaults to False.
+        with_bbox (bool, optional): Whether to load 2D boxes.
+            Defaults to False.
+        with_label (bool, optional): Whether to load 2D labels.
+            Defaults to False.
+        with_mask (bool, optional): Whether to load 2D instance masks.
+            Defaults to False.
+        with_seg (bool, optional): Whether to load 2D semantic masks.
+            Defaults to False.
+        with_bbox_depth (bool, optional): Whether to load 2.5D boxes.
+            Defaults to False.
+        poly2mask (bool, optional): Whether to convert polygon annotations
+            to bitmasks. Defaults to True.
+        seg_3d_dtype (dtype, optional): Dtype of 3D semantic masks.
+            Defaults to int64
+        file_client_args (dict): Config dict of file clients, refer to
+            https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
+            for more details.
+    """
+    def __init__(self,
+                 with_future_anns=False,
+                 with_ins_inds_3d=False,
+                 with_vis_token=True,
+                 ins_inds_add_1=False,  # NOTE: make ins_inds start from 1, not 0
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.with_future_anns = with_future_anns
+        self.with_ins_inds_3d = with_ins_inds_3d
+        self.with_vis_token = with_vis_token
+        self.ins_inds_add_1 = ins_inds_add_1
+    
+    def _load_future_anns(self, results):
+        """Private function to load 3D bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmcv3d.CustomDataset`.
+
+        Returns:
+            dict: The dict containing loaded 3D bounding box annotations.
+        """
+
+        gt_bboxes_3d = []
+        gt_labels_3d = []
+        gt_inds_3d = []
+        # gt_valid_flags = []
+        gt_vis_tokens  = []
+
+        for ann_info in results['occ_future_ann_infos']:
+            if ann_info is not None:
+                gt_bboxes_3d.append(ann_info['gt_bboxes_3d'])
+                gt_labels_3d.append(ann_info['gt_labels_3d'])
+                
+                ann_gt_inds = ann_info['gt_inds']
+                if self.ins_inds_add_1:
+                    ann_gt_inds += 1
+                    # NOTE: sdc query is changed from -10 -> -9
+                gt_inds_3d.append(ann_gt_inds)
+
+                # gt_valid_flags.append(ann_info['gt_valid_flag'])
+                if self.with_vis_token:
+                    gt_vis_tokens.append(ann_info['gt_vis_tokens'])
+            else:
+                # invalid frame
+                gt_bboxes_3d.append(None)
+                gt_labels_3d.append(None)
+                gt_inds_3d.append(None)
+                # gt_valid_flags.append(None)
+                if self.with_vis_token:
+                    gt_vis_tokens.append(None)
+
+        results['future_gt_bboxes_3d'] = gt_bboxes_3d
+        # results['future_bbox3d_fields'].append('gt_bboxes_3d')  # Field is used for augmentations, not needed here
+        results['future_gt_labels_3d'] = gt_labels_3d
+        results['future_gt_inds'] = gt_inds_3d
+        # results['future_gt_valid_flag'] = gt_valid_flags
+        if self.with_vis_token:
+            results['future_gt_vis_tokens'] = gt_vis_tokens
+
+        return results 
+  
+    def _load_ins_inds_3d(self, results):
+        ann_gt_inds = results['ann_info']['gt_inds'].copy() # TODO: note here
+
+        # NOTE: Avoid gt_inds generated twice
+        results['ann_info'].pop('gt_inds')
+        
+        if self.ins_inds_add_1:
+            ann_gt_inds += 1
+        results['gt_inds'] = ann_gt_inds
+        return results
+
+    def __call__(self, results):
+        results = super().__call__(results)
+        
+        if self.with_future_anns:
+            results = self._load_future_anns(results)
+        if self.with_ins_inds_3d:
+            results = self._load_ins_inds_3d(results)
+        
+        # Generate ann for plan
+        if 'occ_future_ann_infos_for_plan' in results.keys():
+            results = self._load_future_anns_plan(results)
+        
+        return results
+
+    def __repr__(self):
+        repr_str = super().__repr__()
+        indent_str = '    '
+        repr_str += f'{indent_str}with_future_anns={self.with_future_anns}, '
+        repr_str += f'{indent_str}with_ins_inds_3d={self.with_ins_inds_3d}, '
+        
+        return repr_str
+
+
+def load_augmented_point_cloud(path, virtual=False, reduce_beams=32):
+    # NOTE: following Tianwei's implementation, it is hard coded for nuScenes
+    points = np.fromfile(path, dtype=np.float32).reshape(-1, 5)
+    # NOTE: path definition different from Tianwei's implementation.
+    tokens = path.split("/")
+    vp_dir = "_VIRTUAL" if reduce_beams == 32 else f"_VIRTUAL_{reduce_beams}BEAMS"
+    seg_path = os.path.join(
+        *tokens[:-3],
+        "virtual_points",
+        tokens[-3],
+        tokens[-2] + vp_dir,
+        tokens[-1] + ".pkl.npy",
+    )
+    assert os.path.exists(seg_path)
+    data_dict = np.load(seg_path, allow_pickle=True).item()
+
+    virtual_points1 = data_dict["real_points"]
+    # NOTE: add zero reflectance to virtual points instead of removing them from real points
+    virtual_points2 = np.concatenate(
+        [
+            data_dict["virtual_points"][:, :3],
+            np.zeros([data_dict["virtual_points"].shape[0], 1]),
+            data_dict["virtual_points"][:, 3:],
+        ],
+        axis=-1,
+    )
+
+    points = np.concatenate(
+        [
+            points,
+            np.ones([points.shape[0], virtual_points1.shape[1] - points.shape[1] + 1]),
+        ],
+        axis=1,
+    )
+    virtual_points1 = np.concatenate(
+        [virtual_points1, np.zeros([virtual_points1.shape[0], 1])], axis=1
+    )
+    # note: this part is different from Tianwei's implementation, we don't have duplicate foreground real points.
+    if len(data_dict["real_points_indice"]) > 0:
+        points[data_dict["real_points_indice"]] = virtual_points1
+    if virtual:
+        virtual_points2 = np.concatenate(
+            [virtual_points2, -1 * np.ones([virtual_points2.shape[0], 1])], axis=1
+        )
+        points = np.concatenate([points, virtual_points2], axis=0).astype(np.float32)
+    return points
+
+
+def reduce_LiDAR_beams(pts, reduce_beams_to=32):
+    # print(pts.size())
+    if isinstance(pts, np.ndarray):
+        pts = torch.from_numpy(pts)
+    radius = torch.sqrt(pts[:, 0].pow(2) + pts[:, 1].pow(2) + pts[:, 2].pow(2))
+    sine_theta = pts[:, 2] / radius
+    # [-pi/2, pi/2]
+    theta = torch.asin(sine_theta)
+    phi = torch.atan2(pts[:, 1], pts[:, 0])
+
+    top_ang = 0.1862
+    down_ang = -0.5353
+
+    beam_range = torch.zeros(32)
+    beam_range[0] = top_ang
+    beam_range[31] = down_ang
+
+    for i in range(1, 31):
+        beam_range[i] = beam_range[i - 1] - 0.023275
+    # beam_range = [1, 0.18, 0.15, 0.13, 0.11, 0.085, 0.065, 0.03, 0.01, -0.01, -0.03, -0.055, -0.08, -0.105, -0.13, -0.155, -0.18, -0.205, -0.228, -0.251, -0.275,
+    #                -0.295, -0.32, -0.34, -0.36, -0.38, -0.40, -0.425, -0.45, -0.47, -0.49, -0.52, -0.54]
+
+    num_pts, _ = pts.size()
+    mask = torch.zeros(num_pts)
+    if reduce_beams_to == 16:
+        for id in [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]:
+            beam_mask = (theta < (beam_range[id - 1] - 0.012)) * (
+                theta > (beam_range[id] - 0.012)
+            )
+            mask = mask + beam_mask
+        mask = mask.bool()
+    elif reduce_beams_to == 4:
+        for id in [7, 9, 11, 13]:
+            beam_mask = (theta < (beam_range[id - 1] - 0.012)) * (
+                theta > (beam_range[id] - 0.012)
+            )
+            mask = mask + beam_mask
+        mask = mask.bool()
+    # [?] pick the 14th beam
+    elif reduce_beams_to == 1:
+        chosen_beam_id = 9
+        mask = (theta < (beam_range[chosen_beam_id - 1] - 0.012)) * (
+            theta > (beam_range[chosen_beam_id] - 0.012)
+        )
+    else:
+        raise NotImplementedError
+    # points = copy.copy(pts)
+    points = pts[mask]
+    # print(points.size())
+    return points.numpy()
+
+@PIPELINES.register_module()
+class CustomLoadPointsFromMultiSweeps:
+    """Load points from multiple sweeps.
+
+    This is usually used for nuScenes dataset to utilize previous sweeps.
+
+    Args:
+        sweeps_num (int): Number of sweeps. Defaults to 10.
+        load_dim (int): Dimension number of the loaded points. Defaults to 5.
+        use_dim (list[int]): Which dimension to use. Defaults to [0, 1, 2, 4].
+        pad_empty_sweeps (bool): Whether to repeat keyframe when
+            sweeps is empty. Defaults to False.
+        remove_close (bool): Whether to remove close points.
+            Defaults to False.
+        test_mode (bool): If test_model=True used for testing, it will not
+            randomly sample sweeps but select the nearest N frames.
+            Defaults to False.
+    """
+
+    def __init__(
+        self,
+        sweeps_num=10,
+        load_dim=5,
+        use_dim=[0, 1, 2, 4],
+        pad_empty_sweeps=False,
+        remove_close=False,
+        test_mode=False,
+        load_augmented=None,
+        reduce_beams=None,
+    ):
+        self.load_dim = load_dim
+        self.sweeps_num = sweeps_num
+        if isinstance(use_dim, int):
+            use_dim = list(range(use_dim))
+        self.use_dim = use_dim
+        self.pad_empty_sweeps = pad_empty_sweeps
+        self.remove_close = remove_close
+        self.test_mode = test_mode
+        self.load_augmented = load_augmented
+        self.reduce_beams = reduce_beams
+
+    def _load_points(self, lidar_path):
+        """Private function to load point clouds data.
+
+        Args:
+            lidar_path (str): Filename of point clouds data.
+
+        Returns:
+            np.ndarray: An array containing point clouds data.
+        """
+        mmcv.check_file_exist(lidar_path)
+        if self.load_augmented:
+            assert self.load_augmented in ["pointpainting", "mvp"]
+            virtual = self.load_augmented == "mvp"
+            points = load_augmented_point_cloud(
+                lidar_path, virtual=virtual, reduce_beams=self.reduce_beams
+            )
+        elif lidar_path.endswith(".npy"):
+            points = np.load(lidar_path)
+        else:
+            points = np.fromfile(lidar_path, dtype=np.float32)
+        return points
+
+    def _remove_close(self, points, radius=1.0):
+        """Removes point too close within a certain radius from origin.
+
+        Args:
+            points (np.ndarray | :obj:`BasePoints`): Sweep points.
+            radius (float): Radius below which points are removed.
+                Defaults to 1.0.
+
+        Returns:
+            np.ndarray: Points after removing.
+        """
+        if isinstance(points, np.ndarray):
+            points_numpy = points
+        elif isinstance(points, BasePoints):
+            points_numpy = points.tensor.numpy()
+        else:
+            raise NotImplementedError
+        x_filt = np.abs(points_numpy[:, 0]) < radius
+        y_filt = np.abs(points_numpy[:, 1]) < radius
+        not_close = np.logical_not(np.logical_and(x_filt, y_filt))
+        return points[not_close]
+
+    def __call__(self, results):
+        """Call function to load multi-sweep point clouds from files.
+
+        Args:
+            results (dict): Result dict containing multi-sweep point cloud \
+                filenames.
+
+        Returns:
+            dict: The result dict containing the multi-sweep points data. \
+                Added key and value are described below.
+
+                - points (np.ndarray | :obj:`BasePoints`): Multi-sweep point \
+                    cloud arrays.
+        """
+        points = results["points"]
+        points.tensor[:, 4] = 0
+        sweep_points_list = [points]
+        ts = results["timestamp"] / 1e6
+        if self.pad_empty_sweeps and len(results["sweeps"]) == 0:
+            for i in range(self.sweeps_num):
+                if self.remove_close:
+                    sweep_points_list.append(self._remove_close(points))
+                else:
+                    sweep_points_list.append(points)
+        else:
+            if len(results["sweeps"]) <= self.sweeps_num:
+                choices = np.arange(len(results["sweeps"]))
+            elif self.test_mode:
+                choices = np.arange(self.sweeps_num)
+            else:
+                # NOTE: seems possible to load frame -11?
+                if not self.load_augmented:
+                    choices = np.random.choice(
+                        len(results["sweeps"]), self.sweeps_num, replace=False
+                    )
+                else:
+                    # don't allow to sample the earliest frame, match with Tianwei's implementation.
+                    choices = np.random.choice(
+                        len(results["sweeps"]) - 1, self.sweeps_num, replace=False
+                    )
+            for idx in choices:
+                sweep = results["sweeps"][idx]
+                points_sweep = self._load_points(sweep["data_path"])
+                points_sweep = np.copy(points_sweep).reshape(-1, self.load_dim)
+
+                # TODO: make it more general
+                if self.reduce_beams and self.reduce_beams < 32:
+                    points_sweep = reduce_LiDAR_beams(points_sweep, self.reduce_beams)
+
+                if self.remove_close:
+                    points_sweep = self._remove_close(points_sweep)
+                sweep_ts = sweep["timestamp"] / 1e6
+                points_sweep[:, :3] = (
+                    points_sweep[:, :3] @ sweep["sensor2lidar_rotation"].T
+                )
+                points_sweep[:, :3] += sweep["sensor2lidar_translation"]
+                points_sweep[:, 4] = ts - sweep_ts
+                points_sweep = points.new_point(points_sweep)
+                sweep_points_list.append(points_sweep)
+
+        points = points.cat(sweep_points_list)
+        points = points[:, self.use_dim]
+        results["points"] = points
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        return f"{self.__class__.__name__}(sweeps_num={self.sweeps_num})"
+
+
+
+@PIPELINES.register_module()
+class CustomLoadPointsFromFile:
+    """Load Points From File.
+
+    Load sunrgbd and scannet points from file.
+
+    Args:
+        coord_type (str): The type of coordinates of points cloud.
+            Available options includes:
+            - 'LIDAR': Points in LiDAR coordinates.
+            - 'DEPTH': Points in depth coordinates, usually for indoor dataset.
+            - 'CAMERA': Points in camera coordinates.
+        load_dim (int): The dimension of the loaded points.
+            Defaults to 6.
+        use_dim (list[int]): Which dimensions of the points to be used.
+            Defaults to [0, 1, 2]. For KITTI dataset, set use_dim=4
+            or use_dim=[0, 1, 2, 3] to use the intensity dimension.
+        shift_height (bool): Whether to use shifted height. Defaults to False.
+        use_color (bool): Whether to use color features. Defaults to False.
+    """
+
+    def __init__(
+        self,
+        coord_type,
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        shift_height=False,
+        use_color=False,
+        load_augmented=None,
+        reduce_beams=None,
+    ):
+        self.shift_height = shift_height
+        self.use_color = use_color
+        if isinstance(use_dim, int):
+            use_dim = list(range(use_dim))
+        assert (
+            max(use_dim) < load_dim
+        ), f"Expect all used dimensions < {load_dim}, got {use_dim}"
+        assert coord_type in ["CAMERA", "LIDAR", "DEPTH"]
+
+        self.coord_type = coord_type
+        self.load_dim = load_dim
+        self.use_dim = use_dim
+        self.load_augmented = load_augmented
+        self.reduce_beams = reduce_beams
+
+    def _load_points(self, lidar_path):
+        """Private function to load point clouds data.
+
+        Args:
+            lidar_path (str): Filename of point clouds data.
+
+        Returns:
+            np.ndarray: An array containing point clouds data.
+        """
+        mmcv.check_file_exist(lidar_path)
+        if self.load_augmented:
+            assert self.load_augmented in ["pointpainting", "mvp"]
+            virtual = self.load_augmented == "mvp"
+            points = load_augmented_point_cloud(
+                lidar_path, virtual=virtual, reduce_beams=self.reduce_beams
+            )
+        elif lidar_path.endswith(".npy"):
+            points = np.load(lidar_path)
+        else:
+            points = np.fromfile(lidar_path, dtype=np.float32)
+
+        return points
+
+    def __call__(self, results):
+        """Call function to load points data from file.
+
+        Args:
+            results (dict): Result dict containing point clouds data.
+
+        Returns:
+            dict: The result dict containing the point clouds data. \
+                Added key and value are described below.
+
+                - points (:obj:`BasePoints`): Point clouds data.
+        """
+        lidar_path = results["pts_filename"]
+        points = self._load_points(lidar_path)
+        points = points.reshape(-1, self.load_dim)
+        # TODO: make it more general
+        if self.reduce_beams and self.reduce_beams < 32:
+            points = reduce_LiDAR_beams(points, self.reduce_beams)
+        points = points[:, self.use_dim]
+        attribute_dims = None
+
+        if self.shift_height:
+            floor_height = np.percentile(points[:, 2], 0.99)
+            height = points[:, 2] - floor_height
+            points = np.concatenate(
+                [points[:, :3], np.expand_dims(height, 1), points[:, 3:]], 1
+            )
+            attribute_dims = dict(height=3)
+
+        if self.use_color:
+            assert len(self.use_dim) >= 6
+            if attribute_dims is None:
+                attribute_dims = dict()
+            attribute_dims.update(
+                dict(
+                    color=[
+                        points.shape[1] - 3,
+                        points.shape[1] - 2,
+                        points.shape[1] - 1,
+                    ]
+                )
+            )
+
+        points_class = get_points_type(self.coord_type)
+        points = points_class(
+            points, points_dim=points.shape[-1], attribute_dims=attribute_dims
+        )
+        results["points"] = points
+
+        return results
diff --git a/mmcv/datasets/pipelines/occflow_label.py b/mmcv/datasets/pipelines/occflow_label.py
new file mode 100644
index 0000000..5ed8fe4
--- /dev/null
+++ b/mmcv/datasets/pipelines/occflow_label.py
@@ -0,0 +1,286 @@
+import torch
+import numpy as np
+import cv2
+
+from mmcv.models.dense_heads.occ_head_plugin import calculate_birds_eye_view_parameters
+
+from mmcv.datasets.builder import PIPELINES
+import os
+
+@PIPELINES.register_module()
+class GenerateOccFlowLabels(object):
+    def __init__(self, grid_conf, ignore_index=255, only_vehicle=True, filter_invisible=True, deal_instance_255=False,all_classes = None,vehicle_classes = None,plan_classes = None):
+        self.grid_conf = grid_conf
+        self.bev_resolution, self.bev_start_position, self.bev_dimension = calculate_birds_eye_view_parameters(
+            grid_conf['xbound'], grid_conf['ybound'], grid_conf['zbound'],
+        )
+        # convert numpy
+        self.bev_resolution = self.bev_resolution.numpy()
+        self.bev_start_position = self.bev_start_position.numpy()
+        self.bev_dimension = self.bev_dimension.numpy()
+        self.spatial_extent = (grid_conf['xbound'][1], grid_conf['ybound'][1])
+        self.ignore_index = ignore_index
+        self.only_vehicle = only_vehicle
+        self.filter_invisible = filter_invisible
+        self.deal_instance_255 = deal_instance_255
+        assert self.deal_instance_255 is False
+
+        
+        if all_classes is None:
+            all_classes = ['car', 'truck', 'construction_vehicle', 'bus', 'trailer',
+                        'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone']
+        if vehicle_classes is None:
+            vehicle_classes = ['car', 'bus', 'construction_vehicle',
+                           'bicycle', 'motorcycle', 'truck', 'trailer']
+        if plan_classes is None:
+            plan_classes = vehicle_classes + ['pedestrian']
+
+        self.vehicle_cls_ids = np.array([all_classes.index(
+            cls_name) for cls_name in vehicle_classes])
+
+        self.plan_cls_ids = np.array([all_classes.index(
+            cls_name) for cls_name in plan_classes])
+        
+        if only_vehicle:
+            self.filter_cls_ids = self.vehicle_cls_ids
+        else:
+            self.filter_cls_ids = self.plan_cls_ids
+
+    def reframe_boxes(self, boxes, t_init, t_curr):
+        l2e_r_mat_curr = t_curr['l2e_r']
+        l2e_t_curr = t_curr['l2e_t']
+        e2g_r_mat_curr = t_curr['e2g_r']
+        e2g_t_curr = t_curr['e2g_t']
+
+        l2e_r_mat_init = t_init['l2e_r']
+        l2e_t_init = t_init['l2e_t']
+        e2g_r_mat_init = t_init['e2g_r']
+        e2g_t_init = t_init['e2g_t']
+
+        # to bbox under curr ego frame  # TODO: Uncomment
+        boxes.rotate(l2e_r_mat_curr.T)
+        boxes.translate(l2e_t_curr)
+
+        # to bbox under world frame
+        boxes.rotate(e2g_r_mat_curr.T)
+        boxes.translate(e2g_t_curr)
+
+        # to bbox under initial ego frame, first inverse translate, then inverse rotate 
+        boxes.translate(- e2g_t_init)
+        m1 = np.linalg.inv(e2g_r_mat_init)
+        boxes.rotate(m1.T)
+
+        # to bbox under curr ego frame, first inverse translate, then inverse rotate
+        boxes.translate(- l2e_t_init)
+        m2 = np.linalg.inv(l2e_r_mat_init)
+        boxes.rotate(m2.T)
+
+        return boxes
+
+    def __call__(self, results):
+        """
+        # Given lidar frame bboxes for curr frame and each future frame,
+        # generate segmentation, instance, centerness, offset, and fwd flow map
+        """
+        # Avoid ignoring obj with index = self.ignore_index
+        SPECIAL_INDEX = -20
+
+        all_gt_bboxes_3d = results['future_gt_bboxes_3d']
+        all_gt_labels_3d = results['future_gt_labels_3d']
+        all_gt_inds = results['future_gt_inds']
+        if 'future_gt_vis_tokens' in results.keys():
+            all_vis_tokens = results['future_gt_vis_tokens']
+        else:
+            all_vis_tokens = None
+        num_frame = len(all_gt_bboxes_3d)
+
+        # motion related transforms, of seq lengths
+        l2e_r_mats = results['occ_l2e_r_mats']
+        l2e_t_vecs = results['occ_l2e_t_vecs']
+        e2g_r_mats = results['occ_e2g_r_mats']
+        e2g_t_vecs = results['occ_e2g_t_vecs']
+
+        # reference frame transform
+        t_ref = dict(l2e_r=l2e_r_mats[0], l2e_t=l2e_t_vecs[0], e2g_r=e2g_r_mats[0], e2g_t=e2g_t_vecs[0])
+        
+        segmentations = []
+        instances = []
+        gt_future_boxes = []
+        gt_future_labels = []
+
+        # num_frame is 5
+        for i in range(num_frame):
+            # bbox, label, index of curr frame
+            gt_bboxes_3d, gt_labels_3d = all_gt_bboxes_3d[i], all_gt_labels_3d[i]
+            ins_inds = all_gt_inds[i]
+            if all_vis_tokens is not None:
+                vis_tokens = all_vis_tokens[i]
+            else:
+                vis_tokens = None
+            
+            if gt_bboxes_3d is None:
+                # for invalid samples, no loss calculated
+                segmentation = np.ones(
+                    (self.bev_dimension[1], self.bev_dimension[0])) * self.ignore_index
+                instance = np.ones(
+                    (self.bev_dimension[1], self.bev_dimension[0])) * self.ignore_index
+            else:
+                # reframe bboxes to reference frame
+                t_curr = dict(l2e_r=l2e_r_mats[i], l2e_t=l2e_t_vecs[i], e2g_r=e2g_r_mats[i], e2g_t=e2g_t_vecs[i])
+                ref_bboxes_3d = self.reframe_boxes(gt_bboxes_3d, t_ref, t_curr)
+                gt_future_boxes.append(ref_bboxes_3d)
+                gt_future_labels.append(gt_labels_3d)
+
+                # for valid samples
+                segmentation = np.zeros(
+                    (self.bev_dimension[1], self.bev_dimension[0]))
+                instance = np.zeros(
+                    (self.bev_dimension[1], self.bev_dimension[0]))
+
+                if self.only_vehicle:
+                    vehicle_mask = np.isin(gt_labels_3d, self.filter_cls_ids)
+                    ref_bboxes_3d = ref_bboxes_3d[vehicle_mask]
+                    gt_labels_3d = gt_labels_3d[vehicle_mask]
+                    ins_inds      = ins_inds[vehicle_mask]
+                    if vis_tokens is not None:
+                        vis_tokens = vis_tokens[vehicle_mask]
+
+                if self.filter_invisible:
+                    assert vis_tokens is not None
+                    visible_mask = (vis_tokens != 1)   # obj are filtered out with visibility(1) between 0 and 40% 
+                    ref_bboxes_3d = ref_bboxes_3d[visible_mask]
+                    gt_labels_3d = gt_labels_3d[visible_mask]
+                    ins_inds = ins_inds[visible_mask]
+
+                # valid sample and has objects
+                if len(ref_bboxes_3d.tensor) > 0:                    
+                    bbox_corners = ref_bboxes_3d.corners[:, [
+                        0, 3, 7, 4], :2].numpy()
+                    bbox_corners = np.round(
+                        (bbox_corners - self.bev_start_position[:2] + self.bev_resolution[:2] / 2.0) / self.bev_resolution[:2]).astype(np.int32)
+
+                    for index, gt_ind in enumerate(ins_inds):
+                        if gt_ind == self.ignore_index:
+                            gt_ind = SPECIAL_INDEX   # 255 -> -20
+                        poly_region = bbox_corners[index]
+
+                        cv2.fillPoly(segmentation, [poly_region], 1.0)
+                        cv2.fillPoly(instance, [poly_region], int(gt_ind))
+
+            segmentations.append(segmentation)
+            instances.append(instance)
+
+        # segmentation = 1 where objects are located
+        segmentations = torch.from_numpy(
+            np.stack(segmentations, axis=0)).long()
+        instances = torch.from_numpy(np.stack(instances, axis=0)).long()
+
+        # generate heatmap & offset from segmentation & instance
+        instance_centerness, instance_offset, instance_flow, instance_backward_flow = self.center_offset_flow(
+            instances, 
+            all_gt_inds, 
+            ignore_index=255,
+            )
+
+        invalid_mask = (segmentations[:, 0, 0] == self.ignore_index)
+        instance_centerness[invalid_mask] = self.ignore_index
+
+        results['gt_occ_has_invalid_frame'] = results.pop('occ_has_invalid_frame')
+        results['gt_occ_img_is_valid'] = results.pop('occ_img_is_valid')
+        results.update({
+            'gt_segmentation': segmentations,
+            'gt_instance': instances,
+            'gt_centerness': instance_centerness,
+            'gt_offset': instance_offset,
+            'gt_flow': instance_flow,
+            'gt_backward_flow': instance_backward_flow,
+            'gt_future_boxes': gt_future_boxes,
+            'gt_future_labels': gt_future_labels
+        })
+        return results
+
+    def center_offset_flow(self, instance_img, all_gt_inds, ignore_index=255, sigma=3.0):
+        seq_len, h, w = instance_img.shape
+        # heatmap
+        center_label = torch.zeros(seq_len, 1, h, w)
+        # offset from parts to centers
+        offset_label = ignore_index * torch.ones(seq_len, 2, h, w)
+        # future flow
+        future_displacement_label = ignore_index * torch.ones(seq_len, 2, h, w)
+
+        # backward flow
+        backward_flow = ignore_index * torch.ones(seq_len, 2, h, w)
+
+        # x is vertical displacement, y is horizontal displacement
+        x, y = torch.meshgrid(torch.arange(h, dtype=torch.float),
+                            torch.arange(w, dtype=torch.float))
+
+        gt_inds_all = []
+        for ins_inds_per_frame in all_gt_inds:
+            if ins_inds_per_frame is None:
+                continue
+            for ins_ind in ins_inds_per_frame:
+                gt_inds_all.append(ins_ind)
+        gt_inds_unique = np.unique(np.array(gt_inds_all))
+
+        # iterate over all instances across this sequence
+        for instance_id in gt_inds_unique:
+            instance_id = int(instance_id)
+            prev_xc = None
+            prev_yc = None
+            prev_mask = None
+            for t in range(seq_len):
+                instance_mask = (instance_img[t] == instance_id)
+                if instance_mask.sum() == 0:
+                    # this instance is not in this frame
+                    prev_xc = None
+                    prev_yc = None
+                    prev_mask = None
+                    continue
+
+                # the Bird-Eye-View center of the instance
+                xc = x[instance_mask].mean()
+                yc = y[instance_mask].mean()
+
+                off_x = xc - x
+                off_y = yc - y
+                g = torch.exp(-(off_x ** 2 + off_y ** 2) / sigma ** 2)
+                center_label[t, 0] = torch.maximum(center_label[t, 0], g)
+                offset_label[t, 0, instance_mask] = off_x[instance_mask]
+                offset_label[t, 1, instance_mask] = off_y[instance_mask]
+
+                if prev_xc is not None and instance_mask.sum() > 0:
+                    delta_x = xc - prev_xc
+                    delta_y = yc - prev_yc
+                    future_displacement_label[t-1, 0, prev_mask] = delta_x
+                    future_displacement_label[t-1, 1, prev_mask] = delta_y
+                    backward_flow[t-1, 0, instance_mask] = -1 * delta_x
+                    backward_flow[t-1, 1, instance_mask] = -1 * delta_y
+                        
+                prev_xc = xc
+                prev_yc = yc
+                prev_mask = instance_mask
+        
+        return center_label, offset_label, future_displacement_label, backward_flow
+
+
+    def visualize_instances(self, instances, vis_root=''):
+        if vis_root is not None and vis_root != '':
+            os.makedirs(vis_root, exist_ok=True)
+            
+        for i, ins in enumerate(instances):
+            ins_c = ins.astype(np.uint8)
+            ins_c = cv2.applyColorMap(ins_c, cv2.COLORMAP_JET)
+            save_path = os.path.join(vis_root, '{}.png'.format(i))
+            cv2.imwrite(save_path, ins_c)
+        
+        vid_path = os.path.join(vis_root, 'vid_ins.avi')
+        height, width = instances[0].shape
+        size = (height, width)
+        v_out = cv2.VideoWriter(vid_path, cv2.VideoWriter_fourcc(*'DIVX'), 4, size)
+        for i in range(len(instances)):
+            ins_c = instances[i].astype(np.uint8)
+            ins_c = cv2.applyColorMap(ins_c, cv2.COLORMAP_JET)
+            v_out.write(ins_c)
+        v_out.release()
+        return
diff --git a/mmcv/datasets/pipelines/test_time_aug.py b/mmcv/datasets/pipelines/test_time_aug.py
new file mode 100644
index 0000000..4c21d4e
--- /dev/null
+++ b/mmcv/datasets/pipelines/test_time_aug.py
@@ -0,0 +1,233 @@
+import warnings
+
+from mmcv.utils import is_list_of
+from copy import deepcopy
+from ..builder import PIPELINES
+from .compose import Compose
+
+
+@PIPELINES.register_module()
+class MultiScaleFlipAug:
+    """Test-time augmentation with multiple scales and flipping.
+
+    An example configuration is as followed:
+
+    .. code-block::
+
+        img_scale=[(1333, 400), (1333, 800)],
+        flip=True,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ]
+
+    After MultiScaleFLipAug with above configuration, the results are wrapped
+    into lists of the same length as followed:
+
+    .. code-block::
+
+        dict(
+            img=[...],
+            img_shape=[...],
+            scale=[(1333, 400), (1333, 400), (1333, 800), (1333, 800)]
+            flip=[False, True, False, True]
+            ...
+        )
+
+    Args:
+        transforms (list[dict]): Transforms to apply in each augmentation.
+        img_scale (tuple | list[tuple] | None): Images scales for resizing.
+        scale_factor (float | list[float] | None): Scale factors for resizing.
+        flip (bool): Whether apply flip augmentation. Default: False.
+        flip_direction (str | list[str]): Flip augmentation directions,
+            options are "horizontal", "vertical" and "diagonal". If
+            flip_direction is a list, multiple flip augmentations will be
+            applied. It has no effect when flip == False. Default:
+            "horizontal".
+    """
+
+    def __init__(self,
+                 transforms,
+                 img_scale=None,
+                 scale_factor=None,
+                 flip=False,
+                 flip_direction='horizontal'):
+        self.transforms = Compose(transforms)
+        assert (img_scale is None) ^ (scale_factor is None), (
+            'Must have but only one variable can be setted')
+        if img_scale is not None:
+            self.img_scale = img_scale if isinstance(img_scale,
+                                                     list) else [img_scale]
+            self.scale_key = 'scale'
+            assert is_list_of(self.img_scale, tuple)
+        else:
+            self.img_scale = scale_factor if isinstance(
+                scale_factor, list) else [scale_factor]
+            self.scale_key = 'scale_factor'
+
+        self.flip = flip
+        self.flip_direction = flip_direction if isinstance(
+            flip_direction, list) else [flip_direction]
+        assert is_list_of(self.flip_direction, str)
+        if not self.flip and self.flip_direction != ['horizontal']:
+            warnings.warn(
+                'flip_direction has no effect when flip is set to False')
+        if (self.flip
+                and not any([t['type'] == 'RandomFlip' for t in transforms])):
+            warnings.warn(
+                'flip has no effect when RandomFlip is not in transforms')
+
+    def __call__(self, results):
+        """Call function to apply test time augment transforms on results.
+
+        Args:
+            results (dict): Result dict contains the data to transform.
+
+        Returns:
+           dict[str: list]: The augmented data, where each value is wrapped
+               into a list.
+        """
+
+        aug_data = []
+        flip_args = [(False, None)]
+        if self.flip:
+            flip_args += [(True, direction)
+                          for direction in self.flip_direction]
+        for scale in self.img_scale:
+            for flip, direction in flip_args:
+                _results = results.copy()
+                _results[self.scale_key] = scale
+                _results['flip'] = flip
+                _results['flip_direction'] = direction
+                data = self.transforms(_results)
+                aug_data.append(data)
+        # list of dict to dict of list
+        aug_data_dict = {key: [] for key in aug_data[0]}
+        for data in aug_data:
+            for key, val in data.items():
+                aug_data_dict[key].append(val)
+        return aug_data_dict
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(transforms={self.transforms}, '
+        repr_str += f'img_scale={self.img_scale}, flip={self.flip}, '
+        repr_str += f'flip_direction={self.flip_direction})'
+        return repr_str
+    
+@PIPELINES.register_module()
+class MultiScaleFlipAug3D(object):
+    """Test-time augmentation with multiple scales and flipping.
+
+    Args:
+        transforms (list[dict]): Transforms to apply in each augmentation.
+        img_scale (tuple | list[tuple]: Images scales for resizing.
+        pts_scale_ratio (float | list[float]): Points scale ratios for
+            resizing.
+        flip (bool): Whether apply flip augmentation. Defaults to False.
+        flip_direction (str | list[str]): Flip augmentation directions
+            for images, options are "horizontal" and "vertical".
+            If flip_direction is list, multiple flip augmentations will
+            be applied. It has no effect when ``flip == False``.
+            Defaults to "horizontal".
+        pcd_horizontal_flip (bool): Whether apply horizontal flip augmentation
+            to point cloud. Defaults to True. Note that it works only when
+            'flip' is turned on.
+        pcd_vertical_flip (bool): Whether apply vertical flip augmentation
+            to point cloud. Defaults to True. Note that it works only when
+            'flip' is turned on.
+    """
+
+    def __init__(self,
+                 transforms,
+                 img_scale,
+                 pts_scale_ratio,
+                 flip=False,
+                 flip_direction='horizontal',
+                 pcd_horizontal_flip=False,
+                 pcd_vertical_flip=False):
+        self.transforms = Compose(transforms)
+        self.img_scale = img_scale if isinstance(img_scale,
+                                                 list) else [img_scale]
+        self.pts_scale_ratio = pts_scale_ratio \
+            if isinstance(pts_scale_ratio, list) else[float(pts_scale_ratio)]
+
+        assert is_list_of(self.img_scale, tuple)
+        assert is_list_of(self.pts_scale_ratio, float)
+
+        self.flip = flip
+        self.pcd_horizontal_flip = pcd_horizontal_flip
+        self.pcd_vertical_flip = pcd_vertical_flip
+
+        self.flip_direction = flip_direction if isinstance(
+            flip_direction, list) else [flip_direction]
+        assert is_list_of(self.flip_direction, str)
+        if not self.flip and self.flip_direction != ['horizontal']:
+            warnings.warn(
+                'flip_direction has no effect when flip is set to False')
+        if (self.flip and not any([(t['type'] == 'RandomFlip3D'
+                                    or t['type'] == 'RandomFlip')
+                                   for t in transforms])):
+            warnings.warn(
+                'flip has no effect when RandomFlip is not in transforms')
+
+    def __call__(self, results):
+        """Call function to augment common fields in results.
+
+        Args:
+            results (dict): Result dict contains the data to augment.
+
+        Returns:
+            dict: The result dict contains the data that is augmented with \
+                different scales and flips.
+        """
+        aug_data = []
+
+        # modified from `flip_aug = [False, True] if self.flip else [False]`
+        # to reduce unnecessary scenes when using double flip augmentation
+        # during test time
+        flip_aug = [True] if self.flip else [False]
+        pcd_horizontal_flip_aug = [False, True] \
+            if self.flip and self.pcd_horizontal_flip else [False]
+        pcd_vertical_flip_aug = [False, True] \
+            if self.flip and self.pcd_vertical_flip else [False]
+        for scale in self.img_scale:
+            for pts_scale_ratio in self.pts_scale_ratio:
+                for flip in flip_aug:
+                    for pcd_horizontal_flip in pcd_horizontal_flip_aug:
+                        for pcd_vertical_flip in pcd_vertical_flip_aug:
+                            for direction in self.flip_direction:
+                                # results.copy will cause bug
+                                # since it is shallow copy
+                                _results = deepcopy(results)
+                                _results['scale'] = scale
+                                _results['flip'] = flip
+                                _results['pcd_scale_factor'] = \
+                                    pts_scale_ratio
+                                _results['flip_direction'] = direction
+                                _results['pcd_horizontal_flip'] = \
+                                    pcd_horizontal_flip
+                                _results['pcd_vertical_flip'] = \
+                                    pcd_vertical_flip
+                                data = self.transforms(_results)
+                                aug_data.append(data)
+        # list of dict to dict of list
+        aug_data_dict = {key: [] for key in aug_data[0]}
+        for data in aug_data:
+            for key, val in data.items():
+                aug_data_dict[key].append(val)
+        return aug_data_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(transforms={self.transforms}, '
+        repr_str += f'img_scale={self.img_scale}, flip={self.flip}, '
+        repr_str += f'pts_scale_ratio={self.pts_scale_ratio}, '
+        repr_str += f'flip_direction={self.flip_direction})'
+        return repr_str
+
diff --git a/mmcv/datasets/pipelines/transforms.py b/mmcv/datasets/pipelines/transforms.py
new file mode 100644
index 0000000..e7776cd
--- /dev/null
+++ b/mmcv/datasets/pipelines/transforms.py
@@ -0,0 +1,1906 @@
+import copy
+import inspect
+
+import numpy as np
+from numpy import random
+
+from mmcv.core.mask.structures import PolygonMasks
+from mmcv.core.evaluation.bbox_overlaps import bbox_overlaps
+from mmcv.utils import is_list_of, is_str
+from mmcv.image import imrescale, imresize, imflip, impad, impad_to_multiple, imnormalize, bgr2hsv, hsv2bgr
+from ..builder import PIPELINES
+
+try:
+    from imagecorruptions import corrupt
+except ImportError:
+    corrupt = None
+
+try:
+    import albumentations
+    from albumentations import Compose
+except ImportError:
+    albumentations = None
+    Compose = None
+
+
+@PIPELINES.register_module()
+class Resize:
+    """Resize images & bbox & mask.
+
+    This transform resizes the input image to some scale. Bboxes and masks are
+    then resized with the same scale factor. If the input dict contains the key
+    "scale", then the scale in the input dict is used, otherwise the specified
+    scale in the init method is used. If the input dict contains the key
+    "scale_factor" (if MultiScaleFlipAug does not give img_scale but
+    scale_factor), the actual scale will be computed by image shape and
+    scale_factor.
+
+    `img_scale` can either be a tuple (single-scale) or a list of tuple
+    (multi-scale). There are 3 multiscale modes:
+
+    - ``ratio_range is not None``: randomly sample a ratio from the ratio \
+      range and multiply it with the image scale.
+    - ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \
+      sample a scale from the multiscale range.
+    - ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \
+      sample a scale from multiple scales.
+
+    Args:
+        img_scale (tuple or list[tuple]): Images scales for resizing.
+        multiscale_mode (str): Either "range" or "value".
+        ratio_range (tuple[float]): (min_ratio, max_ratio)
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image.
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+        override (bool, optional): Whether to override `scale` and
+            `scale_factor` so as to call resize twice. Default False. If True,
+            after the first resizing, the existed `scale` and `scale_factor`
+            will be ignored so the second resizing can be allowed.
+            This option is a work-around for multiple times of resize in DETR.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 img_scale=None,
+                 multiscale_mode='range',
+                 ratio_range=None,
+                 keep_ratio=True,
+                 bbox_clip_border=True,
+                 backend='cv2',
+                 override=False):
+        if img_scale is None:
+            self.img_scale = None
+        else:
+            if isinstance(img_scale, list):
+                self.img_scale = img_scale
+            else:
+                self.img_scale = [img_scale]
+            assert is_list_of(self.img_scale, tuple)
+
+        if ratio_range is not None:
+            # mode 1: given a scale and a range of image ratio
+            assert len(self.img_scale) == 1
+        else:
+            # mode 2: given multiple scales or a range of scales
+            assert multiscale_mode in ['value', 'range']
+
+        self.backend = backend
+        self.multiscale_mode = multiscale_mode
+        self.ratio_range = ratio_range
+        self.keep_ratio = keep_ratio
+        # TODO: refactor the override option in Resize
+        self.override = override
+        self.bbox_clip_border = bbox_clip_border
+
+    @staticmethod
+    def random_select(img_scales):
+        """Randomly select an img_scale from given candidates.
+
+        Args:
+            img_scales (list[tuple]): Images scales for selection.
+
+        Returns:
+            (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \
+                where ``img_scale`` is the selected image scale and \
+                ``scale_idx`` is the selected index in the given candidates.
+        """
+
+        assert is_list_of(img_scales, tuple)
+        scale_idx = np.random.randint(len(img_scales))
+        img_scale = img_scales[scale_idx]
+        return img_scale, scale_idx
+
+    @staticmethod
+    def random_sample(img_scales):
+        """Randomly sample an img_scale when ``multiscale_mode=='range'``.
+
+        Args:
+            img_scales (list[tuple]): Images scale range for sampling.
+                There must be two tuples in img_scales, which specify the lower
+                and upper bound of image scales.
+
+        Returns:
+            (tuple, None): Returns a tuple ``(img_scale, None)``, where \
+                ``img_scale`` is sampled scale and None is just a placeholder \
+                to be consistent with :func:`random_select`.
+        """
+
+        assert is_list_of(img_scales, tuple) and len(img_scales) == 2
+        img_scale_long = [max(s) for s in img_scales]
+        img_scale_short = [min(s) for s in img_scales]
+        long_edge = np.random.randint(
+            min(img_scale_long),
+            max(img_scale_long) + 1)
+        short_edge = np.random.randint(
+            min(img_scale_short),
+            max(img_scale_short) + 1)
+        img_scale = (long_edge, short_edge)
+        return img_scale, None
+
+    @staticmethod
+    def random_sample_ratio(img_scale, ratio_range):
+        """Randomly sample an img_scale when ``ratio_range`` is specified.
+
+        A ratio will be randomly sampled from the range specified by
+        ``ratio_range``. Then it would be multiplied with ``img_scale`` to
+        generate sampled scale.
+
+        Args:
+            img_scale (tuple): Images scale base to multiply with ratio.
+            ratio_range (tuple[float]): The minimum and maximum ratio to scale
+                the ``img_scale``.
+
+        Returns:
+            (tuple, None): Returns a tuple ``(scale, None)``, where \
+                ``scale`` is sampled ratio multiplied with ``img_scale`` and \
+                None is just a placeholder to be consistent with \
+                :func:`random_select`.
+        """
+
+        assert isinstance(img_scale, tuple) and len(img_scale) == 2
+        min_ratio, max_ratio = ratio_range
+        assert min_ratio <= max_ratio
+        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
+        scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
+        return scale, None
+
+    def _random_scale(self, results):
+        """Randomly sample an img_scale according to ``ratio_range`` and
+        ``multiscale_mode``.
+
+        If ``ratio_range`` is specified, a ratio will be sampled and be
+        multiplied with ``img_scale``.
+        If multiple scales are specified by ``img_scale``, a scale will be
+        sampled according to ``multiscale_mode``.
+        Otherwise, single scale will be used.
+
+        Args:
+            results (dict): Result dict from :obj:`dataset`.
+
+        Returns:
+            dict: Two new keys 'scale` and 'scale_idx` are added into \
+                ``results``, which would be used by subsequent pipelines.
+        """
+
+        if self.ratio_range is not None:
+            scale, scale_idx = self.random_sample_ratio(
+                self.img_scale[0], self.ratio_range)
+        elif len(self.img_scale) == 1:
+            scale, scale_idx = self.img_scale[0], 0
+        elif self.multiscale_mode == 'range':
+            scale, scale_idx = self.random_sample(self.img_scale)
+        elif self.multiscale_mode == 'value':
+            scale, scale_idx = self.random_select(self.img_scale)
+        else:
+            raise NotImplementedError
+
+        results['scale'] = scale
+        results['scale_idx'] = scale_idx
+
+    def _resize_img(self, results):
+        """Resize images with ``results['scale']``."""
+        for key in results.get('img_fields', ['img']):
+            if self.keep_ratio:
+                img, scale_factor = imrescale(
+                    results[key],
+                    results['scale'],
+                    return_scale=True,
+                    backend=self.backend)
+                # the w_scale and h_scale has minor difference
+                # a real fix should be done in the mmcv.imrescale in the future
+                new_h, new_w = img.shape[:2]
+                h, w = results[key].shape[:2]
+                w_scale = new_w / w
+                h_scale = new_h / h
+            else:
+                img, w_scale, h_scale = imresize(
+                    results[key],
+                    results['scale'],
+                    return_scale=True,
+                    backend=self.backend)
+            results[key] = img
+
+            scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
+                                    dtype=np.float32)
+            results['img_shape'] = img.shape
+            # in case that there is no padding
+            results['pad_shape'] = img.shape
+            results['scale_factor'] = scale_factor
+            results['keep_ratio'] = self.keep_ratio
+
+    def _resize_bboxes(self, results):
+        """Resize bounding boxes with ``results['scale_factor']``."""
+        for key in results.get('bbox_fields', []):
+            bboxes = results[key] * results['scale_factor']
+            if self.bbox_clip_border:
+                img_shape = results['img_shape']
+                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+            results[key] = bboxes
+
+    def _resize_masks(self, results):
+        """Resize masks with ``results['scale']``"""
+        for key in results.get('mask_fields', []):
+            if results[key] is None:
+                continue
+            if self.keep_ratio:
+                results[key] = results[key].rescale(results['scale'])
+            else:
+                results[key] = results[key].resize(results['img_shape'][:2])
+
+    def _resize_seg(self, results):
+        """Resize semantic segmentation map with ``results['scale']``."""
+        for key in results.get('seg_fields', []):
+            if self.keep_ratio:
+                gt_seg = imrescale(
+                    results[key],
+                    results['scale'],
+                    interpolation='nearest',
+                    backend=self.backend)
+            else:
+                gt_seg = imresize(
+                    results[key],
+                    results['scale'],
+                    interpolation='nearest',
+                    backend=self.backend)
+            results['gt_semantic_seg'] = gt_seg
+
+    def __call__(self, results):
+        """Call function to resize images, bounding boxes, masks, semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', \
+                'keep_ratio' keys are added into result dict.
+        """
+
+        if 'scale' not in results:
+            if 'scale_factor' in results:
+                img_shape = results['img'].shape[:2]
+                scale_factor = results['scale_factor']
+                assert isinstance(scale_factor, float)
+                results['scale'] = tuple(
+                    [int(x * scale_factor) for x in img_shape][::-1])
+            else:
+                self._random_scale(results)
+        else:
+            if not self.override:
+                assert 'scale_factor' not in results, (
+                    'scale and scale_factor cannot be both set.')
+            else:
+                results.pop('scale')
+                if 'scale_factor' in results:
+                    results.pop('scale_factor')
+                self._random_scale(results)
+
+        self._resize_img(results)
+        self._resize_bboxes(results)
+        self._resize_masks(results)
+        self._resize_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(img_scale={self.img_scale}, '
+        repr_str += f'multiscale_mode={self.multiscale_mode}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'keep_ratio={self.keep_ratio}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomFlip:
+    """Flip the image & bbox & mask.
+
+    If the input dict contains the key "flip", then the flag will be used,
+    otherwise it will be randomly decided by a ratio specified in the init
+    method.
+
+    When random flip is enabled, ``flip_ratio``/``direction`` can either be a
+    float/string or tuple of float/string. There are 3 flip modes:
+
+    - ``flip_ratio`` is float, ``direction`` is string: the image will be
+        ``direction``ly flipped with probability of ``flip_ratio`` .
+        E.g., ``flip_ratio=0.5``, ``direction='horizontal'``,
+        then image will be horizontally flipped with probability of 0.5.
+    - ``flip_ratio`` is float, ``direction`` is list of string: the image wil
+        be ``direction[i]``ly flipped with probability of
+        ``flip_ratio/len(direction)``.
+        E.g., ``flip_ratio=0.5``, ``direction=['horizontal', 'vertical']``,
+        then image will be horizontally flipped with probability of 0.25,
+        vertically with probability of 0.25.
+    - ``flip_ratio`` is list of float, ``direction`` is list of string:
+        given ``len(flip_ratio) == len(direction)``, the image wil
+        be ``direction[i]``ly flipped with probability of ``flip_ratio[i]``.
+        E.g., ``flip_ratio=[0.3, 0.5]``, ``direction=['horizontal',
+        'vertical']``, then image will be horizontally flipped with probability
+         of 0.3, vertically with probability of 0.5
+
+    Args:
+        flip_ratio (float | list[float], optional): The flipping probability.
+            Default: None.
+        direction(str | list[str], optional): The flipping direction. Options
+            are 'horizontal', 'vertical', 'diagonal'. Default: 'horizontal'.
+            If input is a list, the length must equal ``flip_ratio``. Each
+            element in ``flip_ratio`` indicates the flip probability of
+            corresponding direction.
+    """
+
+    def __init__(self, flip_ratio=None, direction='horizontal'):
+        if isinstance(flip_ratio, list):
+            assert is_list_of(flip_ratio, float)
+            assert 0 <= sum(flip_ratio) <= 1
+        elif isinstance(flip_ratio, float):
+            assert 0 <= flip_ratio <= 1
+        elif flip_ratio is None:
+            pass
+        else:
+            raise ValueError('flip_ratios must be None, float, '
+                             'or list of float')
+        self.flip_ratio = flip_ratio
+
+        valid_directions = ['horizontal', 'vertical', 'diagonal']
+        if isinstance(direction, str):
+            assert direction in valid_directions
+        elif isinstance(direction, list):
+            assert is_list_of(direction, str)
+            assert set(direction).issubset(set(valid_directions))
+        else:
+            raise ValueError('direction must be either str or list of str')
+        self.direction = direction
+
+        if isinstance(flip_ratio, list):
+            assert len(self.flip_ratio) == len(self.direction)
+
+    def bbox_flip(self, bboxes, img_shape, direction):
+        """Flip bboxes horizontally.
+
+        Args:
+            bboxes (numpy.ndarray): Bounding boxes, shape (..., 4*k)
+            img_shape (tuple[int]): Image shape (height, width)
+            direction (str): Flip direction. Options are 'horizontal',
+                'vertical'.
+
+        Returns:
+            numpy.ndarray: Flipped bounding boxes.
+        """
+
+        assert bboxes.shape[-1] % 4 == 0
+        flipped = bboxes.copy()
+        if direction == 'horizontal':
+            w = img_shape[1]
+            flipped[..., 0::4] = w - bboxes[..., 2::4]
+            flipped[..., 2::4] = w - bboxes[..., 0::4]
+        elif direction == 'vertical':
+            h = img_shape[0]
+            flipped[..., 1::4] = h - bboxes[..., 3::4]
+            flipped[..., 3::4] = h - bboxes[..., 1::4]
+        elif direction == 'diagonal':
+            w = img_shape[1]
+            h = img_shape[0]
+            flipped[..., 0::4] = w - bboxes[..., 2::4]
+            flipped[..., 1::4] = h - bboxes[..., 3::4]
+            flipped[..., 2::4] = w - bboxes[..., 0::4]
+            flipped[..., 3::4] = h - bboxes[..., 1::4]
+        else:
+            raise ValueError(f"Invalid flipping direction '{direction}'")
+        return flipped
+
+    def __call__(self, results):
+        """Call function to flip bounding boxes, masks, semantic segmentation
+        maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Flipped results, 'flip', 'flip_direction' keys are added \
+                into result dict.
+        """
+
+        if 'flip' not in results:
+            if isinstance(self.direction, list):
+                # None means non-flip
+                direction_list = self.direction + [None]
+            else:
+                # None means non-flip
+                direction_list = [self.direction, None]
+
+            if isinstance(self.flip_ratio, list):
+                non_flip_ratio = 1 - sum(self.flip_ratio)
+                flip_ratio_list = self.flip_ratio + [non_flip_ratio]
+            else:
+                non_flip_ratio = 1 - self.flip_ratio
+                # exclude non-flip
+                single_ratio = self.flip_ratio / (len(direction_list) - 1)
+                flip_ratio_list = [single_ratio] * (len(direction_list) -
+                                                    1) + [non_flip_ratio]
+
+            cur_dir = np.random.choice(direction_list, p=flip_ratio_list)
+
+            results['flip'] = cur_dir is not None
+        if 'flip_direction' not in results:
+            results['flip_direction'] = cur_dir
+        if results['flip']:
+            # flip image
+            for key in results.get('img_fields', ['img']):
+                results[key] = imflip(
+                    results[key], direction=results['flip_direction'])
+            # flip bboxes
+            for key in results.get('bbox_fields', []):
+                results[key] = self.bbox_flip(results[key],
+                                              results['img_shape'],
+                                              results['flip_direction'])
+            # flip masks
+            for key in results.get('mask_fields', []):
+                results[key] = results[key].flip(results['flip_direction'])
+
+            # flip segs
+            for key in results.get('seg_fields', []):
+                results[key] = imflip(
+                    results[key], direction=results['flip_direction'])
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(flip_ratio={self.flip_ratio})'
+
+
+@PIPELINES.register_module()
+class RandomShift:
+    """Shift the image and box given shift pixels and probability.
+
+    Args:
+        shift_ratio (float): Probability of shifts. Default 0.5.
+        max_shift_px (int): The max pixels for shifting. Default 32.
+        filter_thr_px (int): The width and height threshold for filtering.
+            The bbox and the rest of the targets below the width and
+            height threshold will be filtered. Default 1.
+    """
+
+    def __init__(self, shift_ratio=0.5, max_shift_px=32, filter_thr_px=1):
+        assert 0 <= shift_ratio <= 1
+        assert max_shift_px >= 0
+        self.shift_ratio = shift_ratio
+        self.max_shift_px = max_shift_px
+        self.filter_thr_px = int(filter_thr_px)
+        # The key correspondence from bboxes to labels.
+        self.bbox2label = {
+            'gt_bboxes': 'gt_labels',
+            'gt_bboxes_ignore': 'gt_labels_ignore'
+        }
+
+    def __call__(self, results):
+        """Call function to random shift images, bounding boxes.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Shift results.
+        """
+        if random.random() < self.shift_ratio:
+            img_shape = results['img'].shape[:2]
+
+            random_shift_x = random.randint(-self.max_shift_px,
+                                            self.max_shift_px)
+            random_shift_y = random.randint(-self.max_shift_px,
+                                            self.max_shift_px)
+            new_x = max(0, random_shift_x)
+            orig_x = max(0, -random_shift_x)
+            new_y = max(0, random_shift_y)
+            orig_y = max(0, -random_shift_y)
+
+            # TODO: support mask and semantic segmentation maps.
+            for key in results.get('bbox_fields', []):
+                bboxes = results[key].copy()
+                bboxes[..., 0::2] += random_shift_x
+                bboxes[..., 1::2] += random_shift_y
+
+                # clip border
+                bboxes[..., 0::2] = np.clip(bboxes[..., 0::2], 0, img_shape[1])
+                bboxes[..., 1::2] = np.clip(bboxes[..., 1::2], 0, img_shape[0])
+
+                # remove invalid bboxes
+                bbox_w = bboxes[..., 2] - bboxes[..., 0]
+                bbox_h = bboxes[..., 3] - bboxes[..., 1]
+                valid_inds = (bbox_w > self.filter_thr_px) & (
+                    bbox_h > self.filter_thr_px)
+                # If the shift does not contain any gt-bbox area, skip this
+                # image.
+                if key == 'gt_bboxes' and not valid_inds.any():
+                    return results
+                bboxes = bboxes[valid_inds]
+                results[key] = bboxes
+
+                # label fields. e.g. gt_labels and gt_labels_ignore
+                label_key = self.bbox2label.get(key)
+                if label_key in results:
+                    results[label_key] = results[label_key][valid_inds]
+
+            for key in results.get('img_fields', ['img']):
+                img = results[key]
+                new_img = np.zeros_like(img)
+                img_h, img_w = img.shape[:2]
+                new_h = img_h - np.abs(random_shift_y)
+                new_w = img_w - np.abs(random_shift_x)
+                new_img[new_y:new_y + new_h, new_x:new_x + new_w] \
+                    = img[orig_y:orig_y + new_h, orig_x:orig_x + new_w]
+                results[key] = new_img
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(max_shift_px={self.max_shift_px}, '
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Pad:
+    """Pad the image & mask.
+
+    There are two padding modes: (1) pad to a fixed size and (2) pad to the
+    minimum size that is divisible by some number.
+    Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
+
+    Args:
+        size (tuple, optional): Fixed padding size.
+        size_divisor (int, optional): The divisor of padded size.
+        pad_val (float, optional): Padding value, 0 by default.
+    """
+
+    def __init__(self, size=None, size_divisor=None, pad_val=0):
+        self.size = size
+        self.size_divisor = size_divisor
+        self.pad_val = pad_val
+        # only one of size and size_divisor should be valid
+        assert size is not None or size_divisor is not None
+        assert size is None or size_divisor is None
+
+    def _pad_img(self, results):
+        """Pad images according to ``self.size``."""
+        for key in results.get('img_fields', ['img']):
+            if self.size is not None:
+                padded_img = impad(
+                    results[key], shape=self.size, pad_val=self.pad_val)
+            elif self.size_divisor is not None:
+                padded_img = impad_to_multiple(
+                    results[key], self.size_divisor, pad_val=self.pad_val)
+            results[key] = padded_img
+        results['pad_shape'] = padded_img.shape
+        results['pad_fixed_size'] = self.size
+        results['pad_size_divisor'] = self.size_divisor
+
+    def _pad_masks(self, results):
+        """Pad masks according to ``results['pad_shape']``."""
+        pad_shape = results['pad_shape'][:2]
+        for key in results.get('mask_fields', []):
+            results[key] = results[key].pad(pad_shape, pad_val=self.pad_val)
+
+    def _pad_seg(self, results):
+        """Pad semantic segmentation map according to
+        ``results['pad_shape']``."""
+        for key in results.get('seg_fields', []):
+            results[key] = impad(
+                results[key], shape=results['pad_shape'][:2])
+
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        self._pad_img(results)
+        self._pad_masks(results)
+        self._pad_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(size={self.size}, '
+        repr_str += f'size_divisor={self.size_divisor}, '
+        repr_str += f'pad_val={self.pad_val})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Normalize:
+    """Normalize the image.
+
+    Added key is "img_norm_cfg".
+
+    Args:
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB,
+            default is true.
+    """
+
+    def __init__(self, mean, std, to_rgb=True):
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_rgb = to_rgb
+
+    def __call__(self, results):
+        """Call function to normalize images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Normalized results, 'img_norm_cfg' key is added into
+                result dict.
+        """
+        for key in results.get('img_fields', ['img']):
+            results[key] = imnormalize(results[key], self.mean, self.std,
+                                            self.to_rgb)
+        results['img_norm_cfg'] = dict(
+            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomCrop:
+    """Random crop the image & bboxes & masks.
+
+    The absolute `crop_size` is sampled based on `crop_type` and `image_size`,
+    then the cropped results are generated.
+
+    Args:
+        crop_size (tuple): The relative ratio or absolute pixels of
+            height and width.
+        crop_type (str, optional): one of "relative_range", "relative",
+            "absolute", "absolute_range". "relative" randomly crops
+            (h * crop_size[0], w * crop_size[1]) part from an input of size
+            (h, w). "relative_range" uniformly samples relative crop size from
+            range [crop_size[0], 1] and [crop_size[1], 1] for height and width
+            respectively. "absolute" crops from an input with absolute size
+            (crop_size[0], crop_size[1]). "absolute_range" uniformly samples
+            crop_h in range [crop_size[0], min(h, crop_size[1])] and crop_w
+            in range [crop_size[0], min(w, crop_size[1])]. Default "absolute".
+        allow_negative_crop (bool, optional): Whether to allow a crop that does
+            not contain any bbox area. Default False.
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+
+    Note:
+        - If the image is smaller than the absolute crop size, return the
+            original image.
+        - The keys for bboxes, labels and masks must be aligned. That is,
+          `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and
+          `gt_bboxes_ignore` corresponds to `gt_labels_ignore` and
+          `gt_masks_ignore`.
+        - If the crop does not contain any gt-bbox region and
+          `allow_negative_crop` is set to False, skip this image.
+    """
+
+    def __init__(self,
+                 crop_size,
+                 crop_type='absolute',
+                 allow_negative_crop=False,
+                 bbox_clip_border=True):
+        if crop_type not in [
+                'relative_range', 'relative', 'absolute', 'absolute_range'
+        ]:
+            raise ValueError(f'Invalid crop_type {crop_type}.')
+        if crop_type in ['absolute', 'absolute_range']:
+            assert crop_size[0] > 0 and crop_size[1] > 0
+            assert isinstance(crop_size[0], int) and isinstance(
+                crop_size[1], int)
+        else:
+            assert 0 < crop_size[0] <= 1 and 0 < crop_size[1] <= 1
+        self.crop_size = crop_size
+        self.crop_type = crop_type
+        self.allow_negative_crop = allow_negative_crop
+        self.bbox_clip_border = bbox_clip_border
+        # The key correspondence from bboxes to labels and masks.
+        self.bbox2label = {
+            'gt_bboxes': 'gt_labels',
+            'gt_bboxes_ignore': 'gt_labels_ignore'
+        }
+        self.bbox2mask = {
+            'gt_bboxes': 'gt_masks',
+            'gt_bboxes_ignore': 'gt_masks_ignore'
+        }
+
+    def _crop_data(self, results, crop_size, allow_negative_crop):
+        """Function to randomly crop images, bounding boxes, masks, semantic
+        segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+            crop_size (tuple): Expected absolute size after cropping, (h, w).
+            allow_negative_crop (bool): Whether to allow a crop that does not
+                contain any bbox area. Default to False.
+
+        Returns:
+            dict: Randomly cropped results, 'img_shape' key in result dict is
+                updated according to crop size.
+        """
+        assert crop_size[0] > 0 and crop_size[1] > 0
+        for key in results.get('img_fields', ['img']):
+            img = results[key]
+            margin_h = max(img.shape[0] - crop_size[0], 0)
+            margin_w = max(img.shape[1] - crop_size[1], 0)
+            offset_h = np.random.randint(0, margin_h + 1)
+            offset_w = np.random.randint(0, margin_w + 1)
+            crop_y1, crop_y2 = offset_h, offset_h + crop_size[0]
+            crop_x1, crop_x2 = offset_w, offset_w + crop_size[1]
+
+            # crop the image
+            img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
+            img_shape = img.shape
+            results[key] = img
+        results['img_shape'] = img_shape
+
+        # crop bboxes accordingly and clip to the image boundary
+        for key in results.get('bbox_fields', []):
+            # e.g. gt_bboxes and gt_bboxes_ignore
+            bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h],
+                                   dtype=np.float32)
+            bboxes = results[key] - bbox_offset
+            if self.bbox_clip_border:
+                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
+                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
+            valid_inds = (bboxes[:, 2] > bboxes[:, 0]) & (
+                bboxes[:, 3] > bboxes[:, 1])
+            # If the crop does not contain any gt-bbox area and
+            # allow_negative_crop is False, skip this image.
+            if (key == 'gt_bboxes' and not valid_inds.any()
+                    and not allow_negative_crop):
+                return None
+            results[key] = bboxes[valid_inds, :]
+            # label fields. e.g. gt_labels and gt_labels_ignore
+            label_key = self.bbox2label.get(key)
+            if label_key in results:
+                results[label_key] = results[label_key][valid_inds]
+
+            # mask fields, e.g. gt_masks and gt_masks_ignore
+            mask_key = self.bbox2mask.get(key)
+            if mask_key in results:
+                results[mask_key] = results[mask_key][
+                    valid_inds.nonzero()[0]].crop(
+                        np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
+
+        # crop semantic seg
+        for key in results.get('seg_fields', []):
+            results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2]
+
+        return results
+
+    def _get_crop_size(self, image_size):
+        """Randomly generates the absolute crop size based on `crop_type` and
+        `image_size`.
+
+        Args:
+            image_size (tuple): (h, w).
+
+        Returns:
+            crop_size (tuple): (crop_h, crop_w) in absolute pixels.
+        """
+        h, w = image_size
+        if self.crop_type == 'absolute':
+            return (min(self.crop_size[0], h), min(self.crop_size[1], w))
+        elif self.crop_type == 'absolute_range':
+            assert self.crop_size[0] <= self.crop_size[1]
+            crop_h = np.random.randint(
+                min(h, self.crop_size[0]),
+                min(h, self.crop_size[1]) + 1)
+            crop_w = np.random.randint(
+                min(w, self.crop_size[0]),
+                min(w, self.crop_size[1]) + 1)
+            return crop_h, crop_w
+        elif self.crop_type == 'relative':
+            crop_h, crop_w = self.crop_size
+            return int(h * crop_h + 0.5), int(w * crop_w + 0.5)
+        elif self.crop_type == 'relative_range':
+            crop_size = np.asarray(self.crop_size, dtype=np.float32)
+            crop_h, crop_w = crop_size + np.random.rand(2) * (1 - crop_size)
+            return int(h * crop_h + 0.5), int(w * crop_w + 0.5)
+
+    def __call__(self, results):
+        """Call function to randomly crop images, bounding boxes, masks,
+        semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Randomly cropped results, 'img_shape' key in result dict is
+                updated according to crop size.
+        """
+        image_size = results['img'].shape[:2]
+        crop_size = self._get_crop_size(image_size)
+        results = self._crop_data(results, crop_size, self.allow_negative_crop)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(crop_size={self.crop_size}, '
+        repr_str += f'crop_type={self.crop_type}, '
+        repr_str += f'allow_negative_crop={self.allow_negative_crop}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class SegRescale:
+    """Rescale semantic segmentation maps.
+
+    Args:
+        scale_factor (float): The scale factor of the final output.
+        backend (str): Image rescale backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+    """
+
+    def __init__(self, scale_factor=1, backend='cv2'):
+        self.scale_factor = scale_factor
+        self.backend = backend
+
+    def __call__(self, results):
+        """Call function to scale the semantic segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with semantic segmentation map scaled.
+        """
+
+        for key in results.get('seg_fields', []):
+            if self.scale_factor != 1:
+                results[key] = imrescale(
+                    results[key],
+                    self.scale_factor,
+                    interpolation='nearest',
+                    backend=self.backend)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(scale_factor={self.scale_factor})'
+
+
+@PIPELINES.register_module()
+class PhotoMetricDistortion:
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def __init__(self,
+                 brightness_delta=32,
+                 contrast_range=(0.5, 1.5),
+                 saturation_range=(0.5, 1.5),
+                 hue_delta=18):
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+
+    def __call__(self, results):
+        """Call function to perform photometric distortion on images.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images distorted.
+        """
+
+        if 'img_fields' in results:
+            assert results['img_fields'] == ['img'], \
+                'Only single img_fields is allowed'
+        img = results['img']
+        assert img.dtype == np.float32, \
+            'PhotoMetricDistortion needs the input image of dtype np.float32,'\
+            ' please set "to_float32=True" in "LoadImageFromFile" pipeline'
+        # random brightness
+        if random.randint(2):
+            delta = random.uniform(-self.brightness_delta,
+                                   self.brightness_delta)
+            img += delta
+
+        # mode == 0 --> do random contrast first
+        # mode == 1 --> do random contrast last
+        mode = random.randint(2)
+        if mode == 1:
+            if random.randint(2):
+                alpha = random.uniform(self.contrast_lower,
+                                       self.contrast_upper)
+                img *= alpha
+
+        # convert color from BGR to HSV
+        img = bgr2hsv(img)
+
+        # random saturation
+        if random.randint(2):
+            img[..., 1] *= random.uniform(self.saturation_lower,
+                                          self.saturation_upper)
+
+        # random hue
+        if random.randint(2):
+            img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
+            img[..., 0][img[..., 0] > 360] -= 360
+            img[..., 0][img[..., 0] < 0] += 360
+
+        # convert color from HSV to BGR
+        img = hsv2bgr(img)
+
+        # random contrast
+        if mode == 0:
+            if random.randint(2):
+                alpha = random.uniform(self.contrast_lower,
+                                       self.contrast_upper)
+                img *= alpha
+
+        # randomly swap channels
+        if random.randint(2):
+            img = img[..., random.permutation(3)]
+
+        results['img'] = img
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
+        repr_str += 'contrast_range='
+        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
+        repr_str += 'saturation_range='
+        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
+        repr_str += f'hue_delta={self.hue_delta})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Expand:
+    """Random expand the image & bboxes.
+
+    Randomly place the original image on a canvas of 'ratio' x original image
+    size filled with mean values. The ratio is in the range of ratio_range.
+
+    Args:
+        mean (tuple): mean value of dataset.
+        to_rgb (bool): if need to convert the order of mean to align with RGB.
+        ratio_range (tuple): range of expand ratio.
+        prob (float): probability of applying this transformation
+    """
+
+    def __init__(self,
+                 mean=(0, 0, 0),
+                 to_rgb=True,
+                 ratio_range=(1, 4),
+                 seg_ignore_label=None,
+                 prob=0.5):
+        self.to_rgb = to_rgb
+        self.ratio_range = ratio_range
+        if to_rgb:
+            self.mean = mean[::-1]
+        else:
+            self.mean = mean
+        self.min_ratio, self.max_ratio = ratio_range
+        self.seg_ignore_label = seg_ignore_label
+        self.prob = prob
+
+    def __call__(self, results):
+        """Call function to expand images, bounding boxes.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images, bounding boxes expanded
+        """
+
+        if random.uniform(0, 1) > self.prob:
+            return results
+
+        if 'img_fields' in results:
+            assert results['img_fields'] == ['img'], \
+                'Only single img_fields is allowed'
+        img = results['img']
+
+        h, w, c = img.shape
+        ratio = random.uniform(self.min_ratio, self.max_ratio)
+        # speedup expand when meets large image
+        if np.all(self.mean == self.mean[0]):
+            expand_img = np.empty((int(h * ratio), int(w * ratio), c),
+                                  img.dtype)
+            expand_img.fill(self.mean[0])
+        else:
+            expand_img = np.full((int(h * ratio), int(w * ratio), c),
+                                 self.mean,
+                                 dtype=img.dtype)
+        left = int(random.uniform(0, w * ratio - w))
+        top = int(random.uniform(0, h * ratio - h))
+        expand_img[top:top + h, left:left + w] = img
+
+        results['img'] = expand_img
+        # expand bboxes
+        for key in results.get('bbox_fields', []):
+            results[key] = results[key] + np.tile(
+                (left, top), 2).astype(results[key].dtype)
+
+        # expand masks
+        for key in results.get('mask_fields', []):
+            results[key] = results[key].expand(
+                int(h * ratio), int(w * ratio), top, left)
+
+        # expand segs
+        for key in results.get('seg_fields', []):
+            gt_seg = results[key]
+            expand_gt_seg = np.full((int(h * ratio), int(w * ratio)),
+                                    self.seg_ignore_label,
+                                    dtype=gt_seg.dtype)
+            expand_gt_seg[top:top + h, left:left + w] = gt_seg
+            results[key] = expand_gt_seg
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={self.mean}, to_rgb={self.to_rgb}, '
+        repr_str += f'ratio_range={self.ratio_range}, '
+        repr_str += f'seg_ignore_label={self.seg_ignore_label})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class MinIoURandomCrop:
+    """Random crop the image & bboxes, the cropped patches have minimum IoU
+    requirement with original image & bboxes, the IoU threshold is randomly
+    selected from min_ious.
+
+    Args:
+        min_ious (tuple): minimum IoU threshold for all intersections with
+        bounding boxes
+        min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
+        where a >= min_crop_size).
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+
+    Note:
+        The keys for bboxes, labels and masks should be paired. That is, \
+        `gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \
+        `gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`.
+    """
+
+    def __init__(self,
+                 min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
+                 min_crop_size=0.3,
+                 bbox_clip_border=True):
+        # 1: return ori img
+        self.min_ious = min_ious
+        self.sample_mode = (1, *min_ious, 0)
+        self.min_crop_size = min_crop_size
+        self.bbox_clip_border = bbox_clip_border
+        self.bbox2label = {
+            'gt_bboxes': 'gt_labels',
+            'gt_bboxes_ignore': 'gt_labels_ignore'
+        }
+        self.bbox2mask = {
+            'gt_bboxes': 'gt_masks',
+            'gt_bboxes_ignore': 'gt_masks_ignore'
+        }
+
+    def __call__(self, results):
+        """Call function to crop images and bounding boxes with minimum IoU
+        constraint.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images and bounding boxes cropped, \
+                'img_shape' key is updated.
+        """
+
+        if 'img_fields' in results:
+            assert results['img_fields'] == ['img'], \
+                'Only single img_fields is allowed'
+        img = results['img']
+        assert 'bbox_fields' in results
+        boxes = [results[key] for key in results['bbox_fields']]
+        boxes = np.concatenate(boxes, 0)
+        h, w, c = img.shape
+        while True:
+            mode = random.choice(self.sample_mode)
+            self.mode = mode
+            if mode == 1:
+                return results
+
+            min_iou = mode
+            for i in range(50):
+                new_w = random.uniform(self.min_crop_size * w, w)
+                new_h = random.uniform(self.min_crop_size * h, h)
+
+                # h / w in [0.5, 2]
+                if new_h / new_w < 0.5 or new_h / new_w > 2:
+                    continue
+
+                left = random.uniform(w - new_w)
+                top = random.uniform(h - new_h)
+
+                patch = np.array(
+                    (int(left), int(top), int(left + new_w), int(top + new_h)))
+                # Line or point crop is not allowed
+                if patch[2] == patch[0] or patch[3] == patch[1]:
+                    continue
+                overlaps = bbox_overlaps(
+                    patch.reshape(-1, 4), boxes.reshape(-1, 4)).reshape(-1)
+                if len(overlaps) > 0 and overlaps.min() < min_iou:
+                    continue
+
+                # center of boxes should inside the crop img
+                # only adjust boxes and instance masks when the gt is not empty
+                if len(overlaps) > 0:
+                    # adjust boxes
+                    def is_center_of_bboxes_in_patch(boxes, patch):
+                        center = (boxes[:, :2] + boxes[:, 2:]) / 2
+                        mask = ((center[:, 0] > patch[0]) *
+                                (center[:, 1] > patch[1]) *
+                                (center[:, 0] < patch[2]) *
+                                (center[:, 1] < patch[3]))
+                        return mask
+
+                    mask = is_center_of_bboxes_in_patch(boxes, patch)
+                    if not mask.any():
+                        continue
+                    for key in results.get('bbox_fields', []):
+                        boxes = results[key].copy()
+                        mask = is_center_of_bboxes_in_patch(boxes, patch)
+                        boxes = boxes[mask]
+                        if self.bbox_clip_border:
+                            boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
+                            boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
+                        boxes -= np.tile(patch[:2], 2)
+
+                        results[key] = boxes
+                        # labels
+                        label_key = self.bbox2label.get(key)
+                        if label_key in results:
+                            results[label_key] = results[label_key][mask]
+
+                        # mask fields
+                        mask_key = self.bbox2mask.get(key)
+                        if mask_key in results:
+                            results[mask_key] = results[mask_key][
+                                mask.nonzero()[0]].crop(patch)
+                # adjust the img no matter whether the gt is empty before crop
+                img = img[patch[1]:patch[3], patch[0]:patch[2]]
+                results['img'] = img
+                results['img_shape'] = img.shape
+
+                # seg fields
+                for key in results.get('seg_fields', []):
+                    results[key] = results[key][patch[1]:patch[3],
+                                                patch[0]:patch[2]]
+                return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(min_ious={self.min_ious}, '
+        repr_str += f'min_crop_size={self.min_crop_size}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Corrupt:
+    """Corruption augmentation.
+
+    Corruption transforms implemented based on
+    `imagecorruptions <https://github.com/bethgelab/imagecorruptions>`_.
+
+    Args:
+        corruption (str): Corruption name.
+        severity (int, optional): The severity of corruption. Default: 1.
+    """
+
+    def __init__(self, corruption, severity=1):
+        self.corruption = corruption
+        self.severity = severity
+
+    def __call__(self, results):
+        """Call function to corrupt image.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with images corrupted.
+        """
+
+        if corrupt is None:
+            raise RuntimeError('imagecorruptions is not installed')
+        if 'img_fields' in results:
+            assert results['img_fields'] == ['img'], \
+                'Only single img_fields is allowed'
+        results['img'] = corrupt(
+            results['img'].astype(np.uint8),
+            corruption_name=self.corruption,
+            severity=self.severity)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(corruption={self.corruption}, '
+        repr_str += f'severity={self.severity})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Albu:
+    """Albumentation augmentation.
+
+    Adds custom transformations from Albumentations library.
+    Please, visit `https://albumentations.readthedocs.io`
+    to get more information.
+
+    An example of ``transforms`` is as followed:
+
+    .. code-block::
+
+        [
+            dict(
+                type='ShiftScaleRotate',
+                shift_limit=0.0625,
+                scale_limit=0.0,
+                rotate_limit=0,
+                interpolation=1,
+                p=0.5),
+            dict(
+                type='RandomBrightnessContrast',
+                brightness_limit=[0.1, 0.3],
+                contrast_limit=[0.1, 0.3],
+                p=0.2),
+            dict(type='ChannelShuffle', p=0.1),
+            dict(
+                type='OneOf',
+                transforms=[
+                    dict(type='Blur', blur_limit=3, p=1.0),
+                    dict(type='MedianBlur', blur_limit=3, p=1.0)
+                ],
+                p=0.1),
+        ]
+
+    Args:
+        transforms (list[dict]): A list of albu transformations
+        bbox_params (dict): Bbox_params for albumentation `Compose`
+        keymap (dict): Contains {'input key':'albumentation-style key'}
+        skip_img_without_anno (bool): Whether to skip the image if no ann left
+            after aug
+    """
+
+    def __init__(self,
+                 transforms,
+                 bbox_params=None,
+                 keymap=None,
+                 update_pad_shape=False,
+                 skip_img_without_anno=False):
+        if Compose is None:
+            raise RuntimeError('albumentations is not installed')
+
+        # Args will be modified later, copying it will be safer
+        transforms = copy.deepcopy(transforms)
+        if bbox_params is not None:
+            bbox_params = copy.deepcopy(bbox_params)
+        if keymap is not None:
+            keymap = copy.deepcopy(keymap)
+        self.transforms = transforms
+        self.filter_lost_elements = False
+        self.update_pad_shape = update_pad_shape
+        self.skip_img_without_anno = skip_img_without_anno
+
+        # A simple workaround to remove masks without boxes
+        if (isinstance(bbox_params, dict) and 'label_fields' in bbox_params
+                and 'filter_lost_elements' in bbox_params):
+            self.filter_lost_elements = True
+            self.origin_label_fields = bbox_params['label_fields']
+            bbox_params['label_fields'] = ['idx_mapper']
+            del bbox_params['filter_lost_elements']
+
+        self.bbox_params = (
+            self.albu_builder(bbox_params) if bbox_params else None)
+        self.aug = Compose([self.albu_builder(t) for t in self.transforms],
+                           bbox_params=self.bbox_params)
+
+        if not keymap:
+            self.keymap_to_albu = {
+                'img': 'image',
+                'gt_masks': 'masks',
+                'gt_bboxes': 'bboxes'
+            }
+        else:
+            self.keymap_to_albu = keymap
+        self.keymap_back = {v: k for k, v in self.keymap_to_albu.items()}
+
+    def albu_builder(self, cfg):
+        """Import a module from albumentations.
+
+        It inherits some of :func:`build_from_cfg` logic.
+
+        Args:
+            cfg (dict): Config dict. It should at least contain the key "type".
+
+        Returns:
+            obj: The constructed object.
+        """
+
+        assert isinstance(cfg, dict) and 'type' in cfg
+        args = cfg.copy()
+
+        obj_type = args.pop('type')
+        if is_str(obj_type):
+            if albumentations is None:
+                raise RuntimeError('albumentations is not installed')
+            obj_cls = getattr(albumentations, obj_type)
+        elif inspect.isclass(obj_type):
+            obj_cls = obj_type
+        else:
+            raise TypeError(
+                f'type must be a str or valid type, but got {type(obj_type)}')
+
+        if 'transforms' in args:
+            args['transforms'] = [
+                self.albu_builder(transform)
+                for transform in args['transforms']
+            ]
+
+        return obj_cls(**args)
+
+    @staticmethod
+    def mapper(d, keymap):
+        """Dictionary mapper. Renames keys according to keymap provided.
+
+        Args:
+            d (dict): old dict
+            keymap (dict): {'old_key':'new_key'}
+        Returns:
+            dict: new dict.
+        """
+
+        updated_dict = {}
+        for k, v in zip(d.keys(), d.values()):
+            new_k = keymap.get(k, k)
+            updated_dict[new_k] = d[k]
+        return updated_dict
+
+    def __call__(self, results):
+        # dict to albumentations format
+        results = self.mapper(results, self.keymap_to_albu)
+        # TODO: add bbox_fields
+        if 'bboxes' in results:
+            # to list of boxes
+            if isinstance(results['bboxes'], np.ndarray):
+                results['bboxes'] = [x for x in results['bboxes']]
+            # add pseudo-field for filtration
+            if self.filter_lost_elements:
+                results['idx_mapper'] = np.arange(len(results['bboxes']))
+
+        # TODO: Support mask structure in albu
+        if 'masks' in results:
+            if isinstance(results['masks'], PolygonMasks):
+                raise NotImplementedError(
+                    'Albu only supports BitMap masks now')
+            ori_masks = results['masks']
+            if albumentations.__version__ < '0.5':
+                results['masks'] = results['masks'].masks
+            else:
+                results['masks'] = [mask for mask in results['masks'].masks]
+
+        results = self.aug(**results)
+
+        if 'bboxes' in results:
+            if isinstance(results['bboxes'], list):
+                results['bboxes'] = np.array(
+                    results['bboxes'], dtype=np.float32)
+            results['bboxes'] = results['bboxes'].reshape(-1, 4)
+
+            # filter label_fields
+            if self.filter_lost_elements:
+
+                for label in self.origin_label_fields:
+                    results[label] = np.array(
+                        [results[label][i] for i in results['idx_mapper']])
+                if 'masks' in results:
+                    results['masks'] = np.array(
+                        [results['masks'][i] for i in results['idx_mapper']])
+                    results['masks'] = ori_masks.__class__(
+                        results['masks'], results['image'].shape[0],
+                        results['image'].shape[1])
+
+                if (not len(results['idx_mapper'])
+                        and self.skip_img_without_anno):
+                    return None
+
+        if 'gt_labels' in results:
+            if isinstance(results['gt_labels'], list):
+                results['gt_labels'] = np.array(results['gt_labels'])
+            results['gt_labels'] = results['gt_labels'].astype(np.int64)
+
+        # back to the original format
+        results = self.mapper(results, self.keymap_back)
+
+        # update final shape
+        if self.update_pad_shape:
+            results['pad_shape'] = results['img'].shape
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__ + f'(transforms={self.transforms})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomCenterCropPad:
+    """Random center crop and random around padding for CornerNet.
+
+    This operation generates randomly cropped image from the original image and
+    pads it simultaneously. Different from :class:`RandomCrop`, the output
+    shape may not equal to ``crop_size`` strictly. We choose a random value
+    from ``ratios`` and the output shape could be larger or smaller than
+    ``crop_size``. The padding operation is also different from :class:`Pad`,
+    here we use around padding instead of right-bottom padding.
+
+    The relation between output image (padding image) and original image:
+
+    .. code:: text
+
+                        output image
+
+               +----------------------------+
+               |          padded area       |
+        +------|----------------------------|----------+
+        |      |         cropped area       |          |
+        |      |         +---------------+  |          |
+        |      |         |    .   center |  |          | original image
+        |      |         |        range  |  |          |
+        |      |         +---------------+  |          |
+        +------|----------------------------|----------+
+               |          padded area       |
+               +----------------------------+
+
+    There are 5 main areas in the figure:
+
+    - output image: output image of this operation, also called padding
+      image in following instruction.
+    - original image: input image of this operation.
+    - padded area: non-intersect area of output image and original image.
+    - cropped area: the overlap of output image and original image.
+    - center range: a smaller area where random center chosen from.
+      center range is computed by ``border`` and original image's shape
+      to avoid our random center is too close to original image's border.
+
+    Also this operation act differently in train and test mode, the summary
+    pipeline is listed below.
+
+    Train pipeline:
+
+    1. Choose a ``random_ratio`` from ``ratios``, the shape of padding image
+       will be ``random_ratio * crop_size``.
+    2. Choose a ``random_center`` in center range.
+    3. Generate padding image with center matches the ``random_center``.
+    4. Initialize the padding image with pixel value equals to ``mean``.
+    5. Copy the cropped area to padding image.
+    6. Refine annotations.
+
+    Test pipeline:
+
+    1. Compute output shape according to ``test_pad_mode``.
+    2. Generate padding image with center matches the original image
+       center.
+    3. Initialize the padding image with pixel value equals to ``mean``.
+    4. Copy the ``cropped area`` to padding image.
+
+    Args:
+        crop_size (tuple | None): expected size after crop, final size will
+            computed according to ratio. Requires (h, w) in train mode, and
+            None in test mode.
+        ratios (tuple): random select a ratio from tuple and crop image to
+            (crop_size[0] * ratio) * (crop_size[1] * ratio).
+            Only available in train mode.
+        border (int): max distance from center select area to image border.
+            Only available in train mode.
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB.
+        test_mode (bool): whether involve random variables in transform.
+            In train mode, crop_size is fixed, center coords and ratio is
+            random selected from predefined lists. In test mode, crop_size
+            is image's original shape, center coords and ratio is fixed.
+        test_pad_mode (tuple): padding method and padding shape value, only
+            available in test mode. Default is using 'logical_or' with
+            127 as padding shape value.
+
+            - 'logical_or': final_shape = input_shape | padding_shape_value
+            - 'size_divisor': final_shape = int(
+              ceil(input_shape / padding_shape_value) * padding_shape_value)
+        test_pad_add_pix (int): Extra padding pixel in test mode. Default 0.
+        bbox_clip_border (bool, optional): Whether clip the objects outside
+            the border of the image. Defaults to True.
+    """
+
+    def __init__(self,
+                 crop_size=None,
+                 ratios=(0.9, 1.0, 1.1),
+                 border=128,
+                 mean=None,
+                 std=None,
+                 to_rgb=None,
+                 test_mode=False,
+                 test_pad_mode=('logical_or', 127),
+                 test_pad_add_pix=0,
+                 bbox_clip_border=True):
+        if test_mode:
+            assert crop_size is None, 'crop_size must be None in test mode'
+            assert ratios is None, 'ratios must be None in test mode'
+            assert border is None, 'border must be None in test mode'
+            assert isinstance(test_pad_mode, (list, tuple))
+            assert test_pad_mode[0] in ['logical_or', 'size_divisor']
+        else:
+            assert isinstance(crop_size, (list, tuple))
+            assert crop_size[0] > 0 and crop_size[1] > 0, (
+                'crop_size must > 0 in train mode')
+            assert isinstance(ratios, (list, tuple))
+            assert test_pad_mode is None, (
+                'test_pad_mode must be None in train mode')
+
+        self.crop_size = crop_size
+        self.ratios = ratios
+        self.border = border
+        # We do not set default value to mean, std and to_rgb because these
+        # hyper-parameters are easy to forget but could affect the performance.
+        # Please use the same setting as Normalize for performance assurance.
+        assert mean is not None and std is not None and to_rgb is not None
+        self.to_rgb = to_rgb
+        self.input_mean = mean
+        self.input_std = std
+        if to_rgb:
+            self.mean = mean[::-1]
+            self.std = std[::-1]
+        else:
+            self.mean = mean
+            self.std = std
+        self.test_mode = test_mode
+        self.test_pad_mode = test_pad_mode
+        self.test_pad_add_pix = test_pad_add_pix
+        self.bbox_clip_border = bbox_clip_border
+
+    def _get_border(self, border, size):
+        """Get final border for the target size.
+
+        This function generates a ``final_border`` according to image's shape.
+        The area between ``final_border`` and ``size - final_border`` is the
+        ``center range``. We randomly choose center from the ``center range``
+        to avoid our random center is too close to original image's border.
+        Also ``center range`` should be larger than 0.
+
+        Args:
+            border (int): The initial border, default is 128.
+            size (int): The width or height of original image.
+        Returns:
+            int: The final border.
+        """
+        k = 2 * border / size
+        i = pow(2, np.ceil(np.log2(np.ceil(k))) + (k == int(k)))
+        return border // i
+
+    def _filter_boxes(self, patch, boxes):
+        """Check whether the center of each box is in the patch.
+
+        Args:
+            patch (list[int]): The cropped area, [left, top, right, bottom].
+            boxes (numpy array, (N x 4)): Ground truth boxes.
+
+        Returns:
+            mask (numpy array, (N,)): Each box is inside or outside the patch.
+        """
+        center = (boxes[:, :2] + boxes[:, 2:]) / 2
+        mask = (center[:, 0] > patch[0]) * (center[:, 1] > patch[1]) * (
+            center[:, 0] < patch[2]) * (
+                center[:, 1] < patch[3])
+        return mask
+
+    def _crop_image_and_paste(self, image, center, size):
+        """Crop image with a given center and size, then paste the cropped
+        image to a blank image with two centers align.
+
+        This function is equivalent to generating a blank image with ``size``
+        as its shape. Then cover it on the original image with two centers (
+        the center of blank image and the random center of original image)
+        aligned. The overlap area is paste from the original image and the
+        outside area is filled with ``mean pixel``.
+
+        Args:
+            image (np array, H x W x C): Original image.
+            center (list[int]): Target crop center coord.
+            size (list[int]): Target crop size. [target_h, target_w]
+
+        Returns:
+            cropped_img (np array, target_h x target_w x C): Cropped image.
+            border (np array, 4): The distance of four border of
+                ``cropped_img`` to the original image area, [top, bottom,
+                left, right]
+            patch (list[int]): The cropped area, [left, top, right, bottom].
+        """
+        center_y, center_x = center
+        target_h, target_w = size
+        img_h, img_w, img_c = image.shape
+
+        x0 = max(0, center_x - target_w // 2)
+        x1 = min(center_x + target_w // 2, img_w)
+        y0 = max(0, center_y - target_h // 2)
+        y1 = min(center_y + target_h // 2, img_h)
+        patch = np.array((int(x0), int(y0), int(x1), int(y1)))
+
+        left, right = center_x - x0, x1 - center_x
+        top, bottom = center_y - y0, y1 - center_y
+
+        cropped_center_y, cropped_center_x = target_h // 2, target_w // 2
+        cropped_img = np.zeros((target_h, target_w, img_c), dtype=image.dtype)
+        for i in range(img_c):
+            cropped_img[:, :, i] += self.mean[i]
+        y_slice = slice(cropped_center_y - top, cropped_center_y + bottom)
+        x_slice = slice(cropped_center_x - left, cropped_center_x + right)
+        cropped_img[y_slice, x_slice, :] = image[y0:y1, x0:x1, :]
+
+        border = np.array([
+            cropped_center_y - top, cropped_center_y + bottom,
+            cropped_center_x - left, cropped_center_x + right
+        ],
+                          dtype=np.float32)
+
+        return cropped_img, border, patch
+
+    def _train_aug(self, results):
+        """Random crop and around padding the original image.
+
+        Args:
+            results (dict): Image infomations in the augment pipeline.
+
+        Returns:
+            results (dict): The updated dict.
+        """
+        img = results['img']
+        h, w, c = img.shape
+        boxes = results['gt_bboxes']
+        while True:
+            scale = random.choice(self.ratios)
+            new_h = int(self.crop_size[0] * scale)
+            new_w = int(self.crop_size[1] * scale)
+            h_border = self._get_border(self.border, h)
+            w_border = self._get_border(self.border, w)
+
+            for i in range(50):
+                center_x = random.randint(low=w_border, high=w - w_border)
+                center_y = random.randint(low=h_border, high=h - h_border)
+
+                cropped_img, border, patch = self._crop_image_and_paste(
+                    img, [center_y, center_x], [new_h, new_w])
+
+                mask = self._filter_boxes(patch, boxes)
+                # if image do not have valid bbox, any crop patch is valid.
+                if not mask.any() and len(boxes) > 0:
+                    continue
+
+                results['img'] = cropped_img
+                results['img_shape'] = cropped_img.shape
+                results['pad_shape'] = cropped_img.shape
+
+                x0, y0, x1, y1 = patch
+
+                left_w, top_h = center_x - x0, center_y - y0
+                cropped_center_x, cropped_center_y = new_w // 2, new_h // 2
+
+                # crop bboxes accordingly and clip to the image boundary
+                for key in results.get('bbox_fields', []):
+                    mask = self._filter_boxes(patch, results[key])
+                    bboxes = results[key][mask]
+                    bboxes[:, 0:4:2] += cropped_center_x - left_w - x0
+                    bboxes[:, 1:4:2] += cropped_center_y - top_h - y0
+                    if self.bbox_clip_border:
+                        bboxes[:, 0:4:2] = np.clip(bboxes[:, 0:4:2], 0, new_w)
+                        bboxes[:, 1:4:2] = np.clip(bboxes[:, 1:4:2], 0, new_h)
+                    keep = (bboxes[:, 2] > bboxes[:, 0]) & (
+                        bboxes[:, 3] > bboxes[:, 1])
+                    bboxes = bboxes[keep]
+                    results[key] = bboxes
+                    if key in ['gt_bboxes']:
+                        if 'gt_labels' in results:
+                            labels = results['gt_labels'][mask]
+                            labels = labels[keep]
+                            results['gt_labels'] = labels
+                        if 'gt_masks' in results:
+                            raise NotImplementedError(
+                                'RandomCenterCropPad only supports bbox.')
+
+                # crop semantic seg
+                for key in results.get('seg_fields', []):
+                    raise NotImplementedError(
+                        'RandomCenterCropPad only supports bbox.')
+                return results
+
+    def _test_aug(self, results):
+        """Around padding the original image without cropping.
+
+        The padding mode and value are from ``test_pad_mode``.
+
+        Args:
+            results (dict): Image infomations in the augment pipeline.
+
+        Returns:
+            results (dict): The updated dict.
+        """
+        img = results['img']
+        h, w, c = img.shape
+        results['img_shape'] = img.shape
+        if self.test_pad_mode[0] in ['logical_or']:
+            # self.test_pad_add_pix is only used for centernet
+            target_h = (h | self.test_pad_mode[1]) + self.test_pad_add_pix
+            target_w = (w | self.test_pad_mode[1]) + self.test_pad_add_pix
+        elif self.test_pad_mode[0] in ['size_divisor']:
+            divisor = self.test_pad_mode[1]
+            target_h = int(np.ceil(h / divisor)) * divisor
+            target_w = int(np.ceil(w / divisor)) * divisor
+        else:
+            raise NotImplementedError(
+                'RandomCenterCropPad only support two testing pad mode:'
+                'logical-or and size_divisor.')
+
+        cropped_img, border, _ = self._crop_image_and_paste(
+            img, [h // 2, w // 2], [target_h, target_w])
+        results['img'] = cropped_img
+        results['pad_shape'] = cropped_img.shape
+        results['border'] = border
+        return results
+
+    def __call__(self, results):
+        img = results['img']
+        assert img.dtype == np.float32, (
+            'RandomCenterCropPad needs the input image of dtype np.float32,'
+            ' please set "to_float32=True" in "LoadImageFromFile" pipeline')
+        h, w, c = img.shape
+        assert c == len(self.mean)
+        if self.test_mode:
+            return self._test_aug(results)
+        else:
+            return self._train_aug(results)
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(crop_size={self.crop_size}, '
+        repr_str += f'ratios={self.ratios}, '
+        repr_str += f'border={self.border}, '
+        repr_str += f'mean={self.input_mean}, '
+        repr_str += f'std={self.input_std}, '
+        repr_str += f'to_rgb={self.to_rgb}, '
+        repr_str += f'test_mode={self.test_mode}, '
+        repr_str += f'test_pad_mode={self.test_pad_mode}, '
+        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class CutOut:
+    """CutOut operation.
+
+    Randomly drop some regions of image used in
+    `Cutout <https://arxiv.org/abs/1708.04552>`_.
+
+    Args:
+        n_holes (int | tuple[int, int]): Number of regions to be dropped.
+            If it is given as a list, number of holes will be randomly
+            selected from the closed interval [`n_holes[0]`, `n_holes[1]`].
+        cutout_shape (tuple[int, int] | list[tuple[int, int]]): The candidate
+            shape of dropped regions. It can be `tuple[int, int]` to use a
+            fixed cutout shape, or `list[tuple[int, int]]` to randomly choose
+            shape from the list.
+        cutout_ratio (tuple[float, float] | list[tuple[float, float]]): The
+            candidate ratio of dropped regions. It can be `tuple[float, float]`
+            to use a fixed ratio or `list[tuple[float, float]]` to randomly
+            choose ratio from the list. Please note that `cutout_shape`
+            and `cutout_ratio` cannot be both given at the same time.
+        fill_in (tuple[float, float, float] | tuple[int, int, int]): The value
+            of pixel to fill in the dropped regions. Default: (0, 0, 0).
+    """
+
+    def __init__(self,
+                 n_holes,
+                 cutout_shape=None,
+                 cutout_ratio=None,
+                 fill_in=(0, 0, 0)):
+
+        assert (cutout_shape is None) ^ (cutout_ratio is None), \
+            'Either cutout_shape or cutout_ratio should be specified.'
+        assert (isinstance(cutout_shape, (list, tuple))
+                or isinstance(cutout_ratio, (list, tuple)))
+        if isinstance(n_holes, tuple):
+            assert len(n_holes) == 2 and 0 <= n_holes[0] < n_holes[1]
+        else:
+            n_holes = (n_holes, n_holes)
+        self.n_holes = n_holes
+        self.fill_in = fill_in
+        self.with_ratio = cutout_ratio is not None
+        self.candidates = cutout_ratio if self.with_ratio else cutout_shape
+        if not isinstance(self.candidates, list):
+            self.candidates = [self.candidates]
+
+    def __call__(self, results):
+        """Call function to drop some regions of image."""
+        h, w, c = results['img'].shape
+        n_holes = np.random.randint(self.n_holes[0], self.n_holes[1] + 1)
+        for _ in range(n_holes):
+            x1 = np.random.randint(0, w)
+            y1 = np.random.randint(0, h)
+            index = np.random.randint(0, len(self.candidates))
+            if not self.with_ratio:
+                cutout_w, cutout_h = self.candidates[index]
+            else:
+                cutout_w = int(self.candidates[index][0] * w)
+                cutout_h = int(self.candidates[index][1] * h)
+
+            x2 = np.clip(x1 + cutout_w, 0, w)
+            y2 = np.clip(y1 + cutout_h, 0, h)
+            results['img'][y1:y2, x1:x2, :] = self.fill_in
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(n_holes={self.n_holes}, '
+        repr_str += (f'cutout_ratio={self.candidates}, ' if self.with_ratio
+                     else f'cutout_shape={self.candidates}, ')
+        repr_str += f'fill_in={self.fill_in})'
+        return repr_str
diff --git a/mmcv/datasets/pipelines/transforms_3d.py b/mmcv/datasets/pipelines/transforms_3d.py
new file mode 100644
index 0000000..aa7ab25
--- /dev/null
+++ b/mmcv/datasets/pipelines/transforms_3d.py
@@ -0,0 +1,2042 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from numpy import random
+import warnings
+from mmcv import is_tuple_of
+from mmcv.utils import build_from_cfg
+from mmcv.parallel import DataContainer as DC
+
+from mmcv.core.voxel.voxel_generator import VoxelGenerator
+from mmcv.core.bbox.structures.cam_box3d import CameraInstance3DBoxes
+from mmcv.core.bbox.structures.depth_box3d import DepthInstance3DBoxes
+from mmcv.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes
+from mmcv.core.bbox import box_np_ops
+from mmcv.datasets.builder import PIPELINES
+from mmcv.datasets.pipelines.transforms import RandomFlip
+from mmcv.image import impad, impad_to_multiple, imnormalize, imresize, bgr2hsv, hsv2bgr
+from ..builder import OBJECTSAMPLERS
+from .data_augment_utils import noise_per_object_v3_
+
+
+@PIPELINES.register_module()
+class RandomDropPointsColor(object):
+    r"""Randomly set the color of points to all zeros.
+
+    Once this transform is executed, all the points' color will be dropped.
+    Refer to `PAConv <https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/
+    util/transform.py#L223>`_ for more details.
+
+    Args:
+        drop_ratio (float): The probability of dropping point colors.
+            Defaults to 0.2.
+    """
+
+    def __init__(self, drop_ratio=0.2):
+        assert isinstance(drop_ratio, (int, float)) and 0 <= drop_ratio <= 1, \
+            f'invalid drop_ratio value {drop_ratio}'
+        self.drop_ratio = drop_ratio
+
+    def __call__(self, input_dict):
+        """Call function to drop point colors.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after color dropping, \
+                'points' key is updated in the result dict.
+        """
+        points = input_dict['points']
+        assert points.attribute_dims is not None and \
+            'color' in points.attribute_dims, \
+            'Expect points have color attribute'
+
+        # this if-expression is a bit strange
+        # `RandomDropPointsColor` is used in training 3D segmentor PAConv
+        # we discovered in our experiments that, using
+        # `if np.random.rand() > 1.0 - self.drop_ratio` consistently leads to
+        # better results than using `if np.random.rand() < self.drop_ratio`
+        # so we keep this hack in our codebase
+        if np.random.rand() > 1.0 - self.drop_ratio:
+            points.color = points.color * 0.0
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(drop_ratio={self.drop_ratio})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomFlip3D(RandomFlip):
+    """Flip the points & bbox.
+
+    If the input dict contains the key "flip", then the flag will be used,
+    otherwise it will be randomly decided by a ratio specified in the init
+    method.
+
+    Args:
+        sync_2d (bool, optional): Whether to apply flip according to the 2D
+            images. If True, it will apply the same flip as that to 2D images.
+            If False, it will decide whether to flip randomly and independently
+            to that of 2D images. Defaults to True.
+        flip_ratio_bev_horizontal (float, optional): The flipping probability
+            in horizontal direction. Defaults to 0.0.
+        flip_ratio_bev_vertical (float, optional): The flipping probability
+            in vertical direction. Defaults to 0.0.
+    """
+
+    def __init__(self,
+                 sync_2d=True,
+                 flip_ratio_bev_horizontal=0.0,
+                 flip_ratio_bev_vertical=0.0,
+                 **kwargs):
+        super(RandomFlip3D, self).__init__(
+            flip_ratio=flip_ratio_bev_horizontal, **kwargs)
+        self.sync_2d = sync_2d
+        self.flip_ratio_bev_vertical = flip_ratio_bev_vertical
+        if flip_ratio_bev_horizontal is not None:
+            assert isinstance(
+                flip_ratio_bev_horizontal,
+                (int, float)) and 0 <= flip_ratio_bev_horizontal <= 1
+        if flip_ratio_bev_vertical is not None:
+            assert isinstance(
+                flip_ratio_bev_vertical,
+                (int, float)) and 0 <= flip_ratio_bev_vertical <= 1
+
+    def random_flip_data_3d(self, input_dict, direction='horizontal'):
+        """Flip 3D data randomly.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+            direction (str): Flip direction. Default: horizontal.
+
+        Returns:
+            dict: Flipped results, 'points', 'bbox3d_fields' keys are \
+                updated in the result dict.
+        """
+        assert direction in ['horizontal', 'vertical']
+        if len(input_dict['bbox3d_fields']) == 0:  # test mode
+            input_dict['bbox3d_fields'].append('empty_box3d')
+            input_dict['empty_box3d'] = input_dict['box_type_3d'](
+                np.array([], dtype=np.float32))
+        assert len(input_dict['bbox3d_fields']) == 1
+        for key in input_dict['bbox3d_fields']:
+            if 'points' in input_dict:
+                input_dict['points'] = input_dict[key].flip(
+                    direction, points=input_dict['points'])
+            else:
+                input_dict[key].flip(direction)
+        if 'centers2d' in input_dict:
+            assert self.sync_2d is True and direction == 'horizontal', \
+                'Only support sync_2d=True and horizontal flip with images'
+            w = input_dict['ori_shape'][1]
+            input_dict['centers2d'][..., 0] = \
+                w - input_dict['centers2d'][..., 0]
+            # need to modify the horizontal position of camera center
+            # along u-axis in the image (flip like centers2d)
+            # ['cam2img'][0][2] = c_u
+            # see more details and examples at
+            # https://github.com/open-mmlab/mmcvection3d/pull/744
+            input_dict['cam2img'][0][2] = w - input_dict['cam2img'][0][2]
+
+    def __call__(self, input_dict):
+        """Call function to flip points, values in the ``bbox3d_fields`` and \
+        also flip 2D image and its annotations.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Flipped results, 'flip', 'flip_direction', \
+                'pcd_horizontal_flip' and 'pcd_vertical_flip' keys are added \
+                into result dict.
+        """
+        # filp 2D image and its annotations
+        super(RandomFlip3D, self).__call__(input_dict)
+
+        if self.sync_2d:
+            input_dict['pcd_horizontal_flip'] = input_dict['flip']
+            input_dict['pcd_vertical_flip'] = False
+        else:
+            if 'pcd_horizontal_flip' not in input_dict:
+                flip_horizontal = True if np.random.rand(
+                ) < self.flip_ratio else False
+                input_dict['pcd_horizontal_flip'] = flip_horizontal
+            if 'pcd_vertical_flip' not in input_dict:
+                flip_vertical = True if np.random.rand(
+                ) < self.flip_ratio_bev_vertical else False
+                input_dict['pcd_vertical_flip'] = flip_vertical
+
+        if 'transformation_3d_flow' not in input_dict:
+            input_dict['transformation_3d_flow'] = []
+
+        if input_dict['pcd_horizontal_flip']:
+            self.random_flip_data_3d(input_dict, 'horizontal')
+            input_dict['transformation_3d_flow'].extend(['HF'])
+        if input_dict['pcd_vertical_flip']:
+            self.random_flip_data_3d(input_dict, 'vertical')
+            input_dict['transformation_3d_flow'].extend(['VF'])
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(sync_2d={self.sync_2d},'
+        repr_str += f' flip_ratio_bev_vertical={self.flip_ratio_bev_vertical})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomJitterPoints(object):
+    """Randomly jitter point coordinates.
+
+    Different from the global translation in ``GlobalRotScaleTrans``, here we \
+        apply different noises to each point in a scene.
+
+    Args:
+        jitter_std (list[float]): The standard deviation of jittering noise.
+            This applies random noise to all points in a 3D scene, which is \
+            sampled from a gaussian distribution whose standard deviation is \
+            set by ``jitter_std``. Defaults to [0.01, 0.01, 0.01]
+        clip_range (list[float] | None): Clip the randomly generated jitter \
+            noise into this range. If None is given, don't perform clipping.
+            Defaults to [-0.05, 0.05]
+
+    Note:
+        This transform should only be used in point cloud segmentation tasks \
+            because we don't transform ground-truth bboxes accordingly.
+        For similar transform in detection task, please refer to `ObjectNoise`.
+    """
+
+    def __init__(self,
+                 jitter_std=[0.01, 0.01, 0.01],
+                 clip_range=[-0.05, 0.05]):
+        seq_types = (list, tuple, np.ndarray)
+        if not isinstance(jitter_std, seq_types):
+            assert isinstance(jitter_std, (int, float)), \
+                f'unsupported jitter_std type {type(jitter_std)}'
+            jitter_std = [jitter_std, jitter_std, jitter_std]
+        self.jitter_std = jitter_std
+
+        if clip_range is not None:
+            if not isinstance(clip_range, seq_types):
+                assert isinstance(clip_range, (int, float)), \
+                    f'unsupported clip_range type {type(clip_range)}'
+                clip_range = [-clip_range, clip_range]
+        self.clip_range = clip_range
+
+    def __call__(self, input_dict):
+        """Call function to jitter all the points in the scene.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after adding noise to each point, \
+                'points' key is updated in the result dict.
+        """
+        points = input_dict['points']
+        jitter_std = np.array(self.jitter_std, dtype=np.float32)
+        jitter_noise = \
+            np.random.randn(points.shape[0], 3) * jitter_std[None, :]
+        if self.clip_range is not None:
+            jitter_noise = np.clip(jitter_noise, self.clip_range[0],
+                                   self.clip_range[1])
+
+        points.translate(jitter_noise)
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(jitter_std={self.jitter_std},'
+        repr_str += f' clip_range={self.clip_range})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class ObjectSample(object):
+    """Sample GT objects to the data.
+
+    Args:
+        db_sampler (dict): Config dict of the database sampler.
+        sample_2d (bool): Whether to also paste 2D image patch to the images
+            This should be true when applying multi-modality cut-and-paste.
+            Defaults to False.
+    """
+
+    def __init__(self, db_sampler, sample_2d=False):
+        self.sampler_cfg = db_sampler
+        self.sample_2d = sample_2d
+        if 'type' not in db_sampler.keys():
+            db_sampler['type'] = 'DataBaseSampler'
+        self.db_sampler = build_from_cfg(db_sampler, OBJECTSAMPLERS)
+
+    @staticmethod
+    def remove_points_in_boxes(points, boxes):
+        """Remove the points in the sampled bounding boxes.
+
+        Args:
+            points (:obj:`BasePoints`): Input point cloud array.
+            boxes (np.ndarray): Sampled ground truth boxes.
+
+        Returns:
+            np.ndarray: Points with those in the boxes removed.
+        """
+        masks = box_np_ops.points_in_rbbox(points.coord.numpy(), boxes)
+        points = points[np.logical_not(masks.any(-1))]
+        return points
+
+    def __call__(self, input_dict):
+        """Call function to sample ground truth objects to the data.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after object sampling augmentation, \
+                'points', 'gt_bboxes_3d', 'gt_labels_3d' keys are updated \
+                in the result dict.
+        """
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        gt_labels_3d = input_dict['gt_labels_3d']
+
+        # change to float for blending operation
+        points = input_dict['points']
+        if self.sample_2d:
+            img = input_dict['img']
+            gt_bboxes_2d = input_dict['gt_bboxes']
+            # Assume for now 3D & 2D bboxes are the same
+            sampled_dict = self.db_sampler.sample_all(
+                gt_bboxes_3d.tensor.numpy(),
+                gt_labels_3d,
+                gt_bboxes_2d=gt_bboxes_2d,
+                img=img)
+        else:
+            sampled_dict = self.db_sampler.sample_all(
+                gt_bboxes_3d.tensor.numpy(), gt_labels_3d, img=None)
+
+        if sampled_dict is not None:
+            sampled_gt_bboxes_3d = sampled_dict['gt_bboxes_3d']
+            sampled_points = sampled_dict['points']
+            sampled_gt_labels = sampled_dict['gt_labels_3d']
+
+            gt_labels_3d = np.concatenate([gt_labels_3d, sampled_gt_labels],
+                                          axis=0)
+            gt_bboxes_3d = gt_bboxes_3d.new_box(
+                np.concatenate(
+                    [gt_bboxes_3d.tensor.numpy(), sampled_gt_bboxes_3d]))
+
+            points = self.remove_points_in_boxes(points, sampled_gt_bboxes_3d)
+            # check the points dimension
+            points = points.cat([sampled_points, points])
+
+            if self.sample_2d:
+                sampled_gt_bboxes_2d = sampled_dict['gt_bboxes_2d']
+                gt_bboxes_2d = np.concatenate(
+                    [gt_bboxes_2d, sampled_gt_bboxes_2d]).astype(np.float32)
+
+                input_dict['gt_bboxes'] = gt_bboxes_2d
+                input_dict['img'] = sampled_dict['img']
+
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+        input_dict['gt_labels_3d'] = gt_labels_3d.astype(np.long)
+        input_dict['points'] = points
+
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f' sample_2d={self.sample_2d},'
+        repr_str += f' data_root={self.sampler_cfg.data_root},'
+        repr_str += f' info_path={self.sampler_cfg.info_path},'
+        repr_str += f' rate={self.sampler_cfg.rate},'
+        repr_str += f' prepare={self.sampler_cfg.prepare},'
+        repr_str += f' classes={self.sampler_cfg.classes},'
+        repr_str += f' sample_groups={self.sampler_cfg.sample_groups}'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class ObjectNoise(object):
+    """Apply noise to each GT objects in the scene.
+
+    Args:
+        translation_std (list[float], optional): Standard deviation of the
+            distribution where translation noise are sampled from.
+            Defaults to [0.25, 0.25, 0.25].
+        global_rot_range (list[float], optional): Global rotation to the scene.
+            Defaults to [0.0, 0.0].
+        rot_range (list[float], optional): Object rotation range.
+            Defaults to [-0.15707963267, 0.15707963267].
+        num_try (int, optional): Number of times to try if the noise applied is
+            invalid. Defaults to 100.
+    """
+
+    def __init__(self,
+                 translation_std=[0.25, 0.25, 0.25],
+                 global_rot_range=[0.0, 0.0],
+                 rot_range=[-0.15707963267, 0.15707963267],
+                 num_try=100):
+        self.translation_std = translation_std
+        self.global_rot_range = global_rot_range
+        self.rot_range = rot_range
+        self.num_try = num_try
+
+    def __call__(self, input_dict):
+        """Call function to apply noise to each ground truth in the scene.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after adding noise to each object, \
+                'points', 'gt_bboxes_3d' keys are updated in the result dict.
+        """
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        points = input_dict['points']
+
+        # TODO: check this inplace function
+        numpy_box = gt_bboxes_3d.tensor.numpy()
+        numpy_points = points.tensor.numpy()
+
+        noise_per_object_v3_(
+            numpy_box,
+            numpy_points,
+            rotation_perturb=self.rot_range,
+            center_noise_std=self.translation_std,
+            global_random_rot_range=self.global_rot_range,
+            num_try=self.num_try)
+
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d.new_box(numpy_box)
+        input_dict['points'] = points.new_point(numpy_points)
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_try={self.num_try},'
+        repr_str += f' translation_std={self.translation_std},'
+        repr_str += f' global_rot_range={self.global_rot_range},'
+        repr_str += f' rot_range={self.rot_range})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class GlobalAlignment(object):
+    """Apply global alignment to 3D scene points by rotation and translation.
+
+    Args:
+        rotation_axis (int): Rotation axis for points and bboxes rotation.
+
+    Note:
+        We do not record the applied rotation and translation as in \
+            GlobalRotScaleTrans. Because usually, we do not need to reverse \
+            the alignment step.
+        For example, ScanNet 3D detection task uses aligned ground-truth \
+            bounding boxes for evaluation.
+    """
+
+    def __init__(self, rotation_axis):
+        self.rotation_axis = rotation_axis
+
+    def _trans_points(self, input_dict, trans_factor):
+        """Private function to translate points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+            trans_factor (np.ndarray): Translation vector to be applied.
+
+        Returns:
+            dict: Results after translation, 'points' is updated in the dict.
+        """
+        input_dict['points'].translate(trans_factor)
+
+    def _rot_points(self, input_dict, rot_mat):
+        """Private function to rotate bounding boxes and points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+            rot_mat (np.ndarray): Rotation matrix to be applied.
+
+        Returns:
+            dict: Results after rotation, 'points' is updated in the dict.
+        """
+        # input should be rot_mat_T so I transpose it here
+        input_dict['points'].rotate(rot_mat.T)
+
+    def _check_rot_mat(self, rot_mat):
+        """Check if rotation matrix is valid for self.rotation_axis.
+
+        Args:
+            rot_mat (np.ndarray): Rotation matrix to be applied.
+        """
+        is_valid = np.allclose(np.linalg.det(rot_mat), 1.0)
+        valid_array = np.zeros(3)
+        valid_array[self.rotation_axis] = 1.0
+        is_valid &= (rot_mat[self.rotation_axis, :] == valid_array).all()
+        is_valid &= (rot_mat[:, self.rotation_axis] == valid_array).all()
+        assert is_valid, f'invalid rotation matrix {rot_mat}'
+
+    def __call__(self, input_dict):
+        """Call function to shuffle points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after global alignment, 'points' and keys in \
+                input_dict['bbox3d_fields'] are updated in the result dict.
+        """
+        assert 'axis_align_matrix' in input_dict['ann_info'].keys(), \
+            'axis_align_matrix is not provided in GlobalAlignment'
+
+        axis_align_matrix = input_dict['ann_info']['axis_align_matrix']
+        assert axis_align_matrix.shape == (4, 4), \
+            f'invalid shape {axis_align_matrix.shape} for axis_align_matrix'
+        rot_mat = axis_align_matrix[:3, :3]
+        trans_vec = axis_align_matrix[:3, -1]
+
+        self._check_rot_mat(rot_mat)
+        self._rot_points(input_dict, rot_mat)
+        self._trans_points(input_dict, trans_vec)
+
+        return input_dict
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(rotation_axis={self.rotation_axis})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class GlobalRotScaleTrans(object):
+    """Apply global rotation, scaling and translation to a 3D scene.
+
+    Args:
+        rot_range (list[float]): Range of rotation angle.
+            Defaults to [-0.78539816, 0.78539816] (close to [-pi/4, pi/4]).
+        scale_ratio_range (list[float]): Range of scale ratio.
+            Defaults to [0.95, 1.05].
+        translation_std (list[float]): The standard deviation of translation
+            noise. This applies random translation to a scene by a noise, which
+            is sampled from a gaussian distribution whose standard deviation
+            is set by ``translation_std``. Defaults to [0, 0, 0]
+        shift_height (bool): Whether to shift height.
+            (the fourth dimension of indoor points) when scaling.
+            Defaults to False.
+    """
+
+    def __init__(self,
+                 rot_range=[-0.78539816, 0.78539816],
+                 scale_ratio_range=[0.95, 1.05],
+                 translation_std=[0, 0, 0],
+                 shift_height=False):
+        seq_types = (list, tuple, np.ndarray)
+        if not isinstance(rot_range, seq_types):
+            assert isinstance(rot_range, (int, float)), \
+                f'unsupported rot_range type {type(rot_range)}'
+            rot_range = [-rot_range, rot_range]
+        self.rot_range = rot_range
+
+        assert isinstance(scale_ratio_range, seq_types), \
+            f'unsupported scale_ratio_range type {type(scale_ratio_range)}'
+        self.scale_ratio_range = scale_ratio_range
+
+        if not isinstance(translation_std, seq_types):
+            assert isinstance(translation_std, (int, float)), \
+                f'unsupported translation_std type {type(translation_std)}'
+            translation_std = [
+                translation_std, translation_std, translation_std
+            ]
+        assert all([std >= 0 for std in translation_std]), \
+            'translation_std should be positive'
+        self.translation_std = translation_std
+        self.shift_height = shift_height
+
+    def _trans_bbox_points(self, input_dict):
+        """Private function to translate bounding boxes and points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after translation, 'points', 'pcd_trans' \
+                and keys in input_dict['bbox3d_fields'] are updated \
+                in the result dict.
+        """
+        translation_std = np.array(self.translation_std, dtype=np.float32)
+        trans_factor = np.random.normal(scale=translation_std, size=3).T
+
+        input_dict['points'].translate(trans_factor)
+        input_dict['pcd_trans'] = trans_factor
+        for key in input_dict['bbox3d_fields']:
+            input_dict[key].translate(trans_factor)
+
+    def _rot_bbox_points(self, input_dict):
+        """Private function to rotate bounding boxes and points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after rotation, 'points', 'pcd_rotation' \
+                and keys in input_dict['bbox3d_fields'] are updated \
+                in the result dict.
+        """
+        rotation = self.rot_range
+        noise_rotation = np.random.uniform(rotation[0], rotation[1])
+
+        # if no bbox in input_dict, only rotate points
+        if len(input_dict['bbox3d_fields']) == 0:
+            rot_mat_T = input_dict['points'].rotate(noise_rotation)
+            input_dict['pcd_rotation'] = rot_mat_T
+            return
+
+        # rotate points with bboxes
+        for key in input_dict['bbox3d_fields']:
+            if len(input_dict[key].tensor) != 0:
+                points, rot_mat_T = input_dict[key].rotate(
+                    noise_rotation, input_dict['points'])
+                input_dict['points'] = points
+                input_dict['pcd_rotation'] = rot_mat_T
+
+    def _scale_bbox_points(self, input_dict):
+        """Private function to scale bounding boxes and points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after scaling, 'points'and keys in \
+                input_dict['bbox3d_fields'] are updated in the result dict.
+        """
+        scale = input_dict['pcd_scale_factor']
+        points = input_dict['points']
+        points.scale(scale)
+        if self.shift_height:
+            assert 'height' in points.attribute_dims.keys(), \
+                'setting shift_height=True but points have no height attribute'
+            points.tensor[:, points.attribute_dims['height']] *= scale
+        input_dict['points'] = points
+
+        for key in input_dict['bbox3d_fields']:
+            input_dict[key].scale(scale)
+
+    def _random_scale(self, input_dict):
+        """Private function to randomly set the scale factor.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after scaling, 'pcd_scale_factor' are updated \
+                in the result dict.
+        """
+        scale_factor = np.random.uniform(self.scale_ratio_range[0],
+                                         self.scale_ratio_range[1])
+        input_dict['pcd_scale_factor'] = scale_factor
+
+    def __call__(self, input_dict):
+        """Private function to rotate, scale and translate bounding boxes and \
+        points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after scaling, 'points', 'pcd_rotation',
+                'pcd_scale_factor', 'pcd_trans' and keys in \
+                input_dict['bbox3d_fields'] are updated in the result dict.
+        """
+        if 'transformation_3d_flow' not in input_dict:
+            input_dict['transformation_3d_flow'] = []
+
+        self._rot_bbox_points(input_dict)
+
+        if 'pcd_scale_factor' not in input_dict:
+            self._random_scale(input_dict)
+        self._scale_bbox_points(input_dict)
+
+        self._trans_bbox_points(input_dict)
+
+        input_dict['transformation_3d_flow'].extend(['R', 'S', 'T'])
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(rot_range={self.rot_range},'
+        repr_str += f' scale_ratio_range={self.scale_ratio_range},'
+        repr_str += f' translation_std={self.translation_std},'
+        repr_str += f' shift_height={self.shift_height})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class PointShuffle(object):
+    """Shuffle input points."""
+
+    def __call__(self, input_dict):
+        """Call function to shuffle points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'points', 'pts_instance_mask' \
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        idx = input_dict['points'].shuffle()
+        idx = idx.numpy()
+
+        pts_instance_mask = input_dict.get('pts_instance_mask', None)
+        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+
+        if pts_instance_mask is not None:
+            input_dict['pts_instance_mask'] = pts_instance_mask[idx]
+
+        if pts_semantic_mask is not None:
+            input_dict['pts_semantic_mask'] = pts_semantic_mask[idx]
+
+        return input_dict
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+
+@PIPELINES.register_module()
+class ObjectRangeFilter(object):
+    """Filter objects by the range.
+
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range):
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def __call__(self, input_dict):
+        """Call function to filter objects by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \
+                keys are updated in the result dict.
+        """
+        # Check points instance type and initialise bev_range
+        if isinstance(input_dict['gt_bboxes_3d'],
+                      (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
+            bev_range = self.pcd_range[[0, 1, 3, 4]]
+        elif isinstance(input_dict['gt_bboxes_3d'], CameraInstance3DBoxes):
+            bev_range = self.pcd_range[[0, 2, 3, 5]]
+
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        gt_labels_3d = input_dict['gt_labels_3d']
+        mask = gt_bboxes_3d.in_range_bev(bev_range)
+        gt_bboxes_3d = gt_bboxes_3d[mask]
+        # mask is a torch tensor but gt_labels_3d is still numpy array
+        # using mask to index gt_labels_3d will cause bug when
+        # len(gt_labels_3d) == 1, where mask=1 will be interpreted
+        # as gt_labels_3d[1] and cause out of index error
+        gt_labels_3d = gt_labels_3d[mask.numpy().astype(np.bool)]
+
+        # limit rad to [-pi, pi]
+        gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+        input_dict['gt_labels_3d'] = gt_labels_3d
+
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class PointsRangeFilter(object):
+    """Filter points by the range.
+
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range):
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def __call__(self, input_dict):
+        """Call function to filter points by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'points', 'pts_instance_mask' \
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = input_dict['points']
+        points_mask = points.in_range_3d(self.pcd_range)
+        clean_points = points[points_mask]
+        input_dict['points'] = clean_points
+        points_mask = points_mask.numpy()
+
+        pts_instance_mask = input_dict.get('pts_instance_mask', None)
+        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+
+        if pts_instance_mask is not None:
+            input_dict['pts_instance_mask'] = pts_instance_mask[points_mask]
+
+        if pts_semantic_mask is not None:
+            input_dict['pts_semantic_mask'] = pts_semantic_mask[points_mask]
+
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class ObjectNameFilter(object):
+    """Filter GT objects by their names.
+
+    Args:
+        classes (list[str]): List of class names to be kept for training.
+    """
+
+    def __init__(self, classes):
+        self.classes = classes
+        self.labels = list(range(len(self.classes)))
+
+    def __call__(self, input_dict):
+        """Call function to filter objects by their names.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \
+                keys are updated in the result dict.
+        """
+        gt_labels_3d = input_dict['gt_labels_3d']
+        gt_bboxes_mask = np.array([n in self.labels for n in gt_labels_3d],
+                                  dtype=np.bool_)
+        input_dict['gt_bboxes_3d'] = input_dict['gt_bboxes_3d'][gt_bboxes_mask]
+        input_dict['gt_labels_3d'] = input_dict['gt_labels_3d'][gt_bboxes_mask]
+
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(classes={self.classes})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class PointSample(object):
+    """Point sample.
+
+    Sampling data to a certain number.
+
+    Args:
+        num_points (int): Number of points to be sampled.
+        sample_range (float, optional): The range where to sample points.
+            If not None, the points with depth larger than `sample_range` are
+            prior to be sampled. Defaults to None.
+        replace (bool, optional): Whether the sampling is with or without
+            replacement. Defaults to False.
+    """
+
+    def __init__(self, num_points, sample_range=None, replace=False):
+        self.num_points = num_points
+        self.sample_range = sample_range
+        self.replace = replace
+
+    def _points_random_sampling(self,
+                                points,
+                                num_samples,
+                                sample_range=None,
+                                replace=False,
+                                return_choices=False):
+        """Points random sampling.
+
+        Sample points to a certain number.
+
+        Args:
+            points (np.ndarray | :obj:`BasePoints`): 3D Points.
+            num_samples (int): Number of samples to be sampled.
+            sample_range (float, optional): Indicating the range where the
+                points will be sampled. Defaults to None.
+            replace (bool, optional): Sampling with or without replacement.
+                Defaults to None.
+            return_choices (bool, optional): Whether return choice.
+                Defaults to False.
+        Returns:
+            tuple[np.ndarray] | np.ndarray:
+                - points (np.ndarray | :obj:`BasePoints`): 3D Points.
+                - choices (np.ndarray, optional): The generated random samples.
+        """
+        if not replace:
+            replace = (points.shape[0] < num_samples)
+        point_range = range(len(points))
+        if sample_range is not None and not replace:
+            # Only sampling the near points when len(points) >= num_samples
+            depth = np.linalg.norm(points.tensor, axis=1)
+            far_inds = np.where(depth > sample_range)[0]
+            near_inds = np.where(depth <= sample_range)[0]
+            # in case there are too many far points
+            if len(far_inds) > num_samples:
+                far_inds = np.random.choice(
+                    far_inds, num_samples, replace=False)
+            point_range = near_inds
+            num_samples -= len(far_inds)
+        choices = np.random.choice(point_range, num_samples, replace=replace)
+        if sample_range is not None and not replace:
+            choices = np.concatenate((far_inds, choices))
+            # Shuffle points after sampling
+            np.random.shuffle(choices)
+        if return_choices:
+            return points[choices], choices
+        else:
+            return points[choices]
+
+    def __call__(self, results):
+        """Call function to sample points to in indoor scenes.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Results after sampling, 'points', 'pts_instance_mask' \
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = results['points']
+        # Points in Camera coord can provide the depth information.
+        # TODO: Need to suport distance-based sampling for other coord system.
+        if self.sample_range is not None:
+            from mmcv.core.points import CameraPoints
+            assert isinstance(points, CameraPoints), \
+                'Sampling based on distance is only appliable for CAMERA coord'
+        points, choices = self._points_random_sampling(
+            points,
+            self.num_points,
+            self.sample_range,
+            self.replace,
+            return_choices=True)
+        results['points'] = points
+
+        pts_instance_mask = results.get('pts_instance_mask', None)
+        pts_semantic_mask = results.get('pts_semantic_mask', None)
+
+        if pts_instance_mask is not None:
+            pts_instance_mask = pts_instance_mask[choices]
+            results['pts_instance_mask'] = pts_instance_mask
+
+        if pts_semantic_mask is not None:
+            pts_semantic_mask = pts_semantic_mask[choices]
+            results['pts_semantic_mask'] = pts_semantic_mask
+
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_points={self.num_points},'
+        repr_str += f' sample_range={self.sample_range},'
+        repr_str += f' replace={self.replace})'
+
+        return repr_str
+
+
+@PIPELINES.register_module()
+class IndoorPointSample(PointSample):
+    """Indoor point sample.
+
+    Sampling data to a certain number.
+    NOTE: IndoorPointSample is deprecated in favor of PointSample
+
+    Args:
+        num_points (int): Number of points to be sampled.
+    """
+
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            'IndoorPointSample is deprecated in favor of PointSample')
+        super(IndoorPointSample, self).__init__(*args, **kwargs)
+
+
+@PIPELINES.register_module()
+class IndoorPatchPointSample(object):
+    r"""Indoor point sample within a patch. Modified from `PointNet++ <https://
+    github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py>`_.
+
+    Sampling data to a certain number for semantic segmentation.
+
+    Args:
+        num_points (int): Number of points to be sampled.
+        block_size (float, optional): Size of a block to sample points from.
+            Defaults to 1.5.
+        sample_rate (float, optional): Stride used in sliding patch generation.
+            This parameter is unused in `IndoorPatchPointSample` and thus has
+            been deprecated. We plan to remove it in the future.
+            Defaults to None.
+        ignore_index (int, optional): Label index that won't be used for the
+            segmentation task. This is set in PointSegClassMapping as neg_cls.
+            If not None, will be used as a patch selection criterion.
+            Defaults to None.
+        use_normalized_coord (bool, optional): Whether to use normalized xyz as
+            additional features. Defaults to False.
+        num_try (int, optional): Number of times to try if the patch selected
+            is invalid. Defaults to 10.
+        enlarge_size (float | None, optional): Enlarge the sampled patch to
+            [-block_size / 2 - enlarge_size, block_size / 2 + enlarge_size] as
+            an augmentation. If None, set it as 0. Defaults to 0.2.
+        min_unique_num (int | None, optional): Minimum number of unique points
+            the sampled patch should contain. If None, use PointNet++'s method
+            to judge uniqueness. Defaults to None.
+        eps (float, optional): A value added to patch boundary to guarantee
+            points coverage. Defaults to 1e-2.
+
+    Note:
+        This transform should only be used in the training process of point
+            cloud segmentation tasks. For the sliding patch generation and
+            inference process in testing, please refer to the `slide_inference`
+            function of `EncoderDecoder3D` class.
+    """
+
+    def __init__(self,
+                 num_points,
+                 block_size=1.5,
+                 sample_rate=None,
+                 ignore_index=None,
+                 use_normalized_coord=False,
+                 num_try=10,
+                 enlarge_size=0.2,
+                 min_unique_num=None,
+                 eps=1e-2):
+        self.num_points = num_points
+        self.block_size = block_size
+        self.ignore_index = ignore_index
+        self.use_normalized_coord = use_normalized_coord
+        self.num_try = num_try
+        self.enlarge_size = enlarge_size if enlarge_size is not None else 0.0
+        self.min_unique_num = min_unique_num
+        self.eps = eps
+
+        if sample_rate is not None:
+            warnings.warn(
+                "'sample_rate' has been deprecated and will be removed in "
+                'the future. Please remove them from your code.')
+
+    def _input_generation(self, coords, patch_center, coord_max, attributes,
+                          attribute_dims, point_type):
+        """Generating model input.
+
+        Generate input by subtracting patch center and adding additional \
+            features. Currently support colors and normalized xyz as features.
+
+        Args:
+            coords (np.ndarray): Sampled 3D Points.
+            patch_center (np.ndarray): Center coordinate of the selected patch.
+            coord_max (np.ndarray): Max coordinate of all 3D Points.
+            attributes (np.ndarray): features of input points.
+            attribute_dims (dict): Dictionary to indicate the meaning of extra
+                dimension.
+            point_type (type): class of input points inherited from BasePoints.
+
+        Returns:
+            :obj:`BasePoints`: The generated input data.
+        """
+        # subtract patch center, the z dimension is not centered
+        centered_coords = coords.copy()
+        centered_coords[:, 0] -= patch_center[0]
+        centered_coords[:, 1] -= patch_center[1]
+
+        if self.use_normalized_coord:
+            normalized_coord = coords / coord_max
+            attributes = np.concatenate([attributes, normalized_coord], axis=1)
+            if attribute_dims is None:
+                attribute_dims = dict()
+            attribute_dims.update(
+                dict(normalized_coord=[
+                    attributes.shape[1], attributes.shape[1] +
+                    1, attributes.shape[1] + 2
+                ]))
+
+        points = np.concatenate([centered_coords, attributes], axis=1)
+        points = point_type(
+            points, points_dim=points.shape[1], attribute_dims=attribute_dims)
+
+        return points
+
+    def _patch_points_sampling(self, points, sem_mask):
+        """Patch points sampling.
+
+        First sample a valid patch.
+        Then sample points within that patch to a certain number.
+
+        Args:
+            points (:obj:`BasePoints`): 3D Points.
+            sem_mask (np.ndarray): semantic segmentation mask for input points.
+
+        Returns:
+            tuple[:obj:`BasePoints`, np.ndarray] | :obj:`BasePoints`:
+
+                - points (:obj:`BasePoints`): 3D Points.
+                - choices (np.ndarray): The generated random samples.
+        """
+        coords = points.coord.numpy()
+        attributes = points.tensor[:, 3:].numpy()
+        attribute_dims = points.attribute_dims
+        point_type = type(points)
+
+        coord_max = np.amax(coords, axis=0)
+        coord_min = np.amin(coords, axis=0)
+
+        for _ in range(self.num_try):
+            # random sample a point as patch center
+            cur_center = coords[np.random.choice(coords.shape[0])]
+
+            # boundary of a patch, which would be enlarged by
+            # `self.enlarge_size` as an augmentation
+            cur_max = cur_center + np.array(
+                [self.block_size / 2.0, self.block_size / 2.0, 0.0])
+            cur_min = cur_center - np.array(
+                [self.block_size / 2.0, self.block_size / 2.0, 0.0])
+            cur_max[2] = coord_max[2]
+            cur_min[2] = coord_min[2]
+            cur_choice = np.sum(
+                (coords >= (cur_min - self.enlarge_size)) *
+                (coords <= (cur_max + self.enlarge_size)),
+                axis=1) == 3
+
+            if not cur_choice.any():  # no points in this patch
+                continue
+
+            cur_coords = coords[cur_choice, :]
+            cur_sem_mask = sem_mask[cur_choice]
+            point_idxs = np.where(cur_choice)[0]
+            mask = np.sum(
+                (cur_coords >= (cur_min - self.eps)) * (cur_coords <=
+                                                        (cur_max + self.eps)),
+                axis=1) == 3
+
+            # two criteria for patch sampling, adopted from PointNet++
+            # 1. selected patch should contain enough unique points
+            if self.min_unique_num is None:
+                # use PointNet++'s method as default
+                # [31, 31, 62] are just some big values used to transform
+                # coords from 3d array to 1d and then check their uniqueness
+                # this is used in all the ScanNet code following PointNet++
+                vidx = np.ceil(
+                    (cur_coords[mask, :] - cur_min) / (cur_max - cur_min) *
+                    np.array([31.0, 31.0, 62.0]))
+                vidx = np.unique(vidx[:, 0] * 31.0 * 62.0 + vidx[:, 1] * 62.0 +
+                                 vidx[:, 2])
+                flag1 = len(vidx) / 31.0 / 31.0 / 62.0 >= 0.02
+            else:
+                # if `min_unique_num` is provided, directly compare with it
+                flag1 = mask.sum() >= self.min_unique_num
+
+            # 2. selected patch should contain enough annotated points
+            if self.ignore_index is None:
+                flag2 = True
+            else:
+                flag2 = np.sum(cur_sem_mask != self.ignore_index) / \
+                               len(cur_sem_mask) >= 0.7
+
+            if flag1 and flag2:
+                break
+
+        # sample idx to `self.num_points`
+        if point_idxs.size >= self.num_points:
+            # no duplicate in sub-sampling
+            choices = np.random.choice(
+                point_idxs, self.num_points, replace=False)
+        else:
+            # do not use random choice here to avoid some points not counted
+            dup = np.random.choice(point_idxs.size,
+                                   self.num_points - point_idxs.size)
+            idx_dup = np.concatenate(
+                [np.arange(point_idxs.size),
+                 np.array(dup)], 0)
+            choices = point_idxs[idx_dup]
+
+        # construct model input
+        points = self._input_generation(coords[choices], cur_center, coord_max,
+                                        attributes[choices], attribute_dims,
+                                        point_type)
+
+        return points, choices
+
+    def __call__(self, results):
+        """Call function to sample points to in indoor scenes.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after sampling, 'points', 'pts_instance_mask' \
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = results['points']
+
+        assert 'pts_semantic_mask' in results.keys(), \
+            'semantic mask should be provided in training and evaluation'
+        pts_semantic_mask = results['pts_semantic_mask']
+
+        points, choices = self._patch_points_sampling(points,
+                                                      pts_semantic_mask)
+
+        results['points'] = points
+        results['pts_semantic_mask'] = pts_semantic_mask[choices]
+        pts_instance_mask = results.get('pts_instance_mask', None)
+        if pts_instance_mask is not None:
+            results['pts_instance_mask'] = pts_instance_mask[choices]
+
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_points={self.num_points},'
+        repr_str += f' block_size={self.block_size},'
+        repr_str += f' ignore_index={self.ignore_index},'
+        repr_str += f' use_normalized_coord={self.use_normalized_coord},'
+        repr_str += f' num_try={self.num_try},'
+        repr_str += f' enlarge_size={self.enlarge_size},'
+        repr_str += f' min_unique_num={self.min_unique_num},'
+        repr_str += f' eps={self.eps})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class BackgroundPointsFilter(object):
+    """Filter background points near the bounding box.
+
+    Args:
+        bbox_enlarge_range (tuple[float], float): Bbox enlarge range.
+    """
+
+    def __init__(self, bbox_enlarge_range):
+        assert (is_tuple_of(bbox_enlarge_range, float)
+                and len(bbox_enlarge_range) == 3) \
+            or isinstance(bbox_enlarge_range, float), \
+            f'Invalid arguments bbox_enlarge_range {bbox_enlarge_range}'
+
+        if isinstance(bbox_enlarge_range, float):
+            bbox_enlarge_range = [bbox_enlarge_range] * 3
+        self.bbox_enlarge_range = np.array(
+            bbox_enlarge_range, dtype=np.float32)[np.newaxis, :]
+
+    def __call__(self, input_dict):
+        """Call function to filter points by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'points', 'pts_instance_mask' \
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = input_dict['points']
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+
+        # avoid groundtruth being modified
+        gt_bboxes_3d_np = gt_bboxes_3d.tensor.clone().numpy()
+        gt_bboxes_3d_np[:, :3] = gt_bboxes_3d.gravity_center.clone().numpy()
+
+        enlarged_gt_bboxes_3d = gt_bboxes_3d_np.copy()
+        enlarged_gt_bboxes_3d[:, 3:6] += self.bbox_enlarge_range
+        points_numpy = points.tensor.clone().numpy()
+        foreground_masks = box_np_ops.points_in_rbbox(
+            points_numpy, gt_bboxes_3d_np, origin=(0.5, 0.5, 0.5))
+        enlarge_foreground_masks = box_np_ops.points_in_rbbox(
+            points_numpy, enlarged_gt_bboxes_3d, origin=(0.5, 0.5, 0.5))
+        foreground_masks = foreground_masks.max(1)
+        enlarge_foreground_masks = enlarge_foreground_masks.max(1)
+        valid_masks = ~np.logical_and(~foreground_masks,
+                                      enlarge_foreground_masks)
+
+        input_dict['points'] = points[valid_masks]
+        pts_instance_mask = input_dict.get('pts_instance_mask', None)
+        if pts_instance_mask is not None:
+            input_dict['pts_instance_mask'] = pts_instance_mask[valid_masks]
+
+        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+        if pts_semantic_mask is not None:
+            input_dict['pts_semantic_mask'] = pts_semantic_mask[valid_masks]
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(bbox_enlarge_range={self.bbox_enlarge_range.tolist()})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class VoxelBasedPointSampler(object):
+    """Voxel based point sampler.
+
+    Apply voxel sampling to multiple sweep points.
+
+    Args:
+        cur_sweep_cfg (dict): Config for sampling current points.
+        prev_sweep_cfg (dict): Config for sampling previous points.
+        time_dim (int): Index that indicate the time dimention
+            for input points.
+    """
+
+    def __init__(self, cur_sweep_cfg, prev_sweep_cfg=None, time_dim=3):
+        self.cur_voxel_generator = VoxelGenerator(**cur_sweep_cfg)
+        self.cur_voxel_num = self.cur_voxel_generator._max_voxels
+        self.time_dim = time_dim
+        if prev_sweep_cfg is not None:
+            assert prev_sweep_cfg['max_num_points'] == \
+                cur_sweep_cfg['max_num_points']
+            self.prev_voxel_generator = VoxelGenerator(**prev_sweep_cfg)
+            self.prev_voxel_num = self.prev_voxel_generator._max_voxels
+        else:
+            self.prev_voxel_generator = None
+            self.prev_voxel_num = 0
+
+    def _sample_points(self, points, sampler, point_dim):
+        """Sample points for each points subset.
+
+        Args:
+            points (np.ndarray): Points subset to be sampled.
+            sampler (VoxelGenerator): Voxel based sampler for
+                each points subset.
+            point_dim (int): The dimention of each points
+
+        Returns:
+            np.ndarray: Sampled points.
+        """
+        voxels, coors, num_points_per_voxel = sampler.generate(points)
+        if voxels.shape[0] < sampler._max_voxels:
+            padding_points = np.zeros([
+                sampler._max_voxels - voxels.shape[0], sampler._max_num_points,
+                point_dim
+            ],
+                                      dtype=points.dtype)
+            padding_points[:] = voxels[0]
+            sample_points = np.concatenate([voxels, padding_points], axis=0)
+        else:
+            sample_points = voxels
+
+        return sample_points
+
+    def __call__(self, results):
+        """Call function to sample points from multiple sweeps.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after sampling, 'points', 'pts_instance_mask' \
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = results['points']
+        original_dim = points.shape[1]
+
+        # TODO: process instance and semantic mask while _max_num_points
+        # is larger than 1
+        # Extend points with seg and mask fields
+        map_fields2dim = []
+        start_dim = original_dim
+        points_numpy = points.tensor.numpy()
+        extra_channel = [points_numpy]
+        for idx, key in enumerate(results['pts_mask_fields']):
+            map_fields2dim.append((key, idx + start_dim))
+            extra_channel.append(results[key][..., None])
+
+        start_dim += len(results['pts_mask_fields'])
+        for idx, key in enumerate(results['pts_seg_fields']):
+            map_fields2dim.append((key, idx + start_dim))
+            extra_channel.append(results[key][..., None])
+
+        points_numpy = np.concatenate(extra_channel, axis=-1)
+
+        # Split points into two part, current sweep points and
+        # previous sweeps points.
+        # TODO: support different sampling methods for next sweeps points
+        # and previous sweeps points.
+        cur_points_flag = (points_numpy[:, self.time_dim] == 0)
+        cur_sweep_points = points_numpy[cur_points_flag]
+        prev_sweeps_points = points_numpy[~cur_points_flag]
+        if prev_sweeps_points.shape[0] == 0:
+            prev_sweeps_points = cur_sweep_points
+
+        # Shuffle points before sampling
+        np.random.shuffle(cur_sweep_points)
+        np.random.shuffle(prev_sweeps_points)
+
+        cur_sweep_points = self._sample_points(cur_sweep_points,
+                                               self.cur_voxel_generator,
+                                               points_numpy.shape[1])
+        if self.prev_voxel_generator is not None:
+            prev_sweeps_points = self._sample_points(prev_sweeps_points,
+                                                     self.prev_voxel_generator,
+                                                     points_numpy.shape[1])
+
+            points_numpy = np.concatenate(
+                [cur_sweep_points, prev_sweeps_points], 0)
+        else:
+            points_numpy = cur_sweep_points
+
+        if self.cur_voxel_generator._max_num_points == 1:
+            points_numpy = points_numpy.squeeze(1)
+        results['points'] = points.new_point(points_numpy[..., :original_dim])
+
+        # Restore the correspoinding seg and mask fields
+        for key, dim_index in map_fields2dim:
+            results[key] = points_numpy[..., dim_index]
+
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+
+        def _auto_indent(repr_str, indent):
+            repr_str = repr_str.split('\n')
+            repr_str = [' ' * indent + t + '\n' for t in repr_str]
+            repr_str = ''.join(repr_str)[:-1]
+            return repr_str
+
+        repr_str = self.__class__.__name__
+        indent = 4
+        repr_str += '(\n'
+        repr_str += ' ' * indent + f'num_cur_sweep={self.cur_voxel_num},\n'
+        repr_str += ' ' * indent + f'num_prev_sweep={self.prev_voxel_num},\n'
+        repr_str += ' ' * indent + f'time_dim={self.time_dim},\n'
+        repr_str += ' ' * indent + 'cur_voxel_generator=\n'
+        repr_str += f'{_auto_indent(repr(self.cur_voxel_generator), 8)},\n'
+        repr_str += ' ' * indent + 'prev_voxel_generator=\n'
+        repr_str += f'{_auto_indent(repr(self.prev_voxel_generator), 8)})'
+        return repr_str
+
+@PIPELINES.register_module()
+class PadMultiViewImage(object):
+    """Pad the multi-view image.
+    There are two padding modes: (1) pad to a fixed size and (2) pad to the
+    minimum size that is divisible by some number.
+    Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
+    Args:
+        size (tuple, optional): Fixed padding size.
+        size_divisor (int, optional): The divisor of padded size.
+        pad_val (float, optional): Padding value, 0 by default.
+    """
+
+    def __init__(self, size=None, size_divisor=None, pad_val=0):
+        self.size = size
+        self.size_divisor = size_divisor
+        self.pad_val = pad_val
+        # only one of size and size_divisor should be valid
+        assert size is not None or size_divisor is not None
+        assert size is None or size_divisor is None
+
+    def _pad_img(self, results):
+        """Pad images according to ``self.size``."""
+        if self.size is not None:
+            padded_img = [impad(
+                img, shape=self.size, pad_val=self.pad_val) for img in results['img']]
+        elif self.size_divisor is not None:
+            padded_img = [impad_to_multiple(
+                img, self.size_divisor, pad_val=self.pad_val) for img in results['img']]
+        
+        results['ori_shape'] = [img.shape for img in results['img']]
+        results['img'] = padded_img
+        results['img_shape'] = [img.shape for img in padded_img]
+        results['pad_shape'] = [img.shape for img in padded_img]
+        results['pad_fixed_size'] = self.size
+        results['pad_size_divisor'] = self.size_divisor
+
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Updated result dict.
+        """
+        self._pad_img(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(size={self.size}, '
+        repr_str += f'size_divisor={self.size_divisor}, '
+        repr_str += f'pad_val={self.pad_val})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class NormalizeMultiviewImage(object):
+    """Normalize the image.
+    Added key is "img_norm_cfg".
+    Args:
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB,
+            default is true.
+    """
+
+    def __init__(self, mean, std, to_rgb=True):
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_rgb = to_rgb
+
+
+    def __call__(self, results):
+        """Call function to normalize images.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Normalized results, 'img_norm_cfg' key is added into
+                result dict.
+        """
+
+        results['img'] = [imnormalize(img, self.mean, self.std, self.to_rgb) for img in results['img']]
+        results['img_norm_cfg'] = dict(
+            mean=self.mean, std=self.std, to_rgb=self.to_rgb)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class PhotoMetricDistortionMultiViewImage:
+    """Apply photometric distortion to image sequentially, every transformation
+    is applied with a probability of 0.5. The position of random contrast is in
+    second or second to last.
+    1. random brightness
+    2. random contrast (mode 0)
+    3. convert color from BGR to HSV
+    4. random saturation
+    5. random hue
+    6. convert color from HSV to BGR
+    7. random contrast (mode 1)
+    8. randomly swap channels
+    Args:
+        brightness_delta (int): delta of brightness.
+        contrast_range (tuple): range of contrast.
+        saturation_range (tuple): range of saturation.
+        hue_delta (int): delta of hue.
+    """
+
+    def __init__(self,
+                 brightness_delta=32,
+                 contrast_range=(0.5, 1.5),
+                 saturation_range=(0.5, 1.5),
+                 hue_delta=18):
+        self.brightness_delta = brightness_delta
+        self.contrast_lower, self.contrast_upper = contrast_range
+        self.saturation_lower, self.saturation_upper = saturation_range
+        self.hue_delta = hue_delta
+
+    def __call__(self, results):
+        """Call function to perform photometric distortion on images.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Result dict with images distorted.
+        """
+        imgs = results['img']
+        new_imgs = []
+        for img in imgs:
+            assert img.dtype == np.float32, \
+                'PhotoMetricDistortion needs the input image of dtype np.float32,'\
+                ' please set "to_float32=True" in "LoadImageFromFile" pipeline'
+            # random brightness
+            if random.randint(2):
+                delta = random.uniform(-self.brightness_delta,
+                                    self.brightness_delta)
+                img += delta
+
+            # mode == 0 --> do random contrast first
+            # mode == 1 --> do random contrast last
+            mode = random.randint(2)
+            if mode == 1:
+                if random.randint(2):
+                    alpha = random.uniform(self.contrast_lower,
+                                        self.contrast_upper)
+                    img *= alpha
+
+            # convert color from BGR to HSV
+            img = bgr2hsv(img)
+
+            # random saturation
+            if random.randint(2):
+                img[..., 1] *= random.uniform(self.saturation_lower,
+                                            self.saturation_upper)
+
+            # random hue
+            if random.randint(2):
+                img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
+                img[..., 0][img[..., 0] > 360] -= 360
+                img[..., 0][img[..., 0] < 0] += 360
+
+            # convert color from HSV to BGR
+            img = hsv2bgr(img)
+
+            # random contrast
+            if mode == 0:
+                if random.randint(2):
+                    alpha = random.uniform(self.contrast_lower,
+                                        self.contrast_upper)
+                    img *= alpha
+
+            # randomly swap channels
+            if random.randint(2):
+                img = img[..., random.permutation(3)]
+            new_imgs.append(img)
+        results['img'] = new_imgs
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
+        repr_str += 'contrast_range='
+        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
+        repr_str += 'saturation_range='
+        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
+        repr_str += f'hue_delta={self.hue_delta})'
+        return repr_str
+
+
+
+@PIPELINES.register_module()
+class CustomCollect3D(object):
+    """Collect data from the loader relevant to the specific task.
+    This is usually the last stage of the data loader pipeline. Typically keys
+    is set to some subset of "img", "proposals", "gt_bboxes",
+    "gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
+    The "img_meta" item is always populated.  The contents of the "img_meta"
+    dictionary depends on "meta_keys". By default this includes:
+        - 'img_shape': shape of the image input to the network as a tuple \
+            (h, w, c).  Note that images may be zero padded on the \
+            bottom/right if the batch tensor is larger than this shape.
+        - 'scale_factor': a float indicating the preprocessing scale
+        - 'flip': a boolean indicating if image flip transform was used
+        - 'filename': path to the image file
+        - 'ori_shape': original shape of the image as a tuple (h, w, c)
+        - 'pad_shape': image shape after padding
+        - 'lidar2img': transform from lidar to image
+        - 'depth2img': transform from depth to image
+        - 'cam2img': transform from camera to image
+        - 'pcd_horizontal_flip': a boolean indicating if point cloud is \
+            flipped horizontally
+        - 'pcd_vertical_flip': a boolean indicating if point cloud is \
+            flipped vertically
+        - 'box_mode_3d': 3D box mode
+        - 'box_type_3d': 3D box type
+        - 'img_norm_cfg': a dict of normalization information:
+            - mean: per channel mean subtraction
+            - std: per channel std divisor
+            - to_rgb: bool indicating if bgr was converted to rgb
+        - 'pcd_trans': point cloud transformations
+        - 'sample_idx': sample index
+        - 'pcd_scale_factor': point cloud scale factor
+        - 'pcd_rotation': rotation applied to point cloud
+        - 'pts_filename': path to point cloud file.
+    Args:
+        keys (Sequence[str]): Keys of results to be collected in ``data``.
+        meta_keys (Sequence[str], optional): Meta keys to be converted to
+            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
+            Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img',
+            'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
+            'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
+            'box_type_3d', 'img_norm_cfg', 'pcd_trans',
+            'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename')
+    """
+
+    def __init__(self,
+                 keys,
+                 meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
+                            'depth2img', 'cam2img', 'pad_shape',
+                            'scale_factor', 'flip', 'pcd_horizontal_flip',
+                            'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
+                            'img_norm_cfg', 'pcd_trans', 'sample_idx', 'prev_idx', 'next_idx',
+                            'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
+                            'transformation_3d_flow', 'scene_token',
+                            'can_bus','folder','frame_idx'
+                            )):
+        # TODO(yzj) bevformer meta_keys has lidar2cam
+        self.keys = keys
+        self.meta_keys = meta_keys
+
+    def __call__(self, results):
+        """Call function to collect keys in results. The keys in ``meta_keys``
+        will be converted to :obj:`mmcv.DataContainer`.
+        Args:
+            results (dict): Result dict contains the data to collect.
+        Returns:
+            dict: The result dict contains the following keys
+                - keys in ``self.keys``
+                - ``img_metas``
+        """
+       
+        data = {}
+        img_metas = {}
+        for key in self.meta_keys:
+            if key in results:
+                img_metas[key] = results[key]
+
+        data['img_metas'] = DC(img_metas, cpu_only=True)
+        for key in self.keys:
+            data[key] = results[key]
+        return data
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        return self.__class__.__name__ + \
+            f'(keys={self.keys}, meta_keys={self.meta_keys})'
+
+
+
+@PIPELINES.register_module()
+class RandomScaleImageMultiViewImage(object):
+    """Random scale the image
+    Args:
+        scales
+    """
+
+    def __init__(self, scales=[]):
+        self.scales = scales
+        assert len(self.scales)==1
+
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Updated result dict.
+        """
+        rand_ind = np.random.permutation(range(len(self.scales)))[0]
+        rand_scale = self.scales[rand_ind]
+
+        y_size = [int(img.shape[0] * rand_scale) for img in results['img']]
+        x_size = [int(img.shape[1] * rand_scale) for img in results['img']]
+        scale_factor = np.eye(4)
+        scale_factor[0, 0] *= rand_scale
+        scale_factor[1, 1] *= rand_scale
+        results['img'] = [imresize(img, (x_size[idx], y_size[idx]), return_scale=False) for idx, img in
+                          enumerate(results['img'])]
+        lidar2img = [scale_factor @ l2i for l2i in results['lidar2img']]
+        results['lidar2img'] = lidar2img
+        results['img_shape'] = [img.shape for img in results['img']]
+        results['ori_shape'] = [img.shape for img in results['img']]
+
+        return results
+
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(size={self.scales}, '
+        return repr_str
+
+@PIPELINES.register_module()
+class ObjectRangeFilterTrack(object):
+    """Filter objects by the range.
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range):
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def __call__(self, input_dict):
+        """Call function to filter objects by the range.
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \
+                keys are updated in the result dict.
+        """
+        # Check points instance type and initialise bev_range
+        if isinstance(input_dict['gt_bboxes_3d'],
+                      (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
+            bev_range = self.pcd_range[[0, 1, 3, 4]]
+        elif isinstance(input_dict['gt_bboxes_3d'], CameraInstance3DBoxes):
+            bev_range = self.pcd_range[[0, 2, 3, 5]]
+
+        if 'gt_inds' in input_dict['ann_info'].keys():
+            input_dict['gt_inds'] = input_dict['ann_info']['gt_inds']
+        if 'gt_fut_traj' in input_dict['ann_info'].keys():
+            input_dict['gt_fut_traj'] = input_dict['ann_info']['gt_fut_traj']
+        if 'gt_fut_traj_mask' in input_dict['ann_info'].keys():
+            input_dict['gt_fut_traj_mask'] = input_dict['ann_info']['gt_fut_traj_mask']
+        if 'gt_past_traj' in input_dict['ann_info'].keys():
+            input_dict['gt_past_traj'] = input_dict['ann_info']['gt_past_traj']
+        if 'gt_past_traj_mask' in input_dict['ann_info'].keys():
+            input_dict['gt_past_traj_mask'] = input_dict['ann_info']['gt_past_traj_mask']
+        if 'gt_sdc_bbox' in input_dict['ann_info'].keys():
+            input_dict['gt_sdc_bbox'] = input_dict['ann_info']['gt_sdc_bbox']
+            input_dict['gt_sdc_label'] = input_dict['ann_info']['gt_sdc_label']
+            input_dict['gt_sdc_fut_traj'] = input_dict['ann_info']['gt_sdc_fut_traj']
+            input_dict['gt_sdc_fut_traj_mask'] = input_dict['ann_info']['gt_sdc_fut_traj_mask']
+
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        gt_labels_3d = input_dict['gt_labels_3d']
+        gt_inds = input_dict['gt_inds']
+        gt_fut_traj = input_dict['gt_fut_traj']
+        gt_fut_traj_mask = input_dict['gt_fut_traj_mask']
+        gt_past_traj = input_dict['gt_past_traj']
+        gt_past_traj_mask = input_dict['gt_past_traj_mask']
+
+        mask = gt_bboxes_3d.in_range_bev(bev_range)
+        gt_bboxes_3d = gt_bboxes_3d[mask]
+        # mask is a torch tensor but gt_labels_3d is still numpy array
+        # using mask to index gt_labels_3d will cause bug when
+        # len(gt_labels_3d) == 1, where mask=1 will be interpreted
+        # as gt_labels_3d[1] and cause out of index error
+        mask = mask.numpy().astype(np.bool)
+        gt_labels_3d = gt_labels_3d[mask]
+        gt_inds = gt_inds[mask]
+        gt_fut_traj = gt_fut_traj[mask]
+        gt_fut_traj_mask = gt_fut_traj_mask[mask]
+        gt_past_traj = gt_past_traj[mask]
+        gt_past_traj_mask = gt_past_traj_mask[mask]
+
+        # limit rad to [-pi, pi]
+        gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+        input_dict['gt_labels_3d'] = gt_labels_3d
+        input_dict['gt_inds'] = gt_inds
+        input_dict['gt_fut_traj'] = gt_fut_traj
+        input_dict['gt_fut_traj_mask'] = gt_fut_traj_mask
+        input_dict['gt_past_traj'] = gt_past_traj
+        input_dict['gt_past_traj_mask'] = gt_past_traj_mask
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
+
+@PIPELINES.register_module()
+class ObjectNameFilterTrack(object):
+    """Filter GT objects by their names.
+    Args:
+        classes (list[str]): List of class names to be kept for training.
+    """
+
+    def __init__(self, classes):
+        self.classes = classes
+        self.labels = list(range(len(self.classes)))
+
+    def __call__(self, input_dict):
+        """Call function to filter objects by their names.
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \
+                keys are updated in the result dict.
+        """
+        gt_labels_3d = input_dict['gt_labels_3d']
+        gt_bboxes_mask = np.array([n in self.labels for n in gt_labels_3d],
+                                  dtype=np.bool_)
+        input_dict['gt_bboxes_3d'] = input_dict['gt_bboxes_3d'][gt_bboxes_mask]
+        input_dict['gt_labels_3d'] = input_dict['gt_labels_3d'][gt_bboxes_mask]
+        input_dict['gt_inds'] = input_dict['gt_inds'][gt_bboxes_mask]
+        input_dict['gt_fut_traj'] = input_dict['gt_fut_traj'][gt_bboxes_mask]
+        input_dict['gt_fut_traj_mask'] = input_dict['gt_fut_traj_mask'][gt_bboxes_mask]
+        input_dict['gt_past_traj'] = input_dict['gt_past_traj'][gt_bboxes_mask]
+        input_dict['gt_past_traj_mask'] = input_dict['gt_past_traj_mask'][gt_bboxes_mask]
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(classes={self.classes})'
+        return repr_str
+
+@PIPELINES.register_module()
+class CustomObjectRangeFilter(ObjectRangeFilter):
+    def __call__(self, results):
+        """Call function to filter objects by the range.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d'
+                keys are updated in the result dict.
+        """
+        # Check points instance type and initialise bev_range
+        if isinstance(results['gt_bboxes_3d'],
+                        (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
+            bev_range = self.pcd_range[[0, 1, 3, 4]]
+        elif isinstance(results['gt_bboxes_3d'], CameraInstance3DBoxes):
+            bev_range = self.pcd_range[[0, 2, 3, 5]]
+
+        gt_bboxes_3d = results['gt_bboxes_3d']
+        gt_labels_3d = results['gt_labels_3d']
+        mask = gt_bboxes_3d.in_range_bev(bev_range)
+        gt_bboxes_3d = gt_bboxes_3d[mask]
+        # mask is a torch tensor but gt_labels_3d is still numpy array
+        # using mask to index gt_labels_3d will cause bug when
+        # len(gt_labels_3d) == 1, where mask=1 will be interpreted
+        # as gt_labels_3d[1] and cause out of index error
+        gt_labels_3d = gt_labels_3d[mask.numpy().astype(np.bool)]
+
+        # limit rad to [-pi, pi]
+        gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
+        results['gt_bboxes_3d'] = gt_bboxes_3d
+        results['gt_labels_3d'] = gt_labels_3d
+        # results['ann_tokens'] = results['ann_tokens'][mask.numpy().astype(np.bool)]
+
+        return results
+
+@PIPELINES.register_module()
+class CustomObjectNameFilter(ObjectNameFilter):
+    def __call__(self, results):
+        """Call function to filter objects by their names.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d'
+                keys are updated in the result dict.
+        """
+        gt_labels_3d = results['gt_labels_3d']
+        gt_bboxes_mask = np.array([n in self.labels for n in gt_labels_3d],
+                                  dtype=np.bool_)
+        results['gt_bboxes_3d'] = results['gt_bboxes_3d'][gt_bboxes_mask]
+        results['gt_labels_3d'] = results['gt_labels_3d'][gt_bboxes_mask]
+        # results['ann_tokens'] = results['ann_tokens'][gt_bboxes_mask]
+
+        return results
+
+
+@PIPELINES.register_module()
+class VADObjectRangeFilter(object):
+    """Filter objects by the range, and also filter corresponding fut trajs
+
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range):
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def __call__(self, input_dict):
+        """Call function to filter objects by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \
+                keys are updated in the result dict.
+        """
+        # Check points instance type and initialise bev_range
+        if isinstance(input_dict['gt_bboxes_3d'],
+                      (LiDARInstance3DBoxes, DepthInstance3DBoxes)):
+            bev_range = self.pcd_range[[0, 1, 3, 4]]
+        elif isinstance(input_dict['gt_bboxes_3d'], CameraInstance3DBoxes):
+            bev_range = self.pcd_range[[0, 2, 3, 5]]
+
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        gt_labels_3d = input_dict['gt_labels_3d']
+        
+        
+        mask = gt_bboxes_3d.in_range_bev(bev_range)
+        gt_bboxes_3d = gt_bboxes_3d[mask]
+        # mask is a torch tensor but gt_labels_3d is still numpy array
+        # using mask to index gt_labels_3d will cause bug when
+        # len(gt_labels_3d) == 1, where mask=1 will be interpreted
+        # as gt_labels_3d[1] and cause out of index error
+        gt_labels_3d = gt_labels_3d[mask.numpy().astype(np.bool)]
+        
+
+        # limit rad to [-pi, pi]
+        gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+        input_dict['gt_labels_3d'] = gt_labels_3d
+
+        if 'attr_labels' in input_dict:
+            gt_attr_labels = input_dict['attr_labels']
+            gt_attr_labels = gt_attr_labels[mask.numpy().astype(np.bool)]
+            input_dict['gt_attr_labels'] = gt_attr_labels
+
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class VADObjectNameFilter(object):
+    """Filter GT objects by their names, , and also filter corresponding fut trajs
+
+    Args:
+        classes (list[str]): List of class names to be kept for training.
+    """
+
+    def __init__(self, classes):
+        self.classes = classes
+        self.labels = list(range(len(self.classes)))
+
+    def __call__(self, input_dict):
+        """Call function to filter objects by their names.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \
+                keys are updated in the result dict.
+        """
+        gt_labels_3d = input_dict['gt_labels_3d']
+        gt_bboxes_mask = np.array([n in self.labels for n in gt_labels_3d],
+                                  dtype=np.bool_)
+        input_dict['gt_bboxes_3d'] = input_dict['gt_bboxes_3d'][gt_bboxes_mask]
+        input_dict['gt_labels_3d'] = input_dict['gt_labels_3d'][gt_bboxes_mask]
+        if 'gt_attr_labels' in input_dict:
+            input_dict['gt_attr_labels'] = input_dict['gt_attr_labels'][gt_bboxes_mask]
+
+        return input_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(classes={self.classes})'
+        return repr_str
+
+@PIPELINES.register_module()
+class CustomPointsRangeFilter:
+    """Filter points by the range.
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range):
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def __call__(self, data):
+        """Call function to filter points by the range.
+        Args:
+            data (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Results after filtering, 'points', 'pts_instance_mask' \
+                and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = data["points"]
+        points_mask = points.in_range_3d(self.pcd_range)
+        clean_points = points[points_mask]
+        data["points"] = clean_points
+        return data
\ No newline at end of file
diff --git a/mmcv/datasets/prepare_B2D.py b/mmcv/datasets/prepare_B2D.py
new file mode 100644
index 0000000..a9b2f33
--- /dev/null
+++ b/mmcv/datasets/prepare_B2D.py
@@ -0,0 +1,401 @@
+import os
+from os.path import join
+import gzip, json, pickle
+import numpy as np
+from pyquaternion import Quaternion
+from tqdm import tqdm
+from vis_utils import calculate_cube_vertices,calculate_occlusion_stats,edges,DIS_CAR_SAVE
+import cv2
+import multiprocessing
+import argparse
+# All data in the Bench2Drive dataset are in the left-handed coordinate system.
+# This code converts all coordinate systems (world coordinate system, vehicle coordinate system,
+# camera coordinate system, and lidar coordinate system) to the right-handed coordinate system
+# consistent with the nuscenes dataset.
+
+DATAROOT = '../../data/bench2drive'
+MAP_ROOT = '../../data/bench2drive/maps'
+OUT_DIR = '../../data/infos'
+
+MAX_DISTANCE = 75              # Filter bounding boxes that are too far from the vehicle
+FILTER_Z_SHRESHOLD = 10        # Filter bounding boxes that are too high/low from the vehicle
+FILTER_INVISINLE = True        # Filter bounding boxes based on visibility
+NUM_VISIBLE_SHRESHOLD = 1      # Filter bounding boxes with fewer visible vertices than this value
+NUM_OUTPOINT_SHRESHOLD = 7     # Filter bounding boxes where the number of vertices outside the frame is greater than this value in all cameras
+CAMERAS = ['CAM_FRONT', 'CAM_FRONT_LEFT', 'CAM_FRONT_RIGHT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT']
+CAMERA_TO_FOLDER_MAP = {'CAM_FRONT':'rgb_front', 'CAM_FRONT_LEFT':'rgb_front_left', 'CAM_FRONT_RIGHT':'rgb_front_right', 'CAM_BACK':'rgb_back', 'CAM_BACK_LEFT':'rgb_back_left', 'CAM_BACK_RIGHT':'rgb_back_right'}
+
+
+stand_to_ue4_rotate = np.array([[ 0, 0, 1, 0],
+                                [ 1, 0, 0, 0],
+                                [ 0,-1, 0, 0],
+                                [ 0, 0, 0, 1]])
+
+
+
+lidar_to_righthand_ego = np.array([[  0, 1, 0, 0],
+                                   [  -1, 0, 0, 0],
+                                   [  0, 0, 1, 0],
+                                   [  0, 0, 0, 1]])
+
+lefthand_ego_to_lidar = np.array([[  0, 1, 0, 0],
+                                   [  1, 0, 0, 0],
+                                   [  0, 0, 1, 0],
+                                   [  0, 0, 0, 1]])
+
+
+
+left2right = np.eye(4)
+left2right[1,1] = -1
+
+def apply_trans(vec,world2ego):
+    vec = np.concatenate((vec,np.array([1])))
+    t = world2ego @ vec
+    return t[0:3]
+
+def get_pose_matrix(dic):
+    new_matrix = np.zeros((4,4))
+    new_matrix[0:3,0:3] = Quaternion(axis=[0, 0, 1], radians=dic['theta']-np.pi/2).rotation_matrix
+    new_matrix[0,3] = dic['x']
+    new_matrix[1,3] = dic['y']
+    new_matrix[3,3] = 1
+    return new_matrix
+
+def get_npc2world(npc):
+    for key in ['world2vehicle','world2ego','world2sign','world2ped']:
+        if key in npc.keys():
+            npc2world = np.linalg.inv(np.array(npc[key]))
+            yaw_from_matrix = np.arctan2(npc2world[1,0], npc2world[0,0])
+            yaw = npc['rotation'][-1]/180*np.pi
+            if abs(yaw-yaw_from_matrix)> 0.01:
+                npc2world[0:3,0:3] = Quaternion(axis=[0, 0, 1], radians=yaw).rotation_matrix
+            npc2world =  left2right@npc2world@left2right
+            return npc2world
+    npc2world = np.eye(4)
+    npc2world[0:3,0:3] = Quaternion(axis=[0, 0, 1], radians=npc['rotation'][-1]/180*np.pi).rotation_matrix
+    npc2world[0:3,3] = np.array(npc['location'])
+    return left2right@npc2world@left2right
+
+
+def get_global_trigger_vertex(center,extent,yaw_in_degree):
+    x,y = center[0],-center[1]
+    dx,dy = extent[0],extent[1]
+    yaw_in_radians = -yaw_in_degree/180*np.pi
+    vertex_in_self = np.array([[dx,dy],
+                               [-dx,dy],
+                               [-dx,-dy],
+                               [dx,-dy]])
+    rotate_matrix = np.array([[np.cos(yaw_in_radians),-np.sin(yaw_in_radians)],
+                              [np.sin(yaw_in_radians), np.cos(yaw_in_radians)]])
+    rotated_vertex = (rotate_matrix @ vertex_in_self.T).T
+    vertex_in_global = np.array([[x,y]]).repeat(4,axis=0) + rotated_vertex
+    return vertex_in_global
+
+
+
+def get_image_point(loc, K, w2c):
+    point = np.array([loc[0], loc[1], loc[2], 1])
+    point_camera = np.dot(w2c, point)
+    point_camera = point_camera[0:3]
+    depth = point_camera[2]
+    point_img = np.dot(K, point_camera)
+    point_img[0] /= point_img[2]
+    point_img[1] /= point_img[2]
+    return point_img[0:2], depth
+
+def get_action(index):
+	Discrete_Actions_DICT = {
+		0:  (0, 0, 1, False),
+		1:  (0.7, -0.5, 0, False),
+		2:  (0.7, -0.3, 0, False),
+		3:  (0.7, -0.2, 0, False),
+		4:  (0.7, -0.1, 0, False),
+		5:  (0.7, 0, 0, False),
+		6:  (0.7, 0.1, 0, False),
+		7:  (0.7, 0.2, 0, False),
+		8:  (0.7, 0.3, 0, False),
+		9:  (0.7, 0.5, 0, False),
+		10: (0.3, -0.7, 0, False),
+		11: (0.3, -0.5, 0, False),
+		12: (0.3, -0.3, 0, False),
+		13: (0.3, -0.2, 0, False),
+		14: (0.3, -0.1, 0, False),
+		15: (0.3, 0, 0, False),
+		16: (0.3, 0.1, 0, False),
+		17: (0.3, 0.2, 0, False),
+		18: (0.3, 0.3, 0, False),
+		19: (0.3, 0.5, 0, False),
+		20: (0.3, 0.7, 0, False),
+		21: (0, -1, 0, False),
+		22: (0, -0.6, 0, False),
+		23: (0, -0.3, 0, False),
+		24: (0, -0.1, 0, False),
+		25: (1, 0, 0, False),
+		26: (0, 0.1, 0, False),
+		27: (0, 0.3, 0, False),
+		28: (0, 0.6, 0, False),
+		29: (0, 1.0, 0, False),
+		30: (0.5, -0.5, 0, True),
+		31: (0.5, -0.3, 0, True),
+		32: (0.5, -0.2, 0, True),
+		33: (0.5, -0.1, 0, True),
+		34: (0.5, 0, 0, True),
+		35: (0.5, 0.1, 0, True),
+		36: (0.5, 0.2, 0, True),
+		37: (0.5, 0.3, 0, True),
+		38: (0.5, 0.5, 0, True),
+		}
+	throttle, steer, brake, reverse = Discrete_Actions_DICT[index]
+	return throttle, steer, brake
+
+
+def gengrate_map(map_root):
+    map_infos = {}
+    for file_name in os.listdir(map_root):
+        if '.npz' in file_name:
+            map_info = dict(np.load(join(map_root,file_name), allow_pickle=True)['arr'])
+            town_name = file_name.split('_')[0]
+            map_infos[town_name] = {} 
+            lane_points = []
+            lane_types = []
+            lane_sample_points = []
+            trigger_volumes_points = []
+            trigger_volumes_types = []
+            trigger_volumes_sample_points = []
+            for road_id, road in map_info.items():
+                for lane_id, lane in road.items():
+                    if lane_id == 'Trigger_Volumes':
+                        for single_trigger_volume in lane:
+                            points = np.array(single_trigger_volume['Points'])
+                            points[:,1] *= -1
+                            trigger_volumes_points.append(points)
+                            trigger_volumes_sample_points.append(points.mean(axis=0))
+                            trigger_volumes_types.append(single_trigger_volume['Type'])
+                    else:
+                        for single_lane in lane:
+                            points = np.array([raw_point[0] for raw_point in single_lane['Points']])
+                            points[:,1] *= -1
+                            lane_points.append(points)
+                            lane_types.append(single_lane['Type'])
+                            lane_lenth = points.shape[0]
+                            if lane_lenth % 50 !=0:
+                                devide_points = [50*i for i in range(lane_lenth//50+1)]
+                            else:
+                                devide_points = [50*i for i in range(lane_lenth//50)]
+                            devide_points.append(lane_lenth-1)
+                            lane_sample_points_tmp = points[devide_points]
+                            lane_sample_points.append(lane_sample_points_tmp)
+            map_infos[town_name]['lane_points'] = lane_points
+            map_infos[town_name]['lane_sample_points'] = lane_sample_points
+            map_infos[town_name]['lane_types'] = lane_types
+            map_infos[town_name]['trigger_volumes_points'] = trigger_volumes_points
+            map_infos[town_name]['trigger_volumes_sample_points'] = trigger_volumes_sample_points
+            map_infos[town_name]['trigger_volumes_types'] = trigger_volumes_types
+    with open(join(OUT_DIR,'b2d_map_infos.pkl'),'wb') as f:
+        pickle.dump(map_infos,f)
+
+def preprocess(folder_list,idx,tmp_dir,train_or_val):
+
+    data_root = DATAROOT
+    cameras = CAMERAS
+    final_data = []
+    if idx == 0:
+        folders = tqdm(folder_list)
+    else:
+        folders = folder_list
+
+    for folder_name in folders:
+        folder_path = join(data_root, folder_name)
+        last_position_dict = {}
+        for ann_name in sorted(os.listdir(join(folder_path,'anno')),key= lambda x: int(x.split('.')[0])):
+            position_dict = {}
+            frame_data = {}
+            cam_gray_depth = {}
+            with gzip.open(join(folder_path,'anno',ann_name), 'rt', encoding='utf-8') as gz_file:
+                anno = json.load(gz_file) 
+            frame_data['folder'] = folder_name
+            frame_data['town_name'] =  folder_name.split('/')[1].split('_')[1]
+            frame_data['command_far_xy'] = np.array([anno['x_command_far'],-anno['y_command_far']])
+            frame_data['command_far'] = anno['command_far']
+            frame_data['command_near_xy'] = np.array([anno['x_command_near'],-anno['y_command_near']])
+            frame_data['command_near'] = anno['command_near']
+            frame_data['frame_idx'] = int(ann_name.split('.')[0])
+            frame_data['ego_yaw'] = -np.nan_to_num(anno['theta'],nan=np.pi)+np.pi/2  
+            frame_data['ego_translation'] = np.array([anno['x'],-anno['y'],0])
+            frame_data['ego_vel'] = np.array([anno['speed'],0,0])
+            frame_data['ego_accel'] = np.array([anno['acceleration'][0],-anno['acceleration'][1],anno['acceleration'][2]])
+            frame_data['ego_rotation_rate'] = -np.array(anno['angular_velocity'])
+            frame_data['ego_size'] = np.array([anno['bounding_boxes'][0]['extent'][1],anno['bounding_boxes'][0]['extent'][0],anno['bounding_boxes'][0]['extent'][2]])*2
+            world2ego = left2right@anno['bounding_boxes'][0]['world2ego']@left2right
+            frame_data['world2ego'] = world2ego
+            if frame_data['frame_idx'] == 0:
+                expert_file_path = join(folder_path,'expert_assessment','-0001.npz')
+            else:
+                expert_file_path = join(folder_path,'expert_assessment',str(frame_data['frame_idx']-1).zfill(5)+'.npz')
+            expert_data = np.load(expert_file_path,allow_pickle=True)['arr_0']
+            action_id = expert_data[-1]
+            # value = expert_data[-2]
+            # expert_feature = expert_data[:-2]
+            throttle, steer, brake = get_action(action_id)
+            frame_data['brake'] = brake
+            frame_data['throttle'] = throttle
+            frame_data['steer'] = steer
+            #frame_data['action_id'] = action_id
+            #frame_data['value'] = value
+            #frame_data['expert_feature'] = expert_feature
+            ###get sensor infos###
+            sensor_infos = {}
+            for cam in CAMERAS:
+                sensor_infos[cam] = {}
+                sensor_infos[cam]['cam2ego'] =  left2right @ np.array(anno['sensors'][cam]['cam2ego']) @stand_to_ue4_rotate 
+                sensor_infos[cam]['intrinsic'] = np.array(anno['sensors'][cam]['intrinsic'])
+                sensor_infos[cam]['world2cam'] = np.linalg.inv(stand_to_ue4_rotate) @ np.array(anno['sensors'][cam]['world2cam']) @left2right
+                sensor_infos[cam]['data_path'] = join(folder_name,'camera',CAMERA_TO_FOLDER_MAP[cam],ann_name.split('.')[0]+'.jpg')
+                cam_gray_depth[cam] = cv2.imread(join(data_root,sensor_infos[cam]['data_path']).replace('rgb_','depth_').replace('.jpg','.png'))[:,:,0]
+            sensor_infos['LIDAR_TOP'] = {}
+            sensor_infos['LIDAR_TOP']['lidar2ego'] =   np.array(anno['sensors']['LIDAR_TOP']['lidar2ego']) @ lidar_to_righthand_ego
+            world2lidar = lefthand_ego_to_lidar @ np.array(anno['sensors']['LIDAR_TOP']['world2lidar']) @ left2right
+            sensor_infos['LIDAR_TOP']['world2lidar'] = world2lidar
+            frame_data['sensors'] = sensor_infos
+            ###get bounding_boxes infos###
+            gt_boxes = []
+            gt_names = []
+            gt_ids = []
+            num_points_list = []
+            npc2world_list = []
+            for npc in anno['bounding_boxes']:
+                if npc['class'] == 'ego_vehicle': continue
+                if npc['distance'] > MAX_DISTANCE: continue
+                if abs(npc['location'][2] - anno['bounding_boxes'][0]['location'][2]) > FILTER_Z_SHRESHOLD: continue
+                center = np.array([npc['center'][0],-npc['center'][1],npc['center'][2]]) # left hand -> right hand
+                extent = np.array([npc['extent'][1],npc['extent'][0],npc['extent'][2]])  # lwh -> wlh
+                position_dict[npc['id']] = center
+                local_center = apply_trans(center, world2lidar)
+                size = extent*2 
+                if 'world2vehicle' in npc.keys():
+                    world2vehicle = left2right@np.array(npc['world2vehicle'])@left2right
+                    vehicle2lidar = world2lidar @ np.linalg.inv(world2vehicle) 
+                    yaw_local = np.arctan2(vehicle2lidar[1,0], vehicle2lidar[0,0])
+
+                else:
+                    yaw_local = -npc['rotation'][-1]/180*np.pi - frame_data['ego_yaw'] +np.pi / 2  
+                yaw_local_in_lidar_box = -yaw_local - np.pi / 2  
+                while yaw_local < -np.pi:
+                    yaw_local += 2*np.pi
+                while yaw_local > np.pi:
+                    yaw_local -= 2*np.pi  
+                if 'speed' in npc.keys():
+                    if 'vehicle' in npc['class']:  # only vehicles have correct speed
+                        speed = npc['speed']
+                    else:
+                        if npc['id'] in last_position_dict.keys():  #calculate speed for other object
+                            speed = np.linalg.norm((center-last_position_dict[npc['id']])[0:2]) * 10
+                        else:
+                            speed = 0
+                else:
+                    speed = 0
+                if 'num_points' in npc.keys():
+                    num_points = npc['num_points']
+                else:
+                    num_points = -1
+                npc2world = get_npc2world(npc)
+                speed_x = speed * np.cos(yaw_local)
+                speed_y = speed * np.sin(yaw_local)
+
+                ###fliter_bounding_boxes###
+                if FILTER_INVISINLE:
+                    valid = False
+                    box2lidar = np.eye(4)
+                    box2lidar[0:3,0:3] = Quaternion(axis=[0, 0, 1], radians=yaw_local).rotation_matrix
+                    box2lidar[0:3,3] = local_center
+                    lidar2box = np.linalg.inv(box2lidar)
+                    raw_verts = calculate_cube_vertices(local_center,extent)
+                    verts = []
+                    for raw_vert in raw_verts:
+                        tmp = np.dot(lidar2box, [raw_vert[0], raw_vert[1], raw_vert[2],1])
+                        tmp[0:3] += local_center
+                        verts.append(tmp.tolist()[:-1])
+                    for cam in cameras:
+                        lidar2cam = np.linalg.inv(frame_data['sensors'][cam]['cam2ego']) @ sensor_infos['LIDAR_TOP']['lidar2ego']
+                        test_points = [] 
+                        test_depth = []
+                        for vert in verts:
+                            point, depth = get_image_point(vert, frame_data['sensors'][cam]['intrinsic'], lidar2cam)
+                            if depth > 0:
+                                test_points.append(point)
+                                test_depth.append(depth)
+
+                        num_visible_vertices, num_invisible_vertices, num_vertices_outside_camera, colored_points = calculate_occlusion_stats(np.array(test_points), np.array(test_depth),  cam_gray_depth[cam], max_render_depth=MAX_DISTANCE)
+                        if num_visible_vertices>NUM_VISIBLE_SHRESHOLD and num_vertices_outside_camera<NUM_OUTPOINT_SHRESHOLD:
+                            valid = True
+                            break
+                else:
+                    valid = True
+                if valid:
+                    npc2world_list.append(npc2world)
+                    num_points_list.append(num_points)            
+                    gt_boxes.append(np.concatenate([local_center,size,np.array([yaw_local_in_lidar_box,speed_x,speed_y])]))
+                    gt_names.append(npc['type_id'])
+                    gt_ids.append(int(npc['id']))
+
+            if len(gt_boxes) == 0:
+                continue
+
+            last_position_dict = position_dict.copy()    
+            gt_ids = np.array(gt_ids)
+            gt_names = np.array(gt_names)
+            num_points_list = np.array(num_points_list)
+            gt_boxes = np.stack(gt_boxes)
+            npc2world = np.stack(npc2world_list)
+            frame_data['gt_ids'] = gt_ids
+            frame_data['gt_boxes'] = gt_boxes
+            frame_data['gt_names'] = gt_names
+            frame_data['num_points'] = num_points_list
+            frame_data['npc2world'] = npc2world
+            final_data.append(frame_data)
+    
+    os.makedirs(join(OUT_DIR,tmp_dir),exist_ok=True)
+    with open(join(OUT_DIR,tmp_dir,'b2d_infos_'+train_or_val+'_'+str(idx)+'.pkl'),'wb') as f:
+        pickle.dump(final_data,f)
+
+
+def generate_infos(folder_list,workers,train_or_val,tmp_dir):
+
+    folder_num = len(folder_list)
+    devide_list = [(folder_num//workers)*i for i in range(folder_num)]
+    devide_list.append(folder_num)
+    for i in range(workers):
+        sub_folder_list = folder_list[devide_list[i]:devide_list[i+1]]
+        process = multiprocessing.Process(target=preprocess, args=(sub_folder_list,i,tmp_dir,train_or_val))
+        process.start()
+        process_list.append(process)
+    for i in range(workers):
+        process_list[i].join()
+    union_data = []
+    for i in range(workers):
+        with open(join(OUT_DIR,tmp_dir,'b2d_infos_'+train_or_val+'_'+str(i)+'.pkl'),'rb') as f:
+            data = pickle.load(f)
+        union_data.extend(data)
+    with open(join(OUT_DIR,'b2d_infos_'+train_or_val+'.pkl'),'wb') as f:
+        pickle.dump(union_data,f)
+
+if __name__ == "__main__":
+
+
+    os.makedirs(OUT_DIR,exist_ok=True)
+    argparser = argparse.ArgumentParser(description=__doc__)
+    argparser.add_argument('--workers',type=int, default= 4, help='num of workers to prepare dataset')
+    argparser.add_argument('--tmp_dir', default="tmp_data", )
+    args = argparser.parse_args()    
+    workers = args.workers
+    process_list = []
+    with open('../../data/splits/bench2drive_base_train_val_split.json','r') as f:
+        train_val_split = json.load(f)
+    print('processing train data...')
+    generate_infos(train_val_split['train'],workers,'train',args.tmp_dir)
+    process_list = []
+    print('processing val data...')
+    generate_infos(train_val_split['val'],workers,'val',args.tmp_dir)
+    print('processing map data...')
+    gengrate_map(MAP_ROOT)
+    print('finish!')
\ No newline at end of file
diff --git a/mmcv/datasets/samplers/__init__.py b/mmcv/datasets/samplers/__init__.py
new file mode 100644
index 0000000..113db89
--- /dev/null
+++ b/mmcv/datasets/samplers/__init__.py
@@ -0,0 +1,5 @@
+from .distributed_sampler import DistributedSampler
+from .sampler import SAMPLER, build_sampler
+from .group_sampler import DistributedGroupSampler, GroupSampler
+
+# __all__ = ['DistributedSampler', 'DistributedGroupSampler', 'GroupSampler']
diff --git a/mmcv/datasets/samplers/distributed_sampler.py b/mmcv/datasets/samplers/distributed_sampler.py
new file mode 100644
index 0000000..2913de9
--- /dev/null
+++ b/mmcv/datasets/samplers/distributed_sampler.py
@@ -0,0 +1,41 @@
+import math
+
+import torch
+from torch.utils.data import DistributedSampler as _DistributedSampler
+from .sampler import SAMPLER
+
+
+@SAMPLER.register_module()
+class DistributedSampler(_DistributedSampler):
+
+    def __init__(self,
+                 dataset=None,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=True,
+                 seed=0):
+        super().__init__(
+            dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
+        # for the compatibility from PyTorch 1.3+
+        self.seed = seed if seed is not None else 0
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        if self.shuffle:
+            assert False
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        # in case that indices is shorter than half of total_size
+        indices = (indices *
+                   math.ceil(self.total_size / len(indices)))[:self.total_size]
+        assert len(indices) == self.total_size
+
+        # subsample
+        per_replicas = self.total_size//self.num_replicas
+        # indices = indices[self.rank:self.total_size:self.num_replicas]
+        indices = indices[self.rank*per_replicas:(self.rank+1)*per_replicas]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
diff --git a/mmcv/datasets/samplers/group_sampler.py b/mmcv/datasets/samplers/group_sampler.py
new file mode 100644
index 0000000..afc3db5
--- /dev/null
+++ b/mmcv/datasets/samplers/group_sampler.py
@@ -0,0 +1,146 @@
+import math
+
+import numpy as np
+import torch
+from mmcv.utils import get_dist_info
+from torch.utils.data import Sampler
+from .sampler import SAMPLER
+import random
+from IPython import embed
+
+
+class GroupSampler(Sampler):
+
+    def __init__(self, dataset, samples_per_gpu=1):
+        assert hasattr(dataset, 'flag')
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.flag = dataset.flag.astype(np.int64)
+        self.group_sizes = np.bincount(self.flag)
+        self.num_samples = 0
+        for i, size in enumerate(self.group_sizes):
+            self.num_samples += int(np.ceil(
+                size / self.samples_per_gpu)) * self.samples_per_gpu
+
+    def __iter__(self):
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size == 0:
+                continue
+            indice = np.where(self.flag == i)[0]
+            assert len(indice) == size
+            np.random.shuffle(indice)
+            num_extra = int(np.ceil(size / self.samples_per_gpu)
+                            ) * self.samples_per_gpu - len(indice)
+            indice = np.concatenate(
+                [indice, np.random.choice(indice, num_extra)])
+            indices.append(indice)
+        indices = np.concatenate(indices)
+        indices = [
+            indices[i * self.samples_per_gpu:(i + 1) * self.samples_per_gpu]
+            for i in np.random.permutation(
+                range(len(indices) // self.samples_per_gpu))
+        ]
+        indices = np.concatenate(indices)
+        indices = indices.astype(np.int64).tolist()
+        assert len(indices) == self.num_samples
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+@SAMPLER.register_module()
+class DistributedGroupSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+        seed (int, optional): random seed used to shuffle the sampler if
+            ``shuffle=True``. This number should be identical across all
+            processes in the distributed group. Default: 0.
+    """
+
+    def __init__(self,
+                 dataset,
+                 samples_per_gpu=1,
+                 num_replicas=None,
+                 rank=None,
+                 seed=0):
+        _rank, _num_replicas = get_dist_info()
+        if num_replicas is None:
+            num_replicas = _num_replicas
+        if rank is None:
+            rank = _rank
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.seed = seed if seed is not None else 0
+
+        assert hasattr(self.dataset, 'flag')
+        self.flag = self.dataset.flag
+        self.group_sizes = np.bincount(self.flag)
+
+        self.num_samples = 0
+        for i, j in enumerate(self.group_sizes):
+            self.num_samples += int(
+                math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
+                          self.num_replicas)) * self.samples_per_gpu
+        self.total_size = self.num_samples * self.num_replicas
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch + self.seed)
+
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size > 0:
+                indice = np.where(self.flag == i)[0]
+                assert len(indice) == size
+                # TODO: check whether torch.randperm() can be replaced by
+                # numpy.random.permutation().
+                indice = indice[list(
+                    torch.randperm(int(size), generator=g).numpy())].tolist()
+                extra = int(
+                    math.ceil(
+                        size * 1.0 / self.samples_per_gpu / self.num_replicas)
+                ) * self.samples_per_gpu * self.num_replicas - len(indice)
+                # pad indice
+                tmp = indice.copy()
+                for _ in range(extra // size):
+                    indice.extend(tmp)
+                indice.extend(tmp[:extra % size])
+                indices.extend(indice)
+
+        assert len(indices) == self.total_size
+
+        indices = [
+            indices[j] for i in list(
+                torch.randperm(
+                    len(indices) // self.samples_per_gpu, generator=g))
+            for j in range(i * self.samples_per_gpu, (i + 1) *
+                           self.samples_per_gpu)
+        ]
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset:offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/mmcv/datasets/samplers/sampler.py b/mmcv/datasets/samplers/sampler.py
new file mode 100644
index 0000000..1906049
--- /dev/null
+++ b/mmcv/datasets/samplers/sampler.py
@@ -0,0 +1,7 @@
+from mmcv.utils.registry import Registry, build_from_cfg
+
+SAMPLER = Registry('sampler')
+
+
+def build_sampler(cfg, default_args):
+    return build_from_cfg(cfg, SAMPLER, default_args)
diff --git a/mmcv/datasets/utils.py b/mmcv/datasets/utils.py
new file mode 100644
index 0000000..02cf96d
--- /dev/null
+++ b/mmcv/datasets/utils.py
@@ -0,0 +1,298 @@
+import copy
+import warnings
+from mmcv.models import VGG
+from mmcv.runner.hooks import HOOKS, Hook
+
+from mmcv.datasets.pipelines import (Collect3D, DefaultFormatBundle3D,
+                                        LoadAnnotations3D,
+                                        LoadImageFromFileMono3D,
+                                        LoadMultiViewImageFromFiles,
+                                        LoadPointsFromFile,
+                                        LoadPointsFromMultiSweeps,
+                                        MultiScaleFlipAug3D,
+                                        PointSegClassMapping)
+
+from mmcv.datasets.builder import PIPELINES
+from mmcv.datasets.pipelines import LoadAnnotations, LoadImageFromFile
+from mmcv.models.dense_heads import GARPNHead, RPNHead
+from mmcv.models.roi_heads.mask_heads import FusedSemanticHead
+from mmcv.parallel import DataContainer
+
+
+def replace_ImageToTensor(pipelines):
+    """Replace the ImageToTensor transform in a data pipeline to
+    DefaultFormatBundle, which is normally useful in batch inference.
+
+    Args:
+        pipelines (list[dict]): Data pipeline configs.
+
+    Returns:
+        list: The new pipeline list with all ImageToTensor replaced by
+            DefaultFormatBundle.
+
+    Examples:
+        >>> pipelines = [
+        ...    dict(type='LoadImageFromFile'),
+        ...    dict(
+        ...        type='MultiScaleFlipAug',
+        ...        img_scale=(1333, 800),
+        ...        flip=False,
+        ...        transforms=[
+        ...            dict(type='Resize', keep_ratio=True),
+        ...            dict(type='RandomFlip'),
+        ...            dict(type='Normalize', mean=[0, 0, 0], std=[1, 1, 1]),
+        ...            dict(type='Pad', size_divisor=32),
+        ...            dict(type='ImageToTensor', keys=['img']),
+        ...            dict(type='Collect', keys=['img']),
+        ...        ])
+        ...    ]
+        >>> expected_pipelines = [
+        ...    dict(type='LoadImageFromFile'),
+        ...    dict(
+        ...        type='MultiScaleFlipAug',
+        ...        img_scale=(1333, 800),
+        ...        flip=False,
+        ...        transforms=[
+        ...            dict(type='Resize', keep_ratio=True),
+        ...            dict(type='RandomFlip'),
+        ...            dict(type='Normalize', mean=[0, 0, 0], std=[1, 1, 1]),
+        ...            dict(type='Pad', size_divisor=32),
+        ...            dict(type='DefaultFormatBundle'),
+        ...            dict(type='Collect', keys=['img']),
+        ...        ])
+        ...    ]
+        >>> assert expected_pipelines == replace_ImageToTensor(pipelines)
+    """
+    pipelines = copy.deepcopy(pipelines)
+    for i, pipeline in enumerate(pipelines):
+        if pipeline['type'] == 'MultiScaleFlipAug':
+            assert 'transforms' in pipeline
+            pipeline['transforms'] = replace_ImageToTensor(
+                pipeline['transforms'])
+        elif pipeline['type'] == 'ImageToTensor':
+            warnings.warn(
+                '"ImageToTensor" pipeline is replaced by '
+                '"DefaultFormatBundle" for batch inference. It is '
+                'recommended to manually replace it in the test '
+                'data pipeline in your config file.', UserWarning)
+            pipelines[i] = {'type': 'DefaultFormatBundle'}
+    return pipelines
+
+
+# def get_loading_pipeline(pipeline):
+#     """Only keep loading image and annotations related configuration.
+
+#     Args:
+#         pipeline (list[dict]): Data pipeline configs.
+
+#     Returns:
+#         list[dict]: The new pipeline list with only keep
+#             loading image and annotations related configuration.
+
+#     Examples:
+#         >>> pipelines = [
+#         ...    dict(type='LoadImageFromFile'),
+#         ...    dict(type='LoadAnnotations', with_bbox=True),
+#         ...    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+#         ...    dict(type='RandomFlip', flip_ratio=0.5),
+#         ...    dict(type='Normalize', **img_norm_cfg),
+#         ...    dict(type='Pad', size_divisor=32),
+#         ...    dict(type='DefaultFormatBundle'),
+#         ...    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
+#         ...    ]
+#         >>> expected_pipelines = [
+#         ...    dict(type='LoadImageFromFile'),
+#         ...    dict(type='LoadAnnotations', with_bbox=True)
+#         ...    ]
+#         >>> assert expected_pipelines ==\
+#         ...        get_loading_pipeline(pipelines)
+#     """
+#     loading_pipeline_cfg = []
+#     for cfg in pipeline:
+#         obj_cls = PIPELINES.get(cfg['type'])
+#         # TODO：use more elegant way to distinguish loading modules
+#         if obj_cls is not None and obj_cls in (LoadImageFromFile,
+#                                                LoadAnnotations):
+#             loading_pipeline_cfg.append(cfg)
+#     assert len(loading_pipeline_cfg) == 2, \
+#         'The data pipeline in your config file must include ' \
+#         'loading image and annotations related pipeline.'
+#     return loading_pipeline_cfg
+
+
+@HOOKS.register_module()
+class NumClassCheckHook(Hook):
+
+    def _check_head(self, runner):
+        """Check whether the `num_classes` in head matches the length of
+        `CLASSSES` in `dataset`.
+
+        Args:
+            runner (obj:`EpochBasedRunner`): Epoch based Runner.
+        """
+        model = runner.model
+        dataset = runner.data_loader.dataset
+        if dataset.CLASSES is None:
+            runner.logger.warning(
+                f'Please set `CLASSES` '
+                f'in the {dataset.__class__.__name__} and'
+                f'check if it is consistent with the `num_classes` '
+                f'of head')
+        else:
+            assert type(dataset.CLASSES) is not str, \
+                (f'`CLASSES` in {dataset.__class__.__name__}'
+                 f'should be a tuple of str.'
+                 f'Add comma if number of classes is 1 as '
+                 f'CLASSES = ({dataset.CLASSES},)')
+            for name, module in model.named_modules():
+                if hasattr(module, 'num_classes') and not isinstance(
+                        module, (RPNHead, VGG, FusedSemanticHead, GARPNHead)):
+                    assert module.num_classes == len(dataset.CLASSES), \
+                        (f'The `num_classes` ({module.num_classes}) in '
+                         f'{module.__class__.__name__} of '
+                         f'{model.__class__.__name__} does not matches '
+                         f'the length of `CLASSES` '
+                         f'{len(dataset.CLASSES)}) in '
+                         f'{dataset.__class__.__name__}')
+
+    def before_train_epoch(self, runner):
+        """Check whether the training dataset is compatible with head.
+
+        Args:
+            runner (obj:`EpochBasedRunner`): Epoch based Runner.
+        """
+        self._check_head(runner)
+
+    def before_val_epoch(self, runner):
+        """Check whether the dataset in val epoch is compatible with head.
+
+        Args:
+            runner (obj:`EpochBasedRunner`): Epoch based Runner.
+        """
+        self._check_head(runner)
+        
+        
+def is_loading_function(transform):
+    """Judge whether a transform function is a loading function.
+
+    Note: `MultiScaleFlipAug3D` is a wrapper for multiple pipeline functions,
+    so we need to search if its inner transforms contain any loading function.
+
+    Args:
+        transform (dict | :obj:`Pipeline`): A transform config or a function.
+
+    Returns:
+        bool | None: Whether it is a loading function. None means can't judge.
+            When transform is `MultiScaleFlipAug3D`, we return None.
+    """
+    # TODO: use more elegant way to distinguish loading modules
+    loading_functions = (LoadImageFromFile, LoadPointsFromFile,
+                         LoadAnnotations3D, LoadMultiViewImageFromFiles,
+                         LoadPointsFromMultiSweeps, DefaultFormatBundle3D,
+                         Collect3D, LoadImageFromFileMono3D,
+                         PointSegClassMapping)
+    if isinstance(transform, dict):
+        obj_cls = PIPELINES.get(transform['type'])
+        if obj_cls is None:
+            return False
+        if obj_cls in loading_functions:
+            return True
+        if obj_cls in (MultiScaleFlipAug3D, ):
+            return None
+    elif callable(transform):
+        if isinstance(transform, loading_functions):
+            return True
+        if isinstance(transform, MultiScaleFlipAug3D):
+            return None
+    return False
+
+
+def get_loading_pipeline(pipeline):
+    """Only keep loading image, points and annotations related configuration.
+
+    Args:
+        pipeline (list[dict] | list[:obj:`Pipeline`]):
+            Data pipeline configs or list of pipeline functions.
+
+    Returns:
+        list[dict] | list[:obj:`Pipeline`]): The new pipeline list with only
+            keep loading image, points and annotations related configuration.
+
+    Examples:
+        >>> pipelines = [
+        ...    dict(type='LoadPointsFromFile',
+        ...         coord_type='LIDAR', load_dim=4, use_dim=4),
+        ...    dict(type='LoadImageFromFile'),
+        ...    dict(type='LoadAnnotations3D',
+        ...         with_bbox=True, with_label_3d=True),
+        ...    dict(type='Resize',
+        ...         img_scale=[(640, 192), (2560, 768)], keep_ratio=True),
+        ...    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+        ...    dict(type='PointsRangeFilter',
+        ...         point_cloud_range=point_cloud_range),
+        ...    dict(type='ObjectRangeFilter',
+        ...         point_cloud_range=point_cloud_range),
+        ...    dict(type='PointShuffle'),
+        ...    dict(type='Normalize', **img_norm_cfg),
+        ...    dict(type='Pad', size_divisor=32),
+        ...    dict(type='DefaultFormatBundle3D', class_names=class_names),
+        ...    dict(type='Collect3D',
+        ...         keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d'])
+        ...    ]
+        >>> expected_pipelines = [
+        ...    dict(type='LoadPointsFromFile',
+        ...         coord_type='LIDAR', load_dim=4, use_dim=4),
+        ...    dict(type='LoadImageFromFile'),
+        ...    dict(type='LoadAnnotations3D',
+        ...         with_bbox=True, with_label_3d=True),
+        ...    dict(type='DefaultFormatBundle3D', class_names=class_names),
+        ...    dict(type='Collect3D',
+        ...         keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d'])
+        ...    ]
+        >>> assert expected_pipelines ==\
+        ...        get_loading_pipeline(pipelines)
+    """
+    loading_pipeline = []
+    for transform in pipeline:
+        is_loading = is_loading_function(transform)
+        if is_loading is None:  # MultiScaleFlipAug3D
+            # extract its inner pipeline
+            if isinstance(transform, dict):
+                inner_pipeline = transform.get('transforms', [])
+            else:
+                inner_pipeline = transform.transforms.transforms
+            loading_pipeline.extend(get_loading_pipeline(inner_pipeline))
+        elif is_loading:
+            loading_pipeline.append(transform)
+    assert len(loading_pipeline) > 0, \
+        'The data pipeline in your config file must include ' \
+        'loading step.'
+    return loading_pipeline
+
+
+def extract_result_dict(results, key):
+    """Extract and return the data corresponding to key in result dict.
+
+    ``results`` is a dict output from `pipeline(input_dict)`, which is the
+        loaded data from ``Dataset`` class.
+    The data terms inside may be wrapped in list, tuple and DataContainer, so
+        this function essentially extracts data from these wrappers.
+
+    Args:
+        results (dict): Data loaded using pipeline.
+        key (str): Key of the desired data.
+
+    Returns:
+        np.ndarray | torch.Tensor | None: Data term.
+    """
+    if key not in results.keys():
+        return None
+    # results[key] may be data or list[data] or tuple[data]
+    # data may be wrapped inside DataContainer
+    data = results[key]
+    if isinstance(data, (list, tuple)):
+        data = data[0]
+    if isinstance(data, DataContainer):
+        data = data._data
+    return data
+
diff --git a/mmcv/datasets/vad_custom_nuscenes_eval.py b/mmcv/datasets/vad_custom_nuscenes_eval.py
new file mode 100644
index 0000000..0285591
--- /dev/null
+++ b/mmcv/datasets/vad_custom_nuscenes_eval.py
@@ -0,0 +1,834 @@
+import argparse
+import copy
+import json
+import os
+import time
+from typing import Tuple, Dict, Any
+from mmcv.fileio.io import dump,load
+import torch
+import numpy as np
+from nuscenes import NuScenes
+from nuscenes.eval.common.config import config_factory
+from nuscenes.eval.common.data_classes import EvalBoxes
+from nuscenes.eval.detection.evaluate import NuScenesEval
+from pyquaternion import Quaternion
+from nuscenes.eval.detection.data_classes import DetectionBox
+from nuscenes.eval.detection.utils import category_to_detection_name
+from nuscenes.eval.tracking.data_classes import TrackingBox
+from nuscenes.utils.data_classes import Box
+from nuscenes.utils.geometry_utils import points_in_box
+from nuscenes.utils.splits import create_splits_scenes
+import tqdm
+from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix
+import pycocotools.mask as mask_util
+# from projects.mmdet3d_plugin.models.utils.visual import save_tensor
+from torchvision.transforms.functional import rotate
+import cv2
+import argparse
+import random
+from nuscenes.eval.common.loaders import load_gt, add_center_dist
+from nuscenes.eval.detection.algo import accumulate, calc_ap, calc_tp
+from nuscenes.eval.detection.data_classes import DetectionConfig, DetectionMetrics, DetectionBox,  DetectionMetricData,DetectionMetricDataList
+from nuscenes.eval.detection.render import summary_plot, class_pr_curve, dist_pr_curve, visualize_sample
+from nuscenes.eval.common.utils import quaternion_yaw, Quaternion
+from IPython import embed
+from matplotlib import pyplot as plt
+from nuscenes.eval.common.render import setup_axis
+from nuscenes.eval.common.utils import boxes_to_sensor
+from nuscenes.eval.detection.constants import TP_METRICS, DETECTION_NAMES, DETECTION_COLORS, TP_METRICS_UNITS, \
+    PRETTY_DETECTION_NAMES, PRETTY_TP_METRICS
+from nuscenes.utils.data_classes import LidarPointCloud
+import mmcv
+
+
+Axis = Any
+
+def class_tp_curve(md_list: DetectionMetricDataList,
+                   metrics: DetectionMetrics,
+                   detection_name: str,
+                   min_recall: float,
+                   dist_th_tp: float,
+                   savepath: str = None,
+                   ax: Axis = None) -> None:
+    """
+    Plot the true positive curve for the specified class.
+    :param md_list: DetectionMetricDataList instance.
+    :param metrics: DetectionMetrics instance.
+    :param detection_name:
+    :param min_recall: Minimum recall value.
+    :param dist_th_tp: The distance threshold used to determine matches.
+    :param savepath: If given, saves the the rendering here instead of displaying.
+    :param ax: Axes onto which to render.
+    """
+    # Get metric data for given detection class with tp distance threshold.
+
+    md = md_list[(detection_name, dist_th_tp)]
+    min_recall_ind = round(100 * min_recall)
+    if min_recall_ind <= md.max_recall_ind:
+        # For traffic_cone and barrier only a subset of the metrics are plotted.
+        rel_metrics = [m for m in TP_METRICS if not np.isnan(metrics.get_label_tp(detection_name, m))]
+        ylimit = max([max(getattr(md, metric)[min_recall_ind:md.max_recall_ind + 1]) for metric in rel_metrics]) * 1.1
+    else:
+        ylimit = 1.0
+
+    # Prepare axis.
+    if ax is None:
+        ax = setup_axis(title=PRETTY_DETECTION_NAMES[detection_name], xlabel='Recall', ylabel='Error', xlim=1,
+                        min_recall=min_recall)
+    ax.set_ylim(0, ylimit)
+
+    # Plot the recall vs. error curve for each tp metric.
+    for metric in TP_METRICS:
+        tp = metrics.get_label_tp(detection_name, metric)
+
+        # Plot only if we have valid data.
+        if tp is not np.nan and min_recall_ind <= md.max_recall_ind:
+            recall, error = md.recall[:md.max_recall_ind + 1], getattr(md, metric)[:md.max_recall_ind + 1]
+        else:
+            recall, error = [], []
+
+        # Change legend based on tp value
+        if tp is np.nan:
+            label = '{}: n/a'.format(PRETTY_TP_METRICS[metric])
+        elif min_recall_ind > md.max_recall_ind:
+            label = '{}: nan'.format(PRETTY_TP_METRICS[metric])
+        else:
+            label = '{}: {:.2f} ({})'.format(PRETTY_TP_METRICS[metric], tp, TP_METRICS_UNITS[metric])
+        if metric == 'trans_err':
+            label += f' ({md.max_recall_ind})'  # add recall
+            print(f'Recall: {detection_name}: {md.max_recall_ind/100}')
+        ax.plot(recall, error, label=label)
+    ax.axvline(x=md.max_recall, linestyle='-.', color=(0, 0, 0, 0.3))
+    ax.legend(loc='best')
+
+    if savepath is not None:
+        plt.savefig(savepath)
+        plt.close()
+
+
+class DetectionBox_modified(DetectionBox):
+    def __init__(self, *args, token=None, visibility=None, index=None, **kwargs):
+        '''
+        add annotation token
+        '''
+        super().__init__(*args, **kwargs)
+        self.token = token
+        self.visibility = visibility
+        self.index = index
+
+    def serialize(self) -> dict:
+        """ Serialize instance into json-friendly format. """
+        return {
+            'token': self.token,
+            'sample_token': self.sample_token,
+            'translation': self.translation,
+            'size': self.size,
+            'rotation': self.rotation,
+            'velocity': self.velocity,
+            'ego_translation': self.ego_translation,
+            'num_pts': self.num_pts,
+            'detection_name': self.detection_name,
+            'detection_score': self.detection_score,
+            'attribute_name': self.attribute_name,
+            'visibility': self.visibility,
+            'index': self.index
+
+        }
+
+    @classmethod
+    def deserialize(cls, content: dict):
+        """ Initialize from serialized content. """
+        return cls(
+            token=content['token'],
+            sample_token=content['sample_token'],
+            translation=tuple(content['translation']),
+            size=tuple(content['size']),
+            rotation=tuple(content['rotation']),
+            velocity=tuple(content['velocity']),
+            ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content
+            else tuple(content['ego_translation']),
+            num_pts=-1 if 'num_pts' not in content else int(content['num_pts']),
+            detection_name=content['detection_name'],
+            detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']),
+            attribute_name=content['attribute_name'],
+            visibility=content['visibility'],
+            index=content['index'],
+        )
+
+
+def center_in_image(box, intrinsic: np.ndarray, imsize: Tuple[int, int], vis_level: int = BoxVisibility.ANY) -> bool:
+    """
+    Check if a box is visible inside an image without accounting for occlusions.
+    :param box: The box to be checked.
+    :param intrinsic: <float: 3, 3>. Intrinsic camera matrix.
+    :param imsize: (width, height).
+    :param vis_level: One of the enumerations of <BoxVisibility>.
+    :return True if visibility condition is satisfied.
+    """
+
+    center_3d = box.center.reshape(3, 1)
+    center_img = view_points(center_3d, intrinsic, normalize=True)[:2, :]
+
+    visible = np.logical_and(center_img[0, :] > 0, center_img[0, :] < imsize[0])
+    visible = np.logical_and(visible, center_img[1, :] < imsize[1])
+    visible = np.logical_and(visible, center_img[1, :] > 0)
+    visible = np.logical_and(visible, center_3d[2, :] > 1)
+
+    in_front = center_3d[2, :] > 0.1  # True if a corner is at least 0.1 meter in front of the camera.
+
+    if vis_level == BoxVisibility.ALL:
+        return all(visible) and all(in_front)
+    elif vis_level == BoxVisibility.ANY:
+        return any(visible) and all(in_front)
+    elif vis_level == BoxVisibility.NONE:
+        return True
+    else:
+        raise ValueError("vis_level: {} not valid".format(vis_level))
+
+
+def exist_corners_in_image_but_not_all(box, intrinsic: np.ndarray, imsize: Tuple[int, int],
+                                       vis_level: int = BoxVisibility.ANY) -> bool:
+    """
+    Check if a box is visible in images but not all corners in image .
+    :param box: The box to be checked.
+    :param intrinsic: <float: 3, 3>. Intrinsic camera matrix.
+    :param imsize: (width, height).
+    :param vis_level: One of the enumerations of <BoxVisibility>.
+    :return True if visibility condition is satisfied.
+    """
+
+    corners_3d = box.corners()
+    corners_img = view_points(corners_3d, intrinsic, normalize=True)[:2, :]
+
+    visible = np.logical_and(corners_img[0, :] > 0, corners_img[0, :] < imsize[0])
+    visible = np.logical_and(visible, corners_img[1, :] < imsize[1])
+    visible = np.logical_and(visible, corners_img[1, :] > 0)
+    visible = np.logical_and(visible, corners_3d[2, :] > 1)
+
+    in_front = corners_3d[2, :] > 0.1  # True if a corner is at least 0.1 meter in front of the camera.
+
+    if any(visible) and not all(visible) and all(in_front):
+        return True
+    else:
+        return False
+
+def load_prediction(result_path: str, max_boxes_per_sample: int, box_cls, verbose: bool = False) \
+        -> Tuple[EvalBoxes, Dict]:
+    """
+    Loads object predictions from file.
+    :param result_path: Path to the .json result file provided by the user.
+    :param max_boxes_per_sample: Maximim number of boxes allowed per sample.
+    :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
+    :param verbose: Whether to print messages to stdout.
+    :return: The deserialized results and meta data.
+    """
+
+    # Load from file and check that the format is correct.
+    # with open(result_path) as f:
+    #     data = json.load(f)
+    data = load(result_path)
+    assert 'results' in data, 'Error: No field `results` in result file. Please note that the result format changed.' \
+                              'See https://www.nuscenes.org/object-detection for more information.'
+
+    # Deserialize results and get meta data.
+    all_results = EvalBoxes.deserialize(data['results'], box_cls)
+    meta = data['meta']
+    if verbose:
+        print("Loaded results from {}. Found detections for {} samples."
+              .format(result_path, len(all_results.sample_tokens)))
+
+    # Check that each sample has no more than x predicted boxes.
+    for sample_token in all_results.sample_tokens:
+        assert len(all_results.boxes[sample_token]) <= max_boxes_per_sample, \
+            "Error: Only <= %d boxes per sample allowed!" % max_boxes_per_sample
+
+    return all_results, meta
+
+def load_gt(nusc: NuScenes, eval_split: str, box_cls, verbose: bool = False):
+    """
+    Loads ground truth boxes from DB.
+    :param nusc: A NuScenes instance.
+    :param eval_split: The evaluation split for which we load GT boxes.
+    :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox.
+    :param verbose: Whether to print messages to stdout.
+    :return: The GT boxes.
+    """
+
+    # Init.
+    if box_cls == DetectionBox_modified:
+        attribute_map = {a['token']: a['name'] for a in nusc.attribute}
+
+    if verbose:
+        print('Loading annotations for {} split from nuScenes version: {}'.format(eval_split, nusc.version))
+    # Read out all sample_tokens in DB.
+    sample_tokens_all = [s['token'] for s in nusc.sample]
+    assert len(sample_tokens_all) > 0, "Error: Database has no samples!"
+
+    # Only keep samples from this split.
+    splits = create_splits_scenes()
+
+    # Check compatibility of split with nusc_version.
+    version = nusc.version
+    if eval_split in {'train', 'val', 'train_detect', 'train_track'}:
+        assert version.endswith('trainval'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    elif eval_split in {'mini_train', 'mini_val'}:
+        assert version.endswith('mini'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    elif eval_split == 'test':
+        assert version.endswith('test'), \
+            'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version)
+    else:
+        raise ValueError('Error: Requested split {} which this function cannot map to the correct NuScenes version.'
+                         .format(eval_split))
+
+    if eval_split == 'test':
+        # Check that you aren't trying to cheat :).
+        assert len(nusc.sample_annotation) > 0, \
+            'Error: You are trying to evaluate on the test set but you do not have the annotations!'
+    index_map = {}
+    for scene in nusc.scene:
+        first_sample_token = scene['first_sample_token']
+        sample = nusc.get('sample', first_sample_token)
+        index_map[first_sample_token] = 1
+        index = 2
+        while sample['next'] != '':
+            sample = nusc.get('sample', sample['next'])
+            index_map[sample['token']] = index
+            index += 1
+
+    sample_tokens = []
+    for sample_token in sample_tokens_all:
+        scene_token = nusc.get('sample', sample_token)['scene_token']
+        scene_record = nusc.get('scene', scene_token)
+        if scene_record['name'] in splits[eval_split]:
+            sample_tokens.append(sample_token)
+
+    all_annotations = EvalBoxes()
+
+    # Load annotations and filter predictions and annotations.
+    tracking_id_set = set()
+    for sample_token in tqdm.tqdm(sample_tokens, leave=verbose):
+
+        sample = nusc.get('sample', sample_token)
+        sample_annotation_tokens = sample['anns']
+
+        sample_boxes = []
+        for sample_annotation_token in sample_annotation_tokens:
+
+            sample_annotation = nusc.get('sample_annotation', sample_annotation_token)
+            if box_cls == DetectionBox_modified:
+                # Get label name in detection task and filter unused labels.
+                detection_name = category_to_detection_name(sample_annotation['category_name'])
+                if detection_name is None:
+                    continue
+
+                # Get attribute_name.
+                attr_tokens = sample_annotation['attribute_tokens']
+                attr_count = len(attr_tokens)
+                if attr_count == 0:
+                    attribute_name = ''
+                elif attr_count == 1:
+                    attribute_name = attribute_map[attr_tokens[0]]
+                else:
+                    raise Exception('Error: GT annotations must not have more than one attribute!')
+
+                sample_boxes.append(
+                    box_cls(
+                        token=sample_annotation_token,
+                        sample_token=sample_token,
+                        translation=sample_annotation['translation'],
+                        size=sample_annotation['size'],
+                        rotation=sample_annotation['rotation'],
+                        velocity=nusc.box_velocity(sample_annotation['token'])[:2],
+                        num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'],
+                        detection_name=detection_name,
+                        detection_score=-1.0,  # GT samples do not have a score.
+                        attribute_name=attribute_name,
+                        visibility=sample_annotation['visibility_token'],
+                        index=index_map[sample_token]
+                    )
+                )
+            elif box_cls == TrackingBox:
+                assert False
+            else:
+                raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls)
+
+        all_annotations.add_boxes(sample_token, sample_boxes)
+
+    if verbose:
+        print("Loaded ground truth annotations for {} samples.".format(len(all_annotations.sample_tokens)))
+
+    return all_annotations
+
+
+def filter_eval_boxes_by_id(nusc: NuScenes,
+                            eval_boxes: EvalBoxes,
+                            id=None,
+                            verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. Distance, bike-racks and points per box.
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param is: the anns token set that used to keep bboxes.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            if box.token in id:
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After anns based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+def filter_eval_boxes_by_visibility(
+        ori_eval_boxes: EvalBoxes,
+        visibility=None,
+        verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. Distance, bike-racks and points per box.
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param is: the anns token set that used to keep bboxes.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    eval_boxes = copy.deepcopy(ori_eval_boxes)
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            if box.visibility == visibility:
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After visibility based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+
+def filter_by_sample_token(ori_eval_boxes, valid_sample_tokens=[],  verbose=False):
+    eval_boxes = copy.deepcopy(ori_eval_boxes)
+    for sample_token in eval_boxes.sample_tokens:
+        if sample_token not in valid_sample_tokens:
+            eval_boxes.boxes.pop(sample_token)
+    return eval_boxes
+
+
+def filter_eval_boxes_by_overlap(nusc: NuScenes,
+                                 eval_boxes: EvalBoxes,
+                                 verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. basedon overlap .
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param verbose: Whether to print to stdout.
+    """
+
+    # Accumulators for number of filtered boxes.
+    cams = ['CAM_FRONT',
+            'CAM_FRONT_RIGHT',
+            'CAM_BACK_RIGHT',
+            'CAM_BACK',
+            'CAM_BACK_LEFT',
+            'CAM_FRONT_LEFT']
+
+    total, anns_filter = 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+        # Filter on anns
+        total += len(eval_boxes[sample_token])
+        sample_record = nusc.get('sample', sample_token)
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            count = 0
+            for cam in cams:
+                '''
+                copy-paste form nuscens
+                '''
+                sample_data_token = sample_record['data'][cam]
+                sd_record = nusc.get('sample_data', sample_data_token)
+                cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token'])
+                sensor_record = nusc.get('sensor', cs_record['sensor_token'])
+                pose_record = nusc.get('ego_pose', sd_record['ego_pose_token'])
+                cam_intrinsic = np.array(cs_record['camera_intrinsic'])
+                imsize = (sd_record['width'], sd_record['height'])
+                new_box = Box(box.translation, box.size, Quaternion(box.rotation),
+                              name=box.detection_name, token='')
+
+                # Move box to ego vehicle coord system.
+                new_box.translate(-np.array(pose_record['translation']))
+                new_box.rotate(Quaternion(pose_record['rotation']).inverse)
+
+                #  Move box to sensor coord system.
+                new_box.translate(-np.array(cs_record['translation']))
+                new_box.rotate(Quaternion(cs_record['rotation']).inverse)
+
+                if center_in_image(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+                    count += 1
+                # if exist_corners_in_image_but_not_all(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY):
+                #    count += 1
+
+            if count > 1:
+                with open('center_overlap.txt', 'a') as f:
+                    try:
+                        f.write(box.token + '\n')
+                    except:
+                        pass
+                filtered_boxes.append(box)
+        anns_filter += len(filtered_boxes)
+        eval_boxes.boxes[sample_token] = filtered_boxes
+
+    verbose = True
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After anns based filtering: %d" % anns_filter)
+
+    return eval_boxes
+
+def _get_box_class_field(eval_boxes: EvalBoxes) -> str:
+    """
+    Retrieve the name of the class field in the boxes.
+    This parses through all boxes until it finds a valid box.
+    If there are no valid boxes, this function throws an exception.
+    :param eval_boxes: The EvalBoxes used for evaluation.
+    :return: The name of the class field in the boxes, e.g. detection_name or tracking_name.
+    """
+    assert len(eval_boxes.boxes) > 0
+    box = None
+    for val in eval_boxes.boxes.values():
+        if len(val) > 0:
+            box = val[0]
+            break
+    if isinstance(box, DetectionBox):
+        class_field = 'detection_name'
+    elif isinstance(box, TrackingBox):
+        class_field = 'tracking_name'
+    else:
+        raise Exception('Error: Invalid box type: %s' % box)
+
+    return class_field
+
+def filter_eval_boxes(nusc: NuScenes,
+                      eval_boxes: EvalBoxes,
+                      max_dist_x: Dict[str, float],
+                      max_dist_y: Dict[str, float],
+                      verbose: bool = False) -> EvalBoxes:
+    """
+    Applies filtering to boxes. Distance, bike-racks and points per box.
+    :param nusc: An instance of the NuScenes class.
+    :param eval_boxes: An instance of the EvalBoxes class.
+    :param max_dist: Maps the detection name to the eval distance threshold for that class.
+    :param verbose: Whether to print to stdout.
+    """
+    # Retrieve box type for detectipn/tracking boxes.
+    class_field = _get_box_class_field(eval_boxes)
+
+    # Accumulators for number of filtered boxes.
+    total, dist_filter, point_filter, bike_rack_filter = 0, 0, 0, 0
+    for ind, sample_token in enumerate(eval_boxes.sample_tokens):
+
+        # Filter on distance first.
+        total += len(eval_boxes[sample_token])
+        eval_boxes.boxes[sample_token] = [box for box in eval_boxes[sample_token] if
+                                          abs(box.ego_translation[0]) < max_dist_x[box.__getattribute__(class_field)] \
+                                          and abs(box.ego_translation[1]) < max_dist_y[box.__getattribute__(class_field)]]
+        dist_filter += len(eval_boxes[sample_token])
+
+        # Then remove boxes with zero points in them. Eval boxes have -1 points by default.
+        eval_boxes.boxes[sample_token] = [box for box in eval_boxes[sample_token] if not box.num_pts == 0]
+        point_filter += len(eval_boxes[sample_token])
+
+        # Perform bike-rack filtering.
+        sample_anns = nusc.get('sample', sample_token)['anns']
+        bikerack_recs = [nusc.get('sample_annotation', ann) for ann in sample_anns if
+                         nusc.get('sample_annotation', ann)['category_name'] == 'static_object.bicycle_rack']
+        bikerack_boxes = [Box(rec['translation'], rec['size'], Quaternion(rec['rotation'])) for rec in bikerack_recs]
+        filtered_boxes = []
+        for box in eval_boxes[sample_token]:
+            if box.__getattribute__(class_field) in ['bicycle', 'motorcycle']:
+                in_a_bikerack = False
+                for bikerack_box in bikerack_boxes:
+                    if np.sum(points_in_box(bikerack_box, np.expand_dims(np.array(box.translation), axis=1))) > 0:
+                        in_a_bikerack = True
+                if not in_a_bikerack:
+                    filtered_boxes.append(box)
+            else:
+                filtered_boxes.append(box)
+
+        eval_boxes.boxes[sample_token] = filtered_boxes
+        bike_rack_filter += len(eval_boxes.boxes[sample_token])
+
+    if verbose:
+        print("=> Original number of boxes: %d" % total)
+        print("=> After distance based filtering: %d" % dist_filter)
+        print("=> After LIDAR and RADAR points based filtering: %d" % point_filter)
+        print("=> After bike rack filtering: %d" % bike_rack_filter)
+
+    return eval_boxes
+
+class NuScenesEval_custom(NuScenesEval):
+    """
+    Dummy class for backward-compatibility. Same as DetectionEval.
+    """
+
+    def __init__(self,
+                 nusc: NuScenes,
+                 config: DetectionConfig,
+                 result_path: str,
+                 eval_set: str,
+                 output_dir: str = None,
+                 verbose: bool = True,
+                 overlap_test=False,
+                 eval_mask=False,
+                 data_infos=None
+                 ):
+        """
+        Initialize a DetectionEval object.
+        :param nusc: A NuScenes object.
+        :param config: A DetectionConfig object.
+        :param result_path: Path of the nuScenes JSON result file.
+        :param eval_set: The dataset split to evaluate on, e.g. train, val or test.
+        :param output_dir: Folder to save plots and results to.
+        :param verbose: Whether to print to stdout.
+        """
+
+        self.nusc = nusc
+        self.result_path = result_path
+        self.eval_set = eval_set
+        self.output_dir = output_dir
+        self.verbose = verbose
+        self.cfg = config
+        self.overlap_test = overlap_test
+        self.eval_mask = eval_mask
+        self.data_infos = data_infos
+        # Check result file exists.
+        assert os.path.exists(result_path), 'Error: The result file does not exist!'
+
+        # Make dirs.
+        self.plot_dir = os.path.join(self.output_dir, 'plots')
+        if not os.path.isdir(self.output_dir):
+            os.makedirs(self.output_dir)
+        if not os.path.isdir(self.plot_dir):
+            os.makedirs(self.plot_dir)
+
+        # Load data.
+        if verbose:
+            print('Initializing nuScenes detection evaluation')
+        self.pred_boxes, self.meta = load_prediction(self.result_path, self.cfg.max_boxes_per_sample, DetectionBox,
+                                                     verbose=verbose)
+        self.gt_boxes = load_gt(self.nusc, self.eval_set, DetectionBox_modified, verbose=verbose)
+
+        # assert set(self.pred_boxes.sample_tokens) == set(self.gt_boxes.sample_tokens), \
+        #     "Samples in split doesn't match samples in predictions."
+
+        # Add center distances.
+        self.pred_boxes = add_center_dist(nusc, self.pred_boxes)
+        self.gt_boxes = add_center_dist(nusc, self.gt_boxes)
+
+        # Filter boxes (distance, points per box, etc.).
+
+        if verbose:
+            print('Filtering predictions')
+        self.pred_boxes = filter_eval_boxes(nusc, self.pred_boxes, self.cfg.class_range_x, self.cfg.class_range_y, verbose=verbose)
+        if verbose:
+            print('Filtering ground truth annotations')
+        self.gt_boxes = filter_eval_boxes(nusc, self.gt_boxes, self.cfg.class_range_x, self.cfg.class_range_y, verbose=verbose)
+
+        if self.overlap_test:
+            self.pred_boxes = filter_eval_boxes_by_overlap(self.nusc, self.pred_boxes)
+
+            self.gt_boxes = filter_eval_boxes_by_overlap(self.nusc, self.gt_boxes, verbose=True)
+
+        self.all_gt = copy.deepcopy(self.gt_boxes)
+        self.all_preds = copy.deepcopy(self.pred_boxes)
+        self.sample_tokens = self.gt_boxes.sample_tokens
+
+        self.index_map = {}
+        for scene in nusc.scene:
+            first_sample_token = scene['first_sample_token']
+            sample = nusc.get('sample', first_sample_token)
+            self.index_map[first_sample_token] = 1
+            index = 2
+            while sample['next'] != '':
+                sample = nusc.get('sample', sample['next'])
+                self.index_map[sample['token']] = index
+                index += 1
+
+    def update_gt(self, type_='vis', visibility='1', index=1):
+        if type_ == 'vis':
+            self.visibility_test = True
+            if self.visibility_test:
+                '''[{'description': 'visibility of whole object is between 0 and 40%',
+                'token': '1',
+                'level': 'v0-40'},
+                {'description': 'visibility of whole object is between 40 and 60%',
+                'token': '2',
+                'level': 'v40-60'},
+                {'description': 'visibility of whole object is between 60 and 80%',
+                'token': '3',
+                'level': 'v60-80'},
+                {'description': 'visibility of whole object is between 80 and 100%',
+                'token': '4',
+                'level': 'v80-100'}]'''
+
+                self.gt_boxes = filter_eval_boxes_by_visibility(self.all_gt, visibility, verbose=True)
+
+        elif type_ == 'ord':
+
+            valid_tokens = [key for (key, value) in self.index_map.items() if value == index]
+            # from IPython import embed
+            # embed()
+            self.gt_boxes = filter_by_sample_token(self.all_gt, valid_tokens)
+            self.pred_boxes = filter_by_sample_token(self.all_preds, valid_tokens)
+        self.sample_tokens = self.gt_boxes.sample_tokens
+
+
+    def evaluate(self) -> Tuple[DetectionMetrics, DetectionMetricDataList]:
+        """
+        Performs the actual evaluation.
+        :return: A tuple of high-level and the raw metric data.
+        """
+        start_time = time.time()
+
+        # -----------------------------------
+        # Step 1: Accumulate metric data for all classes and distance thresholds.
+        # -----------------------------------
+        if self.verbose:
+            print('Accumulating metric data...')
+        metric_data_list = DetectionMetricDataList()
+
+        # print(self.cfg.dist_fcn_callable, self.cfg.dist_ths)
+        # self.cfg.dist_ths = [0.3]
+        # self.cfg.dist_fcn_callable
+        for class_name in self.cfg.class_names:
+            for dist_th in self.cfg.dist_ths:
+                md = accumulate(self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th)
+                metric_data_list.set(class_name, dist_th, md)
+
+        # -----------------------------------
+        # Step 2: Calculate metrics from the data.
+        # -----------------------------------
+        if self.verbose:
+            print('Calculating metrics...')
+        metrics = DetectionMetrics(self.cfg)
+        for class_name in self.cfg.class_names:
+            # Compute APs.
+            for dist_th in self.cfg.dist_ths:
+                metric_data = metric_data_list[(class_name, dist_th)]
+                ap = calc_ap(metric_data, self.cfg.min_recall, self.cfg.min_precision)
+                metrics.add_label_ap(class_name, dist_th, ap)
+            # Compute TP metrics.
+            for metric_name in TP_METRICS:
+                metric_data = metric_data_list[(class_name, self.cfg.dist_th_tp)]
+                if class_name in ['traffic_cone'] and metric_name in ['attr_err', 'vel_err', 'orient_err']:
+                    tp = np.nan
+                elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']:
+                    tp = np.nan
+                else:
+                    tp = calc_tp(metric_data, self.cfg.min_recall, metric_name)
+                metrics.add_label_tp(class_name, metric_name, tp)
+
+        # Compute evaluation time.
+        metrics.add_runtime(time.time() - start_time)
+
+        return metrics, metric_data_list
+
+    def render(self, metrics: DetectionMetrics, md_list: DetectionMetricDataList) -> None:
+        """
+        Renders various PR and TP curves.
+        :param metrics: DetectionMetrics instance.
+        :param md_list: DetectionMetricDataList instance.
+        """
+        if self.verbose:
+            print('Rendering PR and TP curves')
+
+        def savepath(name):
+            return os.path.join(self.plot_dir, name + '.pdf')
+
+        summary_plot(md_list, metrics, min_precision=self.cfg.min_precision, min_recall=self.cfg.min_recall,
+                     dist_th_tp=self.cfg.dist_th_tp, savepath=savepath('summary'))
+
+        for detection_name in self.cfg.class_names:
+            class_pr_curve(md_list, metrics, detection_name, self.cfg.min_precision, self.cfg.min_recall,
+                           savepath=savepath(detection_name + '_pr'))
+
+            class_tp_curve(md_list, metrics, detection_name, self.cfg.min_recall, self.cfg.dist_th_tp,
+                           savepath=savepath(detection_name + '_tp'))
+
+        for dist_th in self.cfg.dist_ths:
+            dist_pr_curve(md_list, metrics, dist_th, self.cfg.min_precision, self.cfg.min_recall,
+                          savepath=savepath('dist_pr_' + str(dist_th)))
+
+
+if __name__ == "__main__":
+
+    # Settings.
+    parser = argparse.ArgumentParser(description='Evaluate nuScenes detection results.',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('result_path', type=str, help='The submission as a JSON file.')
+    parser.add_argument('--output_dir', type=str, default='~/nuscenes-metrics',
+                        help='Folder to store result metrics, graphs and example visualizations.')
+    parser.add_argument('--eval_set', type=str, default='val',
+                        help='Which dataset split to evaluate on, train, val or test.')
+    parser.add_argument('--dataroot', type=str, default='data/nuscenes',
+                        help='Default nuScenes data directory.')
+    parser.add_argument('--version', type=str, default='v1.0-trainval',
+                        help='Which version of the nuScenes dataset to evaluate on, e.g. v1.0-trainval.')
+    parser.add_argument('--config_path', type=str, default='',
+                        help='Path to the configuration file.'
+                             'If no path given, the CVPR 2019 configuration will be used.')
+    parser.add_argument('--plot_examples', type=int, default=0,
+                        help='How many example visualizations to write to disk.')
+    parser.add_argument('--render_curves', type=int, default=1,
+                        help='Whether to render PR and TP curves to disk.')
+    parser.add_argument('--verbose', type=int, default=1,
+                        help='Whether to print to stdout.')
+    args = parser.parse_args()
+
+    result_path_ = os.path.expanduser(args.result_path)
+    output_dir_ = os.path.expanduser(args.output_dir)
+    eval_set_ = args.eval_set
+    dataroot_ = args.dataroot
+    version_ = args.version
+    config_path = args.config_path
+    plot_examples_ = args.plot_examples
+    render_curves_ = bool(args.render_curves)
+    verbose_ = bool(args.verbose)
+
+    if config_path == '':
+        cfg_ = config_factory('detection_cvpr_2019')
+    else:
+        with open(config_path, 'r') as _f:
+            cfg_ = DetectionConfig.deserialize(json.load(_f))
+
+    nusc_ = NuScenes(version=version_, verbose=verbose_, dataroot=dataroot_)
+    nusc_eval = NuScenesEval_custom(nusc_, config=cfg_, result_path=result_path_, eval_set=eval_set_,
+                                    output_dir=output_dir_, verbose=verbose_)
+    for vis in ['1', '2', '3', '4']:
+        nusc_eval.update_gt(type_='vis', visibility=vis)
+        print(f'================ {vis} ===============')
+        nusc_eval.main(plot_examples=plot_examples_, render_curves=render_curves_)
+    #for index in range(1, 41):
+    #    nusc_eval.update_gt(type_='ord', index=index)
+    #
diff --git a/mmcv/datasets/vis_utils.py b/mmcv/datasets/vis_utils.py
new file mode 100644
index 0000000..281703f
--- /dev/null
+++ b/mmcv/datasets/vis_utils.py
@@ -0,0 +1,670 @@
+import numpy as np
+import cv2
+from matplotlib import cm
+import math
+import open3d as o3d
+import os
+
+WINDOW_HEIGHT = 900
+WINDOW_WIDTH = 1600
+
+DIS_CAR_SAVE = 50
+DIS_WALKER_SAVE = 50
+DIS_SIGN_SAVE = 50
+DIS_LIGHT_SAVE = 50
+
+edges = [[0,1], [1,3], [3,2], [2,0], [0,4], [4,5], [5,1], [5,7], [7,6], [6,4], [6,2], [7,3]]
+
+carla_bbox_edges = [
+    (0, 1), (1, 2), (2, 3), (3, 0),  # Bottom face
+    (4, 5), (5, 6), (6, 7), (7, 4),  # Top face
+    (0, 4), (1, 5), (2, 6), (3, 7)   # Side edges connecting top and bottom faces
+]
+
+VIRIDIS = np.array(cm.get_cmap('plasma').colors)
+VID_RANGE = np.linspace(0.0, 1.0, VIRIDIS.shape[0])
+LABEL_COLORS = np.array([
+    (255, 255, 255), # None
+    (70, 70, 70),    # Building
+    (100, 40, 40),   # Fences
+    (55, 90, 80),    # Other
+    (220, 20, 60),   # Pedestrian
+    (153, 153, 153), # Pole
+    (157, 234, 50),  # RoadLines
+    (128, 64, 128),  # Road
+    (244, 35, 232),  # Sidewalk
+    (107, 142, 35),  # Vegetation
+    (0, 0, 142),     # Vehicle
+    (102, 102, 156), # Wall
+    (220, 220, 0),   # TrafficSign
+    (70, 130, 180),  # Sky
+    (81, 0, 81),     # Ground
+    (150, 100, 100), # Bridge
+    (230, 150, 140), # RailTrack
+    (180, 165, 180), # GuardRail
+    (250, 170, 30),  # TrafficLight
+    (110, 190, 160), # Static
+    (170, 120, 50),  # Dynamic
+    (45, 60, 150),   # Water
+    (145, 170, 100), # Terrain
+]) / 255.0 # normalize each channel [0-1] since is what Open3D uses
+
+SEM_SEG_LABEL_COLORS = {
+         0 : (  0,   0,   0),   # unlabeled      
+            # cityscape
+         1 : (128,  64, 128),   # road           
+         2 : (244,  35, 232),   # sidewalk       
+         3 : ( 70,  70,  70),   # building       
+         4 : (102, 102, 156),   # wall           
+         5 : (190, 153, 153),   # fence          
+         6 : (153, 153, 153),   # pole           
+         7 : (250, 170,  30),   # traffic light  
+         8 : (220, 220,   0),   # traffic sign   
+         9 : (107, 142,  35),   # vegetation     
+        10 : (152, 251, 152),   # terrain        
+        11 : ( 70, 130, 180),   # sky            
+        12 : (220,  20,  60),   # pedestrian     
+        13 : (255,   0,   0),   # rider          
+        14 : (  0,   0, 142),   # Car            
+        15 : (  0,   0,  70),   # truck          
+        16 : (  0,  60, 100),   # bus            
+        17 : (  0,  80, 100),   # train          
+        18 : (  0,   0, 230),   # motorcycle     
+        19 : (119,  11,  32),   # bicycle        
+            # custom
+        20 : (110, 190, 160),   # static    
+        21 : (170, 120,  50),   # dynamic   
+        22 : ( 55,  90,  80),   # other     
+        23 : ( 45,  60, 150),   # water     
+        24 : (157, 234,  50),   # road line 
+        25 : ( 81,   0,  81),   # ground    
+        26 : (150, 100, 100),   # bridge    
+        27 : (230, 150, 140),   # rail track
+        28 : (180, 165, 180),   # guard rail
+}
+
+uniad_class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+
+carla_class_name = [
+    'car', 'truck', 'bus', 'van', 'motorcycle', 'bicycle', 'pedestrian', 
+]
+
+TYPE_ID_MAP = {
+    #=================vehicle=================
+    # bicycle
+    'vehicle.bh.crossbike': 'bicycle',
+    "vehicle.diamondback.century": 'bicycle',
+    # car
+    "vehicle.chevrolet.impala": 'car',
+    "vehicle.dodge.charger_2020": 'car',
+    "vehicle.dodge.charger_police_2020": 'car',
+    "vehicle.lincoln.mkz_2017": 'car',
+    "vehicle.lincoln.mkz_2020": 'car',
+    "vehicle.mini.cooper_s_2021": 'car',
+    "vehicle.mercedes.coupe_2020": 'car',
+    "vehicle.ford.mustang": 'car',
+    "vehicle.nissan.patrol_2021": 'car',
+    "vehicle.audi.tt": 'car',
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/FordCrown/SM_FordCrown_parked.SM_FordCrown_parked": 'car',
+    # bus
+    # van
+    "/Game/Carla/Static/Car/4Wheeled/ParkedVehicles/VolkswagenT2/SM_VolkswagenT2_2021_Parked.SM_VolkswagenT2_2021_Parked": "van",
+    #=========================================
+
+    #=================traffic sign============
+    # traffic.speed_limit
+    "traffic.speed_limit.30": 'speed_limit',
+    "traffic.speed_limit.40": 'speed_limit',
+    "traffic.speed_limit.50": 'speed_limit',
+    # traffic.traffic_light
+    "traffic.traffic_light": 'traffic_light',
+    # traffic.stop
+    "traffic.stop": 'stop',
+    #=========================================
+}
+
+def calc_projected_2d_bbox(vertices_pos2d):
+    """ Takes in all vertices in pixel projection and calculates min and max of all x and y coordinates.
+        Returns left top, right bottom pixel coordinates for the 2d bounding box as a list of four values.
+        Note that vertices_pos2d contains a list of (y_pos2d, x_pos2d) tuples, or None
+    """
+    x_coords = vertices_pos2d[:, 0]
+    y_coords = vertices_pos2d[:, 1]
+    min_x, max_x = np.min(x_coords), np.max(x_coords)
+    min_y, max_y = np.min(y_coords), np.max(y_coords)
+    return [min_x, min_y, max_x, max_y]
+
+def calculate_occlusion(bbox, point_depth, agent, depth_map):
+    """Calculate the occlusion value of a 2D bounding box.
+    Iterate through each point (pixel) in the bounding box and declare it occluded only
+    if the 4 surroinding points (pixels) are closer to the camera (by using the help of depth map)
+    than the actual distance to the middle of the 3D bounding boxe and some margin (the extent of the object)
+    """
+    bbox_3d_mid = np.mean(point_depth)
+    min_x, min_y, max_x, max_y = calc_projected_2d_bbox(bbox)
+    height, width, length = agent.bounding_box.extent.z, agent.bounding_box.extent.x, agent.bounding_box.extent.y
+
+    #depth_margin should depend on the rotation of the object but this solution works fine
+    depth_margin = np.max([2 * width, 2 * length])
+    is_occluded = []
+
+    for x in range(int(min_x), int(max_x)):
+        for y in range(int(min_y), int(max_y)):
+            is_occluded.append(point_is_occluded(
+                (y, x), bbox_3d_mid - depth_margin, depth_map))
+
+    occlusion = ((float(np.sum(is_occluded))) / ((max_x-min_x) * (max_y-min_y)))
+    #discretize the 0–1 occlusion value into KITTI’s {0,1,2,3} labels by equally dividing the interval into 4 parts
+    # occlusion = np.digitize(occlusion, bins=[0.25, 0.50, 0.75])
+    return occlusion
+
+def calculate_occlusion_vectorized(bbox, point_depth, extent, depth_map):
+    """Calculate the occlusion value of a 2D bounding box.
+    Iterate through each point (pixel) in the bounding box and declare it occluded only
+    if the 4 surroinding points (pixels) are closer to the camera (by using the help of depth map)
+    than the actual distance to the middle of the 3D bounding boxe and some margin (the extent of the object)
+    """
+    bbox_3d_mid = np.mean(point_depth)
+    min_x, min_y, max_x, max_y = calc_projected_2d_bbox(bbox)
+    height, width, length = extent[2], extent[0], extent[1]
+    depth_margin = np.max([2 * width, 2 * length])
+    count_num = (max_x - min_x) * (max_y - min_y)
+    if count_num > 10000:
+        p = 100 / count_num
+    elif count_num > 1000:
+        p = 100 / count_num
+    elif count_num > 100:
+        p = 100 / count_num
+    else:
+        p = 1
+    sample_step_approx = int(np.sqrt(1/p))
+
+    # x, y = np.meshgrid(np.arange(min_x, max_x), np.arange(min_y, max_y))
+    x, y = np.meshgrid(np.arange(min_x, max_x, sample_step_approx), np.arange(min_y, max_y, sample_step_approx))
+    points = np.stack((y.flatten(), x.flatten()), axis=1)
+    is_occluded_array = point_is_occluded_single(points, bbox_3d_mid - depth_margin, depth_map)
+    occlusion = is_occluded_array.mean()
+    #discretize the 0–1 occlusion value into KITTI’s {0,1,2,3} labels by equally dividing the interval into 4 parts
+    # occlusion = np.digitize(occlusion, bins=[0.25, 0.50, 0.75])
+    return occlusion
+
+def calc_bbox2d_area(bbox_2d):
+    """ Calculate the area of the given 2d bbox
+    Input is assumed to be xmin, ymin, xmax, ymax tuple 
+    """
+    xmin, ymin, xmax, ymax = bbox_2d
+    return (ymax - ymin) * (xmax - xmin)
+
+def calculate_truncation(uncropped_bbox, cropped_bbox):
+    "Calculate how much of the object's 2D uncropped bounding box is outside the image boundary"
+
+    area_cropped = calc_bbox2d_area(cropped_bbox)
+    area_uncropped = calc_bbox2d_area(uncropped_bbox)
+    truncation = 1.0 - float(area_cropped / area_uncropped)
+    return truncation
+
+def crop_boxes_in_canvas(cam_bboxes):
+    neg_x_inds = np.where(cam_bboxes[:, 0] < 0)[0]
+    out_x_inds = np.where(cam_bboxes[:, 0] > WINDOW_WIDTH)[0]
+    neg_y_inds = np.where(cam_bboxes[:, 1] < 0)[0]
+    out_y_inds = np.where(cam_bboxes[:, 1] > WINDOW_HEIGHT)[0]
+    cam_bboxes[neg_x_inds, 0] = 0
+    cam_bboxes[out_x_inds, 0] = WINDOW_HEIGHT
+    cam_bboxes[neg_y_inds, 1] = 0
+    cam_bboxes[out_y_inds, 1] = WINDOW_WIDTH
+    return cam_bboxes
+
+def point_is_occluded(point, vertex_depth, depth_map):
+    """ Checks whether or not the four pixels directly around the given point has less depth than the given vertex depth
+        If True, this means that the point is occluded.
+    """
+    y, x = map(int, point)
+    from itertools import product
+    neigbours = product((1, -1), repeat=2)
+    is_occluded = []
+    for dy, dx in neigbours:
+        if point_in_canvas_hw((dy+y, dx+x)):
+            # If the depth map says the pixel is closer to the camera than the actual vertex
+            if depth_map[y+dy, x+dx] < vertex_depth:
+                is_occluded.append(True)
+            else:
+                is_occluded.append(False)
+    # Only say point is occluded if all four neighbours are closer to camera than vertex
+    return all(is_occluded)
+
+def point_is_occluded_single(points, vertex_depth, depth_map, canvas_shape=(WINDOW_HEIGHT, WINDOW_WIDTH)):
+    '''
+    Simplified version that checks occlusion based only on the points' own depth
+    '''
+    points = np.asarray(points).astype(np.int32)
+    y, x = points[:, 0], points[:, 1]
+        
+    valid = (y >= 0) & (y < canvas_shape[0]) & \
+            (x >= 0) & (x < canvas_shape[1])
+    
+    is_occluded = np.zeros(len(points), dtype=bool)
+    try:
+        is_occluded[valid] = depth_map[y[valid], x[valid]] < vertex_depth
+    except:
+        pass
+    return is_occluded
+
+def point_is_occluded_vectorized(points, vertex_depth, depth_map, canvas_shape=(WINDOW_HEIGHT, WINDOW_WIDTH)):
+    '''
+    Equivalent to point_is_occluded
+    '''
+    points = np.asarray(points).astype(np.int32)
+    y, x = points[:, 0], points[:, 1]
+    
+    dy, dx = np.array([1, 1, -1, -1]), np.array([1, -1, 1, -1])
+    neighbour_y = y[:, np.newaxis] + dy
+    neighbour_x = x[:, np.newaxis] + dx
+    
+    valid = (neighbour_y >= 0) & (neighbour_y < canvas_shape[0]) & \
+            (neighbour_x >= 0) & (neighbour_x < canvas_shape[1])
+    
+    neighbour_depths = np.full(neighbour_y.shape, np.inf)
+    for i in range(4):
+        mask = valid[:, i]
+        neighbour_depths[mask, i] = depth_map[neighbour_y[mask, i], neighbour_x[mask, i]]
+    
+    is_occluded = np.logical_and.reduce(neighbour_depths < vertex_depth, axis=1) & np.any(valid, axis=1)
+    return is_occluded
+
+def draw_3d_bbox_vertex(image, points):
+    for x_2d, y_2d, vertex_color in points:
+        cv2.circle(image, (int(x_2d), int(y_2d)), radius=3, color=vertex_color, thickness=1)
+
+def calculate_occlusion_stats(bbox_points, depth, depth_map, max_render_depth):
+    """ Draws each vertex in vertices_pos2d if it is in front of the camera
+        The color is based on whether the object is occluded or not.
+        Returns the number of visible vertices and the number of vertices outside the camera.
+    """
+    num_visible_vertices = 0
+    num_invisible_vertices = 0
+    num_vertices_outside_camera = 0
+    points = []
+
+    for i in range(len(bbox_points)):
+        x_2d = bbox_points[i][0]
+        y_2d = bbox_points[i][1]
+        point_depth = depth[i]
+
+        # if the point is in front of the camera but not too far away
+        if max_render_depth > point_depth > 0 and point_in_canvas_hw((y_2d, x_2d)):
+            #is_occluded_v = point_is_occluded_vectorized([[y_2d, x_2d]], point_depth, depth_map)
+            is_occluded = point_is_occluded(
+                (y_2d, x_2d), point_depth, depth_map)
+                
+            if is_occluded:
+                vertex_color = (0,0,255) # bgr, red
+                num_invisible_vertices += 1
+            else:
+                num_visible_vertices += 1
+                vertex_color = (0,255,0) # bgr, green
+            points.append((x_2d, y_2d, vertex_color))
+        else:
+            num_vertices_outside_camera += 1
+    return num_visible_vertices, num_invisible_vertices, num_vertices_outside_camera, points
+
+def get_intrinsic_matrix(camera):
+
+    width = int(camera.attributes['image_size_x'])
+    height = int(camera.attributes['image_size_y'])
+    fov = float(camera.attributes['fov'])
+
+    k = np.identity(3)
+    k[0, 2] = width / 2.0
+    k[1, 2] = height / 2.0
+    k[0, 0] = k[1, 1] = width / (2.0 * np.tan(fov * np.pi / 360.0))
+
+    return k
+
+def get_image_point(loc, K, w2c):
+    # Calculate 2D projection of 3D coordinate
+
+    # Format the input coordinate (loc is a carla.Position object)
+    point = np.array([loc[0], loc[1], loc[2], 1])
+    # transform to camera coordinates
+    point_camera = np.dot(w2c, point)
+
+    # New we must change from UE4's coordinate system to an "standard"
+    # (x, y ,z) -> (y, -z, x)
+    # and we remove the fourth componebonent also
+    point_camera = [point_camera[1], -point_camera[2], point_camera[0]]
+
+    depth = point_camera[2]
+
+    # now project 3D->2D using the camera matrix
+    point_img = np.dot(K, point_camera)
+    # normalize
+    point_img[0] /= point_img[2]
+    point_img[1] /= point_img[2]
+    
+    return point_img[0:2], depth
+
+def point_in_canvas_hw(pos):
+    """Return true if point is in canvas"""
+    if (pos[0] >= 0) and (pos[0] < WINDOW_HEIGHT) and (pos[1] >= 0) and (pos[1] < WINDOW_WIDTH):
+        return True
+    return False
+
+def point_in_canvas_wh(pos):
+    """Return true if point is in canvas"""
+    if (pos[0] >= 0) and (pos[0] < WINDOW_WIDTH) and (pos[1] >= 0) and (pos[1] < WINDOW_HEIGHT):
+        return True
+    return False
+
+def build_projection_matrix(w, h, fov, is_behind_camera=False):
+    focal = w / (2.0 * np.tan(fov * np.pi / 360.0))
+    K = np.identity(3)
+
+    if is_behind_camera:
+        K[0, 0] = K[1, 1] = -focal
+    else:
+        K[0, 0] = K[1, 1] = focal
+
+    K[0, 2] = w / 2.0
+    K[1, 2] = h / 2.0
+    return K
+
+def rotate_3d(vector, theta):
+    theta = np.radians(theta)
+    R = np.array([[np.cos(theta), -np.sin(theta), 0],
+                [np.sin(theta), np.cos(theta), 0],
+                [0, 0, 1]])
+
+    v_rotated = np.dot(R, vector)
+    return np.array([v_rotated[0], v_rotated[1], v_rotated[2]])
+
+def normalize_angle_degree(x):
+    x = x % 360.0
+    if x > 180.0:
+        x -= 360.0
+    return x
+
+
+def algin_lidar(lidar, translation, yaw):
+    """
+    Translates and rotates a LiDAR into a new coordinate system.
+    Rotation is inverse to translation and yaw
+    :param lidar: numpy LiDAR point cloud (N,3)
+    :param translation: translations in meters
+    :param yaw: yaw angle in radians
+    :return: numpy LiDAR point cloud in the new coordinate system.
+    """
+
+    rotation_matrix = np.array([[np.cos(yaw), -np.sin(yaw), 0.0], [np.sin(yaw), np.cos(yaw), 0.0], [0.0, 0.0, 1.0]])
+
+    aligned_lidar = (rotation_matrix.T @ (lidar - translation).T).T
+
+    return aligned_lidar
+
+def convert_depth(data):
+    """
+    Computes the normalized depth from a CARLA depth map.
+    """
+    data = data.astype(np.float16)
+
+    normalized = np.dot(data, [65536.0, 256.0, 1.0])
+    normalized /= (256 * 256 * 256 - 1)
+    return normalized * 1000
+
+def get_relative_transform(ego_matrix, vehicle_matrix):
+    """
+    Returns the position of the vehicle matrix in the ego coordinate system.
+    :param ego_matrix: ndarray 4x4 Matrix of the ego vehicle in global
+    coordinates
+    :param vehicle_matrix: ndarray 4x4 Matrix of another actor in global
+    coordinates
+    :return: ndarray position of the other vehicle in the ego coordinate system
+    """
+    relative_pos = vehicle_matrix[:3, 3] - ego_matrix[:3, 3]
+    rot = ego_matrix[:3, :3].T
+    relative_pos = rot @ relative_pos
+
+    return relative_pos
+
+def normalize_angle(x):
+    x = x % (2 * np.pi)  # force in range [0, 2 pi)
+    if x > np.pi:  # move to [-pi, pi)
+        x -= 2 * np.pi
+    return x
+
+def build_skeleton(ped, sk_links):
+
+    ######## get the pedestrian skeleton #########
+    bones = ped.get_bones()
+
+    # list where we will store the lines we will project
+    # onto the camera output
+    lines_3d = []
+
+    # cycle through the bone pairs in skeleton.txt and retrieve the joint positions
+    for link in sk_links[1:]:
+
+        # get the roots of the two bones to be joined
+        bone_transform_1 = next(filter(lambda b: b.name == link[0], bones.bone_transforms), None)
+        bone_transform_2 = next(filter(lambda b: b.name == link[1], bones.bone_transforms), None)
+
+        # some bone names aren't matched
+        if bone_transform_1 is not None and bone_transform_2 is not None:
+            lines_3d.append([(bone_transform_1.world.location.x, bone_transform_1.world.location.y, bone_transform_1.world.location.z), 
+                             (bone_transform_2.world.location.x, bone_transform_2.world.location.y, bone_transform_2.world.location.z)]
+                            )
+    return lines_3d
+
+def get_center_and_extent(verts):
+    sum_x = sum_y = sum_z = 0
+    max_x = max_y = max_z = float('-inf')
+    min_x = min_y = min_z = float('inf')
+
+    for pos in verts:
+        sum_x += pos.x
+        sum_y += pos.y
+        sum_z += pos.z
+        
+        max_x = max(max_x, pos.x)
+        max_y = max(max_y, pos.y)
+        max_z = max(max_z, pos.z)
+        
+        min_x = min(min_x, pos.x)
+        min_y = min(min_y, pos.y)
+        min_z = min(min_z, pos.z)
+    
+    center = (sum_x / 8, sum_y / 8, sum_z / 8)
+    
+    extent = ((max_x - min_x)/2, (max_y - min_y)/2, (max_z - min_z)/2)
+    return center, extent
+
+def get_forward_vector(yaw):
+
+    yaw_rad = math.radians(yaw)
+
+    x = math.cos(yaw_rad)
+    y = math.sin(yaw_rad)
+
+    z = 0
+    return np.array([x, y, z])
+
+def calculate_cube_vertices(center, extent):
+    cx, cy, cz = center
+    x, y, z = extent
+    vertices = [
+        (cx + x, cy + y, cz + z),
+        (cx + x, cy + y, cz - z),
+        (cx + x, cy - y, cz + z),
+        (cx + x, cy - y, cz - z),
+        (cx - x, cy + y, cz + z),
+        (cx - x, cy + y, cz - z),
+        (cx - x, cy - y, cz + z),
+        (cx - x, cy - y, cz - z)
+    ]
+    return vertices
+
+
+def calculate_cube_vertices_2(center, extent):
+    cx, cy, cz = center.x,  center.y,  center.z
+    x, y, z = extent.x, extent.y, extent.z
+    vertices = [
+        (cx + x, cy + y, cz + z),
+        (cx + x, cy + y, cz - z),
+        (cx + x, cy - y, cz + z),
+        (cx + x, cy - y, cz - z),
+        (cx - x, cy + y, cz + z),
+        (cx - x, cy + y, cz - z),
+        (cx - x, cy - y, cz + z),
+        (cx - x, cy - y, cz - z)
+    ]
+    return vertices
+
+def calculate_cube_vertices_3(center, extent):
+    cx, cy, cz = center[0],  center[1],  center[2]
+    x, y, z = extent[0], extent[1], extent[2]
+    vertices = [
+        (cx + x, cy + y, cz + z),
+        (cx + x, cy + y, cz - z),
+        (cx + x, cy - y, cz + z),
+        (cx + x, cy - y, cz - z),
+        (cx - x, cy + y, cz + z),
+        (cx - x, cy + y, cz - z),
+        (cx - x, cy - y, cz + z),
+        (cx - x, cy - y, cz - z)
+    ]
+    return vertices
+
+
+
+
+def draw_dashed_line(img, start_point, end_point, color, thickness=1, dash_length=5):
+
+    d = np.sqrt((end_point[0] - start_point[0])**2 + (end_point[1] - start_point[1])**2)
+    dx = (end_point[0] - start_point[0]) / d
+    dy = (end_point[1] - start_point[1]) / d
+
+    x, y = start_point[0], start_point[1]
+
+    while d >= dash_length:
+  
+        x_end = x + dx * dash_length
+        y_end = y + dy * dash_length
+        cv2.line(img, (int(x), int(y)), (int(x_end), int(y_end)), color, thickness)
+        x = x_end + dx * dash_length
+        y = y_end + dy * dash_length
+        d -= 2 * dash_length
+
+def get_matrix(location, rotation):
+    """
+    Creates matrix from carla transform.
+    """
+    pitch, roll, yaw = rotation
+    x, y, z = location
+    c_y = np.cos(np.radians(yaw))
+    s_y = np.sin(np.radians(yaw))
+    c_r = np.cos(np.radians(roll))
+    s_r = np.sin(np.radians(roll))
+    c_p = np.cos(np.radians(pitch))
+    s_p = np.sin(np.radians(pitch))
+    matrix = np.matrix(np.identity(4))
+    matrix[0, 3] = x
+    matrix[1, 3] = y
+    matrix[2, 3] = z
+    matrix[0, 0] = c_p * c_y
+    matrix[0, 1] = c_y * s_p * s_r - s_y * c_r
+    matrix[0, 2] = -c_y * s_p * c_r - s_y * s_r
+    matrix[1, 0] = s_y * c_p
+    matrix[1, 1] = s_y * s_p * s_r + c_y * c_r
+    matrix[1, 2] = -s_y * s_p * c_r + c_y * s_r
+    matrix[2, 0] = s_p
+    matrix[2, 1] = -c_p * s_r
+    matrix[2, 2] = c_p * c_r
+    return matrix
+
+def euler_to_rotation_matrix(pitch, roll, yaw):
+    Ry_pitch = np.array([
+        [np.cos(pitch), 0, np.sin(pitch)],
+        [0, 1, 0],
+        [-np.sin(pitch), 0, np.cos(pitch)]
+    ])
+    Rx_roll = np.array([
+        [1, 0, 0],
+        [0, np.cos(roll), -np.sin(roll)],
+        [0, np.sin(roll), np.cos(roll)]
+    ])
+    Rz_yaw = np.array([
+        [np.cos(yaw), -np.sin(yaw), 0],
+        [np.sin(yaw), np.cos(yaw), 0],
+        [0, 0, 1]
+    ])
+    return np.dot(Rz_yaw, np.dot(Rx_roll, Ry_pitch))
+
+def world_to_ego_no(point_world, ego_location, ego_rotation):
+    rotation_matrix = euler_to_rotation_matrix(np.radians(ego_rotation[0]),
+                                               np.radians(ego_rotation[1]),
+                                               np.radians(ego_rotation[2]))
+    
+    point_relative = np.array(point_world) - np.array(ego_location)
+    point = np.dot(rotation_matrix, point_relative)
+    # (x, y ,z) -> (y, -x, z)
+    point = [point[0], -point[1], point[2]]
+    return point
+
+def world_to_ego(point_world, w2e):
+    point_world = np.array([point_world[0], point_world[1], point_world[2], 1])
+    point_ego = np.dot(w2e, point_world)
+    point_ego = [point_ego[1], -point_ego[0], point_ego[2]]
+    return point_ego
+
+def world_to_lidar(point_world, w2l):
+    point_world = np.array([point_world[0], point_world[1], point_world[2], 1])
+    point_lidar = np.dot(w2l, point_world)
+    return point_lidar
+
+def vector_angle(v1, v2):
+    dot_product = np.dot(v1, v2)
+    magnitude_v1 = np.linalg.norm(v1)
+    magnitude_v2 = np.linalg.norm(v2)
+    cos_theta = dot_product / (magnitude_v1 * magnitude_v2)
+    angle_radians = np.arccos(cos_theta)
+    angle_degrees = np.degrees(angle_radians)
+    return angle_degrees
+
+def get_weather_id(weather_conditions):
+    from xml.etree import ElementTree as ET
+    tree = ET.parse('./weather.xml')
+    root = tree.getroot()
+    def conditions_match(weather, conditions):
+        for (key, value) in weather:
+            if key == 'route_percentage' : continue
+            if str(conditions[key]) != value:
+                return False
+        return True
+    for case in root.findall('case'):
+        weather = case[0].items()
+        if conditions_match(weather, weather_conditions):
+            return case.items()[0][1]
+    return None
+
+
+def static_weather(path):
+    import gzip
+    import json
+    static_dict = {}
+    for dir in os.listdir(path):
+        for d1 in os.listdir(os.path.join(path, dir)):
+            if os.path.exists(os.path.join(path, dir, d1, 'anno/00000.json.gz')):
+                with gzip.open(os.path.join(path, dir, d1, 'anno/00000.json.gz'), 'rt', encoding='utf-8') as gz_file:
+                    anno = json.load(gz_file)
+                    weather = anno['weather']
+                    weather_id = get_weather_id(weather)
+                    static_dict[weather_id] = static_dict.get(weather_id, 0) + 1
+    print(static_dict)
+    return 
+
+if __name__ == '__main__':
+
+    path = ''
+    static_weather(path)
\ No newline at end of file
diff --git a/mmcv/fileio/__init__.py b/mmcv/fileio/__init__.py
new file mode 100644
index 0000000..b08824c
--- /dev/null
+++ b/mmcv/fileio/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# from .file_client import BaseStorageBackend, FileClient
+# from .io import dump, load, imread #register_handler
+# from .handlers import PickleHandler, JsonHandler
+# from .parse import *
\ No newline at end of file
diff --git a/mmcv/fileio/file_client.py b/mmcv/fileio/file_client.py
new file mode 100644
index 0000000..705eb65
--- /dev/null
+++ b/mmcv/fileio/file_client.py
@@ -0,0 +1,1146 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import os
+import os.path as osp
+import re
+import tempfile
+import warnings
+from abc import ABCMeta, abstractmethod
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Iterable, Iterator, Optional, Tuple, Union
+from urllib.request import urlopen
+from mmcv.utils.misc import has_method
+from mmcv.utils.path import is_filepath, mkdir_or_exist
+
+
+class BaseStorageBackend(metaclass=ABCMeta):
+    """Abstract class of storage backends.
+
+    All backends need to implement two apis: ``get()`` and ``get_text()``.
+    ``get()`` reads the file as a byte stream and ``get_text()`` reads the file
+    as texts.
+    """
+
+    # a flag to indicate whether the backend can create a symlink for a file
+    _allow_symlink = False
+
+    @property
+    def name(self):
+        return self.__class__.__name__
+
+    @property
+    def allow_symlink(self):
+        return self._allow_symlink
+
+    @abstractmethod
+    def get(self, filepath):
+        pass
+
+    @abstractmethod
+    def get_text(self, filepath):
+        pass
+
+
+class CephBackend(BaseStorageBackend):
+    """Ceph storage backend (for internal use).
+
+    Args:
+        path_mapping (dict|None): path mapping dict from local path to Petrel
+            path. When ``path_mapping={'src': 'dst'}``, ``src`` in ``filepath``
+            will be replaced by ``dst``. Default: None.
+
+    .. warning::
+        :class:`mmcv.fileio.file_client.CephBackend` will be deprecated,
+        please use :class:`mmcv.fileio.file_client.PetrelBackend` instead.
+    """
+
+    def __init__(self, path_mapping=None):
+        try:
+            import ceph
+        except ImportError:
+            raise ImportError('Please install ceph to enable CephBackend.')
+
+        warnings.warn(
+            'CephBackend will be deprecated, please use PetrelBackend instead')
+        self._client = ceph.S3Client()
+        assert isinstance(path_mapping, dict) or path_mapping is None
+        self.path_mapping = path_mapping
+
+    def get(self, filepath):
+        filepath = str(filepath)
+        if self.path_mapping is not None:
+            for k, v in self.path_mapping.items():
+                filepath = filepath.replace(k, v)
+        value = self._client.Get(filepath)
+        value_buf = memoryview(value)
+        return value_buf
+
+    def get_text(self, filepath, encoding=None):
+        raise NotImplementedError
+
+
+class PetrelBackend(BaseStorageBackend):
+    """Petrel storage backend (for internal use).
+
+    PetrelBackend supports reading and writing data to multiple clusters.
+    If the file path contains the cluster name, PetrelBackend will read data
+    from specified cluster or write data to it. Otherwise, PetrelBackend will
+    access the default cluster.
+
+    Args:
+        path_mapping (dict, optional): Path mapping dict from local path to
+            Petrel path. When ``path_mapping={'src': 'dst'}``, ``src`` in
+            ``filepath`` will be replaced by ``dst``. Default: None.
+        enable_mc (bool, optional): Whether to enable memcached support.
+            Default: True.
+
+    Examples:
+        >>> filepath1 = 's3://path/of/file'
+        >>> filepath2 = 'cluster-name:s3://path/of/file'
+        >>> client = PetrelBackend()
+        >>> client.get(filepath1)  # get data from default cluster
+        >>> client.get(filepath2)  # get data from 'cluster-name' cluster
+    """
+
+    def __init__(self,
+                 path_mapping: Optional[dict] = None,
+                 enable_mc: bool = True):
+        try:
+            from petrel_client import client
+        except ImportError:
+            raise ImportError('Please install petrel_client to enable '
+                              'PetrelBackend.')
+
+        self._client = client.Client(enable_mc=enable_mc)
+        assert isinstance(path_mapping, dict) or path_mapping is None
+        self.path_mapping = path_mapping
+
+    def _map_path(self, filepath: Union[str, Path]) -> str:
+        """Map ``filepath`` to a string path whose prefix will be replaced by
+        :attr:`self.path_mapping`.
+
+        Args:
+            filepath (str): Path to be mapped.
+        """
+        filepath = str(filepath)
+        if self.path_mapping is not None:
+            for k, v in self.path_mapping.items():
+                filepath = filepath.replace(k, v)
+        return filepath
+
+    def _format_path(self, filepath: str) -> str:
+        """Convert a ``filepath`` to standard format of petrel oss.
+
+        If the ``filepath`` is concatenated by ``os.path.join``, in a Windows
+        environment, the ``filepath`` will be the format of
+        's3://bucket_name\\image.jpg'. By invoking :meth:`_format_path`, the
+        above ``filepath`` will be converted to 's3://bucket_name/image.jpg'.
+
+        Args:
+            filepath (str): Path to be formatted.
+        """
+        return re.sub(r'\\+', '/', filepath)
+
+    def get(self, filepath: Union[str, Path]) -> memoryview:
+        """Read data from a given ``filepath`` with 'rb' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+
+        Returns:
+            memoryview: A memory view of expected bytes object to avoid
+                copying. The memoryview object can be converted to bytes by
+                ``value_buf.tobytes()``.
+        """
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        value = self._client.Get(filepath)
+        value_buf = memoryview(value)
+        return value_buf
+
+    def get_text(self,
+                 filepath: Union[str, Path],
+                 encoding: str = 'utf-8') -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        return str(self.get(filepath), encoding=encoding)
+
+    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Save data to a given ``filepath``.
+
+        Args:
+            obj (bytes): Data to be saved.
+            filepath (str or Path): Path to write data.
+        """
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        self._client.put(filepath, obj)
+
+    def put_text(self,
+                 obj: str,
+                 filepath: Union[str, Path],
+                 encoding: str = 'utf-8') -> None:
+        """Save data to a given ``filepath``.
+
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str): The encoding format used to encode the ``obj``.
+                Default: 'utf-8'.
+        """
+        self.put(bytes(obj, encoding=encoding), filepath)
+
+    def remove(self, filepath: Union[str, Path]) -> None:
+        """Remove a file.
+
+        Args:
+            filepath (str or Path): Path to be removed.
+        """
+        if not has_method(self._client, 'delete'):
+            raise NotImplementedError(
+                ('Current version of Petrel Python SDK has not supported '
+                 'the `delete` method, please use a higher version or dev'
+                 ' branch instead.'))
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        self._client.delete(filepath)
+
+    def exists(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path exists.
+
+        Args:
+            filepath (str or Path): Path to be checked whether exists.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+        """
+        if not (has_method(self._client, 'contains')
+                and has_method(self._client, 'isdir')):
+            raise NotImplementedError(
+                ('Current version of Petrel Python SDK has not supported '
+                 'the `contains` and `isdir` methods, please use a higher'
+                 'version or dev branch instead.'))
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        return self._client.contains(filepath) or self._client.isdir(filepath)
+
+    def isdir(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a directory.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a
+                directory.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a directory,
+                ``False`` otherwise.
+        """
+        if not has_method(self._client, 'isdir'):
+            raise NotImplementedError(
+                ('Current version of Petrel Python SDK has not supported '
+                 'the `isdir` method, please use a higher version or dev'
+                 ' branch instead.'))
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        return self._client.isdir(filepath)
+
+    def isfile(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a file.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a file.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a file, ``False``
+                otherwise.
+        """
+        if not has_method(self._client, 'contains'):
+            raise NotImplementedError(
+                ('Current version of Petrel Python SDK has not supported '
+                 'the `contains` method, please use a higher version or '
+                 'dev branch instead.'))
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        return self._client.contains(filepath)
+
+    def join_path(self, filepath: Union[str, Path],
+                  *filepaths: Union[str, Path]) -> str:
+        """Concatenate all file paths.
+
+        Args:
+            filepath (str or Path): Path to be concatenated.
+
+        Returns:
+            str: The result after concatenation.
+        """
+        filepath = self._format_path(self._map_path(filepath))
+        if filepath.endswith('/'):
+            filepath = filepath[:-1]
+        formatted_paths = [filepath]
+        for path in filepaths:
+            formatted_paths.append(self._format_path(self._map_path(path)))
+        return '/'.join(formatted_paths)
+
+    @contextmanager
+    def get_local_path(self, filepath: Union[str, Path]) -> Iterable[str]:
+        """Download a file from ``filepath`` and return a temporary path.
+
+        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+
+        Args:
+            filepath (str | Path): Download a file from ``filepath``.
+
+        Examples:
+            >>> client = PetrelBackend()
+            >>> # After existing from the ``with`` clause,
+            >>> # the path will be removed
+            >>> with client.get_local_path('s3://path/of/your/file') as path:
+            ...     # do something here
+
+        Yields:
+            Iterable[str]: Only yield one temporary path.
+        """
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        assert self.isfile(filepath)
+        try:
+            f = tempfile.NamedTemporaryFile(delete=False)
+            f.write(self.get(filepath))
+            f.close()
+            yield f.name
+        finally:
+            os.remove(f.name)
+
+    def list_dir_or_file(self,
+                         dir_path: Union[str, Path],
+                         list_dir: bool = True,
+                         list_file: bool = True,
+                         suffix: Optional[Union[str, Tuple[str]]] = None,
+                         recursive: bool = False) -> Iterator[str]:
+        """Scan a directory to find the interested directories or files in
+        arbitrary order.
+
+        Note:
+            Petrel has no concept of directories but it simulates the directory
+            hierarchy in the filesystem through public prefixes. In addition,
+            if the returned path ends with '/', it means the path is a public
+            prefix which is a logical directory.
+
+        Note:
+            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+            In addition, the returned path of directory will not contains the
+            suffix '/' which is consistent with other backends.
+
+        Args:
+            dir_path (str | Path): Path of the directory.
+            list_dir (bool): List the directories. Default: True.
+            list_file (bool): List the path of files. Default: True.
+            suffix (str or tuple[str], optional):  File suffix
+                that we are interested in. Default: None.
+            recursive (bool): If set to True, recursively scan the
+                directory. Default: False.
+
+        Yields:
+            Iterable[str]: A relative path to ``dir_path``.
+        """
+        if not has_method(self._client, 'list'):
+            raise NotImplementedError(
+                ('Current version of Petrel Python SDK has not supported '
+                 'the `list` method, please use a higher version or dev'
+                 ' branch instead.'))
+
+        dir_path = self._map_path(dir_path)
+        dir_path = self._format_path(dir_path)
+        if list_dir and suffix is not None:
+            raise TypeError(
+                '`list_dir` should be False when `suffix` is not None')
+
+        if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+            raise TypeError('`suffix` must be a string or tuple of strings')
+
+        # Petrel's simulated directory hierarchy assumes that directory paths
+        # should end with `/`
+        if not dir_path.endswith('/'):
+            dir_path += '/'
+
+        root = dir_path
+
+        def _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                              recursive):
+            for path in self._client.list(dir_path):
+                # the `self.isdir` is not used here to determine whether path
+                # is a directory, because `self.isdir` relies on
+                # `self._client.list`
+                if path.endswith('/'):  # a directory path
+                    next_dir_path = self.join_path(dir_path, path)
+                    if list_dir:
+                        # get the relative path and exclude the last
+                        # character '/'
+                        rel_dir = next_dir_path[len(root):-1]
+                        yield rel_dir
+                    if recursive:
+                        yield from _list_dir_or_file(next_dir_path, list_dir,
+                                                     list_file, suffix,
+                                                     recursive)
+                else:  # a file path
+                    absolute_path = self.join_path(dir_path, path)
+                    rel_path = absolute_path[len(root):]
+                    if (suffix is None
+                            or rel_path.endswith(suffix)) and list_file:
+                        yield rel_path
+
+        return _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                                 recursive)
+
+
+class MemcachedBackend(BaseStorageBackend):
+    """Memcached storage backend.
+
+    Attributes:
+        server_list_cfg (str): Config file for memcached server list.
+        client_cfg (str): Config file for memcached client.
+        sys_path (str | None): Additional path to be appended to `sys.path`.
+            Default: None.
+    """
+
+    def __init__(self, server_list_cfg, client_cfg, sys_path=None):
+        if sys_path is not None:
+            import sys
+            sys.path.append(sys_path)
+        try:
+            import mc
+        except ImportError:
+            raise ImportError(
+                'Please install memcached to enable MemcachedBackend.')
+
+        self.server_list_cfg = server_list_cfg
+        self.client_cfg = client_cfg
+        self._client = mc.MemcachedClient.GetInstance(self.server_list_cfg,
+                                                      self.client_cfg)
+        # mc.pyvector servers as a point which points to a memory cache
+        self._mc_buffer = mc.pyvector()
+
+    def get(self, filepath):
+        filepath = str(filepath)
+        import mc
+        self._client.Get(filepath, self._mc_buffer)
+        value_buf = mc.ConvertBuffer(self._mc_buffer)
+        return value_buf
+
+    def get_text(self, filepath, encoding=None):
+        raise NotImplementedError
+
+
+class LmdbBackend(BaseStorageBackend):
+    """Lmdb storage backend.
+
+    Args:
+        db_path (str): Lmdb database path.
+        readonly (bool, optional): Lmdb environment parameter. If True,
+            disallow any write operations. Default: True.
+        lock (bool, optional): Lmdb environment parameter. If False, when
+            concurrent access occurs, do not lock the database. Default: False.
+        readahead (bool, optional): Lmdb environment parameter. If False,
+            disable the OS filesystem readahead mechanism, which may improve
+            random read performance when a database is larger than RAM.
+            Default: False.
+
+    Attributes:
+        db_path (str): Lmdb database path.
+    """
+
+    def __init__(self,
+                 db_path,
+                 readonly=True,
+                 lock=False,
+                 readahead=False,
+                 **kwargs):
+        try:
+            import lmdb
+        except ImportError:
+            raise ImportError('Please install lmdb to enable LmdbBackend.')
+
+        self.db_path = str(db_path)
+        self._client = lmdb.open(
+            self.db_path,
+            readonly=readonly,
+            lock=lock,
+            readahead=readahead,
+            **kwargs)
+
+    def get(self, filepath):
+        """Get values according to the filepath.
+
+        Args:
+            filepath (str | obj:`Path`): Here, filepath is the lmdb key.
+        """
+        filepath = str(filepath)
+        with self._client.begin(write=False) as txn:
+            value_buf = txn.get(filepath.encode('ascii'))
+        return value_buf
+
+    def get_text(self, filepath, encoding=None):
+        raise NotImplementedError
+
+
+class HardDiskBackend(BaseStorageBackend):
+    """Raw hard disks storage backend."""
+
+    _allow_symlink = True
+
+    def get(self, filepath: Union[str, Path]) -> bytes:
+        """Read data from a given ``filepath`` with 'rb' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+
+        Returns:
+            bytes: Expected bytes object.
+        """
+        with open(filepath, 'rb') as f:
+            value_buf = f.read()
+        return value_buf
+
+    def get_text(self,
+                 filepath: Union[str, Path],
+                 encoding: str = 'utf-8') -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        with open(filepath, 'r', encoding=encoding) as f:
+            value_buf = f.read()
+        return value_buf
+
+    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'wb' mode.
+
+        Note:
+            ``put`` will create a directory if the directory of ``filepath``
+            does not exist.
+
+        Args:
+            obj (bytes): Data to be written.
+            filepath (str or Path): Path to write data.
+        """
+        mkdir_or_exist(osp.dirname(filepath))
+        with open(filepath, 'wb') as f:
+            f.write(obj)
+
+    def put_text(self,
+                 obj: str,
+                 filepath: Union[str, Path],
+                 encoding: str = 'utf-8') -> None:
+        """Write data to a given ``filepath`` with 'w' mode.
+
+        Note:
+            ``put_text`` will create a directory if the directory of
+            ``filepath`` does not exist.
+
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+        """
+        mkdir_or_exist(osp.dirname(filepath))
+        with open(filepath, 'w', encoding=encoding) as f:
+            f.write(obj)
+
+    def remove(self, filepath: Union[str, Path]) -> None:
+        """Remove a file.
+
+        Args:
+            filepath (str or Path): Path to be removed.
+        """
+        os.remove(filepath)
+
+    def exists(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path exists.
+
+        Args:
+            filepath (str or Path): Path to be checked whether exists.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+        """
+        return osp.exists(filepath)
+
+    def isdir(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a directory.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a
+                directory.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a directory,
+                ``False`` otherwise.
+        """
+        return osp.isdir(filepath)
+
+    def isfile(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a file.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a file.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a file, ``False``
+                otherwise.
+        """
+        return osp.isfile(filepath)
+
+    def join_path(self, filepath: Union[str, Path],
+                  *filepaths: Union[str, Path]) -> str:
+        """Concatenate all file paths.
+
+        Join one or more filepath components intelligently. The return value
+        is the concatenation of filepath and any members of *filepaths.
+
+        Args:
+            filepath (str or Path): Path to be concatenated.
+
+        Returns:
+            str: The result of concatenation.
+        """
+        return osp.join(filepath, *filepaths)
+
+    @contextmanager
+    def get_local_path(
+            self, filepath: Union[str, Path]) -> Iterable[Union[str, Path]]:
+        """Only for unified API and do nothing."""
+        yield filepath
+
+    def list_dir_or_file(self,
+                         dir_path: Union[str, Path],
+                         list_dir: bool = True,
+                         list_file: bool = True,
+                         suffix: Optional[Union[str, Tuple[str]]] = None,
+                         recursive: bool = False) -> Iterator[str]:
+        """Scan a directory to find the interested directories or files in
+        arbitrary order.
+
+        Note:
+            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+
+        Args:
+            dir_path (str | Path): Path of the directory.
+            list_dir (bool): List the directories. Default: True.
+            list_file (bool): List the path of files. Default: True.
+            suffix (str or tuple[str], optional):  File suffix
+                that we are interested in. Default: None.
+            recursive (bool): If set to True, recursively scan the
+                directory. Default: False.
+
+        Yields:
+            Iterable[str]: A relative path to ``dir_path``.
+        """
+        if list_dir and suffix is not None:
+            raise TypeError('`suffix` should be None when `list_dir` is True')
+
+        if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+            raise TypeError('`suffix` must be a string or tuple of strings')
+
+        root = dir_path
+
+        def _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                              recursive):
+            for entry in os.scandir(dir_path):
+                if not entry.name.startswith('.') and entry.is_file():
+                    rel_path = osp.relpath(entry.path, root)
+                    if (suffix is None
+                            or rel_path.endswith(suffix)) and list_file:
+                        yield rel_path
+                elif osp.isdir(entry.path):
+                    if list_dir:
+                        rel_dir = osp.relpath(entry.path, root)
+                        yield rel_dir
+                    if recursive:
+                        yield from _list_dir_or_file(entry.path, list_dir,
+                                                     list_file, suffix,
+                                                     recursive)
+
+        return _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                                 recursive)
+
+
+class HTTPBackend(BaseStorageBackend):
+    """HTTP and HTTPS storage bachend."""
+
+    def get(self, filepath):
+        value_buf = urlopen(filepath).read()
+        return value_buf
+
+    def get_text(self, filepath, encoding='utf-8'):
+        value_buf = urlopen(filepath).read()
+        return value_buf.decode(encoding)
+
+    @contextmanager
+    def get_local_path(self, filepath: str) -> Iterable[str]:
+        """Download a file from ``filepath``.
+
+        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+
+        Args:
+            filepath (str): Download a file from ``filepath``.
+
+        Examples:
+            >>> client = HTTPBackend()
+            >>> # After existing from the ``with`` clause,
+            >>> # the path will be removed
+            >>> with client.get_local_path('http://path/of/your/file') as path:
+            ...     # do something here
+        """
+        try:
+            f = tempfile.NamedTemporaryFile(delete=False)
+            f.write(self.get(filepath))
+            f.close()
+            yield f.name
+        finally:
+            os.remove(f.name)
+
+
+class FileClient:
+    """A general file client to access files in different backends.
+
+    The client loads a file or text in a specified backend from its path
+    and returns it as a binary or text file. There are two ways to choose a
+    backend, the name of backend and the prefix of path. Although both of them
+    can be used to choose a storage backend, ``backend`` has a higher priority
+    that is if they are all set, the storage backend will be chosen by the
+    backend argument. If they are all `None`, the disk backend will be chosen.
+    Note that It can also register other backend accessor with a given name,
+    prefixes, and backend class. In addition, We use the singleton pattern to
+    avoid repeated object creation. If the arguments are the same, the same
+    object will be returned.
+
+    Args:
+        backend (str, optional): The storage backend type. Options are "disk",
+            "ceph", "memcached", "lmdb", "http" and "petrel". Default: None.
+        prefix (str, optional): The prefix of the registered storage backend.
+            Options are "s3", "http", "https". Default: None.
+
+    Examples:
+        >>> # only set backend
+        >>> file_client = FileClient(backend='petrel')
+        >>> # only set prefix
+        >>> file_client = FileClient(prefix='s3')
+        >>> # set both backend and prefix but use backend to choose client
+        >>> file_client = FileClient(backend='petrel', prefix='s3')
+        >>> # if the arguments are the same, the same object is returned
+        >>> file_client1 = FileClient(backend='petrel')
+        >>> file_client1 is file_client
+        True
+
+    Attributes:
+        client (:obj:`BaseStorageBackend`): The backend object.
+    """
+
+    _backends = {
+        'disk': HardDiskBackend,
+        'ceph': CephBackend,
+        'memcached': MemcachedBackend,
+        'lmdb': LmdbBackend,
+        'petrel': PetrelBackend,
+        'http': HTTPBackend,
+    }
+    # This collection is used to record the overridden backends, and when a
+    # backend appears in the collection, the singleton pattern is disabled for
+    # that backend, because if the singleton pattern is used, then the object
+    # returned will be the backend before overwriting
+    _overridden_backends = set()
+    _prefix_to_backends = {
+        's3': PetrelBackend,
+        'http': HTTPBackend,
+        'https': HTTPBackend,
+    }
+    _overridden_prefixes = set()
+
+    _instances = {}
+
+    def __new__(cls, backend=None, prefix=None, **kwargs):
+        if backend is None and prefix is None:
+            backend = 'disk'
+        if backend is not None and backend not in cls._backends:
+            raise ValueError(
+                f'Backend {backend} is not supported. Currently supported ones'
+                f' are {list(cls._backends.keys())}')
+        if prefix is not None and prefix not in cls._prefix_to_backends:
+            raise ValueError(
+                f'prefix {prefix} is not supported. Currently supported ones '
+                f'are {list(cls._prefix_to_backends.keys())}')
+
+        # concatenate the arguments to a unique key for determining whether
+        # objects with the same arguments were created
+        arg_key = f'{backend}:{prefix}'
+        for key, value in kwargs.items():
+            arg_key += f':{key}:{value}'
+
+        # if a backend was overridden, it will create a new object
+        if (arg_key in cls._instances
+                and backend not in cls._overridden_backends
+                and prefix not in cls._overridden_prefixes):
+            _instance = cls._instances[arg_key]
+        else:
+            # create a new object and put it to _instance
+            _instance = super().__new__(cls)
+            if backend is not None:
+                _instance.client = cls._backends[backend](**kwargs)
+            else:
+                _instance.client = cls._prefix_to_backends[prefix](**kwargs)
+
+            cls._instances[arg_key] = _instance
+
+        return _instance
+
+    @property
+    def name(self):
+        return self.client.name
+
+    @property
+    def allow_symlink(self):
+        return self.client.allow_symlink
+
+    @staticmethod
+    def parse_uri_prefix(uri: Union[str, Path]) -> Optional[str]:
+        """Parse the prefix of a uri.
+
+        Args:
+            uri (str | Path): Uri to be parsed that contains the file prefix.
+
+        Examples:
+            >>> FileClient.parse_uri_prefix('s3://path/of/your/file')
+            's3'
+
+        Returns:
+            str | None: Return the prefix of uri if the uri contains '://'
+                else ``None``.
+        """
+        assert is_filepath(uri)
+        uri = str(uri)
+        if '://' not in uri:
+            return None
+        else:
+            prefix, _ = uri.split('://')
+            # In the case of PetrelBackend, the prefix may contains the cluster
+            # name like clusterName:s3
+            if ':' in prefix:
+                _, prefix = prefix.split(':')
+            return prefix
+
+    @classmethod
+    def infer_client(cls,
+                     file_client_args: Optional[dict] = None,
+                     uri: Optional[Union[str, Path]] = None) -> 'FileClient':
+        """Infer a suitable file client based on the URI and arguments.
+
+        Args:
+            file_client_args (dict, optional): Arguments to instantiate a
+                FileClient. Default: None.
+            uri (str | Path, optional): Uri to be parsed that contains the file
+                prefix. Default: None.
+
+        Examples:
+            >>> uri = 's3://path/of/your/file'
+            >>> file_client = FileClient.infer_client(uri=uri)
+            >>> file_client_args = {'backend': 'petrel'}
+            >>> file_client = FileClient.infer_client(file_client_args)
+
+        Returns:
+            FileClient: Instantiated FileClient object.
+        """
+        assert file_client_args is not None or uri is not None
+        if file_client_args is None:
+            file_prefix = cls.parse_uri_prefix(uri)  # type: ignore
+            return cls(prefix=file_prefix)
+        else:
+            return cls(**file_client_args)
+
+    @classmethod
+    def _register_backend(cls, name, backend, force=False, prefixes=None):
+        if not isinstance(name, str):
+            raise TypeError('the backend name should be a string, '
+                            f'but got {type(name)}')
+        if not inspect.isclass(backend):
+            raise TypeError(
+                f'backend should be a class but got {type(backend)}')
+        if not issubclass(backend, BaseStorageBackend):
+            raise TypeError(
+                f'backend {backend} is not a subclass of BaseStorageBackend')
+        if not force and name in cls._backends:
+            raise KeyError(
+                f'{name} is already registered as a storage backend, '
+                'add "force=True" if you want to override it')
+
+        if name in cls._backends and force:
+            cls._overridden_backends.add(name)
+        cls._backends[name] = backend
+
+        if prefixes is not None:
+            if isinstance(prefixes, str):
+                prefixes = [prefixes]
+            else:
+                assert isinstance(prefixes, (list, tuple))
+            for prefix in prefixes:
+                if prefix not in cls._prefix_to_backends:
+                    cls._prefix_to_backends[prefix] = backend
+                elif (prefix in cls._prefix_to_backends) and force:
+                    cls._overridden_prefixes.add(prefix)
+                    cls._prefix_to_backends[prefix] = backend
+                else:
+                    raise KeyError(
+                        f'{prefix} is already registered as a storage backend,'
+                        ' add "force=True" if you want to override it')
+
+    @classmethod
+    def register_backend(cls, name, backend=None, force=False, prefixes=None):
+        """Register a backend to FileClient.
+
+        This method can be used as a normal class method or a decorator.
+
+        .. code-block:: python
+
+            class NewBackend(BaseStorageBackend):
+
+                def get(self, filepath):
+                    return filepath
+
+                def get_text(self, filepath):
+                    return filepath
+
+            FileClient.register_backend('new', NewBackend)
+
+        or
+
+        .. code-block:: python
+
+            @FileClient.register_backend('new')
+            class NewBackend(BaseStorageBackend):
+
+                def get(self, filepath):
+                    return filepath
+
+                def get_text(self, filepath):
+                    return filepath
+
+        Args:
+            name (str): The name of the registered backend.
+            backend (class, optional): The backend class to be registered,
+                which must be a subclass of :class:`BaseStorageBackend`.
+                When this method is used as a decorator, backend is None.
+                Defaults to None.
+            force (bool, optional): Whether to override the backend if the name
+                has already been registered. Defaults to False.
+            prefixes (str or list[str] or tuple[str], optional): The prefixes
+                of the registered storage backend. Default: None.
+                `New in version 1.3.15.`
+        """
+        if backend is not None:
+            cls._register_backend(
+                name, backend, force=force, prefixes=prefixes)
+            return
+
+        def _register(backend_cls):
+            cls._register_backend(
+                name, backend_cls, force=force, prefixes=prefixes)
+            return backend_cls
+
+        return _register
+
+    def get(self, filepath: Union[str, Path]) -> Union[bytes, memoryview]:
+        """Read data from a given ``filepath`` with 'rb' mode.
+
+        Note:
+            There are two types of return values for ``get``, one is ``bytes``
+            and the other is ``memoryview``. The advantage of using memoryview
+            is that you can avoid copying, and if you want to convert it to
+            ``bytes``, you can use ``.tobytes()``.
+
+        Args:
+            filepath (str or Path): Path to read data.
+
+        Returns:
+            bytes | memoryview: Expected bytes object or a memory view of the
+                bytes object.
+        """
+        return self.client.get(filepath)
+
+    def get_text(self, filepath: Union[str, Path], encoding='utf-8') -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        return self.client.get_text(filepath, encoding)
+
+    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'wb' mode.
+
+        Note:
+            ``put`` should create a directory if the directory of ``filepath``
+            does not exist.
+
+        Args:
+            obj (bytes): Data to be written.
+            filepath (str or Path): Path to write data.
+        """
+        self.client.put(obj, filepath)
+
+    def put_text(self, obj: str, filepath: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'w' mode.
+
+        Note:
+            ``put_text`` should create a directory if the directory of
+            ``filepath`` does not exist.
+
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str, optional): The encoding format used to open the
+                `filepath`. Default: 'utf-8'.
+        """
+        self.client.put_text(obj, filepath)
+
+    def remove(self, filepath: Union[str, Path]) -> None:
+        """Remove a file.
+
+        Args:
+            filepath (str, Path): Path to be removed.
+        """
+        self.client.remove(filepath)
+
+    def exists(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path exists.
+
+        Args:
+            filepath (str or Path): Path to be checked whether exists.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+        """
+        return self.client.exists(filepath)
+
+    def isdir(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a directory.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a
+                directory.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a directory,
+                ``False`` otherwise.
+        """
+        return self.client.isdir(filepath)
+
+    def isfile(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a file.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a file.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a file, ``False``
+                otherwise.
+        """
+        return self.client.isfile(filepath)
+
+    def join_path(self, filepath: Union[str, Path],
+                  *filepaths: Union[str, Path]) -> str:
+        """Concatenate all file paths.
+
+        Join one or more filepath components intelligently. The return value
+        is the concatenation of filepath and any members of *filepaths.
+
+        Args:
+            filepath (str or Path): Path to be concatenated.
+
+        Returns:
+            str: The result of concatenation.
+        """
+        return self.client.join_path(filepath, *filepaths)
+
+    @contextmanager
+    def get_local_path(self, filepath: Union[str, Path]) -> Iterable[str]:
+        """Download data from ``filepath`` and write the data to local path.
+
+        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+
+        Note:
+            If the ``filepath`` is a local path, just return itself.
+
+        .. warning::
+            ``get_local_path`` is an experimental interface that may change in
+            the future.
+
+        Args:
+            filepath (str or Path): Path to be read data.
+
+        Examples:
+            >>> file_client = FileClient(prefix='s3')
+            >>> with file_client.get_local_path('s3://bucket/abc.jpg') as path:
+            ...     # do something here
+
+        Yields:
+            Iterable[str]: Only yield one path.
+        """
+        with self.client.get_local_path(str(filepath)) as local_path:
+            yield local_path
+
+    def list_dir_or_file(self,
+                         dir_path: Union[str, Path],
+                         list_dir: bool = True,
+                         list_file: bool = True,
+                         suffix: Optional[Union[str, Tuple[str]]] = None,
+                         recursive: bool = False) -> Iterator[str]:
+        """Scan a directory to find the interested directories or files in
+        arbitrary order.
+
+        Note:
+            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+
+        Args:
+            dir_path (str | Path): Path of the directory.
+            list_dir (bool): List the directories. Default: True.
+            list_file (bool): List the path of files. Default: True.
+            suffix (str or tuple[str], optional):  File suffix
+                that we are interested in. Default: None.
+            recursive (bool): If set to True, recursively scan the
+                directory. Default: False.
+
+        Yields:
+            Iterable[str]: A relative path to ``dir_path``.
+        """
+        yield from self.client.list_dir_or_file(dir_path, list_dir, list_file,
+                                                suffix, recursive)
diff --git a/mmcv/fileio/handlers/__init__.py b/mmcv/fileio/handlers/__init__.py
new file mode 100644
index 0000000..4756674
--- /dev/null
+++ b/mmcv/fileio/handlers/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseFileHandler
+from .json_handler import JsonHandler
+from .pickle_handler import PickleHandler
\ No newline at end of file
diff --git a/mmcv/fileio/handlers/base.py b/mmcv/fileio/handlers/base.py
new file mode 100644
index 0000000..288878b
--- /dev/null
+++ b/mmcv/fileio/handlers/base.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+
+class BaseFileHandler(metaclass=ABCMeta):
+    # `str_like` is a flag to indicate whether the type of file object is
+    # str-like object or bytes-like object. Pickle only processes bytes-like
+    # objects but json only processes str-like object. If it is str-like
+    # object, `StringIO` will be used to process the buffer.
+    str_like = True
+
+    @abstractmethod
+    def load_from_fileobj(self, file, **kwargs):
+        pass
+
+    @abstractmethod
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        pass
+
+    @abstractmethod
+    def dump_to_str(self, obj, **kwargs):
+        pass
+
+    def load_from_path(self, filepath, mode='r', **kwargs):
+        with open(filepath, mode) as f:
+            return self.load_from_fileobj(f, **kwargs)
+
+    def dump_to_path(self, obj, filepath, mode='w', **kwargs):
+        with open(filepath, mode) as f:
+            self.dump_to_fileobj(obj, f, **kwargs)
diff --git a/mmcv/fileio/handlers/json_handler.py b/mmcv/fileio/handlers/json_handler.py
new file mode 100644
index 0000000..18d4f15
--- /dev/null
+++ b/mmcv/fileio/handlers/json_handler.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+
+import numpy as np
+
+from .base import BaseFileHandler
+
+
+def set_default(obj):
+    """Set default json values for non-serializable values.
+
+    It helps convert ``set``, ``range`` and ``np.ndarray`` data types to list.
+    It also converts ``np.generic`` (including ``np.int32``, ``np.float32``,
+    etc.) into plain numbers of plain python built-in types.
+    """
+    if isinstance(obj, (set, range)):
+        return list(obj)
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, np.generic):
+        return obj.item()
+    raise TypeError(f'{type(obj)} is unsupported for json dump')
+
+
+class JsonHandler(BaseFileHandler):
+
+    def load_from_fileobj(self, file):
+        return json.load(file)
+
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault('default', set_default)
+        json.dump(obj, file, **kwargs)
+
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault('default', set_default)
+        return json.dumps(obj, **kwargs)
diff --git a/mmcv/fileio/handlers/pickle_handler.py b/mmcv/fileio/handlers/pickle_handler.py
new file mode 100644
index 0000000..b37c79b
--- /dev/null
+++ b/mmcv/fileio/handlers/pickle_handler.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pickle
+
+from .base import BaseFileHandler
+
+
+class PickleHandler(BaseFileHandler):
+
+    str_like = False
+
+    def load_from_fileobj(self, file, **kwargs):
+        return pickle.load(file, **kwargs)
+
+    def load_from_path(self, filepath, **kwargs):
+        return super(PickleHandler, self).load_from_path(
+            filepath, mode='rb', **kwargs)
+
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault('protocol', 2)
+        return pickle.dumps(obj, **kwargs)
+
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault('protocol', 2)
+        pickle.dump(obj, file, **kwargs)
+
+    def dump_to_path(self, obj, filepath, **kwargs):
+        super(PickleHandler, self).dump_to_path(
+            obj, filepath, mode='wb', **kwargs)
diff --git a/mmcv/fileio/io.py b/mmcv/fileio/io.py
new file mode 100644
index 0000000..6155a5d
--- /dev/null
+++ b/mmcv/fileio/io.py
@@ -0,0 +1,154 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from io import BytesIO, StringIO
+from pathlib import Path
+
+from ..utils.misc import is_list_of, is_str
+from .handlers import BaseFileHandler, JsonHandler, PickleHandler
+
+file_handlers = {
+    'json': JsonHandler(),
+    # 'yaml': YamlHandler(),
+    # 'yml': YamlHandler(),
+    'pickle': PickleHandler(),
+    'pkl': PickleHandler()
+}
+
+
+def load(file, file_format=None, file_client_args=None, **kwargs):
+    """Load data from json/yaml/pickle files.
+
+    This method provides a unified api for loading data from serialized files.
+
+    Note:
+        In v1.3.16 and later, ``load`` supports loading data from serialized
+        files those can be storaged in different backends.
+
+    Args:
+        file (str or :obj:`Path` or file-like object): Filename or a file-like
+            object.
+        file_format (str, optional): If not specified, the file format will be
+            inferred from the file extension, otherwise use the specified one.
+            Currently supported formats include "json", "yaml/yml" and
+            "pickle/pkl".
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+
+    Examples:
+        >>> load('/path/of/your/file')  # file is storaged in disk
+        >>> load('https://path/of/your/file')  # file is storaged in Internet
+        >>> load('s3://path/of/your/file')  # file is storaged in petrel
+
+    Returns:
+        The content from the file.
+    """
+    #TODO(JIAZI)
+    from .file_client import FileClient
+    if isinstance(file, Path):
+        file = str(file)
+    if file_format is None and is_str(file):
+        file_format = file.split('.')[-1]
+    if file_format not in file_handlers:
+        raise TypeError(f'Unsupported format: {file_format}')
+
+    handler = file_handlers[file_format]
+    if is_str(file):
+        file_client = FileClient.infer_client(file_client_args, file)
+        if handler.str_like:
+            with StringIO(file_client.get_text(file)) as f:
+                obj = handler.load_from_fileobj(f, **kwargs)
+        else:
+            with BytesIO(file_client.get(file)) as f:
+                obj = handler.load_from_fileobj(f, **kwargs)
+    elif hasattr(file, 'read'):
+        obj = handler.load_from_fileobj(file, **kwargs)
+    else:
+        raise TypeError('"file" must be a filepath str or a file-object')
+    return obj
+
+
+def dump(obj, file=None, file_format=None, file_client_args=None, **kwargs):
+    """Dump data to json/yaml/pickle strings or files.
+
+    This method provides a unified api for dumping data as strings or to files,
+    and also supports custom arguments for each file format.
+
+    Note:
+        In v1.3.16 and later, ``dump`` supports dumping data as strings or to
+        files which is saved to different backends.
+
+    Args:
+        obj (any): The python object to be dumped.
+        file (str or :obj:`Path` or file-like object, optional): If not
+            specified, then the object is dumped to a str, otherwise to a file
+            specified by the filename or file-like object.
+        file_format (str, optional): Same as :func:`load`.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+
+    Examples:
+        >>> dump('hello world', '/path/of/your/file')  # disk
+        >>> dump('hello world', 's3://path/of/your/file')  # ceph or petrel
+
+    Returns:
+        bool: True for success, False otherwise.
+    """
+    #TODO(JIAZI)
+    from .file_client import FileClient
+    if isinstance(file, Path):
+        file = str(file)
+    if file_format is None:
+        if is_str(file):
+            file_format = file.split('.')[-1]
+        elif file is None:
+            raise ValueError(
+                'file_format must be specified since file is None')
+    if file_format not in file_handlers:
+        raise TypeError(f'Unsupported format: {file_format}')
+
+    handler = file_handlers[file_format]
+    if file is None:
+        return handler.dump_to_str(obj, **kwargs)
+    elif is_str(file):
+        file_client = FileClient.infer_client(file_client_args, file)
+        if handler.str_like:
+            with StringIO() as f:
+                handler.dump_to_fileobj(obj, f, **kwargs)
+                file_client.put_text(f.getvalue(), file)
+        else:
+            with BytesIO() as f:
+                handler.dump_to_fileobj(obj, f, **kwargs)
+                file_client.put(f.getvalue(), file)
+    elif hasattr(file, 'write'):
+        handler.dump_to_fileobj(obj, file, **kwargs)
+    else:
+        raise TypeError('"file" must be a filename str or a file-object')
+
+
+def _register_handler(handler, file_formats):
+    """Register a handler for some file extensions.
+
+    Args:
+        handler (:obj:`BaseFileHandler`): Handler to be registered.
+        file_formats (str or list[str]): File formats to be handled by this
+            handler.
+    """
+    if not isinstance(handler, BaseFileHandler):
+        raise TypeError(
+            f'handler must be a child of BaseFileHandler, not {type(handler)}')
+    if isinstance(file_formats, str):
+        file_formats = [file_formats]
+    if not is_list_of(file_formats, str):
+        raise TypeError('file_formats must be a str or a list of str')
+    for ext in file_formats:
+        file_handlers[ext] = handler
+
+
+def register_handler(file_formats, **kwargs):
+
+    def wrap(cls):
+        _register_handler(cls(**kwargs), file_formats)
+        return cls
+
+    return wrap
diff --git a/mmcv/fileio/parse.py b/mmcv/fileio/parse.py
new file mode 100644
index 0000000..f60f0d6
--- /dev/null
+++ b/mmcv/fileio/parse.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from io import StringIO
+
+from .file_client import FileClient
+
+
+def list_from_file(filename,
+                   prefix='',
+                   offset=0,
+                   max_num=0,
+                   encoding='utf-8',
+                   file_client_args=None):
+    """Load a text file and parse the content as a list of strings.
+
+    Note:
+        In v1.3.16 and later, ``list_from_file`` supports loading a text file
+        which can be storaged in different backends and parsing the content as
+        a list for strings.
+
+    Args:
+        filename (str): Filename.
+        prefix (str): The prefix to be inserted to the beginning of each item.
+        offset (int): The offset of lines.
+        max_num (int): The maximum number of lines to be read,
+            zeros and negatives mean no limitation.
+        encoding (str): Encoding used to open the file. Default utf-8.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+
+    Examples:
+        >>> list_from_file('/path/of/your/file')  # disk
+        ['hello', 'world']
+        >>> list_from_file('s3://path/of/your/file')  # ceph or petrel
+        ['hello', 'world']
+
+    Returns:
+        list[str]: A list of strings.
+    """
+    cnt = 0
+    item_list = []
+    file_client = FileClient.infer_client(file_client_args, filename)
+    with StringIO(file_client.get_text(filename, encoding)) as f:
+        for _ in range(offset):
+            f.readline()
+        for line in f:
+            if 0 < max_num <= cnt:
+                break
+            item_list.append(prefix + line.rstrip('\n\r'))
+            cnt += 1
+    return item_list
+
+
+def dict_from_file(filename,
+                   key_type=str,
+                   encoding='utf-8',
+                   file_client_args=None):
+    """Load a text file and parse the content as a dict.
+
+    Each line of the text file will be two or more columns split by
+    whitespaces or tabs. The first column will be parsed as dict keys, and
+    the following columns will be parsed as dict values.
+
+    Note:
+        In v1.3.16 and later, ``dict_from_file`` supports loading a text file
+        which can be storaged in different backends and parsing the content as
+        a dict.
+
+    Args:
+        filename(str): Filename.
+        key_type(type): Type of the dict keys. str is user by default and
+            type conversion will be performed if specified.
+        encoding (str): Encoding used to open the file. Default utf-8.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+
+    Examples:
+        >>> dict_from_file('/path/of/your/file')  # disk
+        {'key1': 'value1', 'key2': 'value2'}
+        >>> dict_from_file('s3://path/of/your/file')  # ceph or petrel
+        {'key1': 'value1', 'key2': 'value2'}
+
+    Returns:
+        dict: The parsed contents.
+    """
+    mapping = {}
+    file_client = FileClient.infer_client(file_client_args, filename)
+    with StringIO(file_client.get_text(filename, encoding)) as f:
+        for line in f:
+            items = line.rstrip('\n').split()
+            assert len(items) >= 2
+            key = key_type(items[0])
+            val = items[1:] if len(items) > 2 else items[1]
+            mapping[key] = val
+    return mapping
diff --git a/mmcv/image/__init__.py b/mmcv/image/__init__.py
new file mode 100644
index 0000000..5d7edb2
--- /dev/null
+++ b/mmcv/image/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .geometric import (cutout, imcrop, imflip, imflip_, impad,
+                        impad_to_multiple, imrescale, imresize, imresize_like,
+                        imresize_to_multiple, imrotate, imshear, imtranslate,
+                        rescale_size)
+from .io import imfrombytes, imread, imwrite, supported_backends, use_backend
+from .photometric import (adjust_brightness, adjust_color, adjust_contrast,
+                          adjust_lighting, adjust_sharpness, auto_contrast,
+                          clahe, imdenormalize, imequalize, iminvert,
+                          imnormalize, imnormalize_, lut_transform, posterize,
+                          solarize)
+from .misc import tensor2imgs
+from .colorspace import (bgr2gray, bgr2hls, bgr2hsv, bgr2rgb, bgr2ycbcr,
+                         gray2bgr, gray2rgb, hls2bgr, hsv2bgr, imconvert,
+                         rgb2bgr, rgb2gray, rgb2ycbcr, ycbcr2bgr, ycbcr2rgb)
+# __all__ = [
+#     'bgr2gray', 'bgr2hls', 'bgr2hsv', 'bgr2rgb', 'gray2bgr', 'gray2rgb',
+#     'hls2bgr', 'hsv2bgr', 'imconvert', 'rgb2bgr', 'rgb2gray', 'imrescale',
+#     'imresize', 'imresize_like', 'imresize_to_multiple', 'rescale_size',
+#     'imcrop', 'imflip', 'imflip_', 'impad', 'impad_to_multiple', 'imrotate',
+#     'imfrombytes', 'imread', 'imwrite', 'supported_backends', 'use_backend',
+#     'imdenormalize', 'imnormalize', 'imnormalize_', 'iminvert', 'posterize',
+#     'solarize', 'rgb2ycbcr', 'bgr2ycbcr', 'ycbcr2rgb', 'ycbcr2bgr',
+#     'tensor2imgs', 'imshear', 'imtranslate', 'adjust_color', 'imequalize',
+#     'adjust_brightness', 'adjust_contrast', 'lut_transform', 'clahe',
+#     'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting'
+# ]
diff --git a/mmcv/image/colorspace.py b/mmcv/image/colorspace.py
new file mode 100644
index 0000000..8145339
--- /dev/null
+++ b/mmcv/image/colorspace.py
@@ -0,0 +1,306 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import numpy as np
+
+
+def imconvert(img, src, dst):
+    """Convert an image from the src colorspace to dst colorspace.
+
+    Args:
+        img (ndarray): The input image.
+        src (str): The source colorspace, e.g., 'rgb', 'hsv'.
+        dst (str): The destination colorspace, e.g., 'rgb', 'hsv'.
+
+    Returns:
+        ndarray: The converted image.
+    """
+    code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')
+    out_img = cv2.cvtColor(img, code)
+    return out_img
+
+
+def bgr2gray(img, keepdim=False):
+    """Convert a BGR image to grayscale image.
+
+    Args:
+        img (ndarray): The input image.
+        keepdim (bool): If False (by default), then return the grayscale image
+            with 2 dims, otherwise 3 dims.
+
+    Returns:
+        ndarray: The converted grayscale image.
+    """
+    out_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    if keepdim:
+        out_img = out_img[..., None]
+    return out_img
+
+
+def rgb2gray(img, keepdim=False):
+    """Convert a RGB image to grayscale image.
+
+    Args:
+        img (ndarray): The input image.
+        keepdim (bool): If False (by default), then return the grayscale image
+            with 2 dims, otherwise 3 dims.
+
+    Returns:
+        ndarray: The converted grayscale image.
+    """
+    out_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+    if keepdim:
+        out_img = out_img[..., None]
+    return out_img
+
+
+def gray2bgr(img):
+    """Convert a grayscale image to BGR image.
+
+    Args:
+        img (ndarray): The input image.
+
+    Returns:
+        ndarray: The converted BGR image.
+    """
+    img = img[..., None] if img.ndim == 2 else img
+    out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+    return out_img
+
+
+def gray2rgb(img):
+    """Convert a grayscale image to RGB image.
+
+    Args:
+        img (ndarray): The input image.
+
+    Returns:
+        ndarray: The converted RGB image.
+    """
+    img = img[..., None] if img.ndim == 2 else img
+    out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+    return out_img
+
+
+def _convert_input_type_range(img):
+    """Convert the type and range of the input image.
+
+    It converts the input image to np.float32 type and range of [0, 1].
+    It is mainly used for pre-processing the input image in colorspace
+    conversion functions such as rgb2ycbcr and ycbcr2rgb.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+
+    Returns:
+        (ndarray): The converted image with type of np.float32 and range of
+            [0, 1].
+    """
+    img_type = img.dtype
+    img = img.astype(np.float32)
+    if img_type == np.float32:
+        pass
+    elif img_type == np.uint8:
+        img /= 255.
+    else:
+        raise TypeError('The img type should be np.float32 or np.uint8, '
+                        f'but got {img_type}')
+    return img
+
+
+def _convert_output_type_range(img, dst_type):
+    """Convert the type and range of the image according to dst_type.
+
+    It converts the image to desired type and range. If `dst_type` is np.uint8,
+    images will be converted to np.uint8 type with range [0, 255]. If
+    `dst_type` is np.float32, it converts the image to np.float32 type with
+    range [0, 1].
+    It is mainly used for post-processing images in colorspace conversion
+    functions such as rgb2ycbcr and ycbcr2rgb.
+
+    Args:
+        img (ndarray): The image to be converted with np.float32 type and
+            range [0, 255].
+        dst_type (np.uint8 | np.float32): If dst_type is np.uint8, it
+            converts the image to np.uint8 type with range [0, 255]. If
+            dst_type is np.float32, it converts the image to np.float32 type
+            with range [0, 1].
+
+    Returns:
+        (ndarray): The converted image with desired type and range.
+    """
+    if dst_type not in (np.uint8, np.float32):
+        raise TypeError('The dst_type should be np.float32 or np.uint8, '
+                        f'but got {dst_type}')
+    if dst_type == np.uint8:
+        img = img.round()
+    else:
+        img /= 255.
+    return img.astype(dst_type)
+
+
+def rgb2ycbcr(img, y_only=False):
+    """Convert a RGB image to YCbCr image.
+
+    This function produces the same results as Matlab's `rgb2ycbcr` function.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `RGB <-> YCrCb`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+        y_only (bool): Whether to only return Y channel. Default: False.
+
+    Returns:
+        ndarray: The converted YCbCr image. The output image has the same type
+            and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img)
+    if y_only:
+        out_img = np.dot(img, [65.481, 128.553, 24.966]) + 16.0
+    else:
+        out_img = np.matmul(
+            img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786],
+                  [24.966, 112.0, -18.214]]) + [16, 128, 128]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def bgr2ycbcr(img, y_only=False):
+    """Convert a BGR image to YCbCr image.
+
+    The bgr version of rgb2ycbcr.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `BGR <-> YCrCb`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+        y_only (bool): Whether to only return Y channel. Default: False.
+
+    Returns:
+        ndarray: The converted YCbCr image. The output image has the same type
+            and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img)
+    if y_only:
+        out_img = np.dot(img, [24.966, 128.553, 65.481]) + 16.0
+    else:
+        out_img = np.matmul(
+            img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
+                  [65.481, -37.797, 112.0]]) + [16, 128, 128]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def ycbcr2rgb(img):
+    """Convert a YCbCr image to RGB image.
+
+    This function produces the same results as Matlab's ycbcr2rgb function.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `YCrCb <-> RGB`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+
+    Returns:
+        ndarray: The converted RGB image. The output image has the same type
+            and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img) * 255
+    out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621],
+                              [0, -0.00153632, 0.00791071],
+                              [0.00625893, -0.00318811, 0]]) * 255.0 + [
+                                  -222.921, 135.576, -276.836
+                              ]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def ycbcr2bgr(img):
+    """Convert a YCbCr image to BGR image.
+
+    The bgr version of ycbcr2rgb.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `YCrCb <-> BGR`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+
+    Returns:
+        ndarray: The converted BGR image. The output image has the same type
+            and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img) * 255
+    out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621],
+                              [0.00791071, -0.00153632, 0],
+                              [0, -0.00318811, 0.00625893]]) * 255.0 + [
+                                  -276.836, 135.576, -222.921
+                              ]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def convert_color_factory(src, dst):
+
+    code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')
+
+    def convert_color(img):
+        out_img = cv2.cvtColor(img, code)
+        return out_img
+
+    convert_color.__doc__ = f"""Convert a {src.upper()} image to {dst.upper()}
+        image.
+
+    Args:
+        img (ndarray or str): The input image.
+
+    Returns:
+        ndarray: The converted {dst.upper()} image.
+    """
+
+    return convert_color
+
+
+bgr2rgb = convert_color_factory('bgr', 'rgb')
+
+rgb2bgr = convert_color_factory('rgb', 'bgr')
+
+bgr2hsv = convert_color_factory('bgr', 'hsv')
+
+hsv2bgr = convert_color_factory('hsv', 'bgr')
+
+bgr2hls = convert_color_factory('bgr', 'hls')
+
+hls2bgr = convert_color_factory('hls', 'bgr')
diff --git a/mmcv/image/geometric.py b/mmcv/image/geometric.py
new file mode 100644
index 0000000..cf97c20
--- /dev/null
+++ b/mmcv/image/geometric.py
@@ -0,0 +1,728 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numbers
+
+import cv2
+import numpy as np
+
+from ..utils import to_2tuple
+from .io import imread_backend
+
+try:
+    from PIL import Image
+except ImportError:
+    Image = None
+
+
+def _scale_size(size, scale):
+    """Rescale a size by a ratio.
+
+    Args:
+        size (tuple[int]): (w, h).
+        scale (float | tuple(float)): Scaling factor.
+
+    Returns:
+        tuple[int]: scaled size.
+    """
+    if isinstance(scale, (float, int)):
+        scale = (scale, scale)
+    w, h = size
+    return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5)
+
+
+cv2_interp_codes = {
+    'nearest': cv2.INTER_NEAREST,
+    'bilinear': cv2.INTER_LINEAR,
+    'bicubic': cv2.INTER_CUBIC,
+    'area': cv2.INTER_AREA,
+    'lanczos': cv2.INTER_LANCZOS4
+}
+
+if Image is not None:
+    pillow_interp_codes = {
+        'nearest': Image.NEAREST,
+        'bilinear': Image.BILINEAR,
+        'bicubic': Image.BICUBIC,
+        'box': Image.BOX,
+        'lanczos': Image.LANCZOS,
+        'hamming': Image.HAMMING
+    }
+
+
+def imresize(img,
+             size,
+             return_scale=False,
+             interpolation='bilinear',
+             out=None,
+             backend=None):
+    """Resize image to a given size.
+
+    Args:
+        img (ndarray): The input image.
+        size (tuple[int]): Target size (w, h).
+        return_scale (bool): Whether to return `w_scale` and `h_scale`.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend.
+        out (ndarray): The output destination.
+        backend (str | None): The image resize backend type. Options are `cv2`,
+            `pillow`, `None`. If backend is None, the global imread_backend
+            specified by ``mmcv.use_backend()`` will be used. Default: None.
+
+    Returns:
+        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+            `resized_img`.
+    """
+    h, w = img.shape[:2]
+    if backend is None:
+        backend = imread_backend
+    if backend not in ['cv2', 'pillow']:
+        raise ValueError(f'backend: {backend} is not supported for resize.'
+                         f"Supported backends are 'cv2', 'pillow'")
+
+    if backend == 'pillow':
+        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+        pil_image = Image.fromarray(img)
+        pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
+        resized_img = np.array(pil_image)
+    else:
+        resized_img = cv2.resize(
+            img, size, dst=out, interpolation=cv2_interp_codes[interpolation])
+    if not return_scale:
+        return resized_img
+    else:
+        w_scale = size[0] / w
+        h_scale = size[1] / h
+        return resized_img, w_scale, h_scale
+
+
+def imresize_to_multiple(img,
+                         divisor,
+                         size=None,
+                         scale_factor=None,
+                         keep_ratio=False,
+                         return_scale=False,
+                         interpolation='bilinear',
+                         out=None,
+                         backend=None):
+    """Resize image according to a given size or scale factor and then rounds
+    up the the resized or rescaled image size to the nearest value that can be
+    divided by the divisor.
+
+    Args:
+        img (ndarray): The input image.
+        divisor (int | tuple): Resized image size will be a multiple of
+            divisor. If divisor is a tuple, divisor should be
+            (w_divisor, h_divisor).
+        size (None | int | tuple[int]): Target size (w, h). Default: None.
+        scale_factor (None | float | tuple[float]): Multiplier for spatial
+            size. Should match input size if it is a tuple and the 2D style is
+            (w_scale_factor, h_scale_factor). Default: None.
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image. Default: False.
+        return_scale (bool): Whether to return `w_scale` and `h_scale`.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend.
+        out (ndarray): The output destination.
+        backend (str | None): The image resize backend type. Options are `cv2`,
+            `pillow`, `None`. If backend is None, the global imread_backend
+            specified by ``mmcv.use_backend()`` will be used. Default: None.
+
+    Returns:
+        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+            `resized_img`.
+    """
+    h, w = img.shape[:2]
+    if size is not None and scale_factor is not None:
+        raise ValueError('only one of size or scale_factor should be defined')
+    elif size is None and scale_factor is None:
+        raise ValueError('one of size or scale_factor should be defined')
+    elif size is not None:
+        size = to_2tuple(size)
+        if keep_ratio:
+            size = rescale_size((w, h), size, return_scale=False)
+    else:
+        size = _scale_size((w, h), scale_factor)
+
+    divisor = to_2tuple(divisor)
+    size = tuple([int(np.ceil(s / d)) * d for s, d in zip(size, divisor)])
+    resized_img, w_scale, h_scale = imresize(
+        img,
+        size,
+        return_scale=True,
+        interpolation=interpolation,
+        out=out,
+        backend=backend)
+    if return_scale:
+        return resized_img, w_scale, h_scale
+    else:
+        return resized_img
+
+
+def imresize_like(img,
+                  dst_img,
+                  return_scale=False,
+                  interpolation='bilinear',
+                  backend=None):
+    """Resize image to the same size of a given image.
+
+    Args:
+        img (ndarray): The input image.
+        dst_img (ndarray): The target image.
+        return_scale (bool): Whether to return `w_scale` and `h_scale`.
+        interpolation (str): Same as :func:`resize`.
+        backend (str | None): Same as :func:`resize`.
+
+    Returns:
+        tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+            `resized_img`.
+    """
+    h, w = dst_img.shape[:2]
+    return imresize(img, (w, h), return_scale, interpolation, backend=backend)
+
+
+def rescale_size(old_size, scale, return_scale=False):
+    """Calculate the new size to be rescaled to.
+
+    Args:
+        old_size (tuple[int]): The old size (w, h) of image.
+        scale (float | tuple[int]): The scaling factor or maximum size.
+            If it is a float number, then the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, then the image will
+            be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image size.
+
+    Returns:
+        tuple[int]: The new rescaled image size.
+    """
+    w, h = old_size
+    if isinstance(scale, (float, int)):
+        if scale <= 0:
+            raise ValueError(f'Invalid scale {scale}, must be positive.')
+        scale_factor = scale
+    elif isinstance(scale, tuple):
+        max_long_edge = max(scale)
+        max_short_edge = min(scale)
+        scale_factor = min(max_long_edge / max(h, w),
+                           max_short_edge / min(h, w))
+    else:
+        raise TypeError(
+            f'Scale must be a number or tuple of int, but got {type(scale)}')
+
+    new_size = _scale_size((w, h), scale_factor)
+
+    if return_scale:
+        return new_size, scale_factor
+    else:
+        return new_size
+
+
+def imrescale(img,
+              scale,
+              return_scale=False,
+              interpolation='bilinear',
+              backend=None):
+    """Resize image while keeping the aspect ratio.
+
+    Args:
+        img (ndarray): The input image.
+        scale (float | tuple[int]): The scaling factor or maximum size.
+            If it is a float number, then the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, then the image will
+            be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image.
+        interpolation (str): Same as :func:`resize`.
+        backend (str | None): Same as :func:`resize`.
+
+    Returns:
+        ndarray: The rescaled image.
+    """
+    h, w = img.shape[:2]
+    new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)
+    rescaled_img = imresize(
+        img, new_size, interpolation=interpolation, backend=backend)
+    if return_scale:
+        return rescaled_img, scale_factor
+    else:
+        return rescaled_img
+
+
+def imflip(img, direction='horizontal'):
+    """Flip an image horizontally or vertically.
+
+    Args:
+        img (ndarray): Image to be flipped.
+        direction (str): The flip direction, either "horizontal" or
+            "vertical" or "diagonal".
+
+    Returns:
+        ndarray: The flipped image.
+    """
+    assert direction in ['horizontal', 'vertical', 'diagonal']
+    if direction == 'horizontal':
+        return np.flip(img, axis=1)
+    elif direction == 'vertical':
+        return np.flip(img, axis=0)
+    else:
+        return np.flip(img, axis=(0, 1))
+
+
+def imflip_(img, direction='horizontal'):
+    """Inplace flip an image horizontally or vertically.
+
+    Args:
+        img (ndarray): Image to be flipped.
+        direction (str): The flip direction, either "horizontal" or
+            "vertical" or "diagonal".
+
+    Returns:
+        ndarray: The flipped image (inplace).
+    """
+    assert direction in ['horizontal', 'vertical', 'diagonal']
+    if direction == 'horizontal':
+        return cv2.flip(img, 1, img)
+    elif direction == 'vertical':
+        return cv2.flip(img, 0, img)
+    else:
+        return cv2.flip(img, -1, img)
+
+
+def imrotate(img,
+             angle,
+             center=None,
+             scale=1.0,
+             border_value=0,
+             interpolation='bilinear',
+             auto_bound=False):
+    """Rotate an image.
+
+    Args:
+        img (ndarray): Image to be rotated.
+        angle (float): Rotation angle in degrees, positive values mean
+            clockwise rotation.
+        center (tuple[float], optional): Center point (w, h) of the rotation in
+            the source image. If not specified, the center of the image will be
+            used.
+        scale (float): Isotropic scale factor.
+        border_value (int): Border value.
+        interpolation (str): Same as :func:`resize`.
+        auto_bound (bool): Whether to adjust the image size to cover the whole
+            rotated image.
+
+    Returns:
+        ndarray: The rotated image.
+    """
+    if center is not None and auto_bound:
+        raise ValueError('`auto_bound` conflicts with `center`')
+    h, w = img.shape[:2]
+    if center is None:
+        center = ((w - 1) * 0.5, (h - 1) * 0.5)
+    assert isinstance(center, tuple)
+
+    matrix = cv2.getRotationMatrix2D(center, -angle, scale)
+    if auto_bound:
+        cos = np.abs(matrix[0, 0])
+        sin = np.abs(matrix[0, 1])
+        new_w = h * sin + w * cos
+        new_h = h * cos + w * sin
+        matrix[0, 2] += (new_w - w) * 0.5
+        matrix[1, 2] += (new_h - h) * 0.5
+        w = int(np.round(new_w))
+        h = int(np.round(new_h))
+    rotated = cv2.warpAffine(
+        img,
+        matrix, (w, h),
+        flags=cv2_interp_codes[interpolation],
+        borderValue=border_value)
+    return rotated
+
+
+def bbox_clip(bboxes, img_shape):
+    """Clip bboxes to fit the image shape.
+
+    Args:
+        bboxes (ndarray): Shape (..., 4*k)
+        img_shape (tuple[int]): (height, width) of the image.
+
+    Returns:
+        ndarray: Clipped bboxes.
+    """
+    assert bboxes.shape[-1] % 4 == 0
+    cmin = np.empty(bboxes.shape[-1], dtype=bboxes.dtype)
+    cmin[0::2] = img_shape[1] - 1
+    cmin[1::2] = img_shape[0] - 1
+    clipped_bboxes = np.maximum(np.minimum(bboxes, cmin), 0)
+    return clipped_bboxes
+
+
+def bbox_scaling(bboxes, scale, clip_shape=None):
+    """Scaling bboxes w.r.t the box center.
+
+    Args:
+        bboxes (ndarray): Shape(..., 4).
+        scale (float): Scaling factor.
+        clip_shape (tuple[int], optional): If specified, bboxes that exceed the
+            boundary will be clipped according to the given shape (h, w).
+
+    Returns:
+        ndarray: Scaled bboxes.
+    """
+    if float(scale) == 1.0:
+        scaled_bboxes = bboxes.copy()
+    else:
+        w = bboxes[..., 2] - bboxes[..., 0] + 1
+        h = bboxes[..., 3] - bboxes[..., 1] + 1
+        dw = (w * (scale - 1)) * 0.5
+        dh = (h * (scale - 1)) * 0.5
+        scaled_bboxes = bboxes + np.stack((-dw, -dh, dw, dh), axis=-1)
+    if clip_shape is not None:
+        return bbox_clip(scaled_bboxes, clip_shape)
+    else:
+        return scaled_bboxes
+
+
+def imcrop(img, bboxes, scale=1.0, pad_fill=None):
+    """Crop image patches.
+
+    3 steps: scale the bboxes -> clip bboxes -> crop and pad.
+
+    Args:
+        img (ndarray): Image to be cropped.
+        bboxes (ndarray): Shape (k, 4) or (4, ), location of cropped bboxes.
+        scale (float, optional): Scale ratio of bboxes, the default value
+            1.0 means no padding.
+        pad_fill (Number | list[Number]): Value to be filled for padding.
+            Default: None, which means no padding.
+
+    Returns:
+        list[ndarray] | ndarray: The cropped image patches.
+    """
+    chn = 1 if img.ndim == 2 else img.shape[2]
+    if pad_fill is not None:
+        if isinstance(pad_fill, (int, float)):
+            pad_fill = [pad_fill for _ in range(chn)]
+        assert len(pad_fill) == chn
+
+    _bboxes = bboxes[None, ...] if bboxes.ndim == 1 else bboxes
+    scaled_bboxes = bbox_scaling(_bboxes, scale).astype(np.int32)
+    clipped_bbox = bbox_clip(scaled_bboxes, img.shape)
+
+    patches = []
+    for i in range(clipped_bbox.shape[0]):
+        x1, y1, x2, y2 = tuple(clipped_bbox[i, :])
+        if pad_fill is None:
+            patch = img[y1:y2 + 1, x1:x2 + 1, ...]
+        else:
+            _x1, _y1, _x2, _y2 = tuple(scaled_bboxes[i, :])
+            if chn == 1:
+                patch_shape = (_y2 - _y1 + 1, _x2 - _x1 + 1)
+            else:
+                patch_shape = (_y2 - _y1 + 1, _x2 - _x1 + 1, chn)
+            patch = np.array(
+                pad_fill, dtype=img.dtype) * np.ones(
+                    patch_shape, dtype=img.dtype)
+            x_start = 0 if _x1 >= 0 else -_x1
+            y_start = 0 if _y1 >= 0 else -_y1
+            w = x2 - x1 + 1
+            h = y2 - y1 + 1
+            patch[y_start:y_start + h, x_start:x_start + w,
+                  ...] = img[y1:y1 + h, x1:x1 + w, ...]
+        patches.append(patch)
+
+    if bboxes.ndim == 1:
+        return patches[0]
+    else:
+        return patches
+
+
+def impad(img,
+          *,
+          shape=None,
+          padding=None,
+          pad_val=0,
+          padding_mode='constant'):
+    """Pad the given image to a certain shape or pad on all sides with
+    specified padding mode and padding value.
+
+    Args:
+        img (ndarray): Image to be padded.
+        shape (tuple[int]): Expected padding shape (h, w). Default: None.
+        padding (int or tuple[int]): Padding on each border. If a single int is
+            provided this is used to pad all borders. If tuple of length 2 is
+            provided this is the padding on left/right and top/bottom
+            respectively. If a tuple of length 4 is provided this is the
+            padding for the left, top, right and bottom borders respectively.
+            Default: None. Note that `shape` and `padding` can not be both
+            set.
+        pad_val (Number | Sequence[Number]): Values to be filled in padding
+            areas when padding_mode is 'constant'. Default: 0.
+        padding_mode (str): Type of padding. Should be: constant, edge,
+            reflect or symmetric. Default: constant.
+
+            - constant: pads with a constant value, this value is specified
+                with pad_val.
+            - edge: pads with the last value at the edge of the image.
+            - reflect: pads with reflection of image without repeating the
+                last value on the edge. For example, padding [1, 2, 3, 4]
+                with 2 elements on both sides in reflect mode will result
+                in [3, 2, 1, 2, 3, 4, 3, 2].
+            - symmetric: pads with reflection of image repeating the last
+                value on the edge. For example, padding [1, 2, 3, 4] with
+                2 elements on both sides in symmetric mode will result in
+                [2, 1, 1, 2, 3, 4, 4, 3]
+
+    Returns:
+        ndarray: The padded image.
+    """
+
+    assert (shape is not None) ^ (padding is not None)
+    if shape is not None:
+        padding = (0, 0, shape[1] - img.shape[1], shape[0] - img.shape[0])
+
+    # check pad_val
+    if isinstance(pad_val, tuple):
+        assert len(pad_val) == img.shape[-1]
+    elif not isinstance(pad_val, numbers.Number):
+        raise TypeError('pad_val must be a int or a tuple. '
+                        f'But received {type(pad_val)}')
+
+    # check padding
+    if isinstance(padding, tuple) and len(padding) in [2, 4]:
+        if len(padding) == 2:
+            padding = (padding[0], padding[1], padding[0], padding[1])
+    elif isinstance(padding, numbers.Number):
+        padding = (padding, padding, padding, padding)
+    else:
+        raise ValueError('Padding must be a int or a 2, or 4 element tuple.'
+                         f'But received {padding}')
+
+    # check padding mode
+    assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
+
+    border_type = {
+        'constant': cv2.BORDER_CONSTANT,
+        'edge': cv2.BORDER_REPLICATE,
+        'reflect': cv2.BORDER_REFLECT_101,
+        'symmetric': cv2.BORDER_REFLECT
+    }
+    img = cv2.copyMakeBorder(
+        img,
+        padding[1],
+        padding[3],
+        padding[0],
+        padding[2],
+        border_type[padding_mode],
+        value=pad_val)
+
+    return img
+
+
+def impad_to_multiple(img, divisor, pad_val=0):
+    """Pad an image to ensure each edge to be multiple to some number.
+
+    Args:
+        img (ndarray): Image to be padded.
+        divisor (int): Padded image edges will be multiple to divisor.
+        pad_val (Number | Sequence[Number]): Same as :func:`impad`.
+
+    Returns:
+        ndarray: The padded image.
+    """
+    pad_h = int(np.ceil(img.shape[0] / divisor)) * divisor
+    pad_w = int(np.ceil(img.shape[1] / divisor)) * divisor
+    return impad(img, shape=(pad_h, pad_w), pad_val=pad_val)
+
+
+def cutout(img, shape, pad_val=0):
+    """Randomly cut out a rectangle from the original img.
+
+    Args:
+        img (ndarray): Image to be cutout.
+        shape (int | tuple[int]): Expected cutout shape (h, w). If given as a
+            int, the value will be used for both h and w.
+        pad_val (int | float | tuple[int | float]): Values to be filled in the
+            cut area. Defaults to 0.
+
+    Returns:
+        ndarray: The cutout image.
+    """
+
+    channels = 1 if img.ndim == 2 else img.shape[2]
+    if isinstance(shape, int):
+        cut_h, cut_w = shape, shape
+    else:
+        assert isinstance(shape, tuple) and len(shape) == 2, \
+            f'shape must be a int or a tuple with length 2, but got type ' \
+            f'{type(shape)} instead.'
+        cut_h, cut_w = shape
+    if isinstance(pad_val, (int, float)):
+        pad_val = tuple([pad_val] * channels)
+    elif isinstance(pad_val, tuple):
+        assert len(pad_val) == channels, \
+            'Expected the num of elements in tuple equals the channels' \
+            'of input image. Found {} vs {}'.format(
+                len(pad_val), channels)
+    else:
+        raise TypeError(f'Invalid type {type(pad_val)} for `pad_val`')
+
+    img_h, img_w = img.shape[:2]
+    y0 = np.random.uniform(img_h)
+    x0 = np.random.uniform(img_w)
+
+    y1 = int(max(0, y0 - cut_h / 2.))
+    x1 = int(max(0, x0 - cut_w / 2.))
+    y2 = min(img_h, y1 + cut_h)
+    x2 = min(img_w, x1 + cut_w)
+
+    if img.ndim == 2:
+        patch_shape = (y2 - y1, x2 - x1)
+    else:
+        patch_shape = (y2 - y1, x2 - x1, channels)
+
+    img_cutout = img.copy()
+    patch = np.array(
+        pad_val, dtype=img.dtype) * np.ones(
+            patch_shape, dtype=img.dtype)
+    img_cutout[y1:y2, x1:x2, ...] = patch
+
+    return img_cutout
+
+
+def _get_shear_matrix(magnitude, direction='horizontal'):
+    """Generate the shear matrix for transformation.
+
+    Args:
+        magnitude (int | float): The magnitude used for shear.
+        direction (str): The flip direction, either "horizontal"
+            or "vertical".
+
+    Returns:
+        ndarray: The shear matrix with dtype float32.
+    """
+    if direction == 'horizontal':
+        shear_matrix = np.float32([[1, magnitude, 0], [0, 1, 0]])
+    elif direction == 'vertical':
+        shear_matrix = np.float32([[1, 0, 0], [magnitude, 1, 0]])
+    return shear_matrix
+
+
+def imshear(img,
+            magnitude,
+            direction='horizontal',
+            border_value=0,
+            interpolation='bilinear'):
+    """Shear an image.
+
+    Args:
+        img (ndarray): Image to be sheared with format (h, w)
+            or (h, w, c).
+        magnitude (int | float): The magnitude used for shear.
+        direction (str): The flip direction, either "horizontal"
+            or "vertical".
+        border_value (int | tuple[int]): Value used in case of a
+            constant border.
+        interpolation (str): Same as :func:`resize`.
+
+    Returns:
+        ndarray: The sheared image.
+    """
+    assert direction in ['horizontal',
+                         'vertical'], f'Invalid direction: {direction}'
+    height, width = img.shape[:2]
+    if img.ndim == 2:
+        channels = 1
+    elif img.ndim == 3:
+        channels = img.shape[-1]
+    if isinstance(border_value, int):
+        border_value = tuple([border_value] * channels)
+    elif isinstance(border_value, tuple):
+        assert len(border_value) == channels, \
+            'Expected the num of elements in tuple equals the channels' \
+            'of input image. Found {} vs {}'.format(
+                len(border_value), channels)
+    else:
+        raise ValueError(
+            f'Invalid type {type(border_value)} for `border_value`')
+    shear_matrix = _get_shear_matrix(magnitude, direction)
+    sheared = cv2.warpAffine(
+        img,
+        shear_matrix,
+        (width, height),
+        # Note case when the number elements in `border_value`
+        # greater than 3 (e.g. shearing masks whose channels large
+        # than 3) will raise TypeError in `cv2.warpAffine`.
+        # Here simply slice the first 3 values in `border_value`.
+        borderValue=border_value[:3],
+        flags=cv2_interp_codes[interpolation])
+    return sheared
+
+
+def _get_translate_matrix(offset, direction='horizontal'):
+    """Generate the translate matrix.
+
+    Args:
+        offset (int | float): The offset used for translate.
+        direction (str): The translate direction, either
+            "horizontal" or "vertical".
+
+    Returns:
+        ndarray: The translate matrix with dtype float32.
+    """
+    if direction == 'horizontal':
+        translate_matrix = np.float32([[1, 0, offset], [0, 1, 0]])
+    elif direction == 'vertical':
+        translate_matrix = np.float32([[1, 0, 0], [0, 1, offset]])
+    return translate_matrix
+
+
+def imtranslate(img,
+                offset,
+                direction='horizontal',
+                border_value=0,
+                interpolation='bilinear'):
+    """Translate an image.
+
+    Args:
+        img (ndarray): Image to be translated with format
+            (h, w) or (h, w, c).
+        offset (int | float): The offset used for translate.
+        direction (str): The translate direction, either "horizontal"
+            or "vertical".
+        border_value (int | tuple[int]): Value used in case of a
+            constant border.
+        interpolation (str): Same as :func:`resize`.
+
+    Returns:
+        ndarray: The translated image.
+    """
+    assert direction in ['horizontal',
+                         'vertical'], f'Invalid direction: {direction}'
+    height, width = img.shape[:2]
+    if img.ndim == 2:
+        channels = 1
+    elif img.ndim == 3:
+        channels = img.shape[-1]
+    if isinstance(border_value, int):
+        border_value = tuple([border_value] * channels)
+    elif isinstance(border_value, tuple):
+        assert len(border_value) == channels, \
+            'Expected the num of elements in tuple equals the channels' \
+            'of input image. Found {} vs {}'.format(
+                len(border_value), channels)
+    else:
+        raise ValueError(
+            f'Invalid type {type(border_value)} for `border_value`.')
+    translate_matrix = _get_translate_matrix(offset, direction)
+    translated = cv2.warpAffine(
+        img,
+        translate_matrix,
+        (width, height),
+        # Note case when the number elements in `border_value`
+        # greater than 3 (e.g. translating masks whose channels
+        # large than 3) will raise TypeError in `cv2.warpAffine`.
+        # Here simply slice the first 3 values in `border_value`.
+        borderValue=border_value[:3],
+        flags=cv2_interp_codes[interpolation])
+    return translated
diff --git a/mmcv/image/io.py b/mmcv/image/io.py
new file mode 100644
index 0000000..69369f0
--- /dev/null
+++ b/mmcv/image/io.py
@@ -0,0 +1,262 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import io
+import os.path as osp
+from pathlib import Path
+
+import cv2
+import numpy as np
+from cv2 import (IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_IGNORE_ORIENTATION,
+                 IMREAD_UNCHANGED)
+
+from mmcv.utils import check_file_exist, is_str, mkdir_or_exist
+
+try:
+    from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG
+except ImportError:
+    TJCS_RGB = TJPF_GRAY = TJPF_BGR = TurboJPEG = None
+
+try:
+    from PIL import Image, ImageOps
+except ImportError:
+    Image = None
+
+try:
+    import tifffile
+except ImportError:
+    tifffile = None
+
+jpeg = None
+supported_backends = ['cv2', 'turbojpeg', 'pillow', 'tifffile']
+
+imread_flags = {
+    'color': IMREAD_COLOR,
+    'grayscale': IMREAD_GRAYSCALE,
+    'unchanged': IMREAD_UNCHANGED,
+    'color_ignore_orientation': IMREAD_IGNORE_ORIENTATION | IMREAD_COLOR,
+    'grayscale_ignore_orientation':
+    IMREAD_IGNORE_ORIENTATION | IMREAD_GRAYSCALE
+}
+
+imread_backend = 'cv2'
+
+
+def use_backend(backend):
+    """Select a backend for image decoding.
+
+    Args:
+        backend (str): The image decoding backend type. Options are `cv2`,
+        `pillow`, `turbojpeg` (see https://github.com/lilohuang/PyTurboJPEG)
+        and `tifffile`. `turbojpeg` is faster but it only supports `.jpeg`
+        file format.
+    """
+    assert backend in supported_backends
+    global imread_backend
+    imread_backend = backend
+    if imread_backend == 'turbojpeg':
+        if TurboJPEG is None:
+            raise ImportError('`PyTurboJPEG` is not installed')
+        global jpeg
+        if jpeg is None:
+            jpeg = TurboJPEG()
+    elif imread_backend == 'pillow':
+        if Image is None:
+            raise ImportError('`Pillow` is not installed')
+    elif imread_backend == 'tifffile':
+        if tifffile is None:
+            raise ImportError('`tifffile` is not installed')
+
+
+def _jpegflag(flag='color', channel_order='bgr'):
+    channel_order = channel_order.lower()
+    if channel_order not in ['rgb', 'bgr']:
+        raise ValueError('channel order must be either "rgb" or "bgr"')
+
+    if flag == 'color':
+        if channel_order == 'bgr':
+            return TJPF_BGR
+        elif channel_order == 'rgb':
+            return TJCS_RGB
+    elif flag == 'grayscale':
+        return TJPF_GRAY
+    else:
+        raise ValueError('flag must be "color" or "grayscale"')
+
+
+def _pillow2array(img, flag='color', channel_order='bgr'):
+    """Convert a pillow image to numpy array.
+
+    Args:
+        img (:obj:`PIL.Image.Image`): The image loaded using PIL
+        flag (str): Flags specifying the color type of a loaded image,
+            candidates are 'color', 'grayscale' and 'unchanged'.
+            Default to 'color'.
+        channel_order (str): The channel order of the output image array,
+            candidates are 'bgr' and 'rgb'. Default to 'bgr'.
+
+    Returns:
+        np.ndarray: The converted numpy array
+    """
+    channel_order = channel_order.lower()
+    if channel_order not in ['rgb', 'bgr']:
+        raise ValueError('channel order must be either "rgb" or "bgr"')
+
+    if flag == 'unchanged':
+        array = np.array(img)
+        if array.ndim >= 3 and array.shape[2] >= 3:  # color image
+            array[:, :, :3] = array[:, :, (2, 1, 0)]  # RGB to BGR
+    else:
+        # Handle exif orientation tag
+        if flag in ['color', 'grayscale']:
+            img = ImageOps.exif_transpose(img)
+        # If the image mode is not 'RGB', convert it to 'RGB' first.
+        if img.mode != 'RGB':
+            if img.mode != 'LA':
+                # Most formats except 'LA' can be directly converted to RGB
+                img = img.convert('RGB')
+            else:
+                # When the mode is 'LA', the default conversion will fill in
+                #  the canvas with black, which sometimes shadows black objects
+                #  in the foreground.
+                #
+                # Therefore, a random color (124, 117, 104) is used for canvas
+                img_rgba = img.convert('RGBA')
+                img = Image.new('RGB', img_rgba.size, (124, 117, 104))
+                img.paste(img_rgba, mask=img_rgba.split()[3])  # 3 is alpha
+        if flag in ['color', 'color_ignore_orientation']:
+            array = np.array(img)
+            if channel_order != 'rgb':
+                array = array[:, :, ::-1]  # RGB to BGR
+        elif flag in ['grayscale', 'grayscale_ignore_orientation']:
+            img = img.convert('L')
+            array = np.array(img)
+        else:
+            raise ValueError(
+                'flag must be "color", "grayscale", "unchanged", '
+                f'"color_ignore_orientation" or "grayscale_ignore_orientation"'
+                f' but got {flag}')
+    return array
+
+
+def imread(img_or_path, flag='color', channel_order='bgr', backend=None):
+    """Read an image.
+
+    Args:
+        img_or_path (ndarray or str or Path): Either a numpy array or str or
+            pathlib.Path. If it is a numpy array (loaded image), then
+            it will be returned as is.
+        flag (str): Flags specifying the color type of a loaded image,
+            candidates are `color`, `grayscale`, `unchanged`,
+            `color_ignore_orientation` and `grayscale_ignore_orientation`.
+            By default, `cv2` and `pillow` backend would rotate the image
+            according to its EXIF info unless called with `unchanged` or
+            `*_ignore_orientation` flags. `turbojpeg` and `tifffile` backend
+            always ignore image's EXIF info regardless of the flag.
+            The `turbojpeg` backend only supports `color` and `grayscale`.
+        channel_order (str): Order of channel, candidates are `bgr` and `rgb`.
+        backend (str | None): The image decoding backend type. Options are
+            `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`.
+            If backend is None, the global imread_backend specified by
+            ``mmcv.use_backend()`` will be used. Default: None.
+
+    Returns:
+        ndarray: Loaded image array.
+    """
+
+    if backend is None:
+        backend = imread_backend
+    if backend not in supported_backends:
+        raise ValueError(f'backend: {backend} is not supported. Supported '
+                         "backends are 'cv2', 'turbojpeg', 'pillow'")
+    if isinstance(img_or_path, Path):
+        img_or_path = str(img_or_path)
+
+    if isinstance(img_or_path, np.ndarray):
+        return img_or_path
+    elif is_str(img_or_path):
+        check_file_exist(img_or_path,
+                         f'img file does not exist: {img_or_path}')
+        if backend == 'turbojpeg':
+            with open(img_or_path, 'rb') as in_file:
+                img = jpeg.decode(in_file.read(),
+                                  _jpegflag(flag, channel_order))
+                if img.shape[-1] == 1:
+                    img = img[:, :, 0]
+            return img
+        elif backend == 'pillow':
+            img = Image.open(img_or_path)
+            img = _pillow2array(img, flag, channel_order)
+            return img
+        elif backend == 'tifffile':
+            img = tifffile.imread(img_or_path)
+            return img
+        else:
+            flag = imread_flags[flag] if is_str(flag) else flag
+            img = cv2.imread(img_or_path, flag)
+            if flag == IMREAD_COLOR and channel_order == 'rgb':
+                cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+            return img
+    else:
+        raise TypeError('"img" must be a numpy array or a str or '
+                        'a pathlib.Path object')
+
+
+def imfrombytes(content, flag='color', channel_order='bgr', backend=None):
+    """Read an image from bytes.
+
+    Args:
+        content (bytes): Image bytes got from files or other streams.
+        flag (str): Same as :func:`imread`.
+        backend (str | None): The image decoding backend type. Options are
+            `cv2`, `pillow`, `turbojpeg`, `None`. If backend is None, the
+            global imread_backend specified by ``mmcv.use_backend()`` will be
+            used. Default: None.
+
+    Returns:
+        ndarray: Loaded image array.
+    """
+
+    if backend is None:
+        backend = imread_backend
+    if backend not in supported_backends:
+        raise ValueError(f'backend: {backend} is not supported. Supported '
+                         "backends are 'cv2', 'turbojpeg', 'pillow'")
+    if backend == 'turbojpeg':
+        img = jpeg.decode(content, _jpegflag(flag, channel_order))
+        if img.shape[-1] == 1:
+            img = img[:, :, 0]
+        return img
+    elif backend == 'pillow':
+        buff = io.BytesIO(content)
+        img = Image.open(buff)
+        img = _pillow2array(img, flag, channel_order)
+        return img
+    else:
+        img_np = np.frombuffer(content, np.uint8)
+        flag = imread_flags[flag] if is_str(flag) else flag
+        img = cv2.imdecode(img_np, flag)
+        if flag == IMREAD_COLOR and channel_order == 'rgb':
+            cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+        return img
+
+
+def imwrite(img, file_path, params=None, auto_mkdir=True):
+    """Write image to file.
+
+    Args:
+        img (ndarray): Image array to be written.
+        file_path (str): Image file path.
+        params (None or list): Same as opencv :func:`imwrite` interface.
+        auto_mkdir (bool): If the parent folder of `file_path` does not exist,
+            whether to create it automatically.
+
+    Returns:
+        bool: Successful or not.
+    """
+    if auto_mkdir:
+        dir_name = osp.abspath(osp.dirname(file_path))
+        mkdir_or_exist(dir_name)
+    return cv2.imwrite(file_path, img, params)
+
+
+
+
diff --git a/mmcv/image/misc.py b/mmcv/image/misc.py
new file mode 100644
index 0000000..a52304a
--- /dev/null
+++ b/mmcv/image/misc.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from mmcv.image import imdenormalize
+
+try:
+    import torch
+except ImportError:
+    torch = None
+
+
+def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True):
+    """Convert tensor to 3-channel images.
+
+    Args:
+        tensor (torch.Tensor): Tensor that contains multiple images, shape (
+            N, C, H, W).
+        mean (tuple[float], optional): Mean of images. Defaults to (0, 0, 0).
+        std (tuple[float], optional): Standard deviation of images.
+            Defaults to (1, 1, 1).
+        to_rgb (bool, optional): Whether the tensor was converted to RGB
+            format in the first place. If so, convert it back to BGR.
+            Defaults to True.
+
+    Returns:
+        list[np.ndarray]: A list that contains multiple images.
+    """
+
+    if torch is None:
+        raise RuntimeError('pytorch is not installed')
+    assert torch.is_tensor(tensor) and tensor.ndim == 4
+    assert len(mean) == 3
+    assert len(std) == 3
+
+    num_imgs = tensor.size(0)
+    mean = np.array(mean, dtype=np.float32)
+    std = np.array(std, dtype=np.float32)
+    imgs = []
+    for img_id in range(num_imgs):
+        img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0)
+        img = imdenormalize(
+            img, mean, std, to_bgr=to_rgb).astype(np.uint8)
+        imgs.append(np.ascontiguousarray(img))
+    return imgs
diff --git a/mmcv/image/photometric.py b/mmcv/image/photometric.py
new file mode 100644
index 0000000..5085d01
--- /dev/null
+++ b/mmcv/image/photometric.py
@@ -0,0 +1,428 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import numpy as np
+
+from ..utils import is_tuple_of
+from .colorspace import bgr2gray, gray2bgr
+
+
+def imnormalize(img, mean, std, to_rgb=True):
+    """Normalize an image with mean and std.
+
+    Args:
+        img (ndarray): Image to be normalized.
+        mean (ndarray): The mean to be used for normalize.
+        std (ndarray): The std to be used for normalize.
+        to_rgb (bool): Whether to convert to rgb.
+
+    Returns:
+        ndarray: The normalized image.
+    """
+    img = img.copy().astype(np.float32)
+    return imnormalize_(img, mean, std, to_rgb)
+
+
+def imnormalize_(img, mean, std, to_rgb=True):
+    """Inplace normalize an image with mean and std.
+
+    Args:
+        img (ndarray): Image to be normalized.
+        mean (ndarray): The mean to be used for normalize.
+        std (ndarray): The std to be used for normalize.
+        to_rgb (bool): Whether to convert to rgb.
+
+    Returns:
+        ndarray: The normalized image.
+    """
+    # cv2 inplace normalization does not accept uint8
+    assert img.dtype != np.uint8
+    mean = np.float64(mean.reshape(1, -1))
+    stdinv = 1 / np.float64(std.reshape(1, -1))
+    if to_rgb:
+        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace
+    cv2.subtract(img, mean, img)  # inplace
+    cv2.multiply(img, stdinv, img)  # inplace
+    return img
+
+
+def imdenormalize(img, mean, std, to_bgr=True):
+    assert img.dtype != np.uint8
+    mean = mean.reshape(1, -1).astype(np.float64)
+    std = std.reshape(1, -1).astype(np.float64)
+    img = cv2.multiply(img, std)  # make a copy
+    cv2.add(img, mean, img)  # inplace
+    if to_bgr:
+        cv2.cvtColor(img, cv2.COLOR_RGB2BGR, img)  # inplace
+    return img
+
+
+def iminvert(img):
+    """Invert (negate) an image.
+
+    Args:
+        img (ndarray): Image to be inverted.
+
+    Returns:
+        ndarray: The inverted image.
+    """
+    return np.full_like(img, 255) - img
+
+
+def solarize(img, thr=128):
+    """Solarize an image (invert all pixel values above a threshold)
+
+    Args:
+        img (ndarray): Image to be solarized.
+        thr (int): Threshold for solarizing (0 - 255).
+
+    Returns:
+        ndarray: The solarized image.
+    """
+    img = np.where(img < thr, img, 255 - img)
+    return img
+
+
+def posterize(img, bits):
+    """Posterize an image (reduce the number of bits for each color channel)
+
+    Args:
+        img (ndarray): Image to be posterized.
+        bits (int): Number of bits (1 to 8) to use for posterizing.
+
+    Returns:
+        ndarray: The posterized image.
+    """
+    shift = 8 - bits
+    img = np.left_shift(np.right_shift(img, shift), shift)
+    return img
+
+
+def adjust_color(img, alpha=1, beta=None, gamma=0):
+    r"""It blends the source image and its gray image:
+
+    .. math::
+        output = img * alpha + gray\_img * beta + gamma
+
+    Args:
+        img (ndarray): The input source image.
+        alpha (int | float): Weight for the source image. Default 1.
+        beta (int | float): Weight for the converted gray image.
+            If None, it's assigned the value (1 - `alpha`).
+        gamma (int | float): Scalar added to each sum.
+            Same as :func:`cv2.addWeighted`. Default 0.
+
+    Returns:
+        ndarray: Colored image which has the same size and dtype as input.
+    """
+    gray_img = bgr2gray(img)
+    gray_img = np.tile(gray_img[..., None], [1, 1, 3])
+    if beta is None:
+        beta = 1 - alpha
+    colored_img = cv2.addWeighted(img, alpha, gray_img, beta, gamma)
+    if not colored_img.dtype == np.uint8:
+        # Note when the dtype of `img` is not the default `np.uint8`
+        # (e.g. np.float32), the value in `colored_img` got from cv2
+        # is not guaranteed to be in range [0, 255], so here clip
+        # is needed.
+        colored_img = np.clip(colored_img, 0, 255)
+    return colored_img
+
+
+def imequalize(img):
+    """Equalize the image histogram.
+
+    This function applies a non-linear mapping to the input image,
+    in order to create a uniform distribution of grayscale values
+    in the output image.
+
+    Args:
+        img (ndarray): Image to be equalized.
+
+    Returns:
+        ndarray: The equalized image.
+    """
+
+    def _scale_channel(im, c):
+        """Scale the data in the corresponding channel."""
+        im = im[:, :, c]
+        # Compute the histogram of the image channel.
+        histo = np.histogram(im, 256, (0, 255))[0]
+        # For computing the step, filter out the nonzeros.
+        nonzero_histo = histo[histo > 0]
+        step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255
+        if not step:
+            lut = np.array(range(256))
+        else:
+            # Compute the cumulative sum, shifted by step // 2
+            # and then normalized by step.
+            lut = (np.cumsum(histo) + (step // 2)) // step
+            # Shift lut, prepending with 0.
+            lut = np.concatenate([[0], lut[:-1]], 0)
+            # handle potential integer overflow
+            lut[lut > 255] = 255
+        # If step is zero, return the original image.
+        # Otherwise, index from lut.
+        return np.where(np.equal(step, 0), im, lut[im])
+
+    # Scales each channel independently and then stacks
+    # the result.
+    s1 = _scale_channel(img, 0)
+    s2 = _scale_channel(img, 1)
+    s3 = _scale_channel(img, 2)
+    equalized_img = np.stack([s1, s2, s3], axis=-1)
+    return equalized_img.astype(img.dtype)
+
+
+def adjust_brightness(img, factor=1.):
+    """Adjust image brightness.
+
+    This function controls the brightness of an image. An
+    enhancement factor of 0.0 gives a black image.
+    A factor of 1.0 gives the original image. This function
+    blends the source image and the degenerated black image:
+
+    .. math::
+        output = img * factor + degenerated * (1 - factor)
+
+    Args:
+        img (ndarray): Image to be brightened.
+        factor (float): A value controls the enhancement.
+            Factor 1.0 returns the original image, lower
+            factors mean less color (brightness, contrast,
+            etc), and higher values more. Default 1.
+
+    Returns:
+        ndarray: The brightened image.
+    """
+    degenerated = np.zeros_like(img)
+    # Note manually convert the dtype to np.float32, to
+    # achieve as close results as PIL.ImageEnhance.Brightness.
+    # Set beta=1-factor, and gamma=0
+    brightened_img = cv2.addWeighted(
+        img.astype(np.float32), factor, degenerated.astype(np.float32),
+        1 - factor, 0)
+    brightened_img = np.clip(brightened_img, 0, 255)
+    return brightened_img.astype(img.dtype)
+
+
+def adjust_contrast(img, factor=1.):
+    """Adjust image contrast.
+
+    This function controls the contrast of an image. An
+    enhancement factor of 0.0 gives a solid grey
+    image. A factor of 1.0 gives the original image. It
+    blends the source image and the degenerated mean image:
+
+    .. math::
+        output = img * factor + degenerated * (1 - factor)
+
+    Args:
+        img (ndarray): Image to be contrasted. BGR order.
+        factor (float): Same as :func:`mmcv.adjust_brightness`.
+
+    Returns:
+        ndarray: The contrasted image.
+    """
+    gray_img = bgr2gray(img)
+    hist = np.histogram(gray_img, 256, (0, 255))[0]
+    mean = round(np.sum(gray_img) / np.sum(hist))
+    degenerated = (np.ones_like(img[..., 0]) * mean).astype(img.dtype)
+    degenerated = gray2bgr(degenerated)
+    contrasted_img = cv2.addWeighted(
+        img.astype(np.float32), factor, degenerated.astype(np.float32),
+        1 - factor, 0)
+    contrasted_img = np.clip(contrasted_img, 0, 255)
+    return contrasted_img.astype(img.dtype)
+
+
+def auto_contrast(img, cutoff=0):
+    """Auto adjust image contrast.
+
+    This function maximize (normalize) image contrast by first removing cutoff
+    percent of the lightest and darkest pixels from the histogram and remapping
+    the image so that the darkest pixel becomes black (0), and the lightest
+    becomes white (255).
+
+    Args:
+        img (ndarray): Image to be contrasted. BGR order.
+        cutoff (int | float | tuple): The cutoff percent of the lightest and
+            darkest pixels to be removed. If given as tuple, it shall be
+            (low, high). Otherwise, the single value will be used for both.
+            Defaults to 0.
+
+    Returns:
+        ndarray: The contrasted image.
+    """
+
+    def _auto_contrast_channel(im, c, cutoff):
+        im = im[:, :, c]
+        # Compute the histogram of the image channel.
+        histo = np.histogram(im, 256, (0, 255))[0]
+        # Remove cut-off percent pixels from histo
+        histo_sum = np.cumsum(histo)
+        cut_low = histo_sum[-1] * cutoff[0] // 100
+        cut_high = histo_sum[-1] - histo_sum[-1] * cutoff[1] // 100
+        histo_sum = np.clip(histo_sum, cut_low, cut_high) - cut_low
+        histo = np.concatenate([[histo_sum[0]], np.diff(histo_sum)], 0)
+
+        # Compute mapping
+        low, high = np.nonzero(histo)[0][0], np.nonzero(histo)[0][-1]
+        # If all the values have been cut off, return the origin img
+        if low >= high:
+            return im
+        scale = 255.0 / (high - low)
+        offset = -low * scale
+        lut = np.array(range(256))
+        lut = lut * scale + offset
+        lut = np.clip(lut, 0, 255)
+        return lut[im]
+
+    if isinstance(cutoff, (int, float)):
+        cutoff = (cutoff, cutoff)
+    else:
+        assert isinstance(cutoff, tuple), 'cutoff must be of type int, ' \
+            f'float or tuple, but got {type(cutoff)} instead.'
+    # Auto adjusts contrast for each channel independently and then stacks
+    # the result.
+    s1 = _auto_contrast_channel(img, 0, cutoff)
+    s2 = _auto_contrast_channel(img, 1, cutoff)
+    s3 = _auto_contrast_channel(img, 2, cutoff)
+    contrasted_img = np.stack([s1, s2, s3], axis=-1)
+    return contrasted_img.astype(img.dtype)
+
+
+def adjust_sharpness(img, factor=1., kernel=None):
+    """Adjust image sharpness.
+
+    This function controls the sharpness of an image. An
+    enhancement factor of 0.0 gives a blurred image. A
+    factor of 1.0 gives the original image. And a factor
+    of 2.0 gives a sharpened image. It blends the source
+    image and the degenerated mean image:
+
+    .. math::
+        output = img * factor + degenerated * (1 - factor)
+
+    Args:
+        img (ndarray): Image to be sharpened. BGR order.
+        factor (float): Same as :func:`mmcv.adjust_brightness`.
+        kernel (np.ndarray, optional): Filter kernel to be applied on the img
+            to obtain the degenerated img. Defaults to None.
+
+    Note:
+        No value sanity check is enforced on the kernel set by users. So with
+        an inappropriate kernel, the ``adjust_sharpness`` may fail to perform
+        the function its name indicates but end up performing whatever
+        transform determined by the kernel.
+
+    Returns:
+        ndarray: The sharpened image.
+    """
+
+    if kernel is None:
+        # adopted from PIL.ImageFilter.SMOOTH
+        kernel = np.array([[1., 1., 1.], [1., 5., 1.], [1., 1., 1.]]) / 13
+    assert isinstance(kernel, np.ndarray), \
+        f'kernel must be of type np.ndarray, but got {type(kernel)} instead.'
+    assert kernel.ndim == 2, \
+        f'kernel must have a dimension of 2, but got {kernel.ndim} instead.'
+
+    degenerated = cv2.filter2D(img, -1, kernel)
+    sharpened_img = cv2.addWeighted(
+        img.astype(np.float32), factor, degenerated.astype(np.float32),
+        1 - factor, 0)
+    sharpened_img = np.clip(sharpened_img, 0, 255)
+    return sharpened_img.astype(img.dtype)
+
+
+def adjust_lighting(img, eigval, eigvec, alphastd=0.1, to_rgb=True):
+    """AlexNet-style PCA jitter.
+
+    This data augmentation is proposed in `ImageNet Classification with Deep
+    Convolutional Neural Networks
+    <https://dl.acm.org/doi/pdf/10.1145/3065386>`_.
+
+    Args:
+        img (ndarray): Image to be adjusted lighting. BGR order.
+        eigval (ndarray): the eigenvalue of the convariance matrix of pixel
+            values, respectively.
+        eigvec (ndarray): the eigenvector of the convariance matrix of pixel
+            values, respectively.
+        alphastd (float): The standard deviation for distribution of alpha.
+            Defaults to 0.1
+        to_rgb (bool): Whether to convert img to rgb.
+
+    Returns:
+        ndarray: The adjusted image.
+    """
+    assert isinstance(eigval, np.ndarray) and isinstance(eigvec, np.ndarray), \
+        f'eigval and eigvec should both be of type np.ndarray, got ' \
+        f'{type(eigval)} and {type(eigvec)} instead.'
+
+    assert eigval.ndim == 1 and eigvec.ndim == 2
+    assert eigvec.shape == (3, eigval.shape[0])
+    n_eigval = eigval.shape[0]
+    assert isinstance(alphastd, float), 'alphastd should be of type float, ' \
+        f'got {type(alphastd)} instead.'
+
+    img = img.copy().astype(np.float32)
+    if to_rgb:
+        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace
+
+    alpha = np.random.normal(0, alphastd, n_eigval)
+    alter = eigvec \
+        * np.broadcast_to(alpha.reshape(1, n_eigval), (3, n_eigval)) \
+        * np.broadcast_to(eigval.reshape(1, n_eigval), (3, n_eigval))
+    alter = np.broadcast_to(alter.sum(axis=1).reshape(1, 1, 3), img.shape)
+    img_adjusted = img + alter
+    return img_adjusted
+
+
+def lut_transform(img, lut_table):
+    """Transform array by look-up table.
+
+    The function lut_transform fills the output array with values from the
+    look-up table. Indices of the entries are taken from the input array.
+
+    Args:
+        img (ndarray): Image to be transformed.
+        lut_table (ndarray): look-up table of 256 elements; in case of
+            multi-channel input array, the table should either have a single
+            channel (in this case the same table is used for all channels) or
+            the same number of channels as in the input array.
+
+    Returns:
+        ndarray: The transformed image.
+    """
+    assert isinstance(img, np.ndarray)
+    assert 0 <= np.min(img) and np.max(img) <= 255
+    assert isinstance(lut_table, np.ndarray)
+    assert lut_table.shape == (256, )
+
+    return cv2.LUT(np.array(img, dtype=np.uint8), lut_table)
+
+
+def clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)):
+    """Use CLAHE method to process the image.
+
+    See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J].
+    Graphics Gems, 1994:474-485.` for more information.
+
+    Args:
+        img (ndarray): Image to be processed.
+        clip_limit (float): Threshold for contrast limiting. Default: 40.0.
+        tile_grid_size (tuple[int]): Size of grid for histogram equalization.
+            Input image will be divided into equally sized rectangular tiles.
+            It defines the number of tiles in row and column. Default: (8, 8).
+
+    Returns:
+        ndarray: The processed image.
+    """
+    assert isinstance(img, np.ndarray)
+    assert img.ndim == 2
+    assert isinstance(clip_limit, (float, int))
+    assert is_tuple_of(tile_grid_size, int)
+    assert len(tile_grid_size) == 2
+
+    clahe = cv2.createCLAHE(clip_limit, tile_grid_size)
+    return clahe.apply(np.array(img, dtype=np.uint8))
diff --git a/mmcv/layers/__init__.py b/mmcv/layers/__init__.py
new file mode 100644
index 0000000..53f735c
--- /dev/null
+++ b/mmcv/layers/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .batch_norm import get_norm
+from .nms import batched_nms
+from .shape_spec import ShapeSpec
+from .wrappers import cat, Conv2d
+from .roi_align import ROIAlign
\ No newline at end of file
diff --git a/mmcv/layers/aspp.py b/mmcv/layers/aspp.py
new file mode 100644
index 0000000..14861aa
--- /dev/null
+++ b/mmcv/layers/aspp.py
@@ -0,0 +1,144 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+from copy import deepcopy
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .batch_norm import get_norm
+from .blocks import DepthwiseSeparableConv2d
+from .wrappers import Conv2d
+
+
+class ASPP(nn.Module):
+    """
+    Atrous Spatial Pyramid Pooling (ASPP).
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        dilations,
+        *,
+        norm,
+        activation,
+        pool_kernel_size=None,
+        dropout: float = 0.0,
+        use_depthwise_separable_conv=False,
+    ):
+        """
+        Args:
+            in_channels (int): number of input channels for ASPP.
+            out_channels (int): number of output channels.
+            dilations (list): a list of 3 dilations in ASPP.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format. norm is
+                applied to all conv layers except the conv following
+                global average pooling.
+            activation (callable): activation function.
+            pool_kernel_size (tuple, list): the average pooling size (kh, kw)
+                for image pooling layer in ASPP. If set to None, it always
+                performs global average pooling. If not None, it must be
+                divisible by the shape of inputs in forward(). It is recommended
+                to use a fixed input feature size in training, and set this
+                option to match this size, so that it performs global average
+                pooling in training, and the size of the pooling window stays
+                consistent in inference.
+            dropout (float): apply dropout on the output of ASPP. It is used in
+                the official DeepLab implementation with a rate of 0.1:
+                https://github.com/tensorflow/models/blob/21b73d22f3ed05b650e85ac50849408dd36de32e/research/deeplab/model.py#L532  # noqa
+            use_depthwise_separable_conv (bool): use DepthwiseSeparableConv2d
+                for 3x3 convs in ASPP, proposed in :paper:`DeepLabV3+`.
+        """
+        super(ASPP, self).__init__()
+        assert len(dilations) == 3, "ASPP expects 3 dilations, got {}".format(len(dilations))
+        self.pool_kernel_size = pool_kernel_size
+        self.dropout = dropout
+        use_bias = norm == ""
+        self.convs = nn.ModuleList()
+        # conv 1x1
+        self.convs.append(
+            Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                bias=use_bias,
+                norm=get_norm(norm, out_channels),
+                activation=deepcopy(activation),
+            )
+        )
+        weight_init.c2_xavier_fill(self.convs[-1])
+        # atrous convs
+        for dilation in dilations:
+            if use_depthwise_separable_conv:
+                self.convs.append(
+                    DepthwiseSeparableConv2d(
+                        in_channels,
+                        out_channels,
+                        kernel_size=3,
+                        padding=dilation,
+                        dilation=dilation,
+                        norm1=norm,
+                        activation1=deepcopy(activation),
+                        norm2=norm,
+                        activation2=deepcopy(activation),
+                    )
+                )
+            else:
+                self.convs.append(
+                    Conv2d(
+                        in_channels,
+                        out_channels,
+                        kernel_size=3,
+                        padding=dilation,
+                        dilation=dilation,
+                        bias=use_bias,
+                        norm=get_norm(norm, out_channels),
+                        activation=deepcopy(activation),
+                    )
+                )
+                weight_init.c2_xavier_fill(self.convs[-1])
+        # image pooling
+        # We do not add BatchNorm because the spatial resolution is 1x1,
+        # the original TF implementation has BatchNorm.
+        if pool_kernel_size is None:
+            image_pooling = nn.Sequential(
+                nn.AdaptiveAvgPool2d(1),
+                Conv2d(in_channels, out_channels, 1, bias=True, activation=deepcopy(activation)),
+            )
+        else:
+            image_pooling = nn.Sequential(
+                nn.AvgPool2d(kernel_size=pool_kernel_size, stride=1),
+                Conv2d(in_channels, out_channels, 1, bias=True, activation=deepcopy(activation)),
+            )
+        weight_init.c2_xavier_fill(image_pooling[1])
+        self.convs.append(image_pooling)
+
+        self.project = Conv2d(
+            5 * out_channels,
+            out_channels,
+            kernel_size=1,
+            bias=use_bias,
+            norm=get_norm(norm, out_channels),
+            activation=deepcopy(activation),
+        )
+        weight_init.c2_xavier_fill(self.project)
+
+    def forward(self, x):
+        size = x.shape[-2:]
+        if self.pool_kernel_size is not None:
+            if size[0] % self.pool_kernel_size[0] or size[1] % self.pool_kernel_size[1]:
+                raise ValueError(
+                    "`pool_kernel_size` must be divisible by the shape of inputs. "
+                    "Input size: {} `pool_kernel_size`: {}".format(size, self.pool_kernel_size)
+                )
+        res = []
+        for conv in self.convs:
+            res.append(conv(x))
+        res[-1] = F.interpolate(res[-1], size=size, mode="bilinear", align_corners=False)
+        res = torch.cat(res, dim=1)
+        res = self.project(res)
+        res = F.dropout(res, self.dropout, training=self.training) if self.dropout > 0 else res
+        return res
diff --git a/mmcv/layers/batch_norm.py b/mmcv/layers/batch_norm.py
new file mode 100644
index 0000000..9c9d19f
--- /dev/null
+++ b/mmcv/layers/batch_norm.py
@@ -0,0 +1,384 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch
+import torch.distributed as dist
+from torch import nn
+from torch.nn import functional as F
+from torch.autograd.function import Function
+from .wrappers import BatchNorm2d
+TORCH_VERSION = tuple(int(x) for x in torch.__version__.split(".")[:2])
+
+def get_world_size() -> int:
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+
+class _AllReduce(Function):
+    @staticmethod
+    def forward(ctx, input: torch.Tensor) -> torch.Tensor:
+        input_list = [torch.zeros_like(input) for k in range(dist.get_world_size())]
+        # Use allgather instead of allreduce since I don't trust in-place operations ..
+        dist.all_gather(input_list, input, async_op=False)
+        inputs = torch.stack(input_list, dim=0)
+        return torch.sum(inputs, dim=0)
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
+        dist.all_reduce(grad_output, async_op=False)
+        return grad_output
+
+def differentiable_all_reduce(input: torch.Tensor) -> torch.Tensor:
+    """
+    Differentiable counterpart of `dist.all_reduce`.
+    """
+    if (
+        not dist.is_available()
+        or not dist.is_initialized()
+        or dist.get_world_size() == 1
+    ):
+        return input
+    return _AllReduce.apply(input)
+
+
+class FrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    It contains non-trainable buffers called
+    "weight" and "bias", "running_mean", "running_var",
+    initialized to perform identity transformation.
+
+    The pre-trained backbone models from Caffe2 only contain "weight" and "bias",
+    which are computed from the original four parameters of BN.
+    The affine transform `x * weight + bias` will perform the equivalent
+    computation of `(x - running_mean) / sqrt(running_var) * weight + bias`.
+    When loading a backbone model from Caffe2, "running_mean" and "running_var"
+    will be left unchanged as identity transformation.
+
+    Other pre-trained backbone models may contain all 4 parameters.
+
+    The forward is implemented by `F.batch_norm(..., training=False)`.
+    """
+
+    _version = 3
+
+    def __init__(self, num_features, eps=1e-5):
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.register_buffer("weight", torch.ones(num_features))
+        self.register_buffer("bias", torch.zeros(num_features))
+        self.register_buffer("running_mean", torch.zeros(num_features))
+        self.register_buffer("running_var", torch.ones(num_features) - eps)
+        self.register_buffer("num_batches_tracked", None)
+
+    def forward(self, x):
+        if x.requires_grad:
+            # When gradients are needed, F.batch_norm will use extra memory
+            # because its backward op computes gradients for weight/bias as well.
+            scale = self.weight * (self.running_var + self.eps).rsqrt()
+            bias = self.bias - self.running_mean * scale
+            scale = scale.reshape(1, -1, 1, 1)
+            bias = bias.reshape(1, -1, 1, 1)
+            out_dtype = x.dtype  # may be half
+            return x * scale.to(out_dtype) + bias.to(out_dtype)
+        else:
+            # When gradients are not needed, F.batch_norm is a single fused op
+            # and provide more optimization opportunities.
+            return F.batch_norm(
+                x,
+                self.running_mean,
+                self.running_var,
+                self.weight,
+                self.bias,
+                training=False,
+                eps=self.eps,
+            )
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+
+        if version is None or version < 2:
+            # No running_mean/var in early versions
+            # This will silent the warnings
+            if prefix + "running_mean" not in state_dict:
+                state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean)
+            if prefix + "running_var" not in state_dict:
+                state_dict[prefix + "running_var"] = torch.ones_like(self.running_var)
+
+        super()._load_from_state_dict(
+            state_dict,
+            prefix,
+            local_metadata,
+            strict,
+            missing_keys,
+            unexpected_keys,
+            error_msgs,
+        )
+
+    def __repr__(self):
+        return "FrozenBatchNorm2d(num_features={}, eps={})".format(self.num_features, self.eps)
+
+    @classmethod
+    def convert_frozen_batchnorm(cls, module):
+        """
+        Convert all BatchNorm/SyncBatchNorm in module into FrozenBatchNorm.
+
+        Args:
+            module (torch.nn.Module):
+
+        Returns:
+            If module is BatchNorm/SyncBatchNorm, returns a new module.
+            Otherwise, in-place convert module and return it.
+
+        Similar to convert_sync_batchnorm in
+        https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py
+        """
+        bn_module = nn.modules.batchnorm
+        bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm)
+        res = module
+        if isinstance(module, bn_module):
+            res = cls(module.num_features)
+            if module.affine:
+                res.weight.data = module.weight.data.clone().detach()
+                res.bias.data = module.bias.data.clone().detach()
+            res.running_mean.data = module.running_mean.data
+            res.running_var.data = module.running_var.data
+            res.eps = module.eps
+            res.num_batches_tracked = module.num_batches_tracked
+        else:
+            for name, child in module.named_children():
+                new_child = cls.convert_frozen_batchnorm(child)
+                if new_child is not child:
+                    res.add_module(name, new_child)
+        return res
+
+    @classmethod
+    def convert_frozenbatchnorm2d_to_batchnorm2d(cls, module: nn.Module) -> nn.Module:
+        """
+        Convert all FrozenBatchNorm2d to BatchNorm2d
+
+        Args:
+            module (torch.nn.Module):
+
+        Returns:
+            If module is FrozenBatchNorm2d, returns a new module.
+            Otherwise, in-place convert module and return it.
+
+        This is needed for quantization:
+            https://fb.workplace.com/groups/1043663463248667/permalink/1296330057982005/
+        """
+
+        res = module
+        if isinstance(module, FrozenBatchNorm2d):
+            res = torch.nn.BatchNorm2d(module.num_features, module.eps)
+
+            res.weight.data = module.weight.data.clone().detach()
+            res.bias.data = module.bias.data.clone().detach()
+            res.running_mean.data = module.running_mean.data.clone().detach()
+            res.running_var.data = module.running_var.data.clone().detach()
+            res.eps = module.eps
+            res.num_batches_tracked = module.num_batches_tracked
+        else:
+            for name, child in module.named_children():
+                new_child = cls.convert_frozenbatchnorm2d_to_batchnorm2d(child)
+                if new_child is not child:
+                    res.add_module(name, new_child)
+        return res
+
+
+def get_norm(norm, out_channels):
+    """
+    Args:
+        norm (str or callable): either one of BN, SyncBN, FrozenBN, GN;
+            or a callable that takes a channel number and returns
+            the normalization layer as a nn.Module.
+
+    Returns:
+        nn.Module or None: the normalization layer
+    """
+    if norm is None:
+        return None
+    if isinstance(norm, str):
+        if len(norm) == 0:
+            return None
+        norm = {
+            "BN": BatchNorm2d,
+            # Fixed in https://github.com/pytorch/pytorch/pull/36382
+            "SyncBN": NaiveSyncBatchNorm if TORCH_VERSION <= (1, 5) else nn.SyncBatchNorm,
+            "FrozenBN": FrozenBatchNorm2d,
+            "GN": lambda channels: nn.GroupNorm(32, channels),
+            # for debugging:
+            "nnSyncBN": nn.SyncBatchNorm,
+            "naiveSyncBN": NaiveSyncBatchNorm,
+            # expose stats_mode N as an option to caller, required for zero-len inputs
+            "naiveSyncBN_N": lambda channels: NaiveSyncBatchNorm(channels, stats_mode="N"),
+            "LN": lambda channels: LayerNorm(channels),
+        }[norm]
+    return norm(out_channels)
+
+
+class NaiveSyncBatchNorm(BatchNorm2d):
+    """
+    In PyTorch<=1.5, ``nn.SyncBatchNorm`` has incorrect gradient
+    when the batch size on each worker is different.
+    (e.g., when scale augmentation is used, or when it is applied to mask head).
+
+    This is a slower but correct alternative to `nn.SyncBatchNorm`.
+
+    Note:
+        There isn't a single definition of Sync BatchNorm.
+
+        When ``stats_mode==""``, this module computes overall statistics by using
+        statistics of each worker with equal weight.  The result is true statistics
+        of all samples (as if they are all on one worker) only when all workers
+        have the same (N, H, W). This mode does not support inputs with zero batch size.
+
+        When ``stats_mode=="N"``, this module computes overall statistics by weighting
+        the statistics of each worker by their ``N``. The result is true statistics
+        of all samples (as if they are all on one worker) only when all workers
+        have the same (H, W). It is slower than ``stats_mode==""``.
+
+        Even though the result of this module may not be the true statistics of all samples,
+        it may still be reasonable because it might be preferrable to assign equal weights
+        to all workers, regardless of their (H, W) dimension, instead of putting larger weight
+        on larger images. From preliminary experiments, little difference is found between such
+        a simplified implementation and an accurate computation of overall mean & variance.
+    """
+
+    def __init__(self, *args, stats_mode="", **kwargs):
+        super().__init__(*args, **kwargs)
+        assert stats_mode in ["", "N"]
+        self._stats_mode = stats_mode
+
+    def forward(self, input):
+        if get_world_size() == 1 or not self.training:
+            return super().forward(input)
+
+        B, C = input.shape[0], input.shape[1]
+
+        half_input = input.dtype == torch.float16
+        if half_input:
+            # fp16 does not have good enough numerics for the reduction here
+            input = input.float()
+        mean = torch.mean(input, dim=[0, 2, 3])
+        meansqr = torch.mean(input * input, dim=[0, 2, 3])
+
+        if self._stats_mode == "":
+            assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.'
+            vec = torch.cat([mean, meansqr], dim=0)
+            vec = differentiable_all_reduce(vec) * (1.0 / dist.get_world_size())
+            mean, meansqr = torch.split(vec, C)
+            momentum = self.momentum
+        else:
+            if B == 0:
+                vec = torch.zeros([2 * C + 1], device=mean.device, dtype=mean.dtype)
+                vec = vec + input.sum()  # make sure there is gradient w.r.t input
+            else:
+                vec = torch.cat(
+                    [
+                        mean,
+                        meansqr,
+                        torch.ones([1], device=mean.device, dtype=mean.dtype),
+                    ],
+                    dim=0,
+                )
+            vec = differentiable_all_reduce(vec * B)
+
+            total_batch = vec[-1].detach()
+            momentum = total_batch.clamp(max=1) * self.momentum  # no update if total_batch is 0
+            mean, meansqr, _ = torch.split(vec / total_batch.clamp(min=1), C)  # avoid div-by-zero
+
+        var = meansqr - mean * mean
+        invstd = torch.rsqrt(var + self.eps)
+        scale = self.weight * invstd
+        bias = self.bias - mean * scale
+        scale = scale.reshape(1, -1, 1, 1)
+        bias = bias.reshape(1, -1, 1, 1)
+
+        self.running_mean += momentum * (mean.detach() - self.running_mean)
+        self.running_var += momentum * (var.detach() - self.running_var)
+        ret = input * scale + bias
+        if half_input:
+            ret = ret.half()
+        return ret
+
+
+class CycleBatchNormList(nn.ModuleList):
+    """
+    Implement domain-specific BatchNorm by cycling.
+
+    When a BatchNorm layer is used for multiple input domains or input
+    features, it might need to maintain a separate test-time statistics
+    for each domain. See Sec 5.2 in :paper:`rethinking-batchnorm`.
+
+    This module implements it by using N separate BN layers
+    and it cycles through them every time a forward() is called.
+
+    NOTE: The caller of this module MUST guarantee to always call
+    this module by multiple of N times. Otherwise its test-time statistics
+    will be incorrect.
+    """
+
+    def __init__(self, length: int, bn_class=nn.BatchNorm2d, **kwargs):
+        """
+        Args:
+            length: number of BatchNorm layers to cycle.
+            bn_class: the BatchNorm class to use
+            kwargs: arguments of the BatchNorm class, such as num_features.
+        """
+        self._affine = kwargs.pop("affine", True)
+        super().__init__([bn_class(**kwargs, affine=False) for k in range(length)])
+        if self._affine:
+            # shared affine, domain-specific BN
+            channels = self[0].num_features
+            self.weight = nn.Parameter(torch.ones(channels))
+            self.bias = nn.Parameter(torch.zeros(channels))
+        self._pos = 0
+
+    def forward(self, x):
+        ret = self[self._pos](x)
+        self._pos = (self._pos + 1) % len(self)
+
+        if self._affine:
+            w = self.weight.reshape(1, -1, 1, 1)
+            b = self.bias.reshape(1, -1, 1, 1)
+            return ret * w + b
+        else:
+            return ret
+
+    def extra_repr(self):
+        return f"affine={self._affine}"
+
+
+class LayerNorm(nn.Module):
+    """
+    A LayerNorm variant, popularized by Transformers, that performs point-wise mean and
+    variance normalization over the channel dimension for inputs that have shape
+    (batch_size, channels, height, width).
+    https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa B950
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.normalized_shape = (normalized_shape,)
+
+    def forward(self, x):
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
diff --git a/mmcv/layers/blocks.py b/mmcv/layers/blocks.py
new file mode 100644
index 0000000..1995a4b
--- /dev/null
+++ b/mmcv/layers/blocks.py
@@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import fvcore.nn.weight_init as weight_init
+from torch import nn
+
+from .batch_norm import FrozenBatchNorm2d, get_norm
+from .wrappers import Conv2d
+
+
+"""
+CNN building blocks.
+"""
+
+
+class CNNBlockBase(nn.Module):
+    """
+    A CNN block is assumed to have input channels, output channels and a stride.
+    The input and output of `forward()` method must be NCHW tensors.
+    The method can perform arbitrary computation but must match the given
+    channels and stride specification.
+
+    Attribute:
+        in_channels (int):
+        out_channels (int):
+        stride (int):
+    """
+
+    def __init__(self, in_channels, out_channels, stride):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+
+        Args:
+            in_channels (int):
+            out_channels (int):
+            stride (int):
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = stride
+
+    def freeze(self):
+        """
+        Make this block not trainable.
+        This method sets all parameters to `requires_grad=False`,
+        and convert all BatchNorm layers to FrozenBatchNorm
+
+        Returns:
+            the block itself
+        """
+        for p in self.parameters():
+            p.requires_grad = False
+        FrozenBatchNorm2d.convert_frozen_batchnorm(self)
+        return self
+
+
+class DepthwiseSeparableConv2d(nn.Module):
+    """
+    A kxk depthwise convolution + a 1x1 convolution.
+
+    In :paper:`xception`, norm & activation are applied on the second conv.
+    :paper:`mobilenet` uses norm & activation on both convs.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        padding=1,
+        dilation=1,
+        *,
+        norm1=None,
+        activation1=None,
+        norm2=None,
+        activation2=None,
+    ):
+        """
+        Args:
+            norm1, norm2 (str or callable): normalization for the two conv layers.
+            activation1, activation2 (callable(Tensor) -> Tensor): activation
+                function for the two conv layers.
+        """
+        super().__init__()
+        self.depthwise = Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            bias=not norm1,
+            norm=get_norm(norm1, in_channels),
+            activation=activation1,
+        )
+        self.pointwise = Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            bias=not norm2,
+            norm=get_norm(norm2, out_channels),
+            activation=activation2,
+        )
+
+        # default initialization
+        weight_init.c2_msra_fill(self.depthwise)
+        weight_init.c2_msra_fill(self.pointwise)
+
+    def forward(self, x):
+        return self.pointwise(self.depthwise(x))
diff --git a/mmcv/layers/csrc/README.md b/mmcv/layers/csrc/README.md
new file mode 100644
index 0000000..778ed3d
--- /dev/null
+++ b/mmcv/layers/csrc/README.md
@@ -0,0 +1,7 @@
+
+
+To add a new Op:
+
+1. Create a new directory
+2. Implement new ops there
+3. Delcare its Python interface in `vision.cpp`.
diff --git a/mmcv/layers/csrc/ROIAlignRotated/ROIAlignRotated.h b/mmcv/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
new file mode 100644
index 0000000..03f4211
--- /dev/null
+++ b/mmcv/layers/csrc/ROIAlignRotated/ROIAlignRotated.h
@@ -0,0 +1,115 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+at::Tensor ROIAlignRotated_forward_cpu(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio);
+
+at::Tensor ROIAlignRotated_backward_cpu(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio);
+
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+at::Tensor ROIAlignRotated_forward_cuda(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio);
+
+at::Tensor ROIAlignRotated_backward_cuda(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio);
+#endif
+
+// Interface for Python
+inline at::Tensor ROIAlignRotated_forward(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const double spatial_scale,
+    const int64_t pooled_height,
+    const int64_t pooled_width,
+    const int64_t sampling_ratio) {
+  if (input.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    return ROIAlignRotated_forward_cuda(
+        input,
+        rois,
+        spatial_scale,
+        pooled_height,
+        pooled_width,
+        sampling_ratio);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  return ROIAlignRotated_forward_cpu(
+      input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
+}
+
+inline at::Tensor ROIAlignRotated_backward(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const double spatial_scale,
+    const int64_t pooled_height,
+    const int64_t pooled_width,
+    const int64_t batch_size,
+    const int64_t channels,
+    const int64_t height,
+    const int64_t width,
+    const int64_t sampling_ratio) {
+  if (grad.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    return ROIAlignRotated_backward_cuda(
+        grad,
+        rois,
+        spatial_scale,
+        pooled_height,
+        pooled_width,
+        batch_size,
+        channels,
+        height,
+        width,
+        sampling_ratio);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  return ROIAlignRotated_backward_cpu(
+      grad,
+      rois,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      batch_size,
+      channels,
+      height,
+      width,
+      sampling_ratio);
+}
+
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp b/mmcv/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
new file mode 100644
index 0000000..2a3d305
--- /dev/null
+++ b/mmcv/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp
@@ -0,0 +1,522 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include <ATen/TensorUtils.h>
+#include "ROIAlignRotated.h"
+
+// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
+// and PyTorch ROIAlign (non-rotated) Op implementations.
+// The key difference between this implementation and those ones is
+// we don't do "legacy offset" in this version, as there aren't many previous
+// works, if any, using the "legacy" ROIAlignRotated Op.
+// This would make the interface a bit cleaner.
+
+namespace detectron2 {
+
+namespace {
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int iy_upper,
+    const int ix_upper,
+    T roi_start_h,
+    T roi_start_w,
+    T bin_size_h,
+    T bin_size_w,
+    int roi_bin_grid_h,
+    int roi_bin_grid_w,
+    T roi_center_h,
+    T roi_center_w,
+    T cos_theta,
+    T sin_theta,
+    std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+            static_cast<T>(iy + .5f) * bin_size_h /
+                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+              static_cast<T>(ix + .5f) * bin_size_w /
+                  static_cast<T>(roi_bin_grid_w);
+
+          // Rotate by theta around the center and translate
+          // In image space, (y, x) is the order for Right Handed System,
+          // and this is essentially multiplying the point by a rotation matrix
+          // to rotate it counterclockwise through angle theta.
+          T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+          T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y < 0) {
+            y = 0;
+          }
+          if (x < 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T& w1,
+    T& w2,
+    T& w3,
+    T& w4,
+    int& x_low,
+    int& x_high,
+    int& y_low,
+    int& y_high) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(T* address, const T& val) {
+  *address += val;
+}
+
+} // namespace
+
+template <typename T>
+void ROIAlignRotatedForward(
+    const int nthreads,
+    const T* input,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* rois,
+    T* output) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    // ROIAlignRotated supports align == true, i.e., continuous coordinate
+    // by default, thus the 0.5 offset
+    T offset = (T)0.5;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5] * M_PI / 180.0;
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    AT_ASSERTM(
+        roi_width >= 0 && roi_height >= 0,
+        "ROIs in ROIAlignRotated do not have non-negative size!");
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc<T>> pre_calc(
+        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    pre_calc_for_bilinear_interpolate(
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_start_h,
+        roi_start_w,
+        bin_size_h,
+        bin_size_w,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_center_h,
+        roi_center_w,
+        cos_theta,
+        sin_theta,
+        pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_input[pc.pos1] +
+                  pc.w2 * offset_input[pc.pos2] +
+                  pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          output[index] = output_val;
+        } // for pw
+      } // for ph
+    } // for c
+  } // for n
+}
+
+template <typename T>
+void ROIAlignRotatedBackward(
+    const int nthreads,
+    // may not be contiguous. should index using n_stride, etc
+    const T* grad_output,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* grad_input,
+    const T* rois,
+    const int n_stride,
+    const int c_stride,
+    const int h_stride,
+    const int w_stride) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    // ROIAlignRotated supports align == true, i.e., continuous coordinate
+    // by default, thus the 0.5 offset
+    T offset = (T)0.5;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5] * M_PI / 180.0;
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    AT_ASSERTM(
+        roi_width >= 0 && roi_height >= 0,
+        "ROIs in ROIAlignRotated do not have non-negative size!");
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    int output_offset = n * n_stride + c * c_stride;
+    const T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T yy = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        } // if
+      } // ix
+    } // iy
+  } // for
+} // ROIAlignRotatedBackward
+
+at::Tensor ROIAlignRotated_forward_cpu(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio) {
+  AT_ASSERTM(input.device().is_cpu(), "input must be a CPU tensor");
+  AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlign_forward_cpu";
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  at::Tensor output = at::zeros(
+      {num_rois, channels, pooled_height, pooled_width}, input.options());
+
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+
+  if (output.numel() == 0) {
+    return output;
+  }
+
+  auto input_ = input.contiguous(), rois_ = rois.contiguous();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "ROIAlignRotated_forward", [&] {
+        ROIAlignRotatedForward<scalar_t>(
+            output_size,
+            input_.data_ptr<scalar_t>(),
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            rois_.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>());
+      });
+  return output;
+}
+
+at::Tensor ROIAlignRotated_backward_cpu(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio) {
+  AT_ASSERTM(grad.device().is_cpu(), "grad must be a CPU tensor");
+  AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
+
+  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlignRotated_backward_cpu";
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  at::Tensor grad_input =
+      at::zeros({batch_size, channels, height, width}, grad.options());
+
+  // handle possibly empty gradients
+  if (grad.numel() == 0) {
+    return grad_input;
+  }
+
+  // get stride values to ensure indexing into gradients is correct.
+  int n_stride = grad.stride(0);
+  int c_stride = grad.stride(1);
+  int h_stride = grad.stride(2);
+  int w_stride = grad.stride(3);
+
+  auto rois_ = rois.contiguous();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad.scalar_type(), "ROIAlignRotated_forward", [&] {
+        ROIAlignRotatedBackward<scalar_t>(
+            grad.numel(),
+            grad.data_ptr<scalar_t>(),
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            grad_input.data_ptr<scalar_t>(),
+            rois_.data_ptr<scalar_t>(),
+            n_stride,
+            c_stride,
+            h_stride,
+            w_stride);
+      });
+  return grad_input;
+}
+
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu b/mmcv/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
new file mode 100644
index 0000000..fca1865
--- /dev/null
+++ b/mmcv/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu
@@ -0,0 +1,443 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+// Note: this implementation originates from the Caffe2 ROIAlignRotated Op
+// and PyTorch ROIAlign (non-rotated) Op implementations.
+// The key difference between this implementation and those ones is
+// we don't do "legacy offset" in this version, as there aren't many previous
+// works, if any, using the "legacy" ROIAlignRotated Op.
+// This would make the interface a bit cleaner.
+
+namespace detectron2 {
+
+namespace {
+
+template <typename T>
+__device__ T bilinear_interpolate(
+    const T* input,
+    const int height,
+    const int width,
+    T y,
+    T x) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    return 0;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = input[y_low * width + x_low];
+  T v2 = input[y_low * width + x_high];
+  T v3 = input[y_high * width + x_low];
+  T v4 = input[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height,
+    const int width,
+    T y,
+    T x,
+    T& w1,
+    T& w2,
+    T& w3,
+    T& w4,
+    int& x_low,
+    int& x_high,
+    int& y_low,
+    int& y_high) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+} // namespace
+
+template <typename T>
+__global__ void RoIAlignRotatedForward(
+    const int nthreads,
+    const T* input,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* rois,
+    T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    // ROIAlignRotated supports align == true, i.e., continuous coordinate
+    // by default, thus the 0.5 offset
+    T offset = (T)0.5;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5] * M_PI / 180.0;
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    // We do average (inte  gral) pooling inside a bin
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
+
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T yy = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+        T val = bilinear_interpolate(offset_input, height, width, y, x);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+template <typename T>
+__global__ void RoIAlignRotatedBackwardFeature(
+    const int nthreads,
+    const T* top_diff,
+    const int num_rois,
+    const T spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    T* bottom_diff,
+    const T* rois) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    // ROIAlignRotated supports align == true, i.e., continuous coordinate
+    // by default, thus the 0.5 offset
+    T offset = (T)0.5;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5] * M_PI / 180.0;
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T yy = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
+
+        T g1 = top_diff_this_bin * w1 / count;
+        T g2 = top_diff_this_bin * w2 / count;
+        T g3 = top_diff_this_bin * w3 / count;
+        T g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(
+              offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
+          atomicAdd(
+              offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
+          atomicAdd(
+              offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
+          atomicAdd(
+              offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
+        } // if
+      } // ix
+    } // iy
+  } // CUDA_1D_KERNEL_LOOP
+} // RoIAlignRotatedBackward
+
+at::Tensor ROIAlignRotated_forward_cuda(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio) {
+  AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ROIAlignRotated_forward_cuda";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+  at::cuda::CUDAGuard device_guard(input.device());
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+
+  auto output = at::empty(
+      {num_rois, channels, pooled_height, pooled_width}, input.options());
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(
+      at::cuda::ATenCeilDiv(
+          static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
+      static_cast<int64_t>(4096)));
+  dim3 block(512);
+
+  if (output.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return output;
+  }
+
+  auto input_ = input.contiguous(), rois_ = rois.contiguous();
+  AT_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "ROIAlignRotated_forward", [&] {
+        RoIAlignRotatedForward<scalar_t><<<grid, block, 0, stream>>>(
+            output_size,
+            input_.data_ptr<scalar_t>(),
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            rois_.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>());
+      });
+  cudaDeviceSynchronize();
+  AT_CUDA_CHECK(cudaGetLastError());
+  return output;
+}
+
+// TODO remove the dependency on input and use instead its sizes -> save memory
+at::Tensor ROIAlignRotated_backward_cuda(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    const float spatial_scale,
+    const int pooled_height,
+    const int pooled_width,
+    const int batch_size,
+    const int channels,
+    const int height,
+    const int width,
+    const int sampling_ratio) {
+  AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+
+  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
+  at::CheckedFrom c = "ROIAlign_backward_cuda";
+  at::checkAllSameGPU(c, {grad_t, rois_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+  at::cuda::CUDAGuard device_guard(grad.device());
+
+  auto num_rois = rois.size(0);
+  auto grad_input =
+      at::zeros({batch_size, channels, height, width}, grad.options());
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 grid(std::min(
+      at::cuda::ATenCeilDiv(
+          static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)),
+      static_cast<int64_t>(4096)));
+  dim3 block(512);
+
+  // handle possibly empty gradients
+  if (grad.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return grad_input;
+  }
+
+  auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
+  AT_DISPATCH_FLOATING_TYPES(
+      grad.scalar_type(), "ROIAlignRotated_backward", [&] {
+        RoIAlignRotatedBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
+            grad.numel(),
+            grad_.data_ptr<scalar_t>(),
+            num_rois,
+            spatial_scale,
+            channels,
+            height,
+            width,
+            pooled_height,
+            pooled_width,
+            sampling_ratio,
+            grad_input.data_ptr<scalar_t>(),
+            rois_.data_ptr<scalar_t>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+  return grad_input;
+}
+
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated.h b/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated.h
new file mode 100644
index 0000000..3bf383b
--- /dev/null
+++ b/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated.h
@@ -0,0 +1,35 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+at::Tensor box_iou_rotated_cpu(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2);
+
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+at::Tensor box_iou_rotated_cuda(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+inline at::Tensor box_iou_rotated(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2) {
+  assert(boxes1.device().is_cuda() == boxes2.device().is_cuda());
+  if (boxes1.device().is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    return box_iou_rotated_cuda(boxes1.contiguous(), boxes2.contiguous());
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+
+  return box_iou_rotated_cpu(boxes1.contiguous(), boxes2.contiguous());
+}
+
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp b/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
new file mode 100644
index 0000000..c843487
--- /dev/null
+++ b/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
@@ -0,0 +1,39 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include "box_iou_rotated.h"
+#include "box_iou_rotated_utils.h"
+
+namespace detectron2 {
+
+template <typename T>
+void box_iou_rotated_cpu_kernel(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2,
+    at::Tensor& ious) {
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+
+  for (int i = 0; i < num_boxes1; i++) {
+    for (int j = 0; j < num_boxes2; j++) {
+      ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(
+          boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>());
+    }
+  }
+}
+
+at::Tensor box_iou_rotated_cpu(
+    // input must be contiguous:
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2) {
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+  at::Tensor ious =
+      at::empty({num_boxes1 * num_boxes2}, boxes1.options().dtype(at::kFloat));
+
+  box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious);
+
+  // reshape from 1d array to 2d array
+  auto shape = std::vector<int64_t>{num_boxes1, num_boxes2};
+  return ious.reshape(shape);
+}
+
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu b/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
new file mode 100644
index 0000000..952710e
--- /dev/null
+++ b/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
@@ -0,0 +1,130 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include "box_iou_rotated_utils.h"
+
+namespace detectron2 {
+
+// 2D block with 32 * 16 = 512 threads per block
+const int BLOCK_DIM_X = 32;
+const int BLOCK_DIM_Y = 16;
+
+template <typename T>
+__global__ void box_iou_rotated_cuda_kernel(
+    const int n_boxes1,
+    const int n_boxes2,
+    const T* dev_boxes1,
+    const T* dev_boxes2,
+    T* dev_ious) {
+  const int row_start = blockIdx.x * blockDim.x;
+  const int col_start = blockIdx.y * blockDim.y;
+
+  const int row_size = min(n_boxes1 - row_start, blockDim.x);
+  const int col_size = min(n_boxes2 - col_start, blockDim.y);
+
+  __shared__ float block_boxes1[BLOCK_DIM_X * 5];
+  __shared__ float block_boxes2[BLOCK_DIM_Y * 5];
+
+  // It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y
+  if (threadIdx.x < row_size && threadIdx.y == 0) {
+    block_boxes1[threadIdx.x * 5 + 0] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 0];
+    block_boxes1[threadIdx.x * 5 + 1] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 1];
+    block_boxes1[threadIdx.x * 5 + 2] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 2];
+    block_boxes1[threadIdx.x * 5 + 3] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 3];
+    block_boxes1[threadIdx.x * 5 + 4] =
+        dev_boxes1[(row_start + threadIdx.x) * 5 + 4];
+  }
+
+  if (threadIdx.x < col_size && threadIdx.y == 0) {
+    block_boxes2[threadIdx.x * 5 + 0] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 0];
+    block_boxes2[threadIdx.x * 5 + 1] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 1];
+    block_boxes2[threadIdx.x * 5 + 2] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 2];
+    block_boxes2[threadIdx.x * 5 + 3] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 3];
+    block_boxes2[threadIdx.x * 5 + 4] =
+        dev_boxes2[(col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size && threadIdx.y < col_size) {
+    int offset = (row_start + threadIdx.x) * n_boxes2 + col_start + threadIdx.y;
+    dev_ious[offset] = single_box_iou_rotated<T>(
+        block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5);
+  }
+}
+
+at::Tensor box_iou_rotated_cuda(
+    // input must be contiguous
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2) {
+  using scalar_t = float;
+  AT_ASSERTM(
+      boxes1.scalar_type() == at::kFloat, "boxes1 must be a float tensor");
+  AT_ASSERTM(
+      boxes2.scalar_type() == at::kFloat, "boxes2 must be a float tensor");
+  AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor");
+  AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor");
+  at::cuda::CUDAGuard device_guard(boxes1.device());
+
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+
+  at::Tensor ious =
+      at::empty({num_boxes1 * num_boxes2}, boxes1.options().dtype(at::kFloat));
+
+  bool transpose = false;
+  if (num_boxes1 > 0 && num_boxes2 > 0) {
+    scalar_t *data1 = boxes1.data_ptr<scalar_t>(),
+             *data2 = boxes2.data_ptr<scalar_t>();
+
+    if (num_boxes2 > 65535 * BLOCK_DIM_Y) {
+      AT_ASSERTM(
+          num_boxes1 <= 65535 * BLOCK_DIM_Y,
+          "Too many boxes for box_iou_rotated_cuda!");
+      // x dim is allowed to be large, but y dim cannot,
+      // so we transpose the two to avoid "invalid configuration argument"
+      // error. We assume one of them is small. Otherwise the result is hard to
+      // fit in memory anyway.
+      std::swap(num_boxes1, num_boxes2);
+      std::swap(data1, data2);
+      transpose = true;
+    }
+
+    const int blocks_x =
+        at::cuda::ATenCeilDiv(static_cast<int>(num_boxes1), BLOCK_DIM_X);
+    const int blocks_y =
+        at::cuda::ATenCeilDiv(static_cast<int>(num_boxes2), BLOCK_DIM_Y);
+
+    dim3 blocks(blocks_x, blocks_y);
+    dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    box_iou_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+        num_boxes1,
+        num_boxes2,
+        data1,
+        data2,
+        (scalar_t*)ious.data_ptr<scalar_t>());
+
+    AT_CUDA_CHECK(cudaGetLastError());
+  }
+
+  // reshape from 1d array to 2d array
+  auto shape = std::vector<int64_t>{num_boxes1, num_boxes2};
+  if (transpose) {
+    return ious.view(shape).t();
+  } else {
+    return ious.view(shape);
+  }
+}
+
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h b/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
new file mode 100644
index 0000000..bc6967a
--- /dev/null
+++ b/mmcv/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
@@ -0,0 +1,391 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+
+#include <cassert>
+#include <cmath>
+
+#if defined(__CUDACC__) || __HCC__ == 1 || __HIP__ == 1
+// Designates functions callable from the host (CPU) and the device (GPU)
+#define HOST_DEVICE __host__ __device__
+#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
+#else
+#include <algorithm>
+#define HOST_DEVICE
+#define HOST_DEVICE_INLINE HOST_DEVICE inline
+#endif
+
+namespace detectron2 {
+
+namespace {
+
+template <typename T>
+struct RotatedBox {
+  T x_ctr, y_ctr, w, h, a;
+};
+
+template <typename T>
+struct Point {
+  T x, y;
+  HOST_DEVICE_INLINE Point(const T& px = 0, const T& py = 0) : x(px), y(py) {}
+  HOST_DEVICE_INLINE Point operator+(const Point& p) const {
+    return Point(x + p.x, y + p.y);
+  }
+  HOST_DEVICE_INLINE Point& operator+=(const Point& p) {
+    x += p.x;
+    y += p.y;
+    return *this;
+  }
+  HOST_DEVICE_INLINE Point operator-(const Point& p) const {
+    return Point(x - p.x, y - p.y);
+  }
+  HOST_DEVICE_INLINE Point operator*(const T coeff) const {
+    return Point(x * coeff, y * coeff);
+  }
+};
+
+template <typename T>
+HOST_DEVICE_INLINE T dot_2d(const Point<T>& A, const Point<T>& B) {
+  return A.x * B.x + A.y * B.y;
+}
+
+// R: result type. can be different from input type
+template <typename T, typename R = T>
+HOST_DEVICE_INLINE R cross_2d(const Point<T>& A, const Point<T>& B) {
+  return static_cast<R>(A.x) * static_cast<R>(B.y) -
+      static_cast<R>(B.x) * static_cast<R>(A.y);
+}
+
+template <typename T>
+HOST_DEVICE_INLINE void get_rotated_vertices(
+    const RotatedBox<T>& box,
+    Point<T> (&pts)[4]) {
+  // M_PI / 180. == 0.01745329251
+  double theta = box.a * 0.01745329251;
+  T cosTheta2 = (T)cos(theta) * 0.5f;
+  T sinTheta2 = (T)sin(theta) * 0.5f;
+
+  // y: top --> down; x: left --> right
+  pts[0].x = box.x_ctr + sinTheta2 * box.h + cosTheta2 * box.w;
+  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[1].x = box.x_ctr - sinTheta2 * box.h + cosTheta2 * box.w;
+  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[2].x = 2 * box.x_ctr - pts[0].x;
+  pts[2].y = 2 * box.y_ctr - pts[0].y;
+  pts[3].x = 2 * box.x_ctr - pts[1].x;
+  pts[3].y = 2 * box.y_ctr - pts[1].y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int get_intersection_points(
+    const Point<T> (&pts1)[4],
+    const Point<T> (&pts2)[4],
+    Point<T> (&intersections)[24]) {
+  // Line vector
+  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+  Point<T> vec1[4], vec2[4];
+  for (int i = 0; i < 4; i++) {
+    vec1[i] = pts1[(i + 1) % 4] - pts1[i];
+    vec2[i] = pts2[(i + 1) % 4] - pts2[i];
+  }
+
+  // When computing the intersection area, it doesn't hurt if we have
+  // more (duplicated/approximate) intersections/vertices than needed,
+  // while it can cause drastic difference if we miss an intersection/vertex.
+  // Therefore, we add an epsilon to relax the comparisons between
+  // the float point numbers that decide the intersection points.
+  double EPS = 1e-5;
+
+  // Line test - test all line combos for intersection
+  int num = 0; // number of intersections
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      // Solve for 2x2 Ax=b
+      T det = cross_2d<T>(vec2[j], vec1[i]);
+
+      // This takes care of parallel lines
+      if (fabs(det) <= 1e-14) {
+        continue;
+      }
+
+      auto vec12 = pts2[j] - pts1[i];
+
+      T t1 = cross_2d<T>(vec2[j], vec12) / det;
+      T t2 = cross_2d<T>(vec1[i], vec12) / det;
+
+      if (t1 > -EPS && t1 < 1.0f + EPS && t2 > -EPS && t2 < 1.0f + EPS) {
+        intersections[num++] = pts1[i] + vec1[i] * t1;
+      }
+    }
+  }
+
+  // Check for vertices of rect1 inside rect2
+  {
+    const auto& AB = vec2[0];
+    const auto& DA = vec2[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      // assume ABCD is the rectangle, and P is the point to be judged
+      // P is inside ABCD iff. P's projection on AB lies within AB
+      // and P's projection on AD lies within AD
+
+      auto AP = pts1[i] - pts2[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB > -EPS) && (APdotAD > -EPS) && (APdotAB < ABdotAB + EPS) &&
+          (APdotAD < ADdotAD + EPS)) {
+        intersections[num++] = pts1[i];
+      }
+    }
+  }
+
+  // Reverse the check - check for vertices of rect2 inside rect1
+  {
+    const auto& AB = vec1[0];
+    const auto& DA = vec1[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      auto AP = pts2[i] - pts1[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB > -EPS) && (APdotAD > -EPS) && (APdotAB < ABdotAB + EPS) &&
+          (APdotAD < ADdotAD + EPS)) {
+        intersections[num++] = pts2[i];
+      }
+    }
+  }
+
+  return num;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int convex_hull_graham(
+    const Point<T> (&p)[24],
+    const int& num_in,
+    Point<T> (&q)[24],
+    bool shift_to_zero = false) {
+  assert(num_in >= 2);
+
+  // Step 1:
+  // Find point with minimum y
+  // if more than 1 points have the same minimum y,
+  // pick the one with the minimum x.
+  int t = 0;
+  for (int i = 1; i < num_in; i++) {
+    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
+      t = i;
+    }
+  }
+  auto& start = p[t]; // starting point
+
+  // Step 2:
+  // Subtract starting point from every points (for sorting in the next step)
+  for (int i = 0; i < num_in; i++) {
+    q[i] = p[i] - start;
+  }
+
+  // Swap the starting point to position 0
+  auto tmp = q[0];
+  q[0] = q[t];
+  q[t] = tmp;
+
+  // Step 3:
+  // Sort point 1 ~ num_in according to their relative cross-product values
+  // (essentially sorting according to angles)
+  // If the angles are the same, sort according to their distance to origin
+  T dist[24];
+#if defined(__CUDACC__) || __HCC__ == 1 || __HIP__ == 1
+  // compute distance to origin before sort, and sort them together with the
+  // points
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+
+  // CUDA version
+  // In the future, we can potentially use thrust
+  // for sorting here to improve speed (though not guaranteed)
+  for (int i = 1; i < num_in - 1; i++) {
+    for (int j = i + 1; j < num_in; j++) {
+      T crossProduct = cross_2d<T>(q[i], q[j]);
+      if ((crossProduct < -1e-6) ||
+          (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
+        auto q_tmp = q[i];
+        q[i] = q[j];
+        q[j] = q_tmp;
+        auto dist_tmp = dist[i];
+        dist[i] = dist[j];
+        dist[j] = dist_tmp;
+      }
+    }
+  }
+#else
+  // CPU version
+  // std::sort(
+  //     q + 1, q + num_in, [](const Point<T>& A, const Point<T>& B) -> bool {
+  //       T temp = cross_2d<T>(A, B);
+
+  // if (fabs(temp) < 1e-6) {
+  //   return dot_2d<T>(A, A) < dot_2d<T>(B, B);
+  // } else {
+  //   return temp > 0;
+  // }
+  // });
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+
+  for (int i = 1; i < num_in - 1; i++) {
+    for (int j = i + 1; j < num_in; j++) {
+      T crossProduct = cross_2d<T>(q[i], q[j]);
+      if ((crossProduct < -1e-6) ||
+          (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
+        auto q_tmp = q[i];
+        q[i] = q[j];
+        q[j] = q_tmp;
+        auto dist_tmp = dist[i];
+        dist[i] = dist[j];
+        dist[j] = dist_tmp;
+      }
+    }
+  }
+
+  // compute distance to origin after sort, since the points are now different.
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+
+#endif
+
+  // Step 4:
+  // Make sure there are at least 2 points (that don't overlap with each other)
+  // in the stack
+  int k; // index of the non-overlapped second point
+  for (k = 1; k < num_in; k++) {
+    if (dist[k] > 1e-8) {
+      break;
+    }
+  }
+  if (k == num_in) {
+    // We reach the end, which means the convex hull is just one point
+    q[0] = p[t];
+    return 1;
+  }
+  q[1] = q[k];
+  int m = 2; // 2 points in the stack
+  // Step 5:
+  // Finally we can start the scanning process.
+  // When a non-convex relationship between the 3 points is found
+  // (either concave shape or duplicated points),
+  // we pop the previous point from the stack
+  // until the 3-point relationship is convex again, or
+  // until the stack only contains two points
+  for (int i = k + 1; i < num_in; i++) {
+    while (m > 1) {
+      auto q1 = q[i] - q[m - 2], q2 = q[m - 1] - q[m - 2];
+      // cross_2d() uses FMA and therefore computes round(round(q1.x*q2.y) -
+      // q2.x*q1.y) So it may not return 0 even when q1==q2. Therefore we
+      // compare round(q1.x*q2.y) and round(q2.x*q1.y) directly. (round means
+      // round to nearest floating point).
+      if (q1.x * q2.y >= q2.x * q1.y)
+        m--;
+      else
+        break;
+    }
+    // Using double also helps, but float can solve the issue for now.
+    // while (m > 1 && cross_2d<T, double>(q[i] - q[m - 2], q[m - 1] - q[m - 2])
+    // >= 0) {
+    //     m--;
+    // }
+    q[m++] = q[i];
+  }
+
+  // Step 6 (Optional):
+  // In general sense we need the original coordinates, so we
+  // need to shift the points back (reverting Step 2)
+  // But if we're only interested in getting the area/perimeter of the shape
+  // We can simply return.
+  if (!shift_to_zero) {
+    for (int i = 0; i < m; i++) {
+      q[i] += start;
+    }
+  }
+
+  return m;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int& m) {
+  if (m <= 2) {
+    return 0;
+  }
+
+  T area = 0;
+  for (int i = 1; i < m - 1; i++) {
+    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
+  }
+
+  return area / 2.0;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T rotated_boxes_intersection(
+    const RotatedBox<T>& box1,
+    const RotatedBox<T>& box2) {
+  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+  // from rotated_rect_intersection_pts
+  Point<T> intersectPts[24], orderedPts[24];
+
+  Point<T> pts1[4];
+  Point<T> pts2[4];
+  get_rotated_vertices<T>(box1, pts1);
+  get_rotated_vertices<T>(box2, pts2);
+
+  int num = get_intersection_points<T>(pts1, pts2, intersectPts);
+
+  if (num <= 2) {
+    return 0.0;
+  }
+
+  // Convex Hull to order the intersection points in clockwise order and find
+  // the contour area.
+  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
+  return polygon_area<T>(orderedPts, num_convex);
+}
+
+} // namespace
+
+template <typename T>
+HOST_DEVICE_INLINE T
+single_box_iou_rotated(T const* const box1_raw, T const* const box2_raw) {
+  // shift center to the middle point to achieve higher precision in result
+  RotatedBox<T> box1, box2;
+  auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
+  auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
+  box1.x_ctr = box1_raw[0] - center_shift_x;
+  box1.y_ctr = box1_raw[1] - center_shift_y;
+  box1.w = box1_raw[2];
+  box1.h = box1_raw[3];
+  box1.a = box1_raw[4];
+  box2.x_ctr = box2_raw[0] - center_shift_x;
+  box2.y_ctr = box2_raw[1] - center_shift_y;
+  box2.w = box2_raw[2];
+  box2.h = box2_raw[3];
+  box2.a = box2_raw[4];
+
+  T area1 = box1.w * box1.h;
+  T area2 = box2.w * box2.h;
+  if (area1 < 1e-14 || area2 < 1e-14) {
+    return 0.f;
+  }
+
+  T intersection = rotated_boxes_intersection<T>(box1, box2);
+  T iou = intersection / (area1 + area2 - intersection);
+  return iou;
+}
+
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/cocoeval/cocoeval.cpp b/mmcv/layers/csrc/cocoeval/cocoeval.cpp
new file mode 100644
index 0000000..0a5b7b9
--- /dev/null
+++ b/mmcv/layers/csrc/cocoeval/cocoeval.cpp
@@ -0,0 +1,507 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include "cocoeval.h"
+#include <time.h>
+#include <algorithm>
+#include <cstdint>
+#include <numeric>
+
+using namespace pybind11::literals;
+
+namespace detectron2 {
+
+namespace COCOeval {
+
+// Sort detections from highest score to lowest, such that
+// detection_instances[detection_sorted_indices[t]] >=
+// detection_instances[detection_sorted_indices[t+1]].  Use stable_sort to match
+// original COCO API
+void SortInstancesByDetectionScore(
+    const std::vector<InstanceAnnotation>& detection_instances,
+    std::vector<uint64_t>* detection_sorted_indices) {
+  detection_sorted_indices->resize(detection_instances.size());
+  std::iota(
+      detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
+  std::stable_sort(
+      detection_sorted_indices->begin(),
+      detection_sorted_indices->end(),
+      [&detection_instances](size_t j1, size_t j2) {
+        return detection_instances[j1].score > detection_instances[j2].score;
+      });
+}
+
+// Partition the ground truth objects based on whether or not to ignore them
+// based on area
+void SortInstancesByIgnore(
+    const std::array<double, 2>& area_range,
+    const std::vector<InstanceAnnotation>& ground_truth_instances,
+    std::vector<uint64_t>* ground_truth_sorted_indices,
+    std::vector<bool>* ignores) {
+  ignores->clear();
+  ignores->reserve(ground_truth_instances.size());
+  for (auto o : ground_truth_instances) {
+    ignores->push_back(
+        o.ignore || o.area < area_range[0] || o.area > area_range[1]);
+  }
+
+  ground_truth_sorted_indices->resize(ground_truth_instances.size());
+  std::iota(
+      ground_truth_sorted_indices->begin(),
+      ground_truth_sorted_indices->end(),
+      0);
+  std::stable_sort(
+      ground_truth_sorted_indices->begin(),
+      ground_truth_sorted_indices->end(),
+      [&ignores](size_t j1, size_t j2) {
+        return (int)(*ignores)[j1] < (int)(*ignores)[j2];
+      });
+}
+
+// For each IOU threshold, greedily match each detected instance to a ground
+// truth instance (if possible) and store the results
+void MatchDetectionsToGroundTruth(
+    const std::vector<InstanceAnnotation>& detection_instances,
+    const std::vector<uint64_t>& detection_sorted_indices,
+    const std::vector<InstanceAnnotation>& ground_truth_instances,
+    const std::vector<uint64_t>& ground_truth_sorted_indices,
+    const std::vector<bool>& ignores,
+    const std::vector<std::vector<double>>& ious,
+    const std::vector<double>& iou_thresholds,
+    const std::array<double, 2>& area_range,
+    ImageEvaluation* results) {
+  // Initialize memory to store return data matches and ignore
+  const int num_iou_thresholds = iou_thresholds.size();
+  const int num_ground_truth = ground_truth_sorted_indices.size();
+  const int num_detections = detection_sorted_indices.size();
+  std::vector<uint64_t> ground_truth_matches(
+      num_iou_thresholds * num_ground_truth, 0);
+  std::vector<uint64_t>& detection_matches = results->detection_matches;
+  std::vector<bool>& detection_ignores = results->detection_ignores;
+  std::vector<bool>& ground_truth_ignores = results->ground_truth_ignores;
+  detection_matches.resize(num_iou_thresholds * num_detections, 0);
+  detection_ignores.resize(num_iou_thresholds * num_detections, false);
+  ground_truth_ignores.resize(num_ground_truth);
+  for (auto g = 0; g < num_ground_truth; ++g) {
+    ground_truth_ignores[g] = ignores[ground_truth_sorted_indices[g]];
+  }
+
+  for (auto t = 0; t < num_iou_thresholds; ++t) {
+    for (auto d = 0; d < num_detections; ++d) {
+      // information about best match so far (match=-1 -> unmatched)
+      double best_iou = std::min(iou_thresholds[t], 1 - 1e-10);
+      int match = -1;
+      for (auto g = 0; g < num_ground_truth; ++g) {
+        // if this ground truth instance is already matched and not a
+        // crowd, it cannot be matched to another detection
+        if (ground_truth_matches[t * num_ground_truth + g] > 0 &&
+            !ground_truth_instances[ground_truth_sorted_indices[g]].is_crowd) {
+          continue;
+        }
+
+        // if detected instance matched to a regular ground truth
+        // instance, we can break on the first ground truth instance
+        // tagged as ignore (because they are sorted by the ignore tag)
+        if (match >= 0 && !ground_truth_ignores[match] &&
+            ground_truth_ignores[g]) {
+          break;
+        }
+
+        // if IOU overlap is the best so far, store the match appropriately
+        if (ious[d][ground_truth_sorted_indices[g]] >= best_iou) {
+          best_iou = ious[d][ground_truth_sorted_indices[g]];
+          match = g;
+        }
+      }
+      // if match was made, store id of match for both detection and
+      // ground truth
+      if (match >= 0) {
+        detection_ignores[t * num_detections + d] = ground_truth_ignores[match];
+        detection_matches[t * num_detections + d] =
+            ground_truth_instances[ground_truth_sorted_indices[match]].id;
+        ground_truth_matches[t * num_ground_truth + match] =
+            detection_instances[detection_sorted_indices[d]].id;
+      }
+
+      // set unmatched detections outside of area range to ignore
+      const InstanceAnnotation& detection =
+          detection_instances[detection_sorted_indices[d]];
+      detection_ignores[t * num_detections + d] =
+          detection_ignores[t * num_detections + d] ||
+          (detection_matches[t * num_detections + d] == 0 &&
+           (detection.area < area_range[0] || detection.area > area_range[1]));
+    }
+  }
+
+  // store detection score results
+  results->detection_scores.resize(detection_sorted_indices.size());
+  for (size_t d = 0; d < detection_sorted_indices.size(); ++d) {
+    results->detection_scores[d] =
+        detection_instances[detection_sorted_indices[d]].score;
+  }
+}
+
+std::vector<ImageEvaluation> EvaluateImages(
+    const std::vector<std::array<double, 2>>& area_ranges,
+    int max_detections,
+    const std::vector<double>& iou_thresholds,
+    const ImageCategoryInstances<std::vector<double>>& image_category_ious,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_ground_truth_instances,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_detection_instances) {
+  const int num_area_ranges = area_ranges.size();
+  const int num_images = image_category_ground_truth_instances.size();
+  const int num_categories =
+      image_category_ious.size() > 0 ? image_category_ious[0].size() : 0;
+  std::vector<uint64_t> detection_sorted_indices;
+  std::vector<uint64_t> ground_truth_sorted_indices;
+  std::vector<bool> ignores;
+  std::vector<ImageEvaluation> results_all(
+      num_images * num_area_ranges * num_categories);
+
+  // Store results for each image, category, and area range combination. Results
+  // for each IOU threshold are packed into the same ImageEvaluation object
+  for (auto i = 0; i < num_images; ++i) {
+    for (auto c = 0; c < num_categories; ++c) {
+      const std::vector<InstanceAnnotation>& ground_truth_instances =
+          image_category_ground_truth_instances[i][c];
+      const std::vector<InstanceAnnotation>& detection_instances =
+          image_category_detection_instances[i][c];
+
+      SortInstancesByDetectionScore(
+          detection_instances, &detection_sorted_indices);
+      if ((int)detection_sorted_indices.size() > max_detections) {
+        detection_sorted_indices.resize(max_detections);
+      }
+
+      for (size_t a = 0; a < area_ranges.size(); ++a) {
+        SortInstancesByIgnore(
+            area_ranges[a],
+            ground_truth_instances,
+            &ground_truth_sorted_indices,
+            &ignores);
+
+        MatchDetectionsToGroundTruth(
+            detection_instances,
+            detection_sorted_indices,
+            ground_truth_instances,
+            ground_truth_sorted_indices,
+            ignores,
+            image_category_ious[i][c],
+            iou_thresholds,
+            area_ranges[a],
+            &results_all
+                [c * num_area_ranges * num_images + a * num_images + i]);
+      }
+    }
+  }
+
+  return results_all;
+}
+
+// Convert a python list to a vector
+template <typename T>
+std::vector<T> list_to_vec(const py::list& l) {
+  std::vector<T> v(py::len(l));
+  for (int i = 0; i < (int)py::len(l); ++i) {
+    v[i] = l[i].cast<T>();
+  }
+  return v;
+}
+
+// Helper function to Accumulate()
+// Considers the evaluation results applicable to a particular category, area
+// range, and max_detections parameter setting, which begin at
+// evaluations[evaluation_index].  Extracts a sorted list of length n of all
+// applicable detection instances concatenated across all images in the dataset,
+// which are represented by the outputs evaluation_indices, detection_scores,
+// image_detection_indices, and detection_sorted_indices--all of which are
+// length n. evaluation_indices[i] stores the applicable index into
+// evaluations[] for instance i, which has detection score detection_score[i],
+// and is the image_detection_indices[i]'th of the list of detections
+// for the image containing i.  detection_sorted_indices[] defines a sorted
+// permutation of the 3 other outputs
+int BuildSortedDetectionList(
+    const std::vector<ImageEvaluation>& evaluations,
+    const int64_t evaluation_index,
+    const int64_t num_images,
+    const int max_detections,
+    std::vector<uint64_t>* evaluation_indices,
+    std::vector<double>* detection_scores,
+    std::vector<uint64_t>* detection_sorted_indices,
+    std::vector<uint64_t>* image_detection_indices) {
+  assert(evaluations.size() >= evaluation_index + num_images);
+
+  // Extract a list of object instances of the applicable category, area
+  // range, and max detections requirements such that they can be sorted
+  image_detection_indices->clear();
+  evaluation_indices->clear();
+  detection_scores->clear();
+  image_detection_indices->reserve(num_images * max_detections);
+  evaluation_indices->reserve(num_images * max_detections);
+  detection_scores->reserve(num_images * max_detections);
+  int num_valid_ground_truth = 0;
+  for (auto i = 0; i < num_images; ++i) {
+    const ImageEvaluation& evaluation = evaluations[evaluation_index + i];
+
+    for (int d = 0;
+         d < (int)evaluation.detection_scores.size() && d < max_detections;
+         ++d) { // detected instances
+      evaluation_indices->push_back(evaluation_index + i);
+      image_detection_indices->push_back(d);
+      detection_scores->push_back(evaluation.detection_scores[d]);
+    }
+    for (auto ground_truth_ignore : evaluation.ground_truth_ignores) {
+      if (!ground_truth_ignore) {
+        ++num_valid_ground_truth;
+      }
+    }
+  }
+
+  // Sort detections by decreasing score, using stable sort to match
+  // python implementation
+  detection_sorted_indices->resize(detection_scores->size());
+  std::iota(
+      detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
+  std::stable_sort(
+      detection_sorted_indices->begin(),
+      detection_sorted_indices->end(),
+      [&detection_scores](size_t j1, size_t j2) {
+        return (*detection_scores)[j1] > (*detection_scores)[j2];
+      });
+
+  return num_valid_ground_truth;
+}
+
+// Helper function to Accumulate()
+// Compute a precision recall curve given a sorted list of detected instances
+// encoded in evaluations, evaluation_indices, detection_scores,
+// detection_sorted_indices, image_detection_indices (see
+// BuildSortedDetectionList()). Using vectors precisions and recalls
+// and temporary storage, output the results into precisions_out, recalls_out,
+// and scores_out, which are large buffers containing many precion/recall curves
+// for all possible parameter settings, with precisions_out_index and
+// recalls_out_index defining the applicable indices to store results.
+void ComputePrecisionRecallCurve(
+    const int64_t precisions_out_index,
+    const int64_t precisions_out_stride,
+    const int64_t recalls_out_index,
+    const std::vector<double>& recall_thresholds,
+    const int iou_threshold_index,
+    const int num_iou_thresholds,
+    const int num_valid_ground_truth,
+    const std::vector<ImageEvaluation>& evaluations,
+    const std::vector<uint64_t>& evaluation_indices,
+    const std::vector<double>& detection_scores,
+    const std::vector<uint64_t>& detection_sorted_indices,
+    const std::vector<uint64_t>& image_detection_indices,
+    std::vector<double>* precisions,
+    std::vector<double>* recalls,
+    std::vector<double>* precisions_out,
+    std::vector<double>* scores_out,
+    std::vector<double>* recalls_out) {
+  assert(recalls_out->size() > recalls_out_index);
+
+  // Compute precision/recall for each instance in the sorted list of detections
+  int64_t true_positives_sum = 0, false_positives_sum = 0;
+  precisions->clear();
+  recalls->clear();
+  precisions->reserve(detection_sorted_indices.size());
+  recalls->reserve(detection_sorted_indices.size());
+  assert(!evaluations.empty() || detection_sorted_indices.empty());
+  for (auto detection_sorted_index : detection_sorted_indices) {
+    const ImageEvaluation& evaluation =
+        evaluations[evaluation_indices[detection_sorted_index]];
+    const auto num_detections =
+        evaluation.detection_matches.size() / num_iou_thresholds;
+    const auto detection_index = iou_threshold_index * num_detections +
+        image_detection_indices[detection_sorted_index];
+    assert(evaluation.detection_matches.size() > detection_index);
+    assert(evaluation.detection_ignores.size() > detection_index);
+    const int64_t detection_match =
+        evaluation.detection_matches[detection_index];
+    const bool detection_ignores =
+        evaluation.detection_ignores[detection_index];
+    const auto true_positive = detection_match > 0 && !detection_ignores;
+    const auto false_positive = detection_match == 0 && !detection_ignores;
+    if (true_positive) {
+      ++true_positives_sum;
+    }
+    if (false_positive) {
+      ++false_positives_sum;
+    }
+
+    const double recall =
+        static_cast<double>(true_positives_sum) / num_valid_ground_truth;
+    recalls->push_back(recall);
+    const int64_t num_valid_detections =
+        true_positives_sum + false_positives_sum;
+    const double precision = num_valid_detections > 0
+        ? static_cast<double>(true_positives_sum) / num_valid_detections
+        : 0.0;
+    precisions->push_back(precision);
+  }
+
+  (*recalls_out)[recalls_out_index] = !recalls->empty() ? recalls->back() : 0;
+
+  for (int64_t i = static_cast<int64_t>(precisions->size()) - 1; i > 0; --i) {
+    if ((*precisions)[i] > (*precisions)[i - 1]) {
+      (*precisions)[i - 1] = (*precisions)[i];
+    }
+  }
+
+  // Sample the per instance precision/recall list at each recall threshold
+  for (size_t r = 0; r < recall_thresholds.size(); ++r) {
+    // first index in recalls >= recall_thresholds[r]
+    std::vector<double>::iterator low = std::lower_bound(
+        recalls->begin(), recalls->end(), recall_thresholds[r]);
+    size_t precisions_index = low - recalls->begin();
+
+    const auto results_ind = precisions_out_index + r * precisions_out_stride;
+    assert(results_ind < precisions_out->size());
+    assert(results_ind < scores_out->size());
+    if (precisions_index < precisions->size()) {
+      (*precisions_out)[results_ind] = (*precisions)[precisions_index];
+      (*scores_out)[results_ind] =
+          detection_scores[detection_sorted_indices[precisions_index]];
+    } else {
+      (*precisions_out)[results_ind] = 0;
+      (*scores_out)[results_ind] = 0;
+    }
+  }
+}
+py::dict Accumulate(
+    const py::object& params,
+    const std::vector<ImageEvaluation>& evaluations) {
+  const std::vector<double> recall_thresholds =
+      list_to_vec<double>(params.attr("recThrs"));
+  const std::vector<int> max_detections =
+      list_to_vec<int>(params.attr("maxDets"));
+  const int num_iou_thresholds = py::len(params.attr("iouThrs"));
+  const int num_recall_thresholds = py::len(params.attr("recThrs"));
+  const int num_categories = params.attr("useCats").cast<int>() == 1
+      ? py::len(params.attr("catIds"))
+      : 1;
+  const int num_area_ranges = py::len(params.attr("areaRng"));
+  const int num_max_detections = py::len(params.attr("maxDets"));
+  const int num_images = py::len(params.attr("imgIds"));
+
+  std::vector<double> precisions_out(
+      num_iou_thresholds * num_recall_thresholds * num_categories *
+          num_area_ranges * num_max_detections,
+      -1);
+  std::vector<double> recalls_out(
+      num_iou_thresholds * num_categories * num_area_ranges *
+          num_max_detections,
+      -1);
+  std::vector<double> scores_out(
+      num_iou_thresholds * num_recall_thresholds * num_categories *
+          num_area_ranges * num_max_detections,
+      -1);
+
+  // Consider the list of all detected instances in the entire dataset in one
+  // large list.  evaluation_indices, detection_scores,
+  // image_detection_indices, and detection_sorted_indices all have the same
+  // length as this list, such that each entry corresponds to one detected
+  // instance
+  std::vector<uint64_t> evaluation_indices; // indices into evaluations[]
+  std::vector<double> detection_scores; // detection scores of each instance
+  std::vector<uint64_t> detection_sorted_indices; // sorted indices of all
+                                                  // instances in the dataset
+  std::vector<uint64_t>
+      image_detection_indices; // indices into the list of detected instances in
+                               // the same image as each instance
+  std::vector<double> precisions, recalls;
+
+  for (auto c = 0; c < num_categories; ++c) {
+    for (auto a = 0; a < num_area_ranges; ++a) {
+      for (auto m = 0; m < num_max_detections; ++m) {
+        // The COCO PythonAPI assumes evaluations[] (the return value of
+        // COCOeval::EvaluateImages() is one long list storing results for each
+        // combination of category, area range, and image id, with categories in
+        // the outermost loop and images in the innermost loop.
+        const int64_t evaluations_index =
+            c * num_area_ranges * num_images + a * num_images;
+        int num_valid_ground_truth = BuildSortedDetectionList(
+            evaluations,
+            evaluations_index,
+            num_images,
+            max_detections[m],
+            &evaluation_indices,
+            &detection_scores,
+            &detection_sorted_indices,
+            &image_detection_indices);
+
+        if (num_valid_ground_truth == 0) {
+          continue;
+        }
+
+        for (auto t = 0; t < num_iou_thresholds; ++t) {
+          // recalls_out is a flattened vectors representing a
+          // num_iou_thresholds X num_categories X num_area_ranges X
+          // num_max_detections matrix
+          const int64_t recalls_out_index =
+              t * num_categories * num_area_ranges * num_max_detections +
+              c * num_area_ranges * num_max_detections +
+              a * num_max_detections + m;
+
+          // precisions_out and scores_out are flattened vectors
+          // representing a num_iou_thresholds X num_recall_thresholds X
+          // num_categories X num_area_ranges X num_max_detections matrix
+          const int64_t precisions_out_stride =
+              num_categories * num_area_ranges * num_max_detections;
+          const int64_t precisions_out_index = t * num_recall_thresholds *
+                  num_categories * num_area_ranges * num_max_detections +
+              c * num_area_ranges * num_max_detections +
+              a * num_max_detections + m;
+
+          ComputePrecisionRecallCurve(
+              precisions_out_index,
+              precisions_out_stride,
+              recalls_out_index,
+              recall_thresholds,
+              t,
+              num_iou_thresholds,
+              num_valid_ground_truth,
+              evaluations,
+              evaluation_indices,
+              detection_scores,
+              detection_sorted_indices,
+              image_detection_indices,
+              &precisions,
+              &recalls,
+              &precisions_out,
+              &scores_out,
+              &recalls_out);
+        }
+      }
+    }
+  }
+
+  time_t rawtime;
+  struct tm local_time;
+  std::array<char, 200> buffer;
+  time(&rawtime);
+#ifdef _WIN32
+  localtime_s(&local_time, &rawtime);
+#else
+  localtime_r(&rawtime, &local_time);
+#endif
+  strftime(
+      buffer.data(), 200, "%Y-%m-%d %H:%num_max_detections:%S", &local_time);
+  return py::dict(
+      "params"_a = params,
+      "counts"_a = std::vector<int64_t>(
+          {num_iou_thresholds,
+           num_recall_thresholds,
+           num_categories,
+           num_area_ranges,
+           num_max_detections}),
+      "date"_a = buffer,
+      "precision"_a = precisions_out,
+      "recall"_a = recalls_out,
+      "scores"_a = scores_out);
+}
+
+} // namespace COCOeval
+
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/cocoeval/cocoeval.h b/mmcv/layers/csrc/cocoeval/cocoeval.h
new file mode 100644
index 0000000..db246e4
--- /dev/null
+++ b/mmcv/layers/csrc/cocoeval/cocoeval.h
@@ -0,0 +1,88 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+#include <vector>
+
+namespace py = pybind11;
+
+namespace detectron2 {
+
+namespace COCOeval {
+
+// Annotation data for a single object instance in an image
+struct InstanceAnnotation {
+  InstanceAnnotation(
+      uint64_t id,
+      double score,
+      double area,
+      bool is_crowd,
+      bool ignore)
+      : id{id}, score{score}, area{area}, is_crowd{is_crowd}, ignore{ignore} {}
+  uint64_t id;
+  double score = 0.;
+  double area = 0.;
+  bool is_crowd = false;
+  bool ignore = false;
+};
+
+// Stores intermediate results for evaluating detection results for a single
+// image that has D detected instances and G ground truth instances. This stores
+// matches between detected and ground truth instances
+struct ImageEvaluation {
+  // For each of the D detected instances, the id of the matched ground truth
+  // instance, or 0 if unmatched
+  std::vector<uint64_t> detection_matches;
+
+  // The detection score of each of the D detected instances
+  std::vector<double> detection_scores;
+
+  // Marks whether or not each of G instances was ignored from evaluation (e.g.,
+  // because it's outside area_range)
+  std::vector<bool> ground_truth_ignores;
+
+  // Marks whether or not each of D instances was ignored from evaluation (e.g.,
+  // because it's outside aRng)
+  std::vector<bool> detection_ignores;
+};
+
+template <class T>
+using ImageCategoryInstances = std::vector<std::vector<std::vector<T>>>;
+
+// C++ implementation of COCO API cocoeval.py::COCOeval.evaluateImg().  For each
+// combination of image, category, area range settings, and IOU thresholds to
+// evaluate, it matches detected instances to ground truth instances and stores
+// the results into a vector of ImageEvaluation results, which will be
+// interpreted by the COCOeval::Accumulate() function to produce precion-recall
+// curves.  The parameters of nested vectors have the following semantics:
+//   image_category_ious[i][c][d][g] is the intersection over union of the d'th
+//     detected instance and g'th ground truth instance of
+//     category category_ids[c] in image image_ids[i]
+//   image_category_ground_truth_instances[i][c] is a vector of ground truth
+//     instances in image image_ids[i] of category category_ids[c]
+//   image_category_detection_instances[i][c] is a vector of detected
+//     instances in image image_ids[i] of category category_ids[c]
+std::vector<ImageEvaluation> EvaluateImages(
+    const std::vector<std::array<double, 2>>& area_ranges, // vector of 2-tuples
+    int max_detections,
+    const std::vector<double>& iou_thresholds,
+    const ImageCategoryInstances<std::vector<double>>& image_category_ious,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_ground_truth_instances,
+    const ImageCategoryInstances<InstanceAnnotation>&
+        image_category_detection_instances);
+
+// C++ implementation of COCOeval.accumulate(), which generates precision
+// recall curves for each set of category, IOU threshold, detection area range,
+// and max number of detections parameters.  It is assumed that the parameter
+// evaluations is the return value of the functon COCOeval::EvaluateImages(),
+// which was called with the same parameter settings params
+py::dict Accumulate(
+    const py::object& params,
+    const std::vector<ImageEvaluation>& evalutations);
+
+} // namespace COCOeval
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/cuda_version.cu b/mmcv/layers/csrc/cuda_version.cu
new file mode 100644
index 0000000..b74fdda
--- /dev/null
+++ b/mmcv/layers/csrc/cuda_version.cu
@@ -0,0 +1,26 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+#include <cuda_runtime_api.h>
+
+namespace detectron2 {
+int get_cudart_version() {
+// Not a ROCM platform: Either HIP is not used, or
+// it is used, but platform is not ROCM (i.e. it is CUDA)
+#if !defined(__HIP_PLATFORM_AMD__)
+  return CUDART_VERSION;
+#else
+  int version = 0;
+
+#if HIP_VERSION_MAJOR != 0
+  // Create a convention similar to that of CUDA, as assumed by other
+  // parts of the code.
+
+  version = HIP_VERSION_MINOR;
+  version += (HIP_VERSION_MAJOR * 100);
+#else
+  hipRuntimeGetVersion(&version);
+#endif
+  return version;
+#endif
+}
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/deformable/deform_conv.h b/mmcv/layers/csrc/deformable/deform_conv.h
new file mode 100644
index 0000000..965c1bf
--- /dev/null
+++ b/mmcv/layers/csrc/deformable/deform_conv.h
@@ -0,0 +1,377 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+int deform_conv_forward_cuda(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor offset,
+    at::Tensor output,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step);
+
+int deform_conv_backward_input_cuda(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradInput,
+    at::Tensor gradOffset,
+    at::Tensor weight,
+    at::Tensor columns,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step);
+
+int deform_conv_backward_parameters_cuda(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradWeight, // at::Tensor gradBias,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    float scale,
+    int im2col_step);
+
+void modulated_deform_conv_cuda_forward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor output,
+    at::Tensor columns,
+    int kernel_h,
+    int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_h,
+    const int pad_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int group,
+    const int deformable_group,
+    const bool with_bias);
+
+void modulated_deform_conv_cuda_backward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor columns,
+    at::Tensor grad_input,
+    at::Tensor grad_weight,
+    at::Tensor grad_bias,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask,
+    at::Tensor grad_output,
+    int kernel_h,
+    int kernel_w,
+    int stride_h,
+    int stride_w,
+    int pad_h,
+    int pad_w,
+    int dilation_h,
+    int dilation_w,
+    int group,
+    int deformable_group,
+    const bool with_bias);
+
+#endif
+
+inline int deform_conv_forward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor offset,
+    at::Tensor output,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step) {
+  if (input.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!");
+    TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
+    return deform_conv_forward_cuda(
+        input,
+        weight,
+        offset,
+        output,
+        columns,
+        ones,
+        kW,
+        kH,
+        dW,
+        dH,
+        padW,
+        padH,
+        dilationW,
+        dilationH,
+        group,
+        deformable_group,
+        im2col_step);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  AT_ERROR("This operator is not implemented on CPU");
+}
+
+inline int deform_conv_backward_input(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradInput,
+    at::Tensor gradOffset,
+    at::Tensor weight,
+    at::Tensor columns,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step) {
+  if (gradOutput.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!");
+    TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!");
+    TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
+    return deform_conv_backward_input_cuda(
+        input,
+        offset,
+        gradOutput,
+        gradInput,
+        gradOffset,
+        weight,
+        columns,
+        kW,
+        kH,
+        dW,
+        dH,
+        padW,
+        padH,
+        dilationW,
+        dilationH,
+        group,
+        deformable_group,
+        im2col_step);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  AT_ERROR("This operator is not implemented on CPU");
+}
+
+inline int deform_conv_backward_filter(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradWeight, // at::Tensor gradBias,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    float scale,
+    int im2col_step) {
+  if (gradOutput.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!");
+    TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
+    return deform_conv_backward_parameters_cuda(
+        input,
+        offset,
+        gradOutput,
+        gradWeight,
+        columns,
+        ones,
+        kW,
+        kH,
+        dW,
+        dH,
+        padW,
+        padH,
+        dilationW,
+        dilationH,
+        group,
+        deformable_group,
+        scale,
+        im2col_step);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  AT_ERROR("This operator is not implemented on CPU");
+}
+
+inline void modulated_deform_conv_forward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor output,
+    at::Tensor columns,
+    int kernel_h,
+    int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_h,
+    const int pad_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int group,
+    const int deformable_group,
+    const bool with_bias) {
+  if (input.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!");
+    TORCH_CHECK(bias.is_cuda(), "bias tensor is not on GPU!");
+    TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
+    return modulated_deform_conv_cuda_forward(
+        input,
+        weight,
+        bias,
+        ones,
+        offset,
+        mask,
+        output,
+        columns,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dilation_h,
+        dilation_w,
+        group,
+        deformable_group,
+        with_bias);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  AT_ERROR("This operator is not implemented on CPU");
+}
+
+inline void modulated_deform_conv_backward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor columns,
+    at::Tensor grad_input,
+    at::Tensor grad_weight,
+    at::Tensor grad_bias,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask,
+    at::Tensor grad_output,
+    int kernel_h,
+    int kernel_w,
+    int stride_h,
+    int stride_w,
+    int pad_h,
+    int pad_w,
+    int dilation_h,
+    int dilation_w,
+    int group,
+    int deformable_group,
+    const bool with_bias) {
+  if (grad_output.is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!");
+    TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!");
+    TORCH_CHECK(bias.is_cuda(), "bias tensor is not on GPU!");
+    TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!");
+    return modulated_deform_conv_cuda_backward(
+        input,
+        weight,
+        bias,
+        ones,
+        offset,
+        mask,
+        columns,
+        grad_input,
+        grad_weight,
+        grad_bias,
+        grad_offset,
+        grad_mask,
+        grad_output,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        pad_h,
+        pad_w,
+        dilation_h,
+        dilation_w,
+        group,
+        deformable_group,
+        with_bias);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+  AT_ERROR("This operator is not implemented on CPU");
+}
+
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/deformable/deform_conv_cuda.cu b/mmcv/layers/csrc/deformable/deform_conv_cuda.cu
new file mode 100644
index 0000000..2072bb8
--- /dev/null
+++ b/mmcv/layers/csrc/deformable/deform_conv_cuda.cu
@@ -0,0 +1,1223 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+// modified from
+// https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp
+// Original license: Apache 2.0
+
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
+// Original license: Apache 2.0
+
+#include <torch/types.h>
+
+#include "deform_conv.h"
+
+#include <cmath>
+#include <vector>
+
+namespace detectron2 {
+
+void deformable_im2col(
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor data_col);
+
+void deformable_col2im(
+    const at::Tensor data_col,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor grad_im);
+
+void deformable_col2im_coord(
+    const at::Tensor data_col,
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor grad_offset);
+
+void modulated_deformable_im2col_cuda(
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kenerl_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+    const at::Tensor data_col,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kenerl_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+    const at::Tensor data_col,
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kenerl_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask);
+
+void shape_check(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor* gradOutput,
+    at::Tensor weight,
+    int kH,
+    int kW,
+    int dH,
+    int dW,
+    int padH,
+    int padW,
+    int dilationH,
+    int dilationW,
+    int group,
+    int deformable_group) {
+  TORCH_CHECK(
+      weight.ndimension() == 4,
+      "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
+      "but got: %s",
+      weight.ndimension());
+
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  TORCH_CHECK(
+      kW > 0 && kH > 0,
+      "kernel size should be greater than zero, but got kH: %d kW: %d",
+      kH,
+      kW);
+
+  TORCH_CHECK(
+      (weight.size(2) == kH && weight.size(3) == kW),
+      "kernel size should be consistent with weight, ",
+      "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
+      kH,
+      kW,
+      weight.size(2),
+      weight.size(3));
+
+  TORCH_CHECK(
+      dW > 0 && dH > 0,
+      "stride should be greater than zero, but got dH: %d dW: %d",
+      dH,
+      dW);
+
+  TORCH_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH,
+      dilationW);
+
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  TORCH_CHECK(
+      ndim == 3 || ndim == 4,
+      "3D or 4D input tensor expected but got: %s",
+      ndim);
+
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  TORCH_CHECK(
+      nInputPlane % deformable_group == 0,
+      "input channels must divide deformable group size");
+
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        nOutputPlane,
+        outputHeight,
+        outputWidth);
+
+  TORCH_CHECK(
+      input.size(1) == nInputPlane,
+      "invalid number of input planes, expected: %d, but got: %d",
+      nInputPlane,
+      input.size(1));
+
+  TORCH_CHECK(
+      (inputHeight + 2 * padH >= kH && inputWidth + 2 * padW >= kW),
+      "input image is smaller than kernel");
+
+  TORCH_CHECK(
+      (offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+      "invalid spatial size of offset, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      outputHeight,
+      outputWidth,
+      offset.size(2),
+      offset.size(3));
+
+  TORCH_CHECK(
+      (offset.size(1) == deformable_group * 2 * kH * kW),
+      "invalid number of channels of offset");
+
+  if (gradOutput != NULL) {
+    TORCH_CHECK(
+        gradOutput->size(dimf) == nOutputPlane,
+        "invalid number of gradOutput planes, expected: %d, but got: %d",
+        nOutputPlane,
+        gradOutput->size(dimf));
+
+    TORCH_CHECK(
+        (gradOutput->size(dimh) == outputHeight &&
+         gradOutput->size(dimw) == outputWidth),
+        "invalid size of gradOutput, expected height: %d width: %d , but "
+        "got height: %d width: %d",
+        outputHeight,
+        outputWidth,
+        gradOutput->size(dimh),
+        gradOutput->size(dimw));
+  }
+}
+
+int deform_conv_forward_cuda(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor offset,
+    at::Tensor output,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step) {
+  // todo: resize columns to include im2col: done
+  // todo: add im2col_step as input
+  // todo: add new output buffer and transpose it to output (or directly
+  // transpose output) todo: possibly change data indexing because of
+  // parallel_imgs
+
+  shape_check(
+      input,
+      offset,
+      NULL,
+      weight,
+      kH,
+      kW,
+      dH,
+      dW,
+      padH,
+      padW,
+      dilationH,
+      dilationW,
+      group,
+      deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  weight = weight.contiguous();
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+
+  // todo: assert batchsize dividable by im2col_step
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  output = output.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nOutputPlane,
+       outputHeight,
+       outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+
+  input = input.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nInputPlane,
+       inputHeight,
+       inputWidth});
+  offset = offset.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       deformable_group * 2 * kH * kW,
+       outputHeight,
+       outputWidth});
+
+  at::Tensor output_buffer = at::zeros(
+      {batchSize / im2col_step,
+       nOutputPlane,
+       im2col_step * outputHeight,
+       outputWidth},
+      output.options());
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0),
+       group,
+       output_buffer.size(1) / group,
+       output_buffer.size(2),
+       output_buffer.size(3)});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(
+        input[elt],
+        offset[elt],
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        kH,
+        kW,
+        padH,
+        padW,
+        dH,
+        dW,
+        dilationH,
+        dilationW,
+        im2col_step,
+        deformable_group,
+        columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view(
+        {group,
+         weight.size(0) / group,
+         weight.size(1),
+         weight.size(2),
+         weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+  }
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0),
+       output_buffer.size(1) * output_buffer.size(2),
+       output_buffer.size(3),
+       output_buffer.size(4)});
+
+  output_buffer = output_buffer.view(
+      {batchSize / im2col_step,
+       nOutputPlane,
+       im2col_step,
+       outputHeight,
+       outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+
+  return 1;
+}
+
+int deform_conv_backward_input_cuda(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradInput,
+    at::Tensor gradOffset,
+    at::Tensor weight,
+    at::Tensor columns,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    int im2col_step) {
+  shape_check(
+      input,
+      offset,
+      &gradOutput,
+      weight,
+      kH,
+      kW,
+      dH,
+      dW,
+      padH,
+      padW,
+      dilationH,
+      dilationW,
+      group,
+      deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+  weight = weight.contiguous();
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  // change order of grad output
+  gradOutput = gradOutput.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nOutputPlane,
+       outputHeight,
+       outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  gradInput = gradInput.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nInputPlane,
+       inputHeight,
+       inputWidth});
+  input = input.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nInputPlane,
+       inputHeight,
+       inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       deformable_group * 2 * kH * kW,
+       outputHeight,
+       outputWidth});
+  offset = offset.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       deformable_group * 2 * kH * kW,
+       outputHeight,
+       outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view(
+        {group,
+         weight.size(0) / group,
+         weight.size(1),
+         weight.size(2),
+         weight.size(3)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0),
+         group,
+         gradOutput.size(1) / group,
+         gradOutput.size(2),
+         gradOutput.size(3),
+         gradOutput.size(4)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(
+          weight[g].flatten(1).transpose(0, 1),
+          gradOutput[elt][g].flatten(1),
+          0.0f,
+          1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0),
+         gradOutput.size(1) * gradOutput.size(2),
+         gradOutput.size(3),
+         gradOutput.size(4),
+         gradOutput.size(5)});
+
+    deformable_col2im_coord(
+        columns,
+        input[elt],
+        offset[elt],
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        kH,
+        kW,
+        padH,
+        padW,
+        dH,
+        dW,
+        dilationH,
+        dilationW,
+        im2col_step,
+        deformable_group,
+        gradOffset[elt]);
+
+    deformable_col2im(
+        columns,
+        offset[elt],
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        kH,
+        kW,
+        padH,
+        padW,
+        dH,
+        dW,
+        dilationH,
+        dilationW,
+        im2col_step,
+        deformable_group,
+        gradInput[elt]);
+  }
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+
+  return 1;
+}
+
+int deform_conv_backward_parameters_cuda(
+    at::Tensor input,
+    at::Tensor offset,
+    at::Tensor gradOutput,
+    at::Tensor gradWeight, // at::Tensor gradBias,
+    at::Tensor columns,
+    at::Tensor ones,
+    int kW,
+    int kH,
+    int dW,
+    int dH,
+    int padW,
+    int padH,
+    int dilationW,
+    int dilationH,
+    int group,
+    int deformable_group,
+    float scale,
+    int im2col_step) {
+  // todo: transpose and reshape outGrad
+  // todo: reshape columns
+  // todo: add im2col_step as input
+
+  shape_check(
+      input,
+      offset,
+      &gradOutput,
+      gradWeight,
+      kH,
+      kW,
+      dH,
+      dW,
+      padH,
+      padW,
+      dilationH,
+      dilationW,
+      group,
+      deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = gradWeight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  gradOutput = gradOutput.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nOutputPlane,
+       outputHeight,
+       outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  at::Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer = gradOutputBuffer.view(
+      {batchSize / im2col_step,
+       nOutputPlane,
+       im2col_step,
+       outputHeight,
+       outputWidth});
+  gradOutputBuffer.copy_(gradOutput);
+  // gradOutput is not contiguous, so we do reshape (instead of view) next
+  gradOutputBuffer = gradOutputBuffer.reshape(
+      {batchSize / im2col_step,
+       nOutputPlane,
+       im2col_step * outputHeight,
+       outputWidth});
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       nInputPlane,
+       inputHeight,
+       inputWidth});
+  offset = offset.view(
+      {batchSize / im2col_step,
+       im2col_step,
+       deformable_group * 2 * kH * kW,
+       outputHeight,
+       outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(
+        input[elt],
+        offset[elt],
+        nInputPlane,
+        inputHeight,
+        inputWidth,
+        kH,
+        kW,
+        padH,
+        padW,
+        dH,
+        dW,
+        dilationH,
+        dilationW,
+        im2col_step,
+        deformable_group,
+        columns);
+
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         group,
+         gradOutputBuffer.size(1) / group,
+         gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight = gradWeight.view(
+        {group,
+         gradWeight.size(0) / group,
+         gradWeight.size(1),
+         gradWeight.size(2),
+         gradWeight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(
+                              gradOutputBuffer[elt][g].flatten(1),
+                              columns[g].transpose(1, 0),
+                              1.0,
+                              scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3),
+         gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view(
+        {gradWeight.size(0) * gradWeight.size(1),
+         gradWeight.size(2),
+         gradWeight.size(3),
+         gradWeight.size(4)});
+  }
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+
+  return 1;
+}
+
+void modulated_deform_conv_cuda_forward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor output,
+    at::Tensor columns,
+    int kernel_h,
+    int kernel_w,
+    const int stride_h,
+    const int stride_w,
+    const int pad_h,
+    const int pad_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int group,
+    const int deformable_group,
+    const bool with_bias) {
+  shape_check(
+      input,
+      offset,
+      NULL,
+      weight,
+      kernel_h,
+      kernel_w,
+      stride_h,
+      stride_w,
+      pad_h,
+      pad_w,
+      dilation_h,
+      dilation_w,
+      group,
+      deformable_group);
+
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR(
+        "Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+        kernel_h_,
+        kernel_w,
+        kernel_h_,
+        kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR(
+        "Input shape and kernel channels wont match: (%d vs %d).",
+        channels,
+        channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  // mask shape check
+  TORCH_CHECK(
+      (mask.size(2) == height_out && mask.size(3) == width_out),
+      "invalid spatial size of mask, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      height_out,
+      width_out,
+      mask.size(2),
+      mask.size(3));
+
+  TORCH_CHECK(
+      (mask.size(1) == deformable_group * kernel_h * kernel_w),
+      "invalid number of channels of mask");
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
+  // resize temporary columns
+  columns = at::zeros(
+      {channels * kernel_h * kernel_w, 1 * height_out * width_out},
+      input.options());
+
+  output = output.view(
+      {output.size(0),
+       group,
+       output.size(1) / group,
+       output.size(2),
+       output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    modulated_deformable_im2col_cuda(
+        input[b],
+        offset[b],
+        mask[b],
+        1,
+        channels,
+        height,
+        width,
+        height_out,
+        width_out,
+        kernel_h,
+        kernel_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        deformable_group,
+        columns);
+
+    // divide into group
+    weight = weight.view(
+        {group,
+         weight.size(0) / group,
+         weight.size(1),
+         weight.size(2),
+         weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
+    }
+
+    weight = weight.view(
+        {weight.size(0) * weight.size(1),
+         weight.size(2),
+         weight.size(3),
+         weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+  }
+
+  output = output.view(
+      {output.size(0),
+       output.size(1) * output.size(2),
+       output.size(3),
+       output.size(4)});
+
+  if (with_bias) {
+    output += bias.view({1, bias.size(0), 1, 1});
+  }
+}
+
+void modulated_deform_conv_cuda_backward(
+    at::Tensor input,
+    at::Tensor weight,
+    at::Tensor bias,
+    at::Tensor ones,
+    at::Tensor offset,
+    at::Tensor mask,
+    at::Tensor columns,
+    at::Tensor grad_input,
+    at::Tensor grad_weight,
+    at::Tensor grad_bias,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask,
+    at::Tensor grad_output,
+    int kernel_h,
+    int kernel_w,
+    int stride_h,
+    int stride_w,
+    int pad_h,
+    int pad_w,
+    int dilation_h,
+    int dilation_w,
+    int group,
+    int deformable_group,
+    const bool with_bias) {
+  shape_check(
+      input,
+      offset,
+      &grad_output,
+      weight,
+      kernel_h,
+      kernel_w,
+      stride_h,
+      stride_w,
+      pad_h,
+      pad_w,
+      dilation_h,
+      dilation_w,
+      group,
+      deformable_group);
+
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR(
+        "Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+        kernel_h_,
+        kernel_w,
+        kernel_h_,
+        kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR(
+        "Input shape and kernel channels wont match: (%d vs %d).",
+        channels,
+        channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  // mask shape check
+  TORCH_CHECK(
+      (mask.size(2) == height_out && mask.size(3) == width_out),
+      "invalid spatial size of mask, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      height_out,
+      width_out,
+      mask.size(2),
+      mask.size(3));
+
+  TORCH_CHECK(
+      (mask.size(1) == deformable_group * kernel_h * kernel_w),
+      "invalid number of channels of mask");
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = at::zeros(
+      {channels * kernel_h * kernel_w, height_out * width_out},
+      input.options());
+
+  grad_output = grad_output.view(
+      {grad_output.size(0),
+       group,
+       grad_output.size(1) / group,
+       grad_output.size(2),
+       grad_output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    // divide int group
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view(
+        {group,
+         weight.size(0) / group,
+         weight.size(1),
+         weight.size(2),
+         weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(
+          weight[g].flatten(1).transpose(0, 1),
+          grad_output[b][g].flatten(1),
+          0.0f,
+          1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view(
+        {weight.size(0) * weight.size(1),
+         weight.size(2),
+         weight.size(3),
+         weight.size(4)});
+
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_cuda(
+        columns,
+        input[b],
+        offset[b],
+        mask[b],
+        1,
+        channels,
+        height,
+        width,
+        height_out,
+        width_out,
+        kernel_h,
+        kernel_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        deformable_group,
+        grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_cuda(
+        columns,
+        offset[b],
+        mask[b],
+        1,
+        channels,
+        height,
+        width,
+        height_out,
+        width_out,
+        kernel_h,
+        kernel_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        deformable_group,
+        grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_cuda(
+        input[b],
+        offset[b],
+        mask[b],
+        1,
+        channels,
+        height,
+        width,
+        height_out,
+        width_out,
+        kernel_h,
+        kernel_w,
+        pad_h,
+        pad_w,
+        stride_h,
+        stride_w,
+        dilation_h,
+        dilation_w,
+        deformable_group,
+        columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view(
+        {group,
+         grad_weight.size(0) / group,
+         grad_weight.size(1),
+         grad_weight.size(2),
+         grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
+      if (with_bias) {
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
+      }
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view(
+        {grad_weight.size(0) * grad_weight.size(1),
+         grad_weight.size(2),
+         grad_weight.size(3),
+         grad_weight.size(4)});
+    if (with_bias)
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+  }
+  grad_output = grad_output.view(
+      {grad_output.size(0) * grad_output.size(1),
+       grad_output.size(2),
+       grad_output.size(3),
+       grad_output.size(4)});
+}
+
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/deformable/deform_conv_cuda_kernel.cu b/mmcv/layers/csrc/deformable/deform_conv_cuda_kernel.cu
new file mode 100644
index 0000000..f299c7a
--- /dev/null
+++ b/mmcv/layers/csrc/deformable/deform_conv_cuda_kernel.cu
@@ -0,0 +1,1288 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+// modified from
+// https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+// Original license: Apache 2.0
+// clang-format off
+
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <THC/THCAtomics.cuh>
+
+using namespace at;
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+
+namespace {
+
+const int CUDA_NUM_THREADS = 1024;
+const int kMaxGridNum = 65535;
+
+inline int GET_BLOCKS(const int N) {
+  return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS);
+}
+
+}
+
+template <typename scalar_t>
+__device__ scalar_t deformable_im2col_bilinear(
+    const scalar_t* bottom_data,
+    const int data_width,
+    const int height,
+    const int width,
+    scalar_t h,
+    scalar_t w) {
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  scalar_t lh = h - h_low;
+  scalar_t lw = w - w_low;
+  scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ scalar_t get_gradient_weight(
+    scalar_t argmax_h,
+    scalar_t argmax_w,
+    const int h,
+    const int w,
+    const int height,
+    const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename scalar_t>
+__device__ scalar_t get_coordinate_weight(
+    scalar_t argmax_h,
+    scalar_t argmax_w,
+    const int height,
+    const int width,
+    const scalar_t* im_data,
+    const int data_width,
+    const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+          im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+          im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+          im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+          im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+          im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+          im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+          im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+          im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename scalar_t>
+__global__ void deformable_im2col_gpu_kernel(
+    const int n,
+    const scalar_t* data_im,
+    const scalar_t* data_offset,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int num_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    scalar_t* data_col_ptr = data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    // const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) *
+    // height + h_in) * width + w_in;
+    const scalar_t* data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const scalar_t* data_offset_ptr = data_offset +
+        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+        const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+        scalar_t val = static_cast<scalar_t>(0);
+        const scalar_t h_im = h_in + i * dilation_h + offset_h;
+        const scalar_t w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
+          // const scalar_t map_h = i * dilation_h + offset_h;
+          // const scalar_t map_w = j * dilation_w + offset_w;
+          // const int cur_height = height - h_in;
+          // const int cur_width = width - w_in;
+          // val = deformable_im2col_bilinear(data_im_ptr, width, cur_height,
+          // cur_width, map_h, map_w);
+          val = deformable_im2col_bilinear(
+              data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void deformable_col2im_gpu_kernel(
+    const int n,
+    const scalar_t* data_col,
+    const scalar_t* data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* grad_im) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const scalar_t* data_offset_ptr = data_offset +
+        (b * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+    const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+    const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const scalar_t cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          scalar_t weight = get_gradient_weight(
+              cur_inv_h_data,
+              cur_inv_w_data,
+              cur_h + dy,
+              cur_w + dx,
+              height,
+              width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void deformable_col2im_coord_gpu_kernel(
+    const int n,
+    const scalar_t* data_col,
+    const scalar_t* data_im,
+    const scalar_t* data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int offset_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* grad_offset) {
+  CUDA_KERNEL_LOOP(index, n) {
+    scalar_t val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const scalar_t* data_col_ptr = data_col +
+        deformable_group_index * channel_per_deformable_group * batch_size *
+            width_col * height_col;
+    const scalar_t* data_im_ptr = data_im +
+        (b * deformable_group + deformable_group_index) *
+            channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const scalar_t* data_offset_ptr = data_offset +
+        (b * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+      const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+      scalar_t inv_h = h_in + i * dilation_h + offset_h;
+      scalar_t inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
+        inv_h = inv_w = -2;
+      }
+      const scalar_t weight = get_coordinate_weight(
+          inv_h,
+          inv_w,
+          height,
+          width,
+          data_im_ptr + cnt * height * width,
+          width,
+          bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+
+namespace detectron2 {
+
+void deformable_im2col(
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor data_col) {
+  // num_axes should be smaller than block size
+  // todo: check parallel_imgs is correctly passed in
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  at::cuda::CUDAGuard device_guard(data_im.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
+        const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+
+        deformable_im2col_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_im_,
+            data_offset_,
+            height,
+            width,
+            ksize_h,
+            ksize_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            parallel_imgs,
+            channels,
+            deformable_group,
+            height_col,
+            width_col,
+            data_col_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in deformable_im2col: %s\n", cudaGetErrorString(err));
+  }
+}
+
+
+void deformable_col2im(
+    const at::Tensor data_col,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor grad_im) {
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels =
+      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  at::cuda::CUDAGuard device_guard(data_col.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t* grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        deformable_col2im_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_col_,
+            data_offset_,
+            channels,
+            height,
+            width,
+            ksize_h,
+            ksize_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            parallel_imgs,
+            deformable_group,
+            height_col,
+            width_col,
+            grad_im_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in deformable_col2im: %s\n", cudaGetErrorString(err));
+  }
+}
+
+
+void deformable_col2im_coord(
+    const at::Tensor data_col,
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const int channels,
+    const int height,
+    const int width,
+    const int ksize_h,
+    const int ksize_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int parallel_imgs,
+    const int deformable_group,
+    at::Tensor grad_offset) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
+      deformable_group * parallel_imgs;
+  int channel_per_deformable_group =
+      channels * ksize_h * ksize_w / deformable_group;
+
+  at::cuda::CUDAGuard device_guard(data_col.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
+        const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t* grad_offset_ = grad_offset.data_ptr<scalar_t>();
+
+        deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_col_,
+            data_im_,
+            data_offset_,
+            channels,
+            height,
+            width,
+            ksize_h,
+            ksize_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            parallel_imgs,
+            2 * ksize_h * ksize_w * deformable_group,
+            deformable_group,
+            height_col,
+            width_col,
+            grad_offset_);
+      }));
+}
+
+} // namespace detectron2
+
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_im2col_bilinear(
+    const scalar_t* bottom_data,
+    const int data_width,
+    const int height,
+    const int width,
+    scalar_t h,
+    scalar_t w) {
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  scalar_t lh = h - h_low;
+  scalar_t lw = w - w_low;
+  scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_get_gradient_weight(
+    scalar_t argmax_h,
+    scalar_t argmax_w,
+    const int h,
+    const int w,
+    const int height,
+    const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_get_coordinate_weight(
+    scalar_t argmax_h,
+    scalar_t argmax_w,
+    const int height,
+    const int width,
+    const scalar_t* im_data,
+    const int data_width,
+    const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+          im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+          im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+          im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+          im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+          im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+          im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+          im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+          im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_im2col_gpu_kernel(
+    const int n,
+    const scalar_t* data_im,
+    const scalar_t* data_offset,
+    const scalar_t* data_mask,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int num_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    scalar_t* data_col_ptr = data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    // const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) *
+    // height + h_in) * width + w_in;
+    const scalar_t* data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const scalar_t* data_offset_ptr = data_offset +
+        (b_col * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+
+    const scalar_t* data_mask_ptr = data_mask +
+        (b_col * deformable_group + deformable_group_index) * kernel_h *
+            kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+        const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+        const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+        scalar_t val = static_cast<scalar_t>(0);
+        const scalar_t h_im = h_in + i * dilation_h + offset_h;
+        const scalar_t w_im = w_in + j * dilation_w + offset_w;
+        // if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
+          // const float map_h = i * dilation_h + offset_h;
+          // const float map_w = j * dilation_w + offset_w;
+          // const int cur_height = height - h_in;
+          // const int cur_width = width - w_in;
+          // val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height,
+          // cur_width, map_h, map_w);
+          val = dmcn_im2col_bilinear(
+              data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+        // data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_col2im_gpu_kernel(
+    const int n,
+    const scalar_t* data_col,
+    const scalar_t* data_offset,
+    const scalar_t* data_mask,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* grad_im) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const scalar_t* data_offset_ptr = data_offset +
+        (b * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const scalar_t* data_mask_ptr = data_mask +
+        (b * deformable_group + deformable_group_index) * kernel_h * kernel_w *
+            height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr =
+        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+    const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+    const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+    const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const scalar_t cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          scalar_t weight = dmcn_get_gradient_weight(
+              cur_inv_h_data,
+              cur_inv_w_data,
+              cur_h + dy,
+              cur_w + dx,
+              height,
+              width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(
+    const int n,
+    const scalar_t* data_col,
+    const scalar_t* data_im,
+    const scalar_t* data_offset,
+    const scalar_t* data_mask,
+    const int channels,
+    const int height,
+    const int width,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size,
+    const int offset_channels,
+    const int deformable_group,
+    const int height_col,
+    const int width_col,
+    scalar_t* grad_offset,
+    scalar_t* grad_mask) {
+  CUDA_KERNEL_LOOP(index, n) {
+    scalar_t val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const scalar_t* data_col_ptr = data_col +
+        deformable_group_index * channel_per_deformable_group * batch_size *
+            width_col * height_col;
+    const scalar_t* data_im_ptr = data_im +
+        (b * deformable_group + deformable_group_index) *
+            channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const scalar_t* data_offset_ptr = data_offset +
+        (b * deformable_group + deformable_group_index) * 2 * kernel_h *
+            kernel_w * height_col * width_col;
+    const scalar_t* data_mask_ptr = data_mask +
+        (b * deformable_group + deformable_group_index) * kernel_h * kernel_w *
+            height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const int data_mask_hw_ptr =
+          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+      const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+      const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+      scalar_t inv_h = h_in + i * dilation_h + offset_h;
+      scalar_t inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
+        inv_h = inv_w = -2;
+      } else {
+        mval += data_col_ptr[col_pos] *
+            dmcn_im2col_bilinear(
+                    data_im_ptr + cnt * height * width,
+                    width,
+                    height,
+                    width,
+                    inv_h,
+                    inv_w);
+      }
+      const scalar_t weight = dmcn_get_coordinate_weight(
+          inv_h,
+          inv_w,
+          height,
+          width,
+          data_im_ptr + cnt * height * width,
+          width,
+          bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
+      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
+      // height_col + h) * width_col + w], mask_req, mval);
+      grad_mask
+          [(((b * deformable_group + deformable_group_index) * kernel_h *
+                 kernel_w +
+             offset_c / 2) *
+                height_col +
+            h) *
+               width_col +
+           w] = mval;
+  }
+}
+
+
+namespace detectron2 {
+
+void modulated_deformable_im2col_cuda(
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kenerl_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  at::cuda::CUDAGuard device_guard(data_im.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
+        const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t* data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+
+        modulated_deformable_im2col_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_im_,
+            data_offset_,
+            data_mask_,
+            height_im,
+            width_im,
+            kernel_h,
+            kenerl_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            batch_size,
+            channels,
+            deformable_group,
+            height_col,
+            width_col,
+            data_col_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf(
+        "error in modulated_deformable_im2col_cuda: %s\n",
+        cudaGetErrorString(err));
+  }
+}
+
+void modulated_deformable_col2im_cuda(
+    const at::Tensor data_col,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_im) {
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels =
+      channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  at::cuda::CUDAGuard device_guard(data_col.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
+        const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t* data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t* grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_col_,
+            data_offset_,
+            data_mask_,
+            channels,
+            height_im,
+            width_im,
+            kernel_h,
+            kernel_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            batch_size,
+            deformable_group,
+            height_col,
+            width_col,
+            grad_im_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf(
+        "error in modulated_deformable_col2im_cuda: %s\n",
+        cudaGetErrorString(err));
+  }
+}
+
+void modulated_deformable_col2im_coord_cuda(
+    const at::Tensor data_col,
+    const at::Tensor data_im,
+    const at::Tensor data_offset,
+    const at::Tensor data_mask,
+    const int batch_size,
+    const int channels,
+    const int height_im,
+    const int width_im,
+    const int height_col,
+    const int width_col,
+    const int kernel_h,
+    const int kernel_w,
+    const int pad_h,
+    const int pad_w,
+    const int stride_h,
+    const int stride_w,
+    const int dilation_h,
+    const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_offset,
+    at::Tensor grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
+      kernel_w * deformable_group;
+  const int channel_per_deformable_group =
+      channels * kernel_h * kernel_w / deformable_group;
+
+  at::cuda::CUDAGuard device_guard(data_col.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
+        const scalar_t* data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t* data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t* data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t* data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t* grad_offset_ = grad_offset.data_ptr<scalar_t>();
+        scalar_t* grad_mask_ = grad_mask.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels),
+            CUDA_NUM_THREADS,
+            0,
+            stream>>>(
+            num_kernels,
+            data_col_,
+            data_im_,
+            data_offset_,
+            data_mask_,
+            channels,
+            height_im,
+            width_im,
+            kernel_h,
+            kernel_w,
+            pad_h,
+            pad_w,
+            stride_h,
+            stride_w,
+            dilation_h,
+            dilation_w,
+            channel_per_deformable_group,
+            batch_size,
+            2 * kernel_h * kernel_w * deformable_group,
+            deformable_group,
+            height_col,
+            width_col,
+            grad_offset_,
+            grad_mask_);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf(
+        "error in modulated_deformable_col2im_coord_cuda: %s\n",
+        cudaGetErrorString(err));
+  }
+}
+
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/nms_rotated/nms_rotated.h b/mmcv/layers/csrc/nms_rotated/nms_rotated.h
new file mode 100644
index 0000000..12aca38
--- /dev/null
+++ b/mmcv/layers/csrc/nms_rotated/nms_rotated.h
@@ -0,0 +1,39 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#pragma once
+#include <torch/types.h>
+
+namespace detectron2 {
+
+at::Tensor nms_rotated_cpu(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const double iou_threshold);
+
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+at::Tensor nms_rotated_cuda(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const double iou_threshold);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+inline at::Tensor nms_rotated(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const double iou_threshold) {
+  assert(dets.device().is_cuda() == scores.device().is_cuda());
+  if (dets.device().is_cuda()) {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+    return nms_rotated_cuda(
+        dets.contiguous(), scores.contiguous(), iou_threshold);
+#else
+    AT_ERROR("Detectron2 is not compiled with GPU support!");
+#endif
+  }
+
+  return nms_rotated_cpu(dets.contiguous(), scores.contiguous(), iou_threshold);
+}
+
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/nms_rotated/nms_rotated_cpu.cpp b/mmcv/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
new file mode 100644
index 0000000..d7556e6
--- /dev/null
+++ b/mmcv/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
@@ -0,0 +1,75 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include "../box_iou_rotated/box_iou_rotated_utils.h"
+#include "nms_rotated.h"
+
+namespace detectron2 {
+
+template <typename scalar_t>
+at::Tensor nms_rotated_cpu_kernel(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const double iou_threshold) {
+  // nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
+  // however, the code in this function is much shorter because
+  // we delegate the IoU computation for rotated boxes to
+  // the single_box_iou_rotated function in box_iou_rotated_utils.h
+  AT_ASSERTM(dets.device().is_cpu(), "dets must be a CPU tensor");
+  AT_ASSERTM(scores.device().is_cpu(), "scores must be a CPU tensor");
+  AT_ASSERTM(
+      dets.scalar_type() == scores.scalar_type(),
+      "dets should have the same type as scores");
+
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong));
+  }
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
+  at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto keep = keep_t.data_ptr<int64_t>();
+  auto order = order_t.data_ptr<int64_t>();
+
+  int64_t num_to_keep = 0;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) {
+      continue;
+    }
+
+    keep[num_to_keep++] = i;
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) {
+        continue;
+      }
+
+      auto ovr = single_box_iou_rotated<scalar_t>(
+          dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>());
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+      }
+    }
+  }
+  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
+}
+
+at::Tensor nms_rotated_cpu(
+    // input must be contiguous
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const double iou_threshold) {
+  auto result = at::empty({0}, dets.options());
+
+  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_rotated", [&] {
+    result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
+  });
+  return result;
+}
+
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/nms_rotated/nms_rotated_cuda.cu b/mmcv/layers/csrc/nms_rotated/nms_rotated_cuda.cu
new file mode 100644
index 0000000..2a3db5c
--- /dev/null
+++ b/mmcv/layers/csrc/nms_rotated/nms_rotated_cuda.cu
@@ -0,0 +1,145 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#ifdef WITH_CUDA
+#include "../box_iou_rotated/box_iou_rotated_utils.h"
+#endif
+// TODO avoid this when pytorch supports "same directory" hipification
+#ifdef WITH_HIP
+#include "box_iou_rotated/box_iou_rotated_utils.h"
+#endif
+
+using namespace detectron2;
+
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+
+template <typename T>
+__global__ void nms_rotated_cuda_kernel(
+    const int n_boxes,
+    const double iou_threshold,
+    const T* dev_boxes,
+    unsigned long long* dev_mask) {
+  // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel
+
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size =
+      min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+      min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+  // Compared to nms_cuda_kernel, where each box is represented with 4 values
+  // (x1, y1, x2, y2), each rotated box is represented with 5 values
+  // (x_center, y_center, width, height, angle_degrees) here.
+  __shared__ T block_boxes[threadsPerBlock * 5];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 5 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+    block_boxes[threadIdx.x * 5 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+    block_boxes[threadIdx.x * 5 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+    block_boxes[threadIdx.x * 5 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+    block_boxes[threadIdx.x * 5 + 4] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+    const T* cur_box = dev_boxes + cur_box_idx * 5;
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      // Instead of devIoU used by original horizontal nms, here
+      // we use the single_box_iou_rotated function from box_iou_rotated_utils.h
+      if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5) >
+          iou_threshold) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = at::cuda::ATenCeilDiv(n_boxes, threadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+namespace detectron2 {
+
+at::Tensor nms_rotated_cuda(
+    // input must be contiguous
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    double iou_threshold) {
+  // using scalar_t = float;
+  AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
+  AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
+  at::cuda::CUDAGuard device_guard(dets.device());
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+  auto dets_sorted = dets.index_select(0, order_t);
+
+  auto dets_num = dets.size(0);
+
+  const int col_blocks =
+      at::cuda::ATenCeilDiv(static_cast<int>(dets_num), threadsPerBlock);
+
+  at::Tensor mask =
+      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
+
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      dets_sorted.scalar_type(), "nms_rotated_kernel_cuda", [&] {
+        nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            dets_num,
+            iou_threshold,
+            dets_sorted.data_ptr<scalar_t>(),
+            (unsigned long long*)mask.data_ptr<int64_t>());
+      });
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host =
+      (unsigned long long*)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  at::Tensor keep =
+      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
+  int64_t* keep_out = keep.data_ptr<int64_t>();
+
+  int num_to_keep = 0;
+  for (int i = 0; i < dets_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long* p = mask_host + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.index(
+      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
+           .to(order_t.device(), keep.scalar_type())});
+}
+
+} // namespace detectron2
diff --git a/mmcv/layers/csrc/vision.cpp b/mmcv/layers/csrc/vision.cpp
new file mode 100644
index 0000000..c9a2cd4
--- /dev/null
+++ b/mmcv/layers/csrc/vision.cpp
@@ -0,0 +1,117 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+
+#include <torch/extension.h>
+#include "ROIAlignRotated/ROIAlignRotated.h"
+#include "box_iou_rotated/box_iou_rotated.h"
+#include "cocoeval/cocoeval.h"
+#include "deformable/deform_conv.h"
+#include "nms_rotated/nms_rotated.h"
+
+namespace detectron2 {
+
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+extern int get_cudart_version();
+#endif
+
+std::string get_cuda_version() {
+#if defined(WITH_CUDA) || defined(WITH_HIP)
+  std::ostringstream oss;
+
+#if defined(WITH_CUDA)
+  oss << "CUDA ";
+#else
+  oss << "HIP ";
+#endif
+
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else // neither CUDA nor HIP
+  return std::string("not available");
+#endif
+}
+
+bool has_cuda() {
+#if defined(WITH_CUDA)
+  return true;
+#else
+  return false;
+#endif
+}
+
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+
+#if ((__GNUC__ <= 4) && (__GNUC_MINOR__ <= 8))
+#error "GCC >= 4.9 is required!"
+#endif
+
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("get_compiler_version", &get_compiler_version, "get_compiler_version");
+  m.def("get_cuda_version", &get_cuda_version, "get_cuda_version");
+  m.def("has_cuda", &has_cuda, "has_cuda");
+
+  m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward");
+  m.def(
+      "deform_conv_backward_input",
+      &deform_conv_backward_input,
+      "deform_conv_backward_input");
+  m.def(
+      "deform_conv_backward_filter",
+      &deform_conv_backward_filter,
+      "deform_conv_backward_filter");
+  m.def(
+      "modulated_deform_conv_forward",
+      &modulated_deform_conv_forward,
+      "modulated_deform_conv_forward");
+  m.def(
+      "modulated_deform_conv_backward",
+      &modulated_deform_conv_backward,
+      "modulated_deform_conv_backward");
+
+  m.def("COCOevalAccumulate", &COCOeval::Accumulate, "COCOeval::Accumulate");
+  m.def(
+      "COCOevalEvaluateImages",
+      &COCOeval::EvaluateImages,
+      "COCOeval::EvaluateImages");
+  pybind11::class_<COCOeval::InstanceAnnotation>(m, "InstanceAnnotation")
+      .def(pybind11::init<uint64_t, double, double, bool, bool>());
+  pybind11::class_<COCOeval::ImageEvaluation>(m, "ImageEvaluation")
+      .def(pybind11::init<>());
+}
+
+TORCH_LIBRARY(detectron2, m) {
+  m.def("nms_rotated", &nms_rotated);
+  m.def("box_iou_rotated", &box_iou_rotated);
+  m.def("roi_align_rotated_forward", &ROIAlignRotated_forward);
+  m.def("roi_align_rotated_backward", &ROIAlignRotated_backward);
+}
+} // namespace detectron2
diff --git a/mmcv/layers/deform_conv.py b/mmcv/layers/deform_conv.py
new file mode 100644
index 0000000..dffb720
--- /dev/null
+++ b/mmcv/layers/deform_conv.py
@@ -0,0 +1,514 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import math
+from functools import lru_cache
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+from torchvision.ops import deform_conv2d
+
+from detectron2.utils.develop import create_dummy_class, create_dummy_func
+
+from .wrappers import _NewEmptyTensorOp
+
+
+class _DeformConv(Function):
+    @staticmethod
+    def forward(
+        ctx,
+        input,
+        offset,
+        weight,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+        im2col_step=64,
+    ):
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                "Expected 4D tensor as input, got {}D tensor instead.".format(input.dim())
+            )
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.im2col_step = im2col_step
+
+        ctx.save_for_backward(input, offset, weight)
+
+        output = input.new_empty(
+            _DeformConv._output_size(input, weight, ctx.padding, ctx.dilation, ctx.stride)
+        )
+
+        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
+
+        if not input.is_cuda:
+            # TODO: let torchvision support full features of our deformconv.
+            if deformable_groups != 1:
+                raise NotImplementedError(
+                    "Deformable Conv with deformable_groups != 1 is not supported on CPUs!"
+                )
+            return deform_conv2d(
+                input, offset, weight, stride=stride, padding=padding, dilation=dilation
+            )
+        else:
+            cur_im2col_step = _DeformConv._cal_im2col_step(input.shape[0], ctx.im2col_step)
+            assert (input.shape[0] % cur_im2col_step) == 0, "im2col step must divide batchsize"
+
+            _C.deform_conv_forward(
+                input,
+                weight,
+                offset,
+                output,
+                ctx.bufs_[0],
+                ctx.bufs_[1],
+                weight.size(3),
+                weight.size(2),
+                ctx.stride[1],
+                ctx.stride[0],
+                ctx.padding[1],
+                ctx.padding[0],
+                ctx.dilation[1],
+                ctx.dilation[0],
+                ctx.groups,
+                ctx.deformable_groups,
+                cur_im2col_step,
+            )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, offset, weight = ctx.saved_tensors
+
+        grad_input = grad_offset = grad_weight = None
+
+        if not grad_output.is_cuda:
+            raise NotImplementedError("Deformable Conv is not supported on CPUs!")
+        else:
+            cur_im2col_step = _DeformConv._cal_im2col_step(input.shape[0], ctx.im2col_step)
+            assert (input.shape[0] % cur_im2col_step) == 0, "im2col step must divide batchsize"
+
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+                grad_input = torch.zeros_like(input)
+                grad_offset = torch.zeros_like(offset)
+                _C.deform_conv_backward_input(
+                    input,
+                    offset,
+                    grad_output,
+                    grad_input,
+                    grad_offset,
+                    weight,
+                    ctx.bufs_[0],
+                    weight.size(3),
+                    weight.size(2),
+                    ctx.stride[1],
+                    ctx.stride[0],
+                    ctx.padding[1],
+                    ctx.padding[0],
+                    ctx.dilation[1],
+                    ctx.dilation[0],
+                    ctx.groups,
+                    ctx.deformable_groups,
+                    cur_im2col_step,
+                )
+
+            if ctx.needs_input_grad[2]:
+                grad_weight = torch.zeros_like(weight)
+                _C.deform_conv_backward_filter(
+                    input,
+                    offset,
+                    grad_output,
+                    grad_weight,
+                    ctx.bufs_[0],
+                    ctx.bufs_[1],
+                    weight.size(3),
+                    weight.size(2),
+                    ctx.stride[1],
+                    ctx.stride[0],
+                    ctx.padding[1],
+                    ctx.padding[0],
+                    ctx.dilation[1],
+                    ctx.dilation[0],
+                    ctx.groups,
+                    ctx.deformable_groups,
+                    1,
+                    cur_im2col_step,
+                )
+
+        return grad_input, grad_offset, grad_weight, None, None, None, None, None, None
+
+    @staticmethod
+    def _output_size(input, weight, padding, dilation, stride):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = padding[d]
+            kernel = dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1,)
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                "convolution input is too small (output would be {})".format(
+                    "x".join(map(str, output_size))
+                )
+            )
+        return output_size
+
+    @staticmethod
+    @lru_cache(maxsize=128)
+    def _cal_im2col_step(input_size, default_size):
+        """
+        Calculate proper im2col step size, which should be divisible by input_size and not larger
+        than prefer_size. Meanwhile the step size should be as large as possible to be more
+        efficient. So we choose the largest one among all divisors of input_size which are smaller
+        than prefer_size.
+        :param input_size: input batch size .
+        :param default_size: default preferred im2col step size.
+        :return: the largest proper step size.
+        """
+        if input_size <= default_size:
+            return input_size
+        best_step = 1
+        for step in range(2, min(int(math.sqrt(input_size)) + 1, default_size)):
+            if input_size % step == 0:
+                if input_size // step <= default_size:
+                    return input_size // step
+                best_step = step
+
+        return best_step
+
+
+class _ModulatedDeformConv(Function):
+    @staticmethod
+    def forward(
+        ctx,
+        input,
+        offset,
+        mask,
+        weight,
+        bias=None,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+    ):
+        ctx.stride = stride
+        ctx.padding = padding
+        ctx.dilation = dilation
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.with_bias = bias is not None
+        if not ctx.with_bias:
+            bias = input.new_empty(1)  # fake tensor
+        if not input.is_cuda:
+            raise NotImplementedError("Deformable Conv is not supported on CPUs!")
+        if (
+            weight.requires_grad
+            or mask.requires_grad
+            or offset.requires_grad
+            or input.requires_grad
+        ):
+            ctx.save_for_backward(input, offset, mask, weight, bias)
+        output = input.new_empty(_ModulatedDeformConv._infer_shape(ctx, input, weight))
+        ctx._bufs = [input.new_empty(0), input.new_empty(0)]
+        _C.modulated_deform_conv_forward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            output,
+            ctx._bufs[1],
+            weight.shape[2],
+            weight.shape[3],
+            ctx.stride,
+            ctx.stride,
+            ctx.padding,
+            ctx.padding,
+            ctx.dilation,
+            ctx.dilation,
+            ctx.groups,
+            ctx.deformable_groups,
+            ctx.with_bias,
+        )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        if not grad_output.is_cuda:
+            raise NotImplementedError("Deformable Conv is not supported on CPUs!")
+        input, offset, mask, weight, bias = ctx.saved_tensors
+        grad_input = torch.zeros_like(input)
+        grad_offset = torch.zeros_like(offset)
+        grad_mask = torch.zeros_like(mask)
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
+        _C.modulated_deform_conv_backward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            ctx._bufs[1],
+            grad_input,
+            grad_weight,
+            grad_bias,
+            grad_offset,
+            grad_mask,
+            grad_output,
+            weight.shape[2],
+            weight.shape[3],
+            ctx.stride,
+            ctx.stride,
+            ctx.padding,
+            ctx.padding,
+            ctx.dilation,
+            ctx.dilation,
+            ctx.groups,
+            ctx.deformable_groups,
+            ctx.with_bias,
+        )
+        if not ctx.with_bias:
+            grad_bias = None
+
+        return (
+            grad_input,
+            grad_offset,
+            grad_mask,
+            grad_weight,
+            grad_bias,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+    @staticmethod
+    def _infer_shape(ctx, input, weight):
+        n = input.size(0)
+        channels_out = weight.size(0)
+        height, width = input.shape[2:4]
+        kernel_h, kernel_w = weight.shape[2:4]
+        height_out = (
+            height + 2 * ctx.padding - (ctx.dilation * (kernel_h - 1) + 1)
+        ) // ctx.stride + 1
+        width_out = (
+            width + 2 * ctx.padding - (ctx.dilation * (kernel_w - 1) + 1)
+        ) // ctx.stride + 1
+        return n, channels_out, height_out, width_out
+
+
+deform_conv = _DeformConv.apply
+modulated_deform_conv = _ModulatedDeformConv.apply
+
+
+class DeformConv(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+        bias=False,
+        norm=None,
+        activation=None,
+    ):
+        """
+        Deformable convolution from :paper:`deformconv`.
+
+        Arguments are similar to :class:`Conv2D`. Extra arguments:
+
+        Args:
+            deformable_groups (int): number of groups used in deformable convolution.
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+        """
+        super(DeformConv, self).__init__()
+
+        assert not bias
+        assert in_channels % groups == 0, "in_channels {} cannot be divisible by groups {}".format(
+            in_channels, groups
+        )
+        assert (
+            out_channels % groups == 0
+        ), "out_channels {} cannot be divisible by groups {}".format(out_channels, groups)
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+        self.norm = norm
+        self.activation = activation
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // self.groups, *self.kernel_size)
+        )
+        self.bias = None
+
+        nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
+
+    def forward(self, x, offset):
+        if x.numel() == 0:
+            # When input is empty, we want to return a empty tensor with "correct" shape,
+            # So that the following operations will not panic
+            # if they check for the shape of the tensor.
+            # This computes the height and width of the output tensor
+            output_shape = [
+                (i + 2 * p - (di * (k - 1) + 1)) // s + 1
+                for i, p, di, k, s in zip(
+                    x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
+                )
+            ]
+            output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
+            return _NewEmptyTensorOp.apply(x, output_shape)
+
+        x = deform_conv(
+            x,
+            offset,
+            self.weight,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            self.deformable_groups,
+        )
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+
+    def extra_repr(self):
+        tmpstr = "in_channels=" + str(self.in_channels)
+        tmpstr += ", out_channels=" + str(self.out_channels)
+        tmpstr += ", kernel_size=" + str(self.kernel_size)
+        tmpstr += ", stride=" + str(self.stride)
+        tmpstr += ", padding=" + str(self.padding)
+        tmpstr += ", dilation=" + str(self.dilation)
+        tmpstr += ", groups=" + str(self.groups)
+        tmpstr += ", deformable_groups=" + str(self.deformable_groups)
+        tmpstr += ", bias=False"
+        return tmpstr
+
+
+class ModulatedDeformConv(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+        bias=True,
+        norm=None,
+        activation=None,
+    ):
+        """
+        Modulated deformable convolution from :paper:`deformconv2`.
+
+        Arguments are similar to :class:`Conv2D`. Extra arguments:
+
+        Args:
+            deformable_groups (int): number of groups used in deformable convolution.
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+        """
+        super(ModulatedDeformConv, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+        self.with_bias = bias
+        self.norm = norm
+        self.activation = activation
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)
+        )
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.bias = None
+
+        nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0)
+
+    def forward(self, x, offset, mask):
+        if x.numel() == 0:
+            output_shape = [
+                (i + 2 * p - (di * (k - 1) + 1)) // s + 1
+                for i, p, di, k, s in zip(
+                    x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
+                )
+            ]
+            output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
+            return _NewEmptyTensorOp.apply(x, output_shape)
+
+        x = modulated_deform_conv(
+            x,
+            offset,
+            mask,
+            self.weight,
+            self.bias,
+            self.stride,
+            self.padding,
+            self.dilation,
+            self.groups,
+            self.deformable_groups,
+        )
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+
+    def extra_repr(self):
+        tmpstr = "in_channels=" + str(self.in_channels)
+        tmpstr += ", out_channels=" + str(self.out_channels)
+        tmpstr += ", kernel_size=" + str(self.kernel_size)
+        tmpstr += ", stride=" + str(self.stride)
+        tmpstr += ", padding=" + str(self.padding)
+        tmpstr += ", dilation=" + str(self.dilation)
+        tmpstr += ", groups=" + str(self.groups)
+        tmpstr += ", deformable_groups=" + str(self.deformable_groups)
+        tmpstr += ", bias=" + str(self.with_bias)
+        return tmpstr
+
+
+try:
+    from detectron2 import _C
+except ImportError:
+    # TODO: register ops natively so there is no need to import _C.
+    _msg = "detectron2 is not compiled successfully, please build following the instructions!"
+    _args = ("detectron2._C", _msg)
+    DeformConv = create_dummy_class("DeformConv", *_args)
+    ModulatedDeformConv = create_dummy_class("ModulatedDeformConv", *_args)
+    deform_conv = create_dummy_func("deform_conv", *_args)
+    modulated_deform_conv = create_dummy_func("modulated_deform_conv", *_args)
diff --git a/mmcv/layers/losses.py b/mmcv/layers/losses.py
new file mode 100644
index 0000000..850a852
--- /dev/null
+++ b/mmcv/layers/losses.py
@@ -0,0 +1,133 @@
+import math
+import torch
+
+
+def diou_loss(
+    boxes1: torch.Tensor,
+    boxes2: torch.Tensor,
+    reduction: str = "none",
+    eps: float = 1e-7,
+) -> torch.Tensor:
+    """
+    Distance Intersection over Union Loss (Zhaohui Zheng et. al)
+    https://arxiv.org/abs/1911.08287
+    Args:
+        boxes1, boxes2 (Tensor): box locations in XYXY format, shape (N, 4) or (4,).
+        reduction: 'none' | 'mean' | 'sum'
+                 'none': No reduction will be applied to the output.
+                 'mean': The output will be averaged.
+                 'sum': The output will be summed.
+        eps (float): small number to prevent division by zero
+    """
+
+    x1, y1, x2, y2 = boxes1.unbind(dim=-1)
+    x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
+
+    # TODO: use torch._assert_async() when pytorch 1.8 support is dropped
+    assert (x2 >= x1).all(), "bad box: x1 larger than x2"
+    assert (y2 >= y1).all(), "bad box: y1 larger than y2"
+
+    # Intersection keypoints
+    xkis1 = torch.max(x1, x1g)
+    ykis1 = torch.max(y1, y1g)
+    xkis2 = torch.min(x2, x2g)
+    ykis2 = torch.min(y2, y2g)
+
+    intsct = torch.zeros_like(x1)
+    mask = (ykis2 > ykis1) & (xkis2 > xkis1)
+    intsct[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask])
+    union = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsct + eps
+    iou = intsct / union
+
+    # smallest enclosing box
+    xc1 = torch.min(x1, x1g)
+    yc1 = torch.min(y1, y1g)
+    xc2 = torch.max(x2, x2g)
+    yc2 = torch.max(y2, y2g)
+    diag_len = ((xc2 - xc1) ** 2) + ((yc2 - yc1) ** 2) + eps
+
+    # centers of boxes
+    x_p = (x2 + x1) / 2
+    y_p = (y2 + y1) / 2
+    x_g = (x1g + x2g) / 2
+    y_g = (y1g + y2g) / 2
+    distance = ((x_p - x_g) ** 2) + ((y_p - y_g) ** 2)
+
+    # Eqn. (7)
+    loss = 1 - iou + (distance / diag_len)
+    if reduction == "mean":
+        loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
+    elif reduction == "sum":
+        loss = loss.sum()
+
+    return loss
+
+
+def ciou_loss(
+    boxes1: torch.Tensor,
+    boxes2: torch.Tensor,
+    reduction: str = "none",
+    eps: float = 1e-7,
+) -> torch.Tensor:
+    """
+    Complete Intersection over Union Loss (Zhaohui Zheng et. al)
+    https://arxiv.org/abs/1911.08287
+    Args:
+        boxes1, boxes2 (Tensor): box locations in XYXY format, shape (N, 4) or (4,).
+        reduction: 'none' | 'mean' | 'sum'
+                 'none': No reduction will be applied to the output.
+                 'mean': The output will be averaged.
+                 'sum': The output will be summed.
+        eps (float): small number to prevent division by zero
+    """
+
+    x1, y1, x2, y2 = boxes1.unbind(dim=-1)
+    x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
+
+    # TODO: use torch._assert_async() when pytorch 1.8 support is dropped
+    assert (x2 >= x1).all(), "bad box: x1 larger than x2"
+    assert (y2 >= y1).all(), "bad box: y1 larger than y2"
+
+    # Intersection keypoints
+    xkis1 = torch.max(x1, x1g)
+    ykis1 = torch.max(y1, y1g)
+    xkis2 = torch.min(x2, x2g)
+    ykis2 = torch.min(y2, y2g)
+
+    intsct = torch.zeros_like(x1)
+    mask = (ykis2 > ykis1) & (xkis2 > xkis1)
+    intsct[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask])
+    union = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsct + eps
+    iou = intsct / union
+
+    # smallest enclosing box
+    xc1 = torch.min(x1, x1g)
+    yc1 = torch.min(y1, y1g)
+    xc2 = torch.max(x2, x2g)
+    yc2 = torch.max(y2, y2g)
+    diag_len = ((xc2 - xc1) ** 2) + ((yc2 - yc1) ** 2) + eps
+
+    # centers of boxes
+    x_p = (x2 + x1) / 2
+    y_p = (y2 + y1) / 2
+    x_g = (x1g + x2g) / 2
+    y_g = (y1g + y2g) / 2
+    distance = ((x_p - x_g) ** 2) + ((y_p - y_g) ** 2)
+
+    # width and height of boxes
+    w_pred = x2 - x1
+    h_pred = y2 - y1
+    w_gt = x2g - x1g
+    h_gt = y2g - y1g
+    v = (4 / (math.pi**2)) * torch.pow((torch.atan(w_gt / h_gt) - torch.atan(w_pred / h_pred)), 2)
+    with torch.no_grad():
+        alpha = v / (1 - iou + v + eps)
+
+    # Eqn. (10)
+    loss = 1 - iou + (distance / diag_len) + alpha * v
+    if reduction == "mean":
+        loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
+    elif reduction == "sum":
+        loss = loss.sum()
+
+    return loss
diff --git a/mmcv/layers/mask_ops.py b/mmcv/layers/mask_ops.py
new file mode 100644
index 0000000..990d04a
--- /dev/null
+++ b/mmcv/layers/mask_ops.py
@@ -0,0 +1,275 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+from typing import Tuple
+import torch
+from PIL import Image
+from torch.nn import functional as F
+
+__all__ = ["paste_masks_in_image"]
+
+
+BYTES_PER_FLOAT = 4
+# TODO: This memory limit may be too much or too little. It would be better to
+# determine it based on available resources.
+GPU_MEM_LIMIT = 1024**3  # 1 GB memory limit
+
+
+def _do_paste_mask(masks, boxes, img_h: int, img_w: int, skip_empty: bool = True):
+    """
+    Args:
+        masks: N, 1, H, W
+        boxes: N, 4
+        img_h, img_w (int):
+        skip_empty (bool): only paste masks within the region that
+            tightly bound all boxes, and returns the results this region only.
+            An important optimization for CPU.
+
+    Returns:
+        if skip_empty == False, a mask of shape (N, img_h, img_w)
+        if skip_empty == True, a mask of shape (N, h', w'), and the slice
+            object for the corresponding region.
+    """
+    # On GPU, paste all masks together (up to chunk size)
+    # by using the entire image to sample the masks
+    # Compared to pasting them one by one,
+    # this has more operations but is faster on COCO-scale dataset.
+    device = masks.device
+
+    if skip_empty and not torch.jit.is_scripting():
+        x0_int, y0_int = torch.clamp(boxes.min(dim=0).values.floor()[:2] - 1, min=0).to(
+            dtype=torch.int32
+        )
+        x1_int = torch.clamp(boxes[:, 2].max().ceil() + 1, max=img_w).to(dtype=torch.int32)
+        y1_int = torch.clamp(boxes[:, 3].max().ceil() + 1, max=img_h).to(dtype=torch.int32)
+    else:
+        x0_int, y0_int = 0, 0
+        x1_int, y1_int = img_w, img_h
+    x0, y0, x1, y1 = torch.split(boxes, 1, dim=1)  # each is Nx1
+
+    N = masks.shape[0]
+
+    img_y = torch.arange(y0_int, y1_int, device=device, dtype=torch.float32) + 0.5
+    img_x = torch.arange(x0_int, x1_int, device=device, dtype=torch.float32) + 0.5
+    img_y = (img_y - y0) / (y1 - y0) * 2 - 1
+    img_x = (img_x - x0) / (x1 - x0) * 2 - 1
+    # img_x, img_y have shapes (N, w), (N, h)
+
+    gx = img_x[:, None, :].expand(N, img_y.size(1), img_x.size(1))
+    gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1))
+    grid = torch.stack([gx, gy], dim=3)
+
+    if not torch.jit.is_scripting():
+        if not masks.dtype.is_floating_point:
+            masks = masks.float()
+    img_masks = F.grid_sample(masks, grid.to(masks.dtype), align_corners=False)
+
+    if skip_empty and not torch.jit.is_scripting():
+        return img_masks[:, 0], (slice(y0_int, y1_int), slice(x0_int, x1_int))
+    else:
+        return img_masks[:, 0], ()
+
+
+# Annotate boxes as Tensor (but not Boxes) in order to use scripting
+@torch.jit.script_if_tracing
+def paste_masks_in_image(
+    masks: torch.Tensor, boxes: torch.Tensor, image_shape: Tuple[int, int], threshold: float = 0.5
+):
+    """
+    Paste a set of masks that are of a fixed resolution (e.g., 28 x 28) into an image.
+    The location, height, and width for pasting each mask is determined by their
+    corresponding bounding boxes in boxes.
+
+    Note:
+        This is a complicated but more accurate implementation. In actual deployment, it is
+        often enough to use a faster but less accurate implementation.
+        See :func:`paste_mask_in_image_old` in this file for an alternative implementation.
+
+    Args:
+        masks (tensor): Tensor of shape (Bimg, Hmask, Wmask), where Bimg is the number of
+            detected object instances in the image and Hmask, Wmask are the mask width and mask
+            height of the predicted mask (e.g., Hmask = Wmask = 28). Values are in [0, 1].
+        boxes (Boxes or Tensor): A Boxes of length Bimg or Tensor of shape (Bimg, 4).
+            boxes[i] and masks[i] correspond to the same object instance.
+        image_shape (tuple): height, width
+        threshold (float): A threshold in [0, 1] for converting the (soft) masks to
+            binary masks.
+
+    Returns:
+        img_masks (Tensor): A tensor of shape (Bimg, Himage, Wimage), where Bimg is the
+        number of detected object instances and Himage, Wimage are the image width
+        and height. img_masks[i] is a binary mask for object instance i.
+    """
+
+    assert masks.shape[-1] == masks.shape[-2], "Only square mask predictions are supported"
+    N = len(masks)
+    if N == 0:
+        return masks.new_empty((0,) + image_shape, dtype=torch.uint8)
+    if not isinstance(boxes, torch.Tensor):
+        boxes = boxes.tensor
+    device = boxes.device
+    assert len(boxes) == N, boxes.shape
+
+    img_h, img_w = image_shape
+
+    # The actual implementation split the input into chunks,
+    # and paste them chunk by chunk.
+    if device.type == "cpu" or torch.jit.is_scripting():
+        # CPU is most efficient when they are pasted one by one with skip_empty=True
+        # so that it performs minimal number of operations.
+        num_chunks = N
+    else:
+        # GPU benefits from parallelism for larger chunks, but may have memory issue
+        # int(img_h) because shape may be tensors in tracing
+        num_chunks = int(np.ceil(N * int(img_h) * int(img_w) * BYTES_PER_FLOAT / GPU_MEM_LIMIT))
+        assert (
+            num_chunks <= N
+        ), "Default GPU_MEM_LIMIT in mask_ops.py is too small; try increasing it"
+    chunks = torch.chunk(torch.arange(N, device=device), num_chunks)
+
+    img_masks = torch.zeros(
+        N, img_h, img_w, device=device, dtype=torch.bool if threshold >= 0 else torch.uint8
+    )
+    for inds in chunks:
+        masks_chunk, spatial_inds = _do_paste_mask(
+            masks[inds, None, :, :], boxes[inds], img_h, img_w, skip_empty=device.type == "cpu"
+        )
+
+        if threshold >= 0:
+            masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool)
+        else:
+            # for visualization and debugging
+            masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8)
+
+        if torch.jit.is_scripting():  # Scripting does not use the optimized codepath
+            img_masks[inds] = masks_chunk
+        else:
+            img_masks[(inds,) + spatial_inds] = masks_chunk
+    return img_masks
+
+
+# The below are the original paste function (from Detectron1) which has
+# larger quantization error.
+# It is faster on CPU, while the aligned one is faster on GPU thanks to grid_sample.
+
+
+def paste_mask_in_image_old(mask, box, img_h, img_w, threshold):
+    """
+    Paste a single mask in an image.
+    This is a per-box implementation of :func:`paste_masks_in_image`.
+    This function has larger quantization error due to incorrect pixel
+    modeling and is not used any more.
+
+    Args:
+        mask (Tensor): A tensor of shape (Hmask, Wmask) storing the mask of a single
+            object instance. Values are in [0, 1].
+        box (Tensor): A tensor of shape (4, ) storing the x0, y0, x1, y1 box corners
+            of the object instance.
+        img_h, img_w (int): Image height and width.
+        threshold (float): Mask binarization threshold in [0, 1].
+
+    Returns:
+        im_mask (Tensor):
+            The resized and binarized object mask pasted into the original
+            image plane (a tensor of shape (img_h, img_w)).
+    """
+    # Conversion from continuous box coordinates to discrete pixel coordinates
+    # via truncation (cast to int32). This determines which pixels to paste the
+    # mask onto.
+    box = box.to(dtype=torch.int32)  # Continuous to discrete coordinate conversion
+    # An example (1D) box with continuous coordinates (x0=0.7, x1=4.3) will map to
+    # a discrete coordinates (x0=0, x1=4). Note that box is mapped to 5 = x1 - x0 + 1
+    # pixels (not x1 - x0 pixels).
+    samples_w = box[2] - box[0] + 1  # Number of pixel samples, *not* geometric width
+    samples_h = box[3] - box[1] + 1  # Number of pixel samples, *not* geometric height
+
+    # Resample the mask from it's original grid to the new samples_w x samples_h grid
+    mask = Image.fromarray(mask.cpu().numpy())
+    mask = mask.resize((samples_w, samples_h), resample=Image.BILINEAR)
+    mask = np.array(mask, copy=False)
+
+    if threshold >= 0:
+        mask = np.array(mask > threshold, dtype=np.uint8)
+        mask = torch.from_numpy(mask)
+    else:
+        # for visualization and debugging, we also
+        # allow it to return an unmodified mask
+        mask = torch.from_numpy(mask * 255).to(torch.uint8)
+
+    im_mask = torch.zeros((img_h, img_w), dtype=torch.uint8)
+    x_0 = max(box[0], 0)
+    x_1 = min(box[2] + 1, img_w)
+    y_0 = max(box[1], 0)
+    y_1 = min(box[3] + 1, img_h)
+
+    im_mask[y_0:y_1, x_0:x_1] = mask[
+        (y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])
+    ]
+    return im_mask
+
+
+# Our pixel modeling requires extrapolation for any continuous
+# coordinate < 0.5 or > length - 0.5. When sampling pixels on the masks,
+# we would like this extrapolation to be an interpolation between boundary values and zero,
+# instead of using absolute zero or boundary values.
+# Therefore `paste_mask_in_image_old` is often used with zero padding around the masks like this:
+# masks, scale = pad_masks(masks[:, 0, :, :], 1)
+# boxes = scale_boxes(boxes.tensor, scale)
+
+
+def pad_masks(masks, padding):
+    """
+    Args:
+        masks (tensor): A tensor of shape (B, M, M) representing B masks.
+        padding (int): Number of cells to pad on all sides.
+
+    Returns:
+        The padded masks and the scale factor of the padding size / original size.
+    """
+    B = masks.shape[0]
+    M = masks.shape[-1]
+    pad2 = 2 * padding
+    scale = float(M + pad2) / M
+    padded_masks = masks.new_zeros((B, M + pad2, M + pad2))
+    padded_masks[:, padding:-padding, padding:-padding] = masks
+    return padded_masks, scale
+
+
+def scale_boxes(boxes, scale):
+    """
+    Args:
+        boxes (tensor): A tensor of shape (B, 4) representing B boxes with 4
+            coords representing the corners x0, y0, x1, y1,
+        scale (float): The box scaling factor.
+
+    Returns:
+        Scaled boxes.
+    """
+    w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
+    h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
+    x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
+    y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
+
+    w_half *= scale
+    h_half *= scale
+
+    scaled_boxes = torch.zeros_like(boxes)
+    scaled_boxes[:, 0] = x_c - w_half
+    scaled_boxes[:, 2] = x_c + w_half
+    scaled_boxes[:, 1] = y_c - h_half
+    scaled_boxes[:, 3] = y_c + h_half
+    return scaled_boxes
+
+
+@torch.jit.script_if_tracing
+def _paste_masks_tensor_shape(
+    masks: torch.Tensor,
+    boxes: torch.Tensor,
+    image_shape: Tuple[torch.Tensor, torch.Tensor],
+    threshold: float = 0.5,
+):
+    """
+    A wrapper of paste_masks_in_image where image_shape is Tensor.
+    During tracing, shapes might be tensors instead of ints. The Tensor->int
+    conversion should be scripted rather than traced.
+    """
+    return paste_masks_in_image(masks, boxes, (int(image_shape[0]), int(image_shape[1])), threshold)
diff --git a/mmcv/layers/nms.py b/mmcv/layers/nms.py
new file mode 100644
index 0000000..1019e7f
--- /dev/null
+++ b/mmcv/layers/nms.py
@@ -0,0 +1,144 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import torch
+from torchvision.ops import boxes as box_ops
+from torchvision.ops import nms  # noqa . for compatibility
+
+
+def batched_nms(
+    boxes: torch.Tensor, scores: torch.Tensor, idxs: torch.Tensor, iou_threshold: float
+):
+    """
+    Same as torchvision.ops.boxes.batched_nms, but with float().
+    """
+    assert boxes.shape[-1] == 4
+    # Note: Torchvision already has a strategy (https://github.com/pytorch/vision/issues/1311)
+    # to decide whether to use coordinate trick or for loop to implement batched_nms. So we
+    # just call it directly.
+    # Fp16 does not have enough range for batched NMS, so adding float().
+    return box_ops.batched_nms(boxes.float(), scores, idxs, iou_threshold)
+
+
+# Note: this function (nms_rotated) might be moved into
+# torchvision/ops/boxes.py in the future
+def nms_rotated(boxes: torch.Tensor, scores: torch.Tensor, iou_threshold: float):
+    """
+    Performs non-maximum suppression (NMS) on the rotated boxes according
+    to their intersection-over-union (IoU).
+
+    Rotated NMS iteratively removes lower scoring rotated boxes which have an
+    IoU greater than iou_threshold with another (higher scoring) rotated box.
+
+    Note that RotatedBox (5, 3, 4, 2, -90) covers exactly the same region as
+    RotatedBox (5, 3, 4, 2, 90) does, and their IoU will be 1. However, they
+    can be representing completely different objects in certain tasks, e.g., OCR.
+
+    As for the question of whether rotated-NMS should treat them as faraway boxes
+    even though their IOU is 1, it depends on the application and/or ground truth annotation.
+
+    As an extreme example, consider a single character v and the square box around it.
+
+    If the angle is 0 degree, the object (text) would be read as 'v';
+
+    If the angle is 90 degrees, the object (text) would become '>';
+
+    If the angle is 180 degrees, the object (text) would become '^';
+
+    If the angle is 270/-90 degrees, the object (text) would become '<'
+
+    All of these cases have IoU of 1 to each other, and rotated NMS that only
+    uses IoU as criterion would only keep one of them with the highest score -
+    which, practically, still makes sense in most cases because typically
+    only one of theses orientations is the correct one. Also, it does not matter
+    as much if the box is only used to classify the object (instead of transcribing
+    them with a sequential OCR recognition model) later.
+
+    On the other hand, when we use IoU to filter proposals that are close to the
+    ground truth during training, we should definitely take the angle into account if
+    we know the ground truth is labeled with the strictly correct orientation (as in,
+    upside-down words are annotated with -180 degrees even though they can be covered
+    with a 0/90/-90 degree box, etc.)
+
+    The way the original dataset is annotated also matters. For example, if the dataset
+    is a 4-point polygon dataset that does not enforce ordering of vertices/orientation,
+    we can estimate a minimum rotated bounding box to this polygon, but there's no way
+    we can tell the correct angle with 100% confidence (as shown above, there could be 4 different
+    rotated boxes, with angles differed by 90 degrees to each other, covering the exactly
+    same region). In that case we have to just use IoU to determine the box
+    proximity (as many detection benchmarks (even for text) do) unless there're other
+    assumptions we can make (like width is always larger than height, or the object is not
+    rotated by more than 90 degrees CCW/CW, etc.)
+
+    In summary, not considering angles in rotated NMS seems to be a good option for now,
+    but we should be aware of its implications.
+
+    Args:
+        boxes (Tensor[N, 5]): Rotated boxes to perform NMS on. They are expected to be in
+           (x_center, y_center, width, height, angle_degrees) format.
+        scores (Tensor[N]): Scores for each one of the rotated boxes
+        iou_threshold (float): Discards all overlapping rotated boxes with IoU < iou_threshold
+
+    Returns:
+        keep (Tensor): int64 tensor with the indices of the elements that have been kept
+        by Rotated NMS, sorted in decreasing order of scores
+    """
+    return torch.ops.detectron2.nms_rotated(boxes, scores, iou_threshold)
+
+
+# Note: this function (batched_nms_rotated) might be moved into
+# torchvision/ops/boxes.py in the future
+
+
+@torch.jit.script_if_tracing
+def batched_nms_rotated(
+    boxes: torch.Tensor, scores: torch.Tensor, idxs: torch.Tensor, iou_threshold: float
+):
+    """
+    Performs non-maximum suppression in a batched fashion.
+
+    Each index value correspond to a category, and NMS
+    will not be applied between elements of different categories.
+
+    Args:
+        boxes (Tensor[N, 5]):
+           boxes where NMS will be performed. They
+           are expected to be in (x_ctr, y_ctr, width, height, angle_degrees) format
+        scores (Tensor[N]):
+           scores for each one of the boxes
+        idxs (Tensor[N]):
+           indices of the categories for each one of the boxes.
+        iou_threshold (float):
+           discards all overlapping boxes
+           with IoU < iou_threshold
+
+    Returns:
+        Tensor:
+            int64 tensor with the indices of the elements that have been kept
+            by NMS, sorted in decreasing order of scores
+    """
+    assert boxes.shape[-1] == 5
+
+    if boxes.numel() == 0:
+        return torch.empty((0,), dtype=torch.int64, device=boxes.device)
+    boxes = boxes.float()  # fp16 does not have enough range for batched NMS
+    # Strategy: in order to perform NMS independently per class,
+    # we add an offset to all the boxes. The offset is dependent
+    # only on the class idx, and is large enough so that boxes
+    # from different classes do not overlap
+
+    # Note that batched_nms in torchvision/ops/boxes.py only uses max_coordinate,
+    # which won't handle negative coordinates correctly.
+    # Here by using min_coordinate we can make sure the negative coordinates are
+    # correctly handled.
+    max_coordinate = (
+        torch.max(boxes[:, 0], boxes[:, 1]) + torch.max(boxes[:, 2], boxes[:, 3]) / 2
+    ).max()
+    min_coordinate = (
+        torch.min(boxes[:, 0], boxes[:, 1]) - torch.max(boxes[:, 2], boxes[:, 3]) / 2
+    ).min()
+    offsets = idxs.to(boxes) * (max_coordinate - min_coordinate + 1)
+    boxes_for_nms = boxes.clone()  # avoid modifying the original values in boxes
+    boxes_for_nms[:, :2] += offsets[:, None]
+    keep = nms_rotated(boxes_for_nms, scores, iou_threshold)
+    return keep
diff --git a/mmcv/layers/roi_align.py b/mmcv/layers/roi_align.py
new file mode 100644
index 0000000..163462e
--- /dev/null
+++ b/mmcv/layers/roi_align.py
@@ -0,0 +1,74 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from torch import nn
+from torchvision.ops import roi_align
+
+
+# NOTE: torchvision's RoIAlign has a different default aligned=False
+class ROIAlign(nn.Module):
+    def __init__(self, output_size, spatial_scale, sampling_ratio, aligned=True):
+        """
+        Args:
+            output_size (tuple): h, w
+            spatial_scale (float): scale the input boxes by this number
+            sampling_ratio (int): number of inputs samples to take for each output
+                sample. 0 to take samples densely.
+            aligned (bool): if False, use the legacy implementation in
+                Detectron. If True, align the results more perfectly.
+
+        Note:
+            The meaning of aligned=True:
+
+            Given a continuous coordinate c, its two neighboring pixel indices (in our
+            pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example,
+            c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled
+            from the underlying signal at continuous coordinates 0.5 and 1.5). But the original
+            roi_align (aligned=False) does not subtract the 0.5 when computing neighboring
+            pixel indices and therefore it uses pixels with a slightly incorrect alignment
+            (relative to our pixel model) when performing bilinear interpolation.
+
+            With `aligned=True`,
+            we first appropriately scale the ROI and then shift it by -0.5
+            prior to calling roi_align. This produces the correct neighbors; see
+            detectron2/tests/test_roi_align.py for verification.
+
+            The difference does not make a difference to the model's performance if
+            ROIAlign is used together with conv layers.
+        """
+        super().__init__()
+        self.output_size = output_size
+        self.spatial_scale = spatial_scale
+        self.sampling_ratio = sampling_ratio
+        self.aligned = aligned
+
+        from torchvision import __version__
+
+        version = tuple(int(x) for x in __version__.split(".")[:2])
+        # https://github.com/pytorch/vision/pull/2438
+        assert version >= (0, 7), "Require torchvision >= 0.7"
+
+    def forward(self, input, rois):
+        """
+        Args:
+            input: NCHW images
+            rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy.
+        """
+        assert rois.dim() == 2 and rois.size(1) == 5
+        if input.is_quantized:
+            input = input.dequantize()
+        return roi_align(
+            input,
+            rois.to(dtype=input.dtype),
+            self.output_size,
+            self.spatial_scale,
+            self.sampling_ratio,
+            self.aligned,
+        )
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += "output_size=" + str(self.output_size)
+        tmpstr += ", spatial_scale=" + str(self.spatial_scale)
+        tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
+        tmpstr += ", aligned=" + str(self.aligned)
+        tmpstr += ")"
+        return tmpstr
diff --git a/mmcv/layers/roi_align_rotated.py b/mmcv/layers/roi_align_rotated.py
new file mode 100644
index 0000000..2a52399
--- /dev/null
+++ b/mmcv/layers/roi_align_rotated.py
@@ -0,0 +1,100 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+
+class _ROIAlignRotated(Function):
+    @staticmethod
+    def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio):
+        ctx.save_for_backward(roi)
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.sampling_ratio = sampling_ratio
+        ctx.input_shape = input.size()
+        output = torch.ops.detectron2.roi_align_rotated_forward(
+            input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio
+        )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        (rois,) = ctx.saved_tensors
+        output_size = ctx.output_size
+        spatial_scale = ctx.spatial_scale
+        sampling_ratio = ctx.sampling_ratio
+        bs, ch, h, w = ctx.input_shape
+        grad_input = torch.ops.detectron2.roi_align_rotated_backward(
+            grad_output,
+            rois,
+            spatial_scale,
+            output_size[0],
+            output_size[1],
+            bs,
+            ch,
+            h,
+            w,
+            sampling_ratio,
+        )
+        return grad_input, None, None, None, None, None
+
+
+roi_align_rotated = _ROIAlignRotated.apply
+
+
+class ROIAlignRotated(nn.Module):
+    def __init__(self, output_size, spatial_scale, sampling_ratio):
+        """
+        Args:
+            output_size (tuple): h, w
+            spatial_scale (float): scale the input boxes by this number
+            sampling_ratio (int): number of inputs samples to take for each output
+                sample. 0 to take samples densely.
+
+        Note:
+            ROIAlignRotated supports continuous coordinate by default:
+            Given a continuous coordinate c, its two neighboring pixel indices (in our
+            pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example,
+            c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled
+            from the underlying signal at continuous coordinates 0.5 and 1.5).
+        """
+        super(ROIAlignRotated, self).__init__()
+        self.output_size = output_size
+        self.spatial_scale = spatial_scale
+        self.sampling_ratio = sampling_ratio
+
+    def forward(self, input, rois):
+        """
+        Args:
+            input: NCHW images
+            rois: Bx6 boxes. First column is the index into N.
+                The other 5 columns are (x_ctr, y_ctr, width, height, angle_degrees).
+        """
+        assert rois.dim() == 2 and rois.size(1) == 6
+        orig_dtype = input.dtype
+        if orig_dtype == torch.float16:
+            input = input.float()
+            rois = rois.float()
+        output_size = _pair(self.output_size)
+
+        # Scripting for Autograd is currently unsupported.
+        # This is a quick fix without having to rewrite code on the C++ side
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            return torch.ops.detectron2.roi_align_rotated_forward(
+                input, rois, self.spatial_scale, output_size[0], output_size[1], self.sampling_ratio
+            ).to(dtype=orig_dtype)
+
+        return roi_align_rotated(
+            input, rois, self.output_size, self.spatial_scale, self.sampling_ratio
+        ).to(dtype=orig_dtype)
+
+    def __repr__(self):
+        tmpstr = self.__class__.__name__ + "("
+        tmpstr += "output_size=" + str(self.output_size)
+        tmpstr += ", spatial_scale=" + str(self.spatial_scale)
+        tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
+        tmpstr += ")"
+        return tmpstr
diff --git a/mmcv/layers/rotated_boxes.py b/mmcv/layers/rotated_boxes.py
new file mode 100644
index 0000000..03f73b3
--- /dev/null
+++ b/mmcv/layers/rotated_boxes.py
@@ -0,0 +1,21 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from __future__ import absolute_import, division, print_function, unicode_literals
+import torch
+
+
+def pairwise_iou_rotated(boxes1, boxes2):
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+
+    Both sets of boxes are expected to be in
+    (x_center, y_center, width, height, angle) format.
+
+    Arguments:
+        boxes1 (Tensor[N, 5])
+        boxes2 (Tensor[M, 5])
+
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+    return torch.ops.detectron2.box_iou_rotated(boxes1, boxes2)
diff --git a/mmcv/layers/shape_spec.py b/mmcv/layers/shape_spec.py
new file mode 100644
index 0000000..8dac3c5
--- /dev/null
+++ b/mmcv/layers/shape_spec.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass
+class ShapeSpec:
+    """
+    A simple structure that contains basic shape specification about a tensor.
+    It is often used as the auxiliary inputs/outputs of models,
+    to complement the lack of shape inference ability among pytorch modules.
+    """
+
+    channels: Optional[int] = None
+    height: Optional[int] = None
+    width: Optional[int] = None
+    stride: Optional[int] = None
diff --git a/mmcv/layers/wrappers.py b/mmcv/layers/wrappers.py
new file mode 100644
index 0000000..c9d63f1
--- /dev/null
+++ b/mmcv/layers/wrappers.py
@@ -0,0 +1,162 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+"""
+Wrappers around on some nn functions, mainly to support empty tensors.
+
+Ideally, add support directly in PyTorch to empty tensors in those functions.
+
+These can be removed once https://github.com/pytorch/pytorch/issues/12013
+is implemented
+"""
+
+import warnings
+from typing import List, Optional
+import torch
+from torch.nn import functional as F
+
+TORCH_VERSION = tuple(int(x) for x in torch.__version__.split(".")[:2])
+
+
+def shapes_to_tensor(x: List[int], device: Optional[torch.device] = None) -> torch.Tensor:
+    """
+    Turn a list of integer scalars or integer Tensor scalars into a vector,
+    in a way that's both traceable and scriptable.
+
+    In tracing, `x` should be a list of scalar Tensor, so the output can trace to the inputs.
+    In scripting or eager, `x` should be a list of int.
+    """
+    if torch.jit.is_scripting():
+        return torch.as_tensor(x, device=device)
+    if torch.jit.is_tracing():
+        assert all(
+            [isinstance(t, torch.Tensor) for t in x]
+        ), "Shape should be tensor during tracing!"
+        # as_tensor should not be used in tracing because it records a constant
+        ret = torch.stack(x)
+        if ret.device != device:  # avoid recording a hard-coded device if not necessary
+            ret = ret.to(device=device)
+        return ret
+    return torch.as_tensor(x, device=device)
+
+
+def check_if_dynamo_compiling():
+    if TORCH_VERSION >= (1, 14):
+        from torch._dynamo import is_compiling
+
+        return is_compiling()
+    else:
+        return False
+
+
+def cat(tensors: List[torch.Tensor], dim: int = 0):
+    """
+    Efficient version of torch.cat that avoids a copy if there is only a single element in a list
+    """
+    assert isinstance(tensors, (list, tuple))
+    if len(tensors) == 1:
+        return tensors[0]
+    return torch.cat(tensors, dim)
+
+
+def empty_input_loss_func_wrapper(loss_func):
+    def wrapped_loss_func(input, target, *, reduction="mean", **kwargs):
+        """
+        Same as `loss_func`, but returns 0 (instead of nan) for empty inputs.
+        """
+        if target.numel() == 0 and reduction == "mean":
+            return input.sum() * 0.0  # connect the gradient
+        return loss_func(input, target, reduction=reduction, **kwargs)
+
+    return wrapped_loss_func
+
+
+cross_entropy = empty_input_loss_func_wrapper(F.cross_entropy)
+
+
+class _NewEmptyTensorOp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, new_shape):
+        ctx.shape = x.shape
+        return x.new_empty(new_shape)
+
+    @staticmethod
+    def backward(ctx, grad):
+        shape = ctx.shape
+        return _NewEmptyTensorOp.apply(grad, shape), None
+
+
+class Conv2d(torch.nn.Conv2d):
+    """
+    A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features.
+    """
+
+    def __init__(self, *args, **kwargs):
+        """
+        Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`:
+
+        Args:
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+
+        It assumes that norm layer is used before activation.
+        """
+        norm = kwargs.pop("norm", None)
+        activation = kwargs.pop("activation", None)
+        super().__init__(*args, **kwargs)
+
+        self.norm = norm
+        self.activation = activation
+
+    def forward(self, x):
+        # torchscript does not support SyncBatchNorm yet
+        # https://github.com/pytorch/pytorch/issues/40507
+        # and we skip these codes in torchscript since:
+        # 1. currently we only support torchscript in evaluation mode
+        # 2. features needed by exporting module to torchscript are added in PyTorch 1.6 or
+        # later version, `Conv2d` in these PyTorch versions has already supported empty inputs.
+        if not torch.jit.is_scripting():
+            # Dynamo doesn't support context managers yet
+            is_dynamo_compiling = check_if_dynamo_compiling()
+            if not is_dynamo_compiling:
+                with warnings.catch_warnings(record=True):
+                    if x.numel() == 0 and self.training:
+                        # https://github.com/pytorch/pytorch/issues/12013
+                        assert not isinstance(
+                            self.norm, torch.nn.SyncBatchNorm
+                        ), "SyncBatchNorm does not support empty inputs!"
+
+        x = F.conv2d(
+            x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups
+        )
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+
+
+ConvTranspose2d = torch.nn.ConvTranspose2d
+BatchNorm2d = torch.nn.BatchNorm2d
+interpolate = F.interpolate
+Linear = torch.nn.Linear
+
+
+def nonzero_tuple(x):
+    """
+    A 'as_tuple=True' version of torch.nonzero to support torchscript.
+    because of https://github.com/pytorch/pytorch/issues/38718
+    """
+    if torch.jit.is_scripting():
+        if x.dim() == 0:
+            return x.unsqueeze(0).nonzero().unbind(1)
+        return x.nonzero().unbind(1)
+    else:
+        return x.nonzero(as_tuple=True)
+
+
+@torch.jit.script_if_tracing
+def move_device_like(src: torch.Tensor, dst: torch.Tensor) -> torch.Tensor:
+    """
+    Tracing friendly way to cast tensor to another tensor's device. Device will be treated
+    as constant during tracing, scripting the casting process as whole can workaround this issue.
+    """
+    return src.to(dst.device)
diff --git a/mmcv/losses/__init__.py b/mmcv/losses/__init__.py
new file mode 100644
index 0000000..ed8a93b
--- /dev/null
+++ b/mmcv/losses/__init__.py
@@ -0,0 +1,7 @@
+from .track_loss import ClipMatcher
+from .dice_loss import DiceLoss
+from .occflow_loss import *
+from .traj_loss import TrajLoss
+from .planning_loss import PlanningLoss, CollisionLoss
+from .fvcore_smooth_l1_loss import smooth_l1_loss
+from .focal_loss import sigmoid_focal_loss
\ No newline at end of file
diff --git a/mmcv/losses/dice_loss.py b/mmcv/losses/dice_loss.py
new file mode 100644
index 0000000..39dd231
--- /dev/null
+++ b/mmcv/losses/dice_loss.py
@@ -0,0 +1,61 @@
+import torch
+import torch
+import torch.nn as nn
+
+from mmcv.models.losses.utils import weighted_loss
+from mmcv.models.builder import LOSSES
+
+@weighted_loss
+def dice_loss(input, target,mask=None,eps=0.001):
+    N,H,W = input.shape
+    
+    input = input.contiguous().view(N, H*W)
+    target = target.contiguous().view(N, H*W).float()
+    if mask is not None:
+      mask = mask.contiguous().view(N, H*W).float()
+      input = input * mask
+      target = target * mask
+    a = torch.sum(input * target, 1)
+    b = torch.sum(input * input, 1) + eps
+    c = torch.sum(target * target, 1) + eps
+    d = (2 * a) / (b + c)
+    return 1 - d
+
+@LOSSES.register_module()
+class DiceLoss(nn.Module):
+
+    def __init__(self, eps=1e-6, reduction='mean', loss_weight=1.0):
+        super(DiceLoss, self).__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.count = 0
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                mask=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        #if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n,w,h) to (n,) to match the
+            # giou_loss of shape (n,)
+            #assert weight.shape == pred.shape
+            #weight = weight.mean((-2,-1))
+        loss = self.loss_weight * dice_loss(
+            pred,
+            target,
+            weight,
+            mask=mask,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        #print('DiceLoss',loss, avg_factor)
+        return loss
diff --git a/mmcv/losses/focal_loss.py b/mmcv/losses/focal_loss.py
new file mode 100644
index 0000000..d4f357c
--- /dev/null
+++ b/mmcv/losses/focal_loss.py
@@ -0,0 +1,105 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+# pyre-strict
+
+import torch
+from torch.nn import functional as F
+
+
+def sigmoid_focal_loss(
+    inputs: torch.Tensor,
+    targets: torch.Tensor,
+    alpha: float = -1,
+    gamma: float = 2,
+    reduction: str = "none",
+) -> torch.Tensor:
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+        reduction: 'none' | 'mean' | 'sum'
+                 'none': No reduction will be applied to the output.
+                 'mean': The output will be averaged.
+                 'sum': The output will be summed.
+    Returns:
+        Loss tensor with the reduction option applied.
+    """
+    inputs = inputs.float()
+    targets = targets.float()
+    p = torch.sigmoid(inputs)
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = p * targets + (1 - p) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    if reduction == "mean":
+        loss = loss.mean()
+    elif reduction == "sum":
+        loss = loss.sum()
+
+    return loss
+
+
+# pyre-fixme[9]: sigmoid_focal_loss_jit has type `ScriptModule`; used as
+#  `ScriptFunction[..., typing.Any]`.
+sigmoid_focal_loss_jit: "torch.jit.ScriptModule" = torch.jit.script(sigmoid_focal_loss)
+
+
+def sigmoid_focal_loss_star(
+    inputs: torch.Tensor,
+    targets: torch.Tensor,
+    alpha: float = -1,
+    gamma: float = 1,
+    reduction: str = "none",
+) -> torch.Tensor:
+    """
+    FL* described in RetinaNet paper Appendix: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Gamma parameter described in FL*. Default = 1 (no weighting).
+        reduction: 'none' | 'mean' | 'sum'
+                 'none': No reduction will be applied to the output.
+                 'mean': The output will be averaged.
+                 'sum': The output will be summed.
+    Returns:
+        Loss tensor with the reduction option applied.
+    """
+    inputs = inputs.float()
+    targets = targets.float()
+    shifted_inputs = gamma * (inputs * (2 * targets - 1))
+    loss = -(F.logsigmoid(shifted_inputs)) / gamma
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss *= alpha_t
+
+    if reduction == "mean":
+        loss = loss.mean()
+    elif reduction == "sum":
+        loss = loss.sum()
+
+    return loss
+
+
+# pyre-fixme[9]: sigmoid_focal_loss_star_jit has type `ScriptModule`; used as
+#  `ScriptFunction[..., typing.Any]`.
+sigmoid_focal_loss_star_jit: "torch.jit.ScriptModule" = torch.jit.script(
+    sigmoid_focal_loss_star
+)
\ No newline at end of file
diff --git a/mmcv/losses/fvcore_smooth_l1_loss.py b/mmcv/losses/fvcore_smooth_l1_loss.py
new file mode 100644
index 0000000..df4f541
--- /dev/null
+++ b/mmcv/losses/fvcore_smooth_l1_loss.py
@@ -0,0 +1,76 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+# pyre-strict
+
+import torch
+
+
+def smooth_l1_loss(
+    input: torch.Tensor, target: torch.Tensor, beta: float, reduction: str = "none"
+) -> torch.Tensor:
+    """
+    Smooth L1 loss defined in the Fast R-CNN paper as:
+    ::
+                      | 0.5 * x ** 2 / beta   if abs(x) < beta
+        smoothl1(x) = |
+                      | abs(x) - 0.5 * beta   otherwise,
+
+    where x = input - target.
+
+    Smooth L1 loss is related to Huber loss, which is defined as:
+    ::
+                    | 0.5 * x ** 2                  if abs(x) < beta
+         huber(x) = |
+                    | beta * (abs(x) - 0.5 * beta)  otherwise
+
+    Smooth L1 loss is equal to huber(x) / beta. This leads to the following
+    differences:
+
+     - As beta -> 0, Smooth L1 loss converges to L1 loss, while Huber loss
+       converges to a constant 0 loss.
+     - As beta -> +inf, Smooth L1 converges to a constant 0 loss, while Huber loss
+       converges to L2 loss.
+     - For Smooth L1 loss, as beta varies, the L1 segment of the loss has a constant
+       slope of 1. For Huber loss, the slope of the L1 segment is beta.
+
+    Smooth L1 loss can be seen as exactly L1 loss, but with the abs(x) < beta
+    portion replaced with a quadratic function such that at abs(x) = beta, its
+    slope is 1. The quadratic segment smooths the L1 loss near x = 0.
+
+    Args:
+        input (Tensor): input tensor of any shape
+        target (Tensor): target value tensor with the same shape as input
+        beta (float): L1 to L2 change point.
+            For beta values < 1e-5, L1 loss is computed.
+        reduction: 'none' | 'mean' | 'sum'
+                 'none': No reduction will be applied to the output.
+                 'mean': The output will be averaged.
+                 'sum': The output will be summed.
+
+    Returns:
+        The loss with the reduction option applied.
+
+    Note:
+        PyTorch's builtin "Smooth L1 loss" implementation does not actually
+        implement Smooth L1 loss, nor does it implement Huber loss. It implements
+        the special case of both in which they are equal (beta=1).
+        See: https://pytorch.org/docs/stable/nn.html#torch.nn.SmoothL1Loss.
+    """
+    if beta < 1e-5:
+        # if beta == 0, then torch.where will result in nan gradients when
+        # the chain rule is applied due to pytorch implementation details
+        # (the False branch "0.5 * n ** 2 / 0" has an incoming gradient of
+        # zeros, rather than "no gradient"). To avoid this issue, we define
+        # small values of beta to be exactly l1 loss.
+        loss = torch.abs(input - target)
+    else:
+        n = torch.abs(input - target)
+        cond = n < beta
+        # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and `int`.
+        loss = torch.where(cond, 0.5 * n**2 / beta, n - 0.5 * beta)
+
+    if reduction == "mean":
+        loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
+    elif reduction == "sum":
+        loss = loss.sum()
+    return loss
\ No newline at end of file
diff --git a/mmcv/losses/occflow_loss.py b/mmcv/losses/occflow_loss.py
new file mode 100644
index 0000000..ff80306
--- /dev/null
+++ b/mmcv/losses/occflow_loss.py
@@ -0,0 +1,226 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+# Modified from Fiery (https://github.com/wayveai/fiery)                          #
+#---------------------------------------------------------------------------------#
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from mmcv.models.builder import LOSSES
+from mmcv.models.losses.utils import weight_reduce_loss
+
+@LOSSES.register_module()
+class FieryBinarySegmentationLoss(nn.Module):
+    def __init__(self, use_top_k=False, top_k_ratio=1.0, future_discount=1.0, loss_weight=1.0, ignore_index=255):
+        super().__init__()
+        self.use_top_k = use_top_k
+        self.top_k_ratio = top_k_ratio
+        self.future_discount = future_discount
+        self.loss_weight = loss_weight
+        self.ignore_index = ignore_index
+
+    def forward(self, prediction, target, frame_mask=None):
+        n_gt, s, h, w = prediction.size()
+        assert prediction.size() == target.size(), f"{prediction.size()}, {target.size()}"
+
+        # Deal target > 1 (ignore_index)
+        keep_mask = (target.long() != self.ignore_index).float()
+        target = target * keep_mask
+
+        loss = F.binary_cross_entropy_with_logits(
+            prediction,
+            target.float(),
+            reduction='none',
+        )
+        assert loss.size() == prediction.size(), f"{loss.size()}, {prediction.size()}"
+        
+        # Deal ignore_index
+        if self.ignore_index is not None:
+            # keep_mask = (target.long() != self.ignore_index).float()
+            loss = loss * keep_mask
+
+        # Filter out losses of invalid future sample
+        if frame_mask is not None:
+            assert frame_mask.size(0) == s, f"{frame_mask.size()}"
+            if frame_mask.sum().item() == 0:
+                return prediction.sum() * 0.
+            frame_mask = frame_mask.view(1, s, 1, 1)
+            loss = loss * frame_mask.float()
+        
+        future_discounts = self.future_discount ** torch.arange(s, device=loss.device, dtype=loss.dtype)
+        future_discounts = future_discounts.view(1, s, 1, 1)
+        loss = loss * future_discounts
+
+        loss = loss.view(n_gt, s, -1)
+        if self.use_top_k:
+            # Penalises the top-k hardest pixels
+            k = int(self.top_k_ratio * loss.shape[2])
+            loss, _ = torch.sort(loss, dim=2, descending=True)
+            loss = loss[:, :, :k]
+
+        return self.loss_weight * torch.mean(loss)
+        
+def dice_loss(pred,
+              target,
+              weight=None,
+              eps=1e-3,
+              reduction='mean',
+              naive_dice=False,
+              avg_factor=None,
+              ignore_index=None,
+              frame_mask=None):
+    """Calculate dice loss, there are two forms of dice loss is supported:
+
+        - the one proposed in `V-Net: Fully Convolutional Neural
+            Networks for Volumetric Medical Image Segmentation
+            <https://arxiv.org/abs/1606.04797>`_.
+        - the dice loss in which the power of the number in the
+            denominator is the first power instead of the second
+            power.
+
+    Args:
+        pred (torch.Tensor): The prediction, has a shape (n, *)
+        target (torch.Tensor): The learning label of the prediction,
+            shape (n, *), same shape of pred.
+        weight (torch.Tensor, optional): The weight of loss for each
+            prediction, has a shape (n,). Defaults to None.
+        eps (float): Avoid dividing by zero. Default: 1e-3.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+            Options are "none", "mean" and "sum".
+        naive_dice (bool, optional): If false, use the dice
+                loss defined in the V-Net paper, otherwise, use the
+                naive dice loss in which the power of the number in the
+                denominator is the first power instead of the second
+                power.Defaults to False.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    n, s, h, w = pred.size()
+    assert pred.size() == target.size(),  \
+                f"{pred.size()}, {target.size()}"
+    
+    # Ignore invalid index(255)
+    if ignore_index is not None:
+        keep_mask = (target.long() != ignore_index)
+        target = target * keep_mask.float()
+        pred   = pred   * keep_mask.float()
+
+    # Ignore invalid frame
+    if frame_mask is not None:
+        assert frame_mask.size(0) == s, f"{frame_mask.size()}"
+        if frame_mask.sum().item() == 0:
+            return pred.sum() * 0.
+        frame_mask = frame_mask.view(1, s, 1, 1)
+        target = target * frame_mask.float()
+        pred   = pred   * frame_mask.float()
+
+    input = pred.flatten(1)
+    target = target.flatten(1).float()
+
+    a = torch.sum(input * target, 1)
+    if naive_dice:
+        b = torch.sum(input, 1)
+        c = torch.sum(target, 1)
+        d = (2 * a + eps) / (b + c + eps)
+    else:
+        b = torch.sum(input * input, 1) + eps
+        c = torch.sum(target * target, 1) + eps
+        d = (2 * a) / (b + c)
+
+    loss = 1 - d
+    if weight is not None:
+        assert weight.ndim == loss.ndim
+        assert len(weight) == len(pred)
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+@LOSSES.register_module()
+class DiceLossWithMasks(nn.Module):
+    def __init__(self,
+                 use_sigmoid=True,
+                 activate=True,
+                 reduction='mean',
+                 naive_dice=False,
+                 loss_weight=1.0,
+                 ignore_index=255,
+                 eps=1e-3):
+        """Compute dice loss.
+
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            activate (bool): Whether to activate the predictions inside,
+                this will disable the inside sigmoid operation.
+                Defaults to True.
+            reduction (str, optional): The method used
+                to reduce the loss. Options are "none",
+                "mean" and "sum". Defaults to 'mean'.
+            naive_dice (bool, optional): If false, use the dice
+                loss defined in the V-Net paper, otherwise, use the
+                naive dice loss in which the power of the number in the
+                denominator is the first power instead of the second
+                power. Defaults to False.
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            eps (float): Avoid dividing by zero. Defaults to 1e-3.
+        """
+
+        super(DiceLossWithMasks, self).__init__()
+        self.use_sigmoid = use_sigmoid
+        self.reduction = reduction
+        self.naive_dice = naive_dice
+        self.loss_weight = loss_weight
+        self.eps = eps
+        self.activate = activate
+        self.ignore_index = ignore_index
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                reduction_override=None,
+                avg_factor=None,
+                frame_mask=None
+                ):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction, has a shape (n, *).
+            target (torch.Tensor): The label of the prediction,
+                shape (n, *), same shape of pred.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction, has a shape (n,). Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        if self.activate:
+            if self.use_sigmoid:
+                pred = pred.sigmoid()
+            else:
+                raise NotImplementedError
+
+        loss = self.loss_weight * dice_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            naive_dice=self.naive_dice,
+            avg_factor=avg_factor,
+            ignore_index=self.ignore_index,
+            frame_mask=frame_mask)
+
+        return loss
\ No newline at end of file
diff --git a/mmcv/losses/planning_loss.py b/mmcv/losses/planning_loss.py
new file mode 100644
index 0000000..6b6fbfc
--- /dev/null
+++ b/mmcv/losses/planning_loss.py
@@ -0,0 +1,77 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+#---------------------------------------------------------------------------------#
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Tuple
+import pickle
+from mmcv.models import LOSSES
+
+
+@LOSSES.register_module()
+class PlanningLoss(nn.Module):
+    def __init__(self, loss_type='L2'):
+        super(PlanningLoss, self).__init__()
+        self.loss_type = loss_type
+    
+    def forward(self, sdc_traj, gt_sdc_fut_traj, mask):
+        err = sdc_traj[..., :2] - gt_sdc_fut_traj[..., :2]
+        err = torch.pow(err, exponent=2)
+        err = torch.sum(err, dim=-1)
+        err = torch.pow(err, exponent=0.5)
+        return torch.sum(err * mask)/(torch.sum(mask) + 1e-5)
+
+
+@LOSSES.register_module()
+class CollisionLoss(nn.Module):
+    def __init__(self, delta=0.5, weight=1.0):
+        super(CollisionLoss, self).__init__()
+        self.w = 1.85 + delta
+        self.h = 4.084 + delta
+        self.weight = weight
+    
+    def forward(self, sdc_traj_all, sdc_planning_gt, sdc_planning_gt_mask, future_gt_bbox):
+        # sdc_traj_all (1, 6, 2)
+        # sdc_planning_gt (1,6,3)
+        # sdc_planning_gt_mask (1, 6)
+        # future_gt_bbox 6x[lidarboxinstance]
+        n_futures = len(future_gt_bbox)
+        inter_sum = sdc_traj_all.new_zeros(1, )
+        dump_sdc = []
+        for i in range(n_futures):
+            if len(future_gt_bbox[i].tensor) > 0:
+                future_gt_bbox_corners = future_gt_bbox[i].corners[:, [0,3,4,7], :2] # (N, 8, 3) -> (N, 4, 2) only bev 
+                # sdc_yaw = -sdc_planning_gt[0, i, 2].to(sdc_traj_all.dtype) - 1.5708
+                sdc_yaw = sdc_planning_gt[0, i, 2].to(sdc_traj_all.dtype)
+                sdc_bev_box = self.to_corners([sdc_traj_all[0, i, 0], sdc_traj_all[0, i, 1], self.w, self.h, sdc_yaw])
+                dump_sdc.append(sdc_bev_box.cpu().detach().numpy())
+                for j in range(future_gt_bbox_corners.shape[0]):
+                    inter_sum += self.inter_bbox(sdc_bev_box, future_gt_bbox_corners[j].to(sdc_traj_all.device))
+        return inter_sum * self.weight
+        
+    def inter_bbox(self, corners_a, corners_b):
+        xa1, ya1 = torch.max(corners_a[:, 0]), torch.max(corners_a[:, 1])
+        xa2, ya2 = torch.min(corners_a[:, 0]), torch.min(corners_a[:, 1])
+        xb1, yb1 = torch.max(corners_b[:, 0]), torch.max(corners_b[:, 1])
+        xb2, yb2 = torch.min(corners_b[:, 0]), torch.min(corners_b[:, 1])
+        
+        xi1, yi1 = min(xa1, xb1), min(ya1, yb1)
+        xi2, yi2 = max(xa2, xb2), max(ya2, yb2)
+        intersect = max((xi1 - xi2), xi1.new_zeros(1, ).to(xi1.device)) * max((yi1 - yi2), xi1.new_zeros(1,).to(xi1.device))
+        return intersect
+
+    def to_corners(self, bbox):
+        x, y, w, l, theta = bbox
+        corners = torch.tensor([
+            [w/2, -l/2], [w/2, l/2], [-w/2, l/2], [-w/2,-l/2]  
+        ]).to(x.device) # 4,2
+        rot_mat = torch.tensor(
+            [[torch.cos(theta), torch.sin(theta)],
+             [-torch.sin(theta), torch.cos(theta)]]
+        ).to(x.device)
+        new_corners = rot_mat @ corners.T + torch.tensor(bbox[:2])[:, None].to(x.device)
+        return new_corners.T
\ No newline at end of file
diff --git a/mmcv/losses/track_loss.py b/mmcv/losses/track_loss.py
new file mode 100644
index 0000000..603116f
--- /dev/null
+++ b/mmcv/losses/track_loss.py
@@ -0,0 +1,619 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+# Modified from MOTR (https://github.com/megvii-research/MOTR)                    #
+#---------------------------------------------------------------------------------#
+
+import copy
+from distutils.command.build import build
+import math
+from xmlrpc.client import Boolean
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.distributed as dist
+import torch.nn as nn
+from typing import List
+from mmcv.models.dense_heads.track_head_plugin import Instances
+from mmcv.core import build_assigner
+from mmcv.models import build_loss
+from mmcv.models.builder import LOSSES
+from mmcv.core import reduce_mean
+from mmcv.core.bbox.iou_calculators.iou3d_calculator import (
+    bbox_overlaps_nearest_3d as iou_3d, )
+from mmcv.core.bbox.util import denormalize_bbox
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+@torch.no_grad()
+def accuracy(output, target, topk=(1, )):
+    """Computes the precision@k for the specified values of k"""
+    if target.numel() == 0:
+        return [torch.zeros([], device=output.device)]
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+
+@LOSSES.register_module()
+class ClipMatcher(nn.Module):
+    def __init__(
+            self,
+            num_classes,
+            weight_dict,
+            code_weights=[
+                1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2
+            ],
+            loss_past_traj_weight=1.0,
+            assigner=dict(
+                type="HungarianAssigner3D",
+                cls_cost=dict(type="FocalLossCost", weight=2.0),
+                reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+                pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
+            ),
+            loss_cls=dict(type="FocalLoss",
+                          use_sigmoid=True,
+                          gamma=2.0,
+                          alpha=0.25,
+                          loss_weight=2.0),
+            loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+    ):
+        """Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            eos_coef: relative classification weight applied to the no-object category
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = build_assigner(assigner)
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bboxes = build_loss(loss_bbox)
+        self.loss_predictions = nn.SmoothL1Loss(reduction="none", beta=1.0)
+        self.register_buffer("code_weights",
+                             torch.tensor(code_weights, requires_grad=False))
+
+        self.weight_dict = weight_dict
+        self.loss_past_traj_weight = loss_past_traj_weight
+        # self.losses = ['labels', 'boxes', 'cardinality']
+        self.losses = ["labels", "boxes", "past_trajs"]
+        self.focal_loss = True
+        self.losses_dict = {}
+        self._current_frame_idx = 0
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat(
+            [torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat(
+            [torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def initialize_for_single_clip(self, gt_instances: List[Instances]):
+        self.gt_instances = gt_instances
+        self.num_samples = 0
+        self.sample_device = None
+        self._current_frame_idx = 0
+        self.losses_dict = {}
+
+    def _step(self):
+        self._current_frame_idx += 1
+
+    def calc_loss_for_track_scores(self, track_instances: Instances):
+        frame_id = self._current_frame_idx - 1
+        gt_instances = self.gt_instances[frame_id]
+        outputs = {
+            "pred_logits": track_instances.track_scores[None],
+        }
+        device = track_instances.track_scores.device
+
+        num_tracks = len(track_instances)
+        src_idx = torch.arange(num_tracks, dtype=torch.long, device=device)
+        tgt_idx = (track_instances.matched_gt_idxes
+                   )  # -1 for FP tracks and disappeared tracks
+
+        track_losses = self.get_loss(
+            "labels",
+            outputs=outputs,
+            gt_instances=[gt_instances],
+            indices=[(src_idx, tgt_idx)],
+            num_boxes=1,
+        )
+        self.losses_dict.update({
+            "frame_{}_track_{}".format(frame_id, key): value
+            for key, value in track_losses.items()
+        })
+
+    def get_num_boxes(self, num_samples):
+        num_boxes = torch.as_tensor(num_samples,
+                                    dtype=torch.float,
+                                    device=self.sample_device)
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_boxes)
+        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
+        return num_boxes
+
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices):
+        """Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients
+        """
+        pred_logits = outputs["pred_logits"]
+        device = pred_logits.device
+        tgt_lengths = torch.as_tensor([len(v.labels) for v in targets],
+                                      device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (pred_logits.argmax(-1) !=
+                     pred_logits.shape[-1] - 1).sum(1)
+        card_err = F.l1_loss(card_pred.float(), tgt_lengths.float())
+        losses = {"cardinality_error": card_err}
+        return losses
+
+    def get_loss(self, loss, outputs, gt_instances, indices, **kwargs):
+        loss_map = {
+            "labels": self.loss_labels,
+            "cardinality": self.loss_cardinality,
+            "boxes": self.loss_boxes,
+            "past_trajs": self.loss_past_trajs,
+        }
+        assert loss in loss_map, f"do you really want to compute {loss} loss?"
+        return loss_map[loss](outputs, gt_instances, indices, **kwargs)
+
+    def loss_past_trajs(self, outputs, gt_instances: List[Instances],
+                   indices: List[tuple]):
+        # We ignore the regression loss of the track-disappear slots.
+        # TODO: Make this filter process more elegant.
+        filtered_idx = []
+        for src_per_img, tgt_per_img in indices:
+            keep = tgt_per_img != -1
+            filtered_idx.append((src_per_img[keep], tgt_per_img[keep]))
+        indices = filtered_idx
+        idx = self._get_src_permutation_idx(indices)
+        src_trajs = outputs["pred_past_trajs"][idx]
+        target_trajs = torch.cat(
+            [
+                gt_per_img.past_traj[i]
+                for gt_per_img, (_, i) in zip(gt_instances, indices)
+            ],
+            dim=0,
+        )
+        target_trajs_mask = torch.cat(
+            [
+                gt_per_img.past_traj_mask[i]
+                for gt_per_img, (_, i) in zip(gt_instances, indices)
+            ],
+            dim=0,
+        )
+
+        # for pad target, don't calculate regression loss, judged by whether obj_id=-1
+        target_obj_ids = torch.cat(
+            [
+                gt_per_img.obj_ids[i]
+                for gt_per_img, (_, i) in zip(gt_instances, indices)
+            ],
+            dim=0,
+        )  # size(16)
+        # [num_matched]
+        mask = target_obj_ids != -1
+        loss_trajs = self.compute_past_traj_loss(src_trajs[mask], target_trajs[mask], target_trajs_mask[mask])
+        losses = {}
+        losses["loss_past_trajs"] = loss_trajs * self.loss_past_traj_weight
+        return losses
+    
+    def compute_past_traj_loss(self, src, tgt, tgt_mask):
+        loss = torch.abs(src - tgt) * tgt_mask
+        return torch.sum(loss)/ (torch.sum(tgt_mask>0) + 1e-5)
+
+    def loss_boxes(self, outputs, gt_instances: List[Instances],
+                   indices: List[tuple]):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+        targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+        The target boxes are expected in format (center_x, center_y, h, w), normalized by the image size.
+        """
+        # We ignore the regression loss of the track-disappear slots.
+        # TODO: Make this filter process more elegant.
+        filtered_idx = []
+        for src_per_img, tgt_per_img in indices:
+            keep = tgt_per_img != -1
+            filtered_idx.append((src_per_img[keep], tgt_per_img[keep]))
+        indices = filtered_idx
+        idx = self._get_src_permutation_idx(indices)
+        src_boxes = outputs["pred_boxes"][idx]
+        sdc_boxes = outputs["pred_sdc_boxes"][0, -1:]
+        target_sdc_boxes = gt_instances[0].sdc_boxes[:1]
+        target_boxes = torch.cat(
+            [
+                gt_per_img.boxes[i]
+                for gt_per_img, (_, i) in zip(gt_instances, indices)
+            ],
+            dim=0,
+        )
+        
+        src_boxes = torch.cat([src_boxes, sdc_boxes], dim=0)
+        target_boxes = torch.cat([target_boxes, target_sdc_boxes], dim=0)
+
+        # for pad target, don't calculate regression loss, judged by whether obj_id=-1
+        target_obj_ids = torch.cat(
+            [
+                gt_per_img.obj_ids[i]
+                for gt_per_img, (_, i) in zip(gt_instances, indices)
+            ],
+            dim=0,
+        )
+        # [num_matched]
+
+        target_obj_ids = torch.cat([target_obj_ids, torch.zeros(1).to(target_obj_ids.device)], dim=0)
+        mask = target_obj_ids != -1
+        bbox_weights = torch.ones_like(target_boxes) * self.code_weights
+        avg_factor = src_boxes[mask].size(0)
+        avg_factor = reduce_mean(target_boxes.new_tensor([avg_factor]))
+        loss_bbox = self.loss_bboxes(
+            src_boxes[mask],
+            target_boxes[mask],
+            bbox_weights[mask],
+            avg_factor=avg_factor.item(),
+        )
+        
+        losses = {}
+        losses["loss_bbox"] = loss_bbox
+
+        return losses
+
+    def loss_labels(self,
+                    outputs,
+                    gt_instances: List[Instances],
+                    indices,
+                    log=False):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+
+        indices: [(src_idx, tgt_idx)]
+        """
+        # [bs=1, num_query, num_classes]
+        src_logits = outputs["pred_logits"]
+        sdc_logits = outputs["pred_sdc_logits"]
+        # batch_idx, src_idx
+        idx = self._get_src_permutation_idx(indices)
+        # [bs, num_query]
+        target_classes = torch.full(
+            src_logits.shape[:2],
+            self.num_classes,
+            dtype=torch.int64,
+            device=src_logits.device,
+        )
+        # The matched gt for disappear track query is set -1.
+        labels = []
+        for gt_per_img, (_, J) in zip(gt_instances, indices):
+            labels_per_img = torch.ones_like(J) * self.num_classes
+            # set labels of track-appear slots to num_classes
+            if len(gt_per_img) > 0:
+                labels_per_img[J != -1] = gt_per_img.labels[J[J != -1]]
+            labels.append(labels_per_img)
+        # [num_matched]
+        target_classes_o = torch.cat(labels)
+        # [bs, num_query]
+        target_classes[idx] = target_classes_o
+        target_sdc_classes = gt_instances[0].sdc_labels[0:1].unsqueeze(0)
+        if sdc_logits is not None:
+            src_logits = torch.cat([src_logits, sdc_logits], dim=1)
+            target_classes = torch.cat([target_classes, target_sdc_classes], dim=1)
+        label_weights = torch.ones_like(target_classes)
+        # float tensor
+        avg_factor = target_classes_o.numel(
+        )  # pos + mathced gt for disapper track
+        avg_factor += 1 # sdc
+        
+        avg_factor = reduce_mean(src_logits.new_tensor([avg_factor]))
+        loss_ce = self.loss_cls(
+            src_logits.flatten(0, 1),
+            target_classes.flatten(0),
+            label_weights.flatten(0),
+            avg_factor,
+        )
+
+        losses = {"loss_cls": loss_ce}
+
+        if log:
+            # TODO this should probably be a separate loss, not hacked in this one here
+            losses["class_error"] = 100 - accuracy(src_logits[idx],
+                                                   target_classes_o)[0]
+
+        return losses
+
+    def match_for_single_frame(self,
+                               outputs: dict,
+                               dec_lvl: int,
+                               if_step=False,
+                               ):
+        outputs_without_aux = {
+            k: v
+            for k, v in outputs.items() if k != "aux_outputs"
+        }
+
+        gt_instances_i = self.gt_instances[
+            self._current_frame_idx]  # gt instances of i-th image.
+        track_instances: Instances = outputs_without_aux["track_instances"]
+        pred_logits_i = track_instances.pred_logits
+        pred_boxes_i = track_instances.pred_boxes
+        # modified the hard code, 900:901, sdc query
+        pred_sdc_logits_i = track_instances.pred_logits[900:901].unsqueeze(0) 
+        pred_sdc_boxes_i = track_instances.pred_boxes[900:901].unsqueeze(0) 
+        # -2 means the sdc query in this code
+        track_instances.obj_idxes[900]=-2
+        pred_past_trajs_i = track_instances.pred_past_trajs  # predicted past trajs of i-th image.
+
+        obj_idxes = gt_instances_i.obj_ids
+        obj_idxes_list = obj_idxes.detach().cpu().numpy().tolist()
+        obj_idx_to_gt_idx = {
+            obj_idx: gt_idx
+            for gt_idx, obj_idx in enumerate(obj_idxes_list)
+        }
+        outputs_i = {
+            "pred_logits": pred_logits_i.unsqueeze(0),
+            "pred_sdc_logits": pred_sdc_logits_i,
+            "pred_boxes": pred_boxes_i.unsqueeze(0),
+            "pred_sdc_boxes": pred_sdc_boxes_i,
+            "pred_past_trajs": pred_past_trajs_i.unsqueeze(0),
+        }
+        # step1. inherit and update the previous tracks.
+        num_disappear_track = 0
+        for j in range(len(track_instances)):
+            obj_id = track_instances.obj_idxes[j].item()
+            # set new target idx.
+            if obj_id >= 0:
+                if obj_id in obj_idx_to_gt_idx:
+                    track_instances.matched_gt_idxes[j] = obj_idx_to_gt_idx[
+                        obj_id]
+                else:
+                    num_disappear_track += 1
+                    track_instances.matched_gt_idxes[
+                        j] = -1  # track-disappear case.
+            else:
+                track_instances.matched_gt_idxes[j] = -1
+
+        full_track_idxes = torch.arange(
+            len(track_instances), dtype=torch.long).to(pred_logits_i.device)
+        # previsouly tracked, which is matched by rule
+        matched_track_idxes = track_instances.obj_idxes >= 0
+        prev_matched_indices = torch.stack(
+            [
+                full_track_idxes[matched_track_idxes],
+                track_instances.matched_gt_idxes[matched_track_idxes],
+            ],
+            dim=1,
+        ).to(pred_logits_i.device)
+
+        # step2. select the unmatched slots.
+        # note that the FP tracks whose obj_idxes are -2 will not be selected here.
+        unmatched_track_idxes = full_track_idxes[track_instances.obj_idxes ==
+                                                 -1]
+
+        # step3. select the untracked gt instances (new tracks).
+        tgt_indexes = track_instances.matched_gt_idxes
+        tgt_indexes = tgt_indexes[tgt_indexes != -1]
+
+        tgt_state = torch.zeros(len(gt_instances_i)).to(pred_logits_i.device)
+        tgt_state[tgt_indexes] = 1
+        # new tgt indexes
+        untracked_tgt_indexes = torch.arange(len(gt_instances_i)).to(
+            pred_logits_i.device)[tgt_state == 0]
+        # untracked_tgt_indexes = select_unmatched_indexes(tgt_indexes, len(gt_instances_i))
+        # [num_untracked]
+        untracked_gt_instances = gt_instances_i[untracked_tgt_indexes]
+
+        def match_for_single_decoder_layer(unmatched_outputs, matcher):
+            bbox_preds, cls_preds = (
+                unmatched_outputs["pred_boxes"],
+                unmatched_outputs["pred_logits"],
+            )
+            bs, num_querys = bbox_preds.shape[:2]
+            # Also concat the target labels and boxes
+            targets = [untracked_gt_instances]
+            if isinstance(targets[0], Instances):
+                # [num_box], [num_box, 9] (un-normalized bboxes)
+                gt_labels = torch.cat(
+                    [gt_per_img.labels for gt_per_img in targets])
+                gt_bboxes = torch.cat(
+                    [gt_per_img.boxes for gt_per_img in targets])
+            else:
+                gt_labels = torch.cat([v["labels"] for v in targets])
+                gt_bboxes = torch.cat([v["boxes"] for v in targets])
+
+            bbox_pred = bbox_preds[0]
+            cls_pred = cls_preds[0]
+
+            src_idx, tgt_idx = matcher.assign(bbox_pred, cls_pred, gt_bboxes,
+                                              gt_labels)
+            if src_idx is None:
+                return None
+            # concat src and tgt.
+            new_matched_indices = torch.stack([
+                unmatched_track_idxes[src_idx], untracked_tgt_indexes[tgt_idx]
+            ],
+                                              dim=1).to(pred_logits_i.device)
+            return new_matched_indices
+
+        # step4. do matching between the unmatched slots and GTs.
+        unmatched_outputs = {
+            # [bs, num_pred, num_classes]
+            "pred_logits":
+            track_instances.pred_logits[unmatched_track_idxes].unsqueeze(0),
+            # [bs, num_pred, box_dim]
+            "pred_boxes":
+            track_instances.pred_boxes[unmatched_track_idxes].unsqueeze(0),
+        }
+        # [num_new_matched, 2]
+        new_matched_indices = match_for_single_decoder_layer(
+            unmatched_outputs, self.matcher)
+
+        # step5. update obj_idxes according to the new matching result.
+        if new_matched_indices is not None:
+            track_instances.obj_idxes[
+                new_matched_indices[:, 0]] = gt_instances_i.obj_ids[
+                    new_matched_indices[:, 1]].long()
+            track_instances.matched_gt_idxes[
+                new_matched_indices[:, 0]] = new_matched_indices[:, 1]
+
+            # step6. calculate iou3d.
+            active_idxes = (track_instances.obj_idxes >=
+                            0) & (track_instances.matched_gt_idxes >= 0)
+            active_track_boxes = track_instances.pred_boxes[active_idxes]
+            with torch.no_grad():
+                if len(active_track_boxes) > 0:
+                    gt_boxes = gt_instances_i.boxes[
+                        track_instances.matched_gt_idxes[active_idxes]]
+                    iou_3ds = iou_3d(
+                        denormalize_bbox(gt_boxes, None)[..., :7],
+                        denormalize_bbox(active_track_boxes, None)[..., :7],
+                    )
+                    track_instances.iou[active_idxes] = torch.tensor([
+                        iou_3ds[i, i] for i in range(gt_boxes.shape[0])
+                    ]).to(gt_boxes.device)
+
+            # step7. merge the unmatched pairs and the matched pairs.
+            # [num_new_macthed + num_prev_mathed, 2]
+            matched_indices = torch.cat(
+                [new_matched_indices, prev_matched_indices], dim=0)
+        else:
+            matched_indices = prev_matched_indices
+        # step8. calculate losses.
+        self.num_samples += len(gt_instances_i) + num_disappear_track
+        self.sample_device = pred_logits_i.device
+
+        for loss in self.losses:
+            new_track_loss = self.get_loss(
+                loss,
+                outputs=outputs_i,
+                gt_instances=[gt_instances_i],
+                indices=[(matched_indices[:, 0], matched_indices[:, 1])],
+            )
+            self.losses_dict.update({
+                "frame_{}_{}_{}".format(self._current_frame_idx, key, dec_lvl):
+                value
+                for key, value in new_track_loss.items()
+            })
+        if "aux_outputs" in outputs:
+            for i, aux_outputs in enumerate(outputs["aux_outputs"]):
+                unmatched_outputs_layer = {
+                    "pred_logits":
+                    aux_outputs["pred_logits"][
+                        0, unmatched_track_idxes].unsqueeze(0),
+                    "pred_boxes":
+                    aux_outputs["pred_boxes"][
+                        0, unmatched_track_idxes].unsqueeze(0),
+                }
+                new_matched_indices_layer = match_for_single_decoder_layer(
+                    unmatched_outputs_layer, self.matcher)
+                matched_indices_layer = torch.cat(
+                    [new_matched_indices_layer, prev_matched_indices], dim=0)
+                for loss in self.losses:
+                    if loss == "masks":
+                        # Intermediate masks losses are too costly to compute, we ignore them.
+                        continue
+                    l_dict = self.get_loss(
+                        loss,
+                        aux_outputs,
+                        gt_instances=[gt_instances_i],
+                        indices=[(matched_indices_layer[:, 0],
+                                  matched_indices_layer[:, 1])],
+                    )
+                    self.losses_dict.update({
+                        "frame_{}_aux{}_{}".format(self._current_frame_idx, i,
+                                                   key): value
+                        for key, value in l_dict.items()
+                    })
+        if if_step:
+            self._step()
+        return track_instances, matched_indices
+
+    def forward(self, outputs, input_data: dict):
+        # losses of each frame are calculated during the model's forwarding and are outputted by the model as outputs['losses_dict].
+        losses = outputs.pop("losses_dict")
+        num_samples = self.get_num_boxes(self.num_samples)
+        for loss_name, loss in losses.items():
+            losses[loss_name] /= num_samples
+        return losses
+
+    def prediction_loss(self, track_instances, predictions):
+
+        decay_ratio = 1.0
+        for i in range(self._current_frame_idx, len(self.gt_instances)):
+            gt_instances_i = self.gt_instances[
+                i]  # gt instances of i-th image.
+
+            pred_boxes_i = predictions[i - self._current_frame_idx]
+
+            obj_idxes = gt_instances_i.obj_ids
+            obj_idxes_list = obj_idxes.detach().cpu().numpy().tolist()
+            obj_idx_to_gt_idx = {
+                obj_idx: gt_idx
+                for gt_idx, obj_idx in enumerate(obj_idxes_list)
+            }
+
+            num_paired = 0
+            for j in range(len(track_instances)):
+                obj_id = track_instances.obj_idxes[j].item()
+                # set new target idx.
+                if obj_id >= 0:
+                    if obj_id in obj_idx_to_gt_idx:
+                        track_instances.matched_gt_idxes[
+                            j] = obj_idx_to_gt_idx[obj_id]
+                        num_paired += 1
+                    else:
+                        track_instances.matched_gt_idxes[
+                            j] = -1  # track-disappear case.
+                else:
+                    track_instances.matched_gt_idxes[j] = -1
+
+            if num_paired > 0:
+                if_paired_i = track_instances.matched_gt_idxes >= 0
+
+                paired_pred_boxes_i = pred_boxes_i[if_paired_i]
+
+                paired_gt_instances = gt_instances_i[
+                    track_instances.matched_gt_idxes[if_paired_i]]
+                normalized_bboxes = paired_gt_instances.boxes
+                cx = normalized_bboxes[..., 0:1]
+                cy = normalized_bboxes[..., 1:2]
+                cz = normalized_bboxes[..., 4:5]
+
+                gt_boxes_i = torch.cat([cx, cy, cz], dim=-1)
+
+                pred_loss_i = (0.2 * decay_ratio * self.loss_predictions(
+                    paired_pred_boxes_i, gt_boxes_i).sum(dim=-1).mean())
+
+                self.losses_dict["pred_loss_{}".format(i)] = pred_loss_i
+            else:
+                self.losses_dict["pred_loss_{}".format(i)] = torch.tensor(
+                    [0.0]).cuda()
+
+            decay_ratio = decay_ratio * 0.5
\ No newline at end of file
diff --git a/mmcv/losses/traj_loss.py b/mmcv/losses/traj_loss.py
new file mode 100644
index 0000000..727d291
--- /dev/null
+++ b/mmcv/losses/traj_loss.py
@@ -0,0 +1,233 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+#---------------------------------------------------------------------------------#
+
+import torch
+import math
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Tuple
+
+from mmcv.models import LOSSES
+
+@LOSSES.register_module()
+class TrajLoss(nn.Module):
+    """
+    MTP loss modified to include variances. Uses MSE for mode selection.
+    Can also be used with
+    Multipath outputs, with residuals added to anchors.
+    """
+
+    def __init__(self, use_variance=False, cls_loss_weight=1., nll_loss_weight=1., loss_weight_minade=0., loss_weight_minfde=1., loss_weight_mr=1.):
+        """
+        Initialize MTP loss
+        :param args: Dictionary with the following (optional) keys
+            use_variance: bool, whether or not to use variances for computing
+            regression component of loss,
+                default: False
+            alpha: float, relative weight assigned to classification component,
+            compared to regression component
+                of loss, default: 1
+        """
+        super(TrajLoss, self).__init__()
+        self.use_variance = use_variance
+        self.cls_loss_weight = cls_loss_weight
+        self.nll_loss_weight = nll_loss_weight
+        self.loss_weight_minade = loss_weight_minade
+        self.loss_weight_minfde = loss_weight_minfde
+
+    def forward(self,
+                traj_prob, 
+                traj_preds, 
+                gt_future_traj, 
+                gt_future_traj_valid_mask):
+        """
+        Compute MTP loss
+        :param predictions: Dictionary with 'traj': predicted trajectories
+        and 'probs': mode (log) probabilities
+        :param ground_truth: Either a tensor with ground truth trajectories
+        or a dictionary
+        :return:
+        """
+        # Unpack arguments
+        traj = traj_preds # (b, nmodes, seq, 5)
+        log_probs = traj_prob
+        traj_gt = gt_future_traj
+
+        # Useful variables
+        batch_size = traj.shape[0]
+        sequence_length = traj.shape[2]
+        pred_params = 5 if self.use_variance else 2
+
+        # Masks for variable length ground truth trajectories
+        masks = 1 - gt_future_traj_valid_mask.to(traj.dtype)
+
+        l_minfde, inds = min_fde(traj, traj_gt, masks)
+        try:
+            l_mr = miss_rate(traj, traj_gt, masks)
+        except:
+            l_mr = torch.zeros_like(l_minfde)
+        l_minade, inds = min_ade(traj, traj_gt, masks)
+        inds_rep = inds.repeat(
+            sequence_length,
+            pred_params, 1, 1).permute(3, 2, 0, 1)
+
+        # Calculate MSE or NLL loss for trajectories corresponding to selected
+        # outputs:
+        traj_best = traj.gather(1, inds_rep).squeeze(dim=1)
+
+        if self.use_variance:
+            l_reg = traj_nll(traj_best, traj_gt, masks)
+        else:
+            l_reg = l_minade
+
+        # Compute classification loss
+        l_class = - torch.squeeze(log_probs.gather(1, inds.unsqueeze(1)))
+
+        l_reg = torch.sum(l_reg)/(batch_size + 1e-5) 
+        l_class = torch.sum(l_class)/(batch_size + 1e-5)
+        l_minade = torch.sum(l_minade)/(batch_size + 1e-5) 
+        l_minfde = torch.sum(l_minfde)/(batch_size + 1e-5) 
+
+        loss = l_class * self.cls_loss_weight + l_reg * self.nll_loss_weight + l_minade * self.loss_weight_minade + l_minfde * self.loss_weight_minfde
+        return loss, l_class, l_reg, l_minade, l_minfde, l_mr
+
+def min_ade(traj: torch.Tensor, traj_gt: torch.Tensor,
+            masks: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Computes average displacement error for the best trajectory is a set,
+    with respect to ground truth
+    :param traj: predictions, shape [batch_size, num_modes, sequence_length, 2]
+    :param traj_gt: ground truth trajectory, shape
+    [batch_size, sequence_length, 2]
+    :param masks: masks for varying length ground truth, shape
+    [batch_size, sequence_length]
+    :return errs, inds: errors and indices for modes with min error, shape
+    [batch_size]
+    """
+    num_modes = traj.shape[1]
+    traj_gt_rpt = traj_gt.unsqueeze(1).repeat(1, num_modes, 1, 1)
+    masks_rpt = masks.unsqueeze(1).repeat(1, num_modes, 1)
+    err = traj_gt_rpt - traj[:, :, :, 0:2]
+    err = torch.pow(err, exponent=2)
+    err = torch.sum(err, dim=3)
+    err = torch.pow(err, exponent=0.5)
+    err = torch.sum(err * (1 - masks_rpt), dim=2) / \
+        torch.clip(torch.sum((1 - masks_rpt), dim=2), min=1)
+    err, inds = torch.min(err, dim=1)
+
+    return err, inds
+
+def traj_nll(
+        pred_dist: torch.Tensor,
+        traj_gt: torch.Tensor,
+        masks: torch.Tensor):
+    """
+    Computes negative log likelihood of ground truth trajectory under a
+    predictive distribution with a single mode,
+    with a bivariate Gaussian distribution predicted at each time in the
+    prediction horizon
+
+    :param pred_dist: parameters of a bivariate Gaussian distribution,
+    shape [batch_size, sequence_length, 5]
+    :param traj_gt: ground truth trajectory,
+    shape [batch_size, sequence_length, 2]
+    :param masks: masks for varying length ground truth,
+    shape [batch_size, sequence_length]
+    :return:
+    """
+    mu_x = pred_dist[:, :, 0]
+    mu_y = pred_dist[:, :, 1]
+    x = traj_gt[:, :, 0]
+    y = traj_gt[:, :, 1]
+
+    sig_x = pred_dist[:, :, 2]
+    sig_y = pred_dist[:, :, 3]
+    rho = pred_dist[:, :, 4]
+    ohr = torch.pow(1 - torch.pow(rho, 2), -0.5)
+
+    nll = 0.5 * torch.pow(ohr, 2) * \
+        (torch.pow(sig_x, 2) * torch.pow(x - mu_x, 2) + torch.pow(sig_y, 2) *
+         torch.pow(y - mu_y, 2) - 2 * rho * torch.pow(sig_x, 1) *
+         torch.pow(sig_y, 1) * (x - mu_x) * (y - mu_y)) - \
+        torch.log(sig_x * sig_y * ohr) + 1.8379
+
+    nll[nll.isnan()] = 0
+    nll[nll.isinf()] = 0
+
+    nll = torch.sum(nll * (1 - masks), dim=1) / (torch.sum((1 - masks), dim=1) + 1e-5)
+    # Note: Normalizing with torch.sum((1 - masks), dim=1) makes values
+    # somewhat comparable for trajectories of
+    # different lengths
+
+    return nll
+
+def min_fde(traj: torch.Tensor, traj_gt: torch.Tensor,
+            masks: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Computes final displacement error for the best trajectory is a set,
+    with respect to ground truth
+    :param traj: predictions, shape [batch_size, num_modes, sequence_length, 2]
+    :param traj_gt: ground truth trajectory, shape
+    [batch_size, sequence_length, 2]
+    :param masks: masks for varying length ground truth, shape
+    [batch_size, sequence_length]
+    :return errs, inds: errors and indices for modes with min error,
+    shape [batch_size]
+    """
+    num_modes = traj.shape[1]
+    lengths = torch.sum(1 - masks, dim=1).long()
+    valid_mask = lengths > 0
+    traj = traj[valid_mask]
+    traj_gt = traj_gt[valid_mask]
+    masks = masks[valid_mask]
+    traj_gt_rpt = traj_gt.unsqueeze(1).repeat(1, num_modes, 1, 1)
+    lengths = torch.sum(1 - masks, dim=1).long()
+    inds = lengths.unsqueeze(1).unsqueeze(
+        2).unsqueeze(3).repeat(1, num_modes, 1, 2) - 1
+
+    traj_last = torch.gather(traj[..., :2], dim=2, index=inds).squeeze(2)
+    traj_gt_last = torch.gather(traj_gt_rpt, dim=2, index=inds).squeeze(2)
+
+    err = traj_gt_last - traj_last[..., 0:2]
+    err = torch.pow(err, exponent=2)
+    err = torch.sum(err, dim=2)
+    err = torch.pow(err, exponent=0.5)
+    err, inds = torch.min(err, dim=1)
+
+    return err, inds
+
+
+def miss_rate(
+        traj: torch.Tensor,
+        traj_gt: torch.Tensor,
+        masks: torch.Tensor,
+        dist_thresh: float = 2) -> torch.Tensor:
+    """
+    Computes miss rate for mini batch of trajectories,
+    with respect to ground truth and given distance threshold
+    :param traj: predictions, shape [batch_size, num_modes, sequence_length, 2]
+    :param traj_gt: ground truth trajectory,
+    shape [batch_size, sequence_length, 2]
+    :param masks: masks for varying length ground truth,
+    shape [batch_size, sequence_length]
+    :param dist_thresh: distance threshold for computing miss rate.
+    :return errs, inds: errors and indices for modes with min error,
+    shape [batch_size]
+    """
+    num_modes = traj.shape[1]
+
+    traj_gt_rpt = traj_gt.unsqueeze(1).repeat(1, num_modes, 1, 1)
+    masks_rpt = masks.unsqueeze(1).repeat(1, num_modes, 1)
+    dist = traj_gt_rpt - traj[:, :, :, 0:2]
+    dist = torch.pow(dist, exponent=2)
+    dist = torch.sum(dist, dim=3)
+    dist = torch.pow(dist, exponent=0.5)
+    dist[masks_rpt.bool()] = -math.inf
+    dist, _ = torch.max(dist, dim=2)
+    dist, _ = torch.min(dist, dim=1)
+    m_r = torch.sum(torch.as_tensor(dist > dist_thresh)) / len(dist)
+
+    return m_r
diff --git a/mmcv/metrics/classification.py b/mmcv/metrics/classification.py
new file mode 100644
index 0000000..9818c2a
--- /dev/null
+++ b/mmcv/metrics/classification.py
@@ -0,0 +1,178 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import wraps
+from typing import Callable, Optional, Sequence, Tuple
+
+import torch
+from .utils import get_num_classes as __gnc
+from .utils import to_categorical as __tc
+from .distributed import rank_zero_warn
+
+
+def to_categorical(tensor: torch.Tensor, argmax_dim: int = 1) -> torch.Tensor:
+    """
+    Converts a tensor of probabilities to a dense label tensor
+
+    .. warning :: Deprecated in favor of :func:`~mmcv.pytorch_lightning.metrics.utils.to_categorical`
+
+    """
+    rank_zero_warn(
+        "This `to_categorical` was deprecated in v1.1.0 in favor of"
+        " `from mmcv.pytorch_lightning.metrics.utils import to_categorical`."
+        " It will be removed in v1.3.0", DeprecationWarning
+    )
+    return __tc(tensor)
+
+
+def get_num_classes(
+    pred: torch.Tensor,
+    target: torch.Tensor,
+    num_classes: Optional[int] = None,
+) -> int:
+    """
+    Calculates the number of classes for a given prediction and target tensor.
+
+    .. warning :: Deprecated in favor of :func:`~mmcv.pytorch_lightning.metrics.utils.get_num_classes`
+
+    """
+    rank_zero_warn(
+        "This `get_num_classes` was deprecated in v1.1.0 in favor of"
+        " `from mmcv.pytorch_lightning.metrics.utils import get_num_classes`."
+        " It will be removed in v1.3.0", DeprecationWarning
+    )
+    return __gnc(pred, target, num_classes)
+
+
+def stat_scores(
+    pred: torch.Tensor,
+    target: torch.Tensor,
+    class_index: int,
+    argmax_dim: int = 1,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Calculates the number of true positive, false positive, true negative
+    and false negative for a specific class
+
+    Args:
+        pred: prediction tensor
+        target: target tensor
+        class_index: class to calculate over
+        argmax_dim: if pred is a tensor of probabilities, this indicates the
+            axis the argmax transformation will be applied over
+
+    Return:
+        True Positive, False Positive, True Negative, False Negative, Support
+
+    Example:
+
+        >>> x = torch.tensor([1, 2, 3])
+        >>> y = torch.tensor([0, 2, 3])
+        >>> tp, fp, tn, fn, sup = stat_scores(x, y, class_index=1)
+        >>> tp, fp, tn, fn, sup
+        (tensor(0), tensor(1), tensor(2), tensor(0), tensor(0))
+
+    """
+    if pred.ndim == target.ndim + 1:
+        pred = to_categorical(pred, argmax_dim=argmax_dim)
+
+    tp = ((pred == class_index) * (target == class_index)).to(torch.long).sum()
+    fp = ((pred == class_index) * (target != class_index)).to(torch.long).sum()
+    tn = ((pred != class_index) * (target != class_index)).to(torch.long).sum()
+    fn = ((pred != class_index) * (target == class_index)).to(torch.long).sum()
+    sup = (target == class_index).to(torch.long).sum()
+
+    return tp, fp, tn, fn, sup
+
+
+# todo: remove in 1.4
+def stat_scores_multiple_classes(
+    pred: torch.Tensor,
+    target: torch.Tensor,
+    num_classes: Optional[int] = None,
+    argmax_dim: int = 1,
+    reduction: str = 'none',
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Calculates the number of true positive, false positive, true negative
+    and false negative for each class
+
+    .. warning :: Deprecated in favor of :func:`~mmcv.pytorch_lightning.metrics.functional.stat_scores`
+
+    Raises:
+        ValueError:
+            If ``reduction`` is not one of ``"none"``, ``"sum"`` or ``"elementwise_mean"``.
+    """
+
+    rank_zero_warn(
+        "This `stat_scores_multiple_classes` was deprecated in v1.2.0 in favor of"
+        " `from mmcv.pytorch_lightning.metrics.functional import stat_scores`."
+        " It will be removed in v1.4.0", DeprecationWarning
+    )
+    if pred.ndim == target.ndim + 1:
+        pred = to_categorical(pred, argmax_dim=argmax_dim)
+
+    num_classes = get_num_classes(pred=pred, target=target, num_classes=num_classes)
+
+    if pred.dtype != torch.bool:
+        pred = pred.clamp_max(max=num_classes)
+    if target.dtype != torch.bool:
+        target = target.clamp_max(max=num_classes)
+
+    possible_reductions = ('none', 'sum', 'elementwise_mean')
+    if reduction not in possible_reductions:
+        raise ValueError("reduction type %s not supported" % reduction)
+
+    if reduction == 'none':
+        pred = pred.view((-1, )).long()
+        target = target.view((-1, )).long()
+
+        tps = torch.zeros((num_classes + 1, ), device=pred.device)
+        fps = torch.zeros((num_classes + 1, ), device=pred.device)
+        fns = torch.zeros((num_classes + 1, ), device=pred.device)
+        sups = torch.zeros((num_classes + 1, ), device=pred.device)
+
+        match_true = (pred == target).float()
+        match_false = 1 - match_true
+
+        tps.scatter_add_(0, pred, match_true)
+        fps.scatter_add_(0, pred, match_false)
+        fns.scatter_add_(0, target, match_false)
+        tns = pred.size(0) - (tps + fps + fns)
+        sups.scatter_add_(0, target, torch.ones_like(match_true))
+
+        tps = tps[:num_classes]
+        fps = fps[:num_classes]
+        tns = tns[:num_classes]
+        fns = fns[:num_classes]
+        sups = sups[:num_classes]
+
+    elif reduction == 'sum' or reduction == 'elementwise_mean':
+        count_match_true = (pred == target).sum().float()
+        oob_tp, oob_fp, oob_tn, oob_fn, oob_sup = stat_scores(pred, target, num_classes, argmax_dim)
+
+        tps = count_match_true - oob_tp
+        fps = pred.nelement() - count_match_true - oob_fp
+        fns = pred.nelement() - count_match_true - oob_fn
+        tns = pred.nelement() * (num_classes + 1) - (tps + fps + fns + oob_tn)
+        sups = pred.nelement() - oob_sup.float()
+
+        if reduction == 'elementwise_mean':
+            tps /= num_classes
+            fps /= num_classes
+            fns /= num_classes
+            tns /= num_classes
+            sups /= num_classes
+
+    return tps.float(), fps.float(), tns.float(), fns.float(), sups.float()
+
diff --git a/mmcv/metrics/compositional.py b/mmcv/metrics/compositional.py
new file mode 100644
index 0000000..124e6a5
--- /dev/null
+++ b/mmcv/metrics/compositional.py
@@ -0,0 +1,40 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Union
+
+import torch
+from torchmetrics.metric import CompositionalMetric as _CompositionalMetric
+
+from .metric import Metric
+from .distributed import rank_zero_warn
+
+
+class CompositionalMetric(_CompositionalMetric):
+    r"""
+    This implementation refers to :class:`~torchmetrics.metric.CompositionalMetric`.
+
+    .. warning:: This metric is deprecated, use ``torchmetrics.metric.CompositionalMetric``. Will be removed in v1.5.0.
+    """
+
+    def __init__(
+        self,
+        operator: Callable,
+        metric_a: Union[Metric, int, float, torch.Tensor],
+        metric_b: Union[Metric, int, float, torch.Tensor, None],
+    ):
+        rank_zero_warn(
+            "This `Metric` was deprecated since v1.3.0 in favor of `torchmetrics.Metric`."
+            " It will be removed in v1.5.0", DeprecationWarning
+        )
+        super().__init__(operator=operator, metric_a=metric_a, metric_b=metric_b)
diff --git a/mmcv/metrics/distributed.py b/mmcv/metrics/distributed.py
new file mode 100644
index 0000000..9e47af2
--- /dev/null
+++ b/mmcv/metrics/distributed.py
@@ -0,0 +1,214 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import warnings
+from functools import wraps
+from typing import Any, Optional, Union
+
+import torch
+
+log = logging.getLogger(__name__)
+
+if torch.distributed.is_available():
+    from torch.distributed import group, ReduceOp
+
+else:
+
+    class ReduceOp:
+        SUM = None
+
+    class group:
+        WORLD = None
+
+
+def rank_zero_only(fn):
+
+    @wraps(fn)
+    def wrapped_fn(*args, **kwargs):
+        if rank_zero_only.rank == 0:
+            return fn(*args, **kwargs)
+
+    return wrapped_fn
+
+
+# add the attribute to the function but don't overwrite in case Trainer has already set it
+rank_zero_only.rank = getattr(rank_zero_only, 'rank', int(os.environ.get('LOCAL_RANK', 0)))
+
+
+def _warn(*args, **kwargs):
+    warnings.warn(*args, **kwargs)
+
+
+def _info(*args, **kwargs):
+    log.info(*args, **kwargs)
+
+
+def _debug(*args, **kwargs):
+    log.debug(*args, **kwargs)
+
+
+rank_zero_debug = rank_zero_only(_debug)
+rank_zero_info = rank_zero_only(_info)
+rank_zero_warn = rank_zero_only(_warn)
+
+
+def find_free_network_port() -> int:
+    """
+    Finds a free port on localhost.
+    It is useful in single-node training when we don't want to connect to a real master node but
+    have to set the `MASTER_PORT` environment variable.
+    """
+    import socket
+    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    s.bind(("", 0))
+    s.listen(1)
+    port = s.getsockname()[1]
+    s.close()
+    return port
+
+
+def gather_all_tensors(result: Union[torch.Tensor], group: Optional[Any] = None):
+    """
+    Function to gather all tensors from several ddp processes onto a list that
+    is broadcasted to all processes
+
+    Args:
+        result: the value to sync
+        group: the process group to gather results from. Defaults to all processes (world)
+
+    Return:
+        gathered_result: list with size equal to the process group where
+            gathered_result[i] corresponds to result tensor from process i
+    """
+    if group is None:
+        group = torch.distributed.group.WORLD
+
+    # convert tensors to contiguous format
+    result = result.contiguous()
+
+    world_size = torch.distributed.get_world_size(group)
+
+    gathered_result = [torch.zeros_like(result) for _ in range(world_size)]
+
+    # sync and broadcast all
+    torch.distributed.barrier(group=group)
+    torch.distributed.all_gather(gathered_result, result, group)
+
+    return gathered_result
+
+
+def sync_ddp_if_available(
+    result: Union[torch.Tensor],
+    group: Optional[Any] = None,
+    reduce_op: Optional[Union[ReduceOp, str]] = None
+) -> torch.Tensor:
+    """
+    Function to reduce a tensor across worker processes during distributed training
+    Args:
+        result: the value to sync and reduce (typically tensor or number)
+        group: the process group to gather results from. Defaults to all processes (world)
+        reduce_op: the reduction operation. Defaults to sum.
+            Can also be a string of 'avg', 'mean' to calculate the mean during reduction.
+
+    Return:
+        reduced value
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return sync_ddp(result, group=group, reduce_op=reduce_op)
+    return result
+
+
+def sync_ddp(
+    result: Union[torch.Tensor],
+    group: Optional[Any] = None,
+    reduce_op: Optional[Union[ReduceOp, str]] = None
+) -> torch.Tensor:
+    """
+    Function to reduce the tensors from several ddp processes to one master process
+
+    Args:
+        result: the value to sync and reduce (typically tensor or number)
+        group: the process group to gather results from. Defaults to all processes (world)
+        reduce_op: the reduction operation. Defaults to sum.
+            Can also be a string of 'avg', 'mean' to calculate the mean during reduction.
+
+    Return:
+        reduced value
+    """
+    divide_by_world_size = False
+
+    if group is None:
+        group = torch.distributed.group.WORLD
+
+    op = reduce_op if isinstance(reduce_op, ReduceOp) else ReduceOp.SUM
+
+    if isinstance(reduce_op, str) and reduce_op.lower() in ("avg", "mean"):
+        divide_by_world_size = True
+
+    # sync all processes before reduction
+    torch.distributed.barrier(group=group)
+    torch.distributed.all_reduce(result, op=op, group=group, async_op=False)
+
+    if divide_by_world_size:
+        result = result / torch.distributed.get_world_size(group)
+
+    return result
+
+
+class AllGatherGrad(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, tensor, group=group.WORLD):
+        ctx.group = group
+
+        gathered_tensor = [torch.zeros_like(tensor) for _ in range(torch.distributed.get_world_size())]
+
+        torch.distributed.all_gather(gathered_tensor, tensor, group=group)
+        gathered_tensor = torch.stack(gathered_tensor, dim=0)
+
+        return gathered_tensor
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        grad_output = torch.cat(grad_output)
+
+        torch.distributed.all_reduce(grad_output, op=torch.distributed.ReduceOp.SUM, async_op=False, group=ctx.group)
+
+        return grad_output[torch.distributed.get_rank()]
+
+
+def all_gather_ddp_if_available(
+    tensor: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False
+) -> torch.Tensor:
+    """
+    Function to gather a tensor from several distributed processes
+
+    Args:
+        tensor: tensor of shape (batch, ...)
+        group: the process group to gather results from. Defaults to all processes (world)
+        sync_grads: flag that allows users to synchronize gradients for all_gather op
+
+    Return:
+        A tensor of shape (world_size, batch, ...)
+    """
+    group = group if group is not None else torch.distributed.group.WORLD
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        if sync_grads:
+            return AllGatherGrad.apply(tensor, group)
+        else:
+            with torch.no_grad():
+                return AllGatherGrad.apply(tensor, group)
+    return tensor
diff --git a/mmcv/metrics/metric.py b/mmcv/metrics/metric.py
new file mode 100644
index 0000000..c306504
--- /dev/null
+++ b/mmcv/metrics/metric.py
@@ -0,0 +1,199 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from torchmetrics import Metric as _Metric
+from torchmetrics import MetricCollection as _MetricCollection
+
+from mmcv.metrics.distributed import rank_zero_warn
+
+
+class Metric(_Metric):
+    r"""
+    This implementation refers to :class:`~torchmetrics.Metric`.
+
+    .. warning:: This metric is deprecated, use ``torchmetrics.Metric``. Will be removed in v1.5.0.
+    """
+
+    def __init__(
+        self,
+        dist_sync_on_step: bool = False,
+        process_group: Optional[Any] = None,
+        dist_sync_fn: Callable = None,
+    ):
+        rank_zero_warn(
+            "This `Metric` was deprecated since v1.3.0 in favor of `torchmetrics.Metric`."
+            " It will be removed in v1.5.0", DeprecationWarning
+        )
+        super().__init__(
+            dist_sync_on_step=dist_sync_on_step,
+            process_group=process_group,
+            dist_sync_fn=dist_sync_fn,
+        )
+
+    def __hash__(self):
+        return super().__hash__()
+
+    def __add__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.add, self, other)
+
+    def __and__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.bitwise_and, self, other)
+
+    def __eq__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.eq, self, other)
+
+    def __floordiv__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.floor_divide, self, other)
+
+    def __ge__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.ge, self, other)
+
+    def __gt__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.gt, self, other)
+
+    def __le__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.le, self, other)
+
+    def __lt__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.lt, self, other)
+
+    def __matmul__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.matmul, self, other)
+
+    def __mod__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.fmod, self, other)
+
+    def __mul__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.mul, self, other)
+
+    def __ne__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.ne, self, other)
+
+    def __or__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.bitwise_or, self, other)
+
+    def __pow__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.pow, self, other)
+
+    def __radd__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.add, other, self)
+
+    def __rand__(self, other: Any):
+        from .compositional import CompositionalMetric
+
+        # swap them since bitwise_and only supports that way and it's commutative
+        return CompositionalMetric(torch.bitwise_and, self, other)
+
+    def __rfloordiv__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.floor_divide, other, self)
+
+    def __rmatmul__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.matmul, other, self)
+
+    def __rmod__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.fmod, other, self)
+
+    def __rmul__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.mul, other, self)
+
+    def __ror__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.bitwise_or, other, self)
+
+    def __rpow__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.pow, other, self)
+
+    def __rsub__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.sub, other, self)
+
+    def __rtruediv__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.true_divide, other, self)
+
+    def __rxor__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.bitwise_xor, other, self)
+
+    def __sub__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.sub, self, other)
+
+    def __truediv__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.true_divide, self, other)
+
+    def __xor__(self, other: Any):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.bitwise_xor, self, other)
+
+    def __abs__(self):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.abs, self, None)
+
+    def __inv__(self):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.bitwise_not, self, None)
+
+    def __invert__(self):
+        return self.__inv__()
+
+    def __neg__(self):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(_neg, self, None)
+
+    def __pos__(self):
+        from .compositional import CompositionalMetric
+        return CompositionalMetric(torch.abs, self, None)
+
+
+def _neg(tensor: torch.Tensor):
+    return -torch.abs(tensor)
+
+
+class MetricCollection(_MetricCollection):
+    r"""
+    This implementation refers to :class:`~torchmetrics.MetricCollection`.
+
+    .. warning:: This metric is deprecated, use ``torchmetrics.MetricCollection``. Will be removed in v1.5.0.
+    """
+
+    def __init__(self, metrics: Union[List[Metric], Tuple[Metric], Dict[str, Metric]]):
+        rank_zero_warn(
+            "This `MetricCollection` was deprecated since v1.3.0 in favor of `torchmetrics.MetricCollection`."
+            " It will be removed in v1.5.0", DeprecationWarning
+        )
+        super().__init__(metrics=metrics)
diff --git a/mmcv/metrics/reduction.py b/mmcv/metrics/reduction.py
new file mode 100644
index 0000000..f0ab4c2
--- /dev/null
+++ b/mmcv/metrics/reduction.py
@@ -0,0 +1,26 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from .utils import reduce as __reduce
+from .distributed import rank_zero_warn
+
+def reduce(to_reduce: torch.Tensor, reduction: str) -> torch.Tensor:
+    rank_zero_warn(
+        "This `reduce` was deprecated in v1.1.0 in favor of"
+        " `mmcv.pytorch_lightning.metrics.utils import reduce`."
+        " It will be removed in v1.3.0", DeprecationWarning
+    )
+    return __reduce(to_reduce=to_reduce, reduction=reduction)
+
diff --git a/mmcv/metrics/utils.py b/mmcv/metrics/utils.py
new file mode 100644
index 0000000..7d1bf40
--- /dev/null
+++ b/mmcv/metrics/utils.py
@@ -0,0 +1,292 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple
+
+import torch
+
+from .distributed import rank_zero_warn
+
+METRIC_EPS = 1e-6
+
+
+def dim_zero_cat(x):
+    x = x if isinstance(x, (list, tuple)) else [x]
+    return torch.cat(x, dim=0)
+
+
+def dim_zero_sum(x):
+    return torch.sum(x, dim=0)
+
+
+def dim_zero_mean(x):
+    return torch.mean(x, dim=0)
+
+
+def _flatten(x):
+    return [item for sublist in x for item in sublist]
+
+
+def _check_same_shape(pred: torch.Tensor, target: torch.Tensor):
+    """ Check that predictions and target have the same shape, else raise error """
+    if pred.shape != target.shape:
+        raise RuntimeError("Predictions and targets are expected to have the same shape")
+
+
+def _input_format_classification_one_hot(
+    num_classes: int,
+    preds: torch.Tensor,
+    target: torch.Tensor,
+    threshold: float = 0.5,
+    multilabel: bool = False
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Convert preds and target tensors into one hot spare label tensors
+
+    Args:
+        num_classes: number of classes
+        preds: either tensor with labels, tensor with probabilities/logits or
+            multilabel tensor
+        target: tensor with ground true labels
+        threshold: float used for thresholding multilabel input
+        multilabel: boolean flag indicating if input is multilabel
+
+    Returns:
+        preds: one hot tensor of shape [num_classes, -1] with predicted labels
+        target: one hot tensors of shape [num_classes, -1] with true labels
+    """
+    if not (preds.ndim == target.ndim or preds.ndim == target.ndim + 1):
+        raise ValueError("preds and target must have same number of dimensions, or one additional dimension for preds")
+
+    if preds.ndim == target.ndim + 1:
+        # multi class probabilites
+        preds = torch.argmax(preds, dim=1)
+
+    if preds.ndim == target.ndim and preds.dtype in (torch.long, torch.int) and num_classes > 1 and not multilabel:
+        # multi-class
+        preds = to_onehot(preds, num_classes=num_classes)
+        target = to_onehot(target, num_classes=num_classes)
+
+    elif preds.ndim == target.ndim and preds.is_floating_point():
+        # binary or multilabel probablities
+        preds = (preds >= threshold).long()
+
+    # transpose class as first dim and reshape
+    if preds.ndim > 1:
+        preds = preds.transpose(1, 0)
+        target = target.transpose(1, 0)
+
+    return preds.reshape(num_classes, -1), target.reshape(num_classes, -1)
+
+
+def to_onehot(
+    label_tensor: torch.Tensor,
+    num_classes: Optional[int] = None,
+) -> torch.Tensor:
+    """
+    Converts a dense label tensor to one-hot format
+
+    Args:
+        label_tensor: dense label tensor, with shape [N, d1, d2, ...]
+        num_classes: number of classes C
+
+    Output:
+        A sparse label tensor with shape [N, C, d1, d2, ...]
+
+    Example:
+
+        >>> x = torch.tensor([1, 2, 3])
+        >>> to_onehot(x)
+        tensor([[0, 1, 0, 0],
+                [0, 0, 1, 0],
+                [0, 0, 0, 1]])
+
+    """
+    if num_classes is None:
+        num_classes = int(label_tensor.max().detach().item() + 1)
+
+    tensor_onehot = torch.zeros(
+        label_tensor.shape[0],
+        num_classes,
+        *label_tensor.shape[1:],
+        dtype=label_tensor.dtype,
+        device=label_tensor.device,
+    )
+    index = label_tensor.long().unsqueeze(1).expand_as(tensor_onehot)
+    return tensor_onehot.scatter_(1, index, 1.0)
+
+
+def select_topk(prob_tensor: torch.Tensor, topk: int = 1, dim: int = 1) -> torch.Tensor:
+    """
+    Convert a probability tensor to binary by selecting top-k highest entries.
+
+    Args:
+        prob_tensor: dense tensor of shape ``[..., C, ...]``, where ``C`` is in the
+            position defined by the ``dim`` argument
+        topk: number of highest entries to turn into 1s
+        dim: dimension on which to compare entries
+
+    Output:
+        A binary tensor of the same shape as the input tensor of type torch.int32
+
+    Example:
+        >>> x = torch.tensor([[1.1, 2.0, 3.0], [2.0, 1.0, 0.5]])
+        >>> select_topk(x, topk=2)
+        tensor([[0, 1, 1],
+                [1, 1, 0]], dtype=torch.int32)
+    """
+    zeros = torch.zeros_like(prob_tensor)
+    topk_tensor = zeros.scatter(dim, prob_tensor.topk(k=topk, dim=dim).indices, 1.0)
+    return topk_tensor.int()
+
+
+def to_categorical(tensor: torch.Tensor, argmax_dim: int = 1) -> torch.Tensor:
+    """
+    Converts a tensor of probabilities to a dense label tensor
+
+    Args:
+        tensor: probabilities to get the categorical label [N, d1, d2, ...]
+        argmax_dim: dimension to apply
+
+    Return:
+        A tensor with categorical labels [N, d2, ...]
+
+    Example:
+
+        >>> x = torch.tensor([[0.2, 0.5], [0.9, 0.1]])
+        >>> to_categorical(x)
+        tensor([1, 0])
+
+    """
+    return torch.argmax(tensor, dim=argmax_dim)
+
+
+def get_num_classes(
+    pred: torch.Tensor,
+    target: torch.Tensor,
+    num_classes: Optional[int] = None,
+) -> int:
+    """
+    Calculates the number of classes for a given prediction and target tensor.
+
+    Args:
+        pred: predicted values
+        target: true labels
+        num_classes: number of classes if known
+
+    Return:
+        An integer that represents the number of classes.
+    """
+    num_target_classes = int(target.max().detach().item() + 1)
+    num_pred_classes = int(pred.max().detach().item() + 1)
+    num_all_classes = max(num_target_classes, num_pred_classes)
+
+    if num_classes is None:
+        num_classes = num_all_classes
+    elif num_classes != num_all_classes:
+        rank_zero_warn(
+            f"You have set {num_classes} number of classes which is"
+            f" different from predicted ({num_pred_classes}) and"
+            f" target ({num_target_classes}) number of classes",
+            RuntimeWarning,
+        )
+    return num_classes
+
+
+def reduce(to_reduce: torch.Tensor, reduction: str) -> torch.Tensor:
+    """
+    Reduces a given tensor by a given reduction method
+
+    Args:
+        to_reduce : the tensor, which shall be reduced
+       reduction :  a string specifying the reduction method ('elementwise_mean', 'none', 'sum')
+
+    Return:
+        reduced Tensor
+
+    Raise:
+        ValueError if an invalid reduction parameter was given
+    """
+    if reduction == "elementwise_mean":
+        return torch.mean(to_reduce)
+    if reduction == "none":
+        return to_reduce
+    if reduction == "sum":
+        return torch.sum(to_reduce)
+    raise ValueError("Reduction parameter unknown.")
+
+
+def class_reduce(
+    num: torch.Tensor, denom: torch.Tensor, weights: torch.Tensor, class_reduction: str = "none"
+) -> torch.Tensor:
+    """
+    Function used to reduce classification metrics of the form `num / denom * weights`.
+    For example for calculating standard accuracy the num would be number of
+    true positives per class, denom would be the support per class, and weights
+    would be a tensor of 1s
+
+    Args:
+        num: numerator tensor
+        denom: denominator tensor
+        weights: weights for each class
+        class_reduction: reduction method for multiclass problems
+
+            - ``'micro'``: calculate metrics globally (default)
+            - ``'macro'``: calculate metrics for each label, and find their unweighted mean.
+            - ``'weighted'``: calculate metrics for each label, and find their weighted mean.
+            - ``'none'`` or ``None``: returns calculated metric per class
+
+    Raises:
+        ValueError:
+            If ``class_reduction`` is none of ``"micro"``, ``"macro"``, ``"weighted"``, ``"none"`` or ``None``.
+    """
+    valid_reduction = ("micro", "macro", "weighted", "none", None)
+    if class_reduction == "micro":
+        fraction = torch.sum(num) / torch.sum(denom)
+    else:
+        fraction = num / denom
+
+    # We need to take care of instances where the denom can be 0
+    # for some (or all) classes which will produce nans
+    fraction[fraction != fraction] = 0
+
+    if class_reduction == "micro":
+        return fraction
+    elif class_reduction == "macro":
+        return torch.mean(fraction)
+    elif class_reduction == "weighted":
+        return torch.sum(fraction * (weights.float() / torch.sum(weights)))
+    elif class_reduction == "none" or class_reduction is None:
+        return fraction
+
+    raise ValueError(
+        f"Reduction parameter {class_reduction} unknown."
+        f" Choose between one of these: {valid_reduction}"
+    )
+
+
+def _stable_1d_sort(x: torch, N: int = 2049):
+    """
+    Stable sort of 1d tensors. Pytorch defaults to a stable sorting algorithm
+    if number of elements are larger than 2048. This function pads the tensors,
+    makes the sort and returns the sorted array (with the padding removed)
+    See this discussion: https://discuss.pytorch.org/t/is-torch-sort-stable/20714
+    """
+    if x.ndim > 1:
+        raise ValueError('Stable sort only works on 1d tensors')
+    n = x.numel()
+    if N - n > 0:
+        x_max = x.max()
+        x = torch.cat([x, (x_max + 1) * torch.ones(N - n, dtype=x.dtype, device=x.device)], 0)
+    x_sort = x.sort()
+    i = min(N, n)
+    return x_sort.values[:i], x_sort.indices[:i]
diff --git a/mmcv/modeling/postprocessing.py b/mmcv/modeling/postprocessing.py
new file mode 100644
index 0000000..b893d37
--- /dev/null
+++ b/mmcv/modeling/postprocessing.py
@@ -0,0 +1,100 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import torch
+from torch.nn import functional as F
+
+from mmcv.structures import Instances, ROIMasks
+
+
+# perhaps should rename to "resize_instance"
+def detector_postprocess(
+    results: Instances, output_height: int, output_width: int, mask_threshold: float = 0.5
+):
+    """
+    Resize the output instances.
+    The input images are often resized when entering an object detector.
+    As a result, we often need the outputs of the detector in a different
+    resolution from its inputs.
+
+    This function will resize the raw outputs of an R-CNN detector
+    to produce outputs according to the desired output resolution.
+
+    Args:
+        results (Instances): the raw outputs from the detector.
+            `results.image_size` contains the input image resolution the detector sees.
+            This object might be modified in-place.
+        output_height, output_width: the desired output resolution.
+    Returns:
+        Instances: the resized output from the model, based on the output resolution
+    """
+    if isinstance(output_width, torch.Tensor):
+        # This shape might (but not necessarily) be tensors during tracing.
+        # Converts integer tensors to float temporaries to ensure true
+        # division is performed when computing scale_x and scale_y.
+        output_width_tmp = output_width.float()
+        output_height_tmp = output_height.float()
+        new_size = torch.stack([output_height, output_width])
+    else:
+        new_size = (output_height, output_width)
+        output_width_tmp = output_width
+        output_height_tmp = output_height
+
+    scale_x, scale_y = (
+        output_width_tmp / results.image_size[1],
+        output_height_tmp / results.image_size[0],
+    )
+    results = Instances(new_size, **results.get_fields())
+
+    if results.has("pred_boxes"):
+        output_boxes = results.pred_boxes
+    elif results.has("proposal_boxes"):
+        output_boxes = results.proposal_boxes
+    else:
+        output_boxes = None
+    assert output_boxes is not None, "Predictions must contain boxes!"
+
+    output_boxes.scale(scale_x, scale_y)
+    output_boxes.clip(results.image_size)
+
+    results = results[output_boxes.nonempty()]
+
+    if results.has("pred_masks"):
+        if isinstance(results.pred_masks, ROIMasks):
+            roi_masks = results.pred_masks
+        else:
+            # pred_masks is a tensor of shape (N, 1, M, M)
+            roi_masks = ROIMasks(results.pred_masks[:, 0, :, :])
+        results.pred_masks = roi_masks.to_bitmasks(
+            results.pred_boxes, output_height, output_width, mask_threshold
+        ).tensor  # TODO return ROIMasks/BitMask object in the future
+
+    if results.has("pred_keypoints"):
+        results.pred_keypoints[:, :, 0] *= scale_x
+        results.pred_keypoints[:, :, 1] *= scale_y
+
+    return results
+
+
+def sem_seg_postprocess(result, img_size, output_height, output_width):
+    """
+    Return semantic segmentation predictions in the original resolution.
+
+    The input images are often resized when entering semantic segmentor. Moreover, in same
+    cases, they also padded inside segmentor to be divisible by maximum network stride.
+    As a result, we often need the predictions of the segmentor in a different
+    resolution from its inputs.
+
+    Args:
+        result (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W),
+            where C is the number of classes, and H, W are the height and width of the prediction.
+        img_size (tuple): image size that segmentor is taking as input.
+        output_height, output_width: the desired output resolution.
+
+    Returns:
+        semantic segmentation prediction (Tensor): A tensor of the shape
+            (C, output_height, output_width) that contains per-pixel soft predictions.
+    """
+    result = result[:, : img_size[0], : img_size[1]].expand(1, -1, -1, -1)
+    result = F.interpolate(
+        result, size=(output_height, output_width), mode="bilinear", align_corners=False
+    )[0]
+    return result
\ No newline at end of file
diff --git a/mmcv/models/__init__.py b/mmcv/models/__init__.py
new file mode 100644
index 0000000..b493852
--- /dev/null
+++ b/mmcv/models/__init__.py
@@ -0,0 +1,14 @@
+from .backbones import *  # noqa: F401,F403
+from .builder import (BACKBONES, DETECTORS, HEADS, LOSSES, NECKS,
+                      ROI_EXTRACTORS, SHARED_HEADS, FUSION_LAYERS, 
+                      MIDDLE_ENCODERS, VOXEL_ENCODERS, SEGMENTORS,
+                      build_backbone, build_detector, build_fusion_layer,
+                      build_head, build_loss, build_middle_encoder, 
+                      build_model, build_neck, build_roi_extractor, 
+                      build_shared_head, build_voxel_encoder, build_segmentor)
+from .dense_heads import *  # noqa: F401,F403
+from .detectors import *  # noqa: F401,F403
+from .losses import *  # noqa: F401,F403
+from .necks import *  # noqa: F401,F403
+from .bricks import *
+from .utils import *
\ No newline at end of file
diff --git a/mmcv/models/backbones/__init__.py b/mmcv/models/backbones/__init__.py
new file mode 100644
index 0000000..22bb120
--- /dev/null
+++ b/mmcv/models/backbones/__init__.py
@@ -0,0 +1,3 @@
+from .resnet import ResNet, ResNetV1d
+from  .vgg import VGG
+from .base_module import BaseModule, ModuleList, Sequential
diff --git a/mmcv/models/backbones/base_module.py b/mmcv/models/backbones/base_module.py
new file mode 100644
index 0000000..94a8d04
--- /dev/null
+++ b/mmcv/models/backbones/base_module.py
@@ -0,0 +1,195 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from abc import ABCMeta
+from collections import defaultdict
+from logging import FileHandler
+
+import torch.nn as nn
+
+from mmcv.utils import master_only
+from mmcv.utils.logging import get_logger, logger_initialized, print_log
+
+
+class BaseModule(nn.Module, metaclass=ABCMeta):
+    """Base module for all modules in openmmlab.
+
+    ``BaseModule`` is a wrapper of ``torch.nn.Module`` with additional
+    functionality of parameter initialization. Compared with
+    ``torch.nn.Module``, ``BaseModule`` mainly adds three attributes.
+
+        - ``init_cfg``: the config to control the initialization.
+        - ``init_weights``: The function of parameter
+            initialization and recording initialization
+            information.
+        - ``_params_init_info``: Used to track the parameter
+            initialization information. This attribute only
+            exists during executing the ``init_weights``.
+
+    Args:
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self, init_cfg=None):
+        """Initialize BaseModule, inherited from `torch.nn.Module`"""
+
+        # NOTE init_cfg can be defined in different levels, but init_cfg
+        # in low levels has a higher priority.
+
+        super(BaseModule, self).__init__()
+        # define default value of init_cfg instead of hard code
+        # in init_weights() function
+        self._is_init = False
+
+        self.init_cfg = copy.deepcopy(init_cfg)
+
+        # Backward compatibility in derived classes
+        # if pretrained is not None:
+        #     warnings.warn('DeprecationWarning: pretrained is a deprecated \
+        #         key, please consider using init_cfg')
+        #     self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+
+    @property
+    def is_init(self):
+        return self._is_init
+
+    def init_weights(self):
+        """Initialize the weights."""
+
+        is_top_level_module = False
+        # check if it is top-level module
+        if not hasattr(self, '_params_init_info'):
+            # The `_params_init_info` is used to record the initialization
+            # information of the parameters
+            # the key should be the obj:`nn.Parameter` of model and the value
+            # should be a dict containing
+            # - init_info (str): The string that describes the initialization.
+            # - tmp_mean_value (FloatTensor): The mean of the parameter,
+            #       which indicates whether the parameter has been modified.
+            # this attribute would be deleted after all parameters
+            # is initialized.
+            self._params_init_info = defaultdict(dict)
+            is_top_level_module = True
+
+            # Initialize the `_params_init_info`,
+            # When detecting the `tmp_mean_value` of
+            # the corresponding parameter is changed, update related
+            # initialization information
+            for name, param in self.named_parameters():
+                self._params_init_info[param][
+                    'init_info'] = f'The value is the same before and ' \
+                                   f'after calling `init_weights` ' \
+                                   f'of {self.__class__.__name__} '
+                self._params_init_info[param][
+                    'tmp_mean_value'] = param.data.mean()
+
+            # pass `params_init_info` to all submodules
+            # All submodules share the same `params_init_info`,
+            # so it will be updated when parameters are
+            # modified at any level of the model.
+            for sub_module in self.modules():
+                sub_module._params_init_info = self._params_init_info
+
+        # Get the initialized logger, if not exist,
+        # create a logger named `mmcv`
+        logger_names = list(logger_initialized.keys())
+        logger_name = logger_names[0] if logger_names else 'mmcv'
+
+        from ..utils import initialize
+        from ..utils.weight_init import update_init_info
+        module_name = self.__class__.__name__
+        if not self._is_init:
+            if self.init_cfg:
+                print_log(
+                    f'initialize {module_name} with init_cfg {self.init_cfg}',
+                    logger=logger_name)
+                initialize(self, self.init_cfg)
+                if isinstance(self.init_cfg, dict):
+                    # prevent the parameters of
+                    # the pre-trained model
+                    # from being overwritten by
+                    # the `init_weights`
+                    if self.init_cfg['type'] == 'Pretrained':
+                        return
+
+            for m in self.children():
+                if hasattr(m, 'init_weights'):
+                    m.init_weights()
+                    # users may overload the `init_weights`
+                    update_init_info(
+                        m,
+                        init_info=f'Initialized by '
+                        f'user-defined `init_weights`'
+                        f' in {m.__class__.__name__} ')
+
+            self._is_init = True
+        else:
+            warnings.warn(f'init_weights of {self.__class__.__name__} has '
+                          f'been called more than once.')
+
+        if is_top_level_module:
+            self._dump_init_info(logger_name)
+
+            for sub_module in self.modules():
+                del sub_module._params_init_info
+
+    @master_only
+    def _dump_init_info(self, logger_name):
+        """Dump the initialization information to a file named
+        `initialization.log.json` in workdir.
+
+        Args:
+            logger_name (str): The name of logger.
+        """
+
+        logger = get_logger(logger_name)
+
+        with_file_handler = False
+        # dump the information to the logger file if there is a `FileHandler`
+        for handler in logger.handlers:
+            if isinstance(handler, FileHandler):
+                handler.stream.write(
+                    'Name of parameter - Initialization information\n')
+                for name, param in self.named_parameters():
+                    handler.stream.write(
+                        f'\n{name} - {param.shape}: '
+                        f"\n{self._params_init_info[param]['init_info']} \n")
+                handler.stream.flush()
+                with_file_handler = True
+        if not with_file_handler:
+            for name, param in self.named_parameters():
+                print_log(
+                    f'\n{name} - {param.shape}: '
+                    f"\n{self._params_init_info[param]['init_info']} \n ",
+                    logger=logger_name)
+
+    def __repr__(self):
+        s = super().__repr__()
+        if self.init_cfg:
+            s += f'\ninit_cfg={self.init_cfg}'
+        return s
+
+
+class Sequential(BaseModule, nn.Sequential):
+    """Sequential module in openmmlab.
+
+    Args:
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self, *args, init_cfg=None):
+        BaseModule.__init__(self, init_cfg)
+        nn.Sequential.__init__(self, *args)
+
+
+class ModuleList(BaseModule, nn.ModuleList):
+    """ModuleList in openmmlab.
+
+    Args:
+        modules (iterable, optional): an iterable of modules to add.
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self, modules=None, init_cfg=None):
+        BaseModule.__init__(self, init_cfg)
+        nn.ModuleList.__init__(self, modules)
diff --git a/mmcv/models/backbones/resnet.py b/mmcv/models/backbones/resnet.py
new file mode 100644
index 0000000..0b21416
--- /dev/null
+++ b/mmcv/models/backbones/resnet.py
@@ -0,0 +1,671 @@
+import warnings
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.models.bricks import build_conv_layer, build_norm_layer, build_plugin_layer
+from mmcv.models.backbones.base_module import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..builder import BACKBONES
+from ..utils import ResLayer
+
+
+class BasicBlock(BaseModule):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        super(BasicBlock, self).__init__(init_cfg)
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg, planes, planes, 3, padding=1, bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(BaseModule):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        """Bottleneck block for ResNet.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__(init_cfg)
+        assert style in ['pytorch', 'caffe']
+        assert dcn is None or isinstance(dcn, dict)
+        assert plugins is None or isinstance(plugins, list)
+        if plugins is not None:
+            allowed_position = ['after_conv1', 'after_conv2', 'after_conv3']
+            assert all(p['position'] in allowed_position for p in plugins)
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.dcn = dcn
+        self.with_dcn = dcn is not None
+        self.plugins = plugins
+        self.with_plugins = plugins is not None
+
+        if self.with_plugins:
+            # collect plugins for conv1/conv2/conv3
+            self.after_conv1_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv1'
+            ]
+            self.after_conv2_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv2'
+            ]
+            self.after_conv3_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv3'
+            ]
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(
+            norm_cfg, planes * self.expansion, postfix=3)
+
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            kernel_size=1,
+            stride=self.conv1_stride,
+            bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(
+                conv_cfg,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(
+                dcn,
+                planes,
+                planes,
+                kernel_size=3,
+                stride=self.conv2_stride,
+                padding=dilation,
+                dilation=dilation,
+                bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            planes,
+            planes * self.expansion,
+            kernel_size=1,
+            bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+        if self.with_plugins:
+            self.after_conv1_plugin_names = self.make_block_plugins(
+                planes, self.after_conv1_plugins)
+            self.after_conv2_plugin_names = self.make_block_plugins(
+                planes, self.after_conv2_plugins)
+            self.after_conv3_plugin_names = self.make_block_plugins(
+                planes * self.expansion, self.after_conv3_plugins)
+
+    def make_block_plugins(self, in_channels, plugins):
+        """make plugins for block.
+
+        Args:
+            in_channels (int): Input channels of plugin.
+            plugins (list[dict]): List of plugins cfg to build.
+
+        Returns:
+            list[str]: List of the names of plugin.
+        """
+        assert isinstance(plugins, list)
+        plugin_names = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            name, layer = build_plugin_layer(
+                plugin,
+                in_channels=in_channels,
+                postfix=plugin.pop('postfix', ''))
+            assert not hasattr(self, name), f'duplicate plugin {name}'
+            self.add_module(name, layer)
+            plugin_names.append(name)
+        return plugin_names
+
+    def forward_plugin(self, x, plugin_names):
+        out = x
+        for name in plugin_names:
+            out = getattr(self, name)(x)
+        return out
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: normalization layer after the third convolution layer"""
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+
+        def _inner_forward(x):
+            identity = x
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+@BACKBONES.register_module()
+class ResNet(BaseModule):
+    """ResNet backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        stem_channels (int | None): Number of stem channels. If not specified,
+            it will be the same as `base_channels`. Default: None.
+        base_channels (int): Number of base channels of res layer. Default: 64.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Resnet stages. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+
+            - cfg (dict, required): Cfg dict to build plugin.
+            - position (str, required): Position inside block to insert
+              plugin, options are 'after_conv1', 'after_conv2', 'after_conv3'.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+
+    Example:
+        >>> from mmcv.models import ResNet
+        >>> import torch
+        >>> self = ResNet(depth=18)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 64, 8, 8)
+        (1, 128, 4, 4)
+        (1, 256, 2, 2)
+        (1, 512, 1, 1)
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 stem_channels=None,
+                 base_channels=64,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=True,
+                 dcn=None,
+                 stage_with_dcn=(False, False, False, False),
+                 plugins=None,
+                 with_cp=False,
+                 zero_init_residual=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(ResNet, self).__init__(init_cfg)
+        self.zero_init_residual = zero_init_residual
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+
+        block_init_cfg = None
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(
+                        type='Constant',
+                        val=1,
+                        layer=['_BatchNorm', 'GroupNorm'])
+                ]
+                block = self.arch_settings[depth][0]
+                if self.zero_init_residual:
+                    if block is BasicBlock:
+                        block_init_cfg = dict(
+                            type='Constant',
+                            val=0,
+                            override=dict(name='norm2'))
+                    elif block is Bottleneck:
+                        block_init_cfg = dict(
+                            type='Constant',
+                            val=0,
+                            override=dict(name='norm3'))
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.depth = depth
+        if stem_channels is None:
+            stem_channels = base_channels
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert num_stages >= 1 and num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        if dcn is not None:
+            assert len(stage_with_dcn) == num_stages
+        self.plugins = plugins
+        self.block, stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        self.inplanes = stem_channels
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            if plugins is not None:
+                stage_plugins = self.make_stage_plugins(plugins, i)
+            else:
+                stage_plugins = None
+            planes = base_channels * 2**i
+            res_layer = self.make_res_layer(
+                block=self.block,
+                inplanes=self.inplanes,
+                planes=planes,
+                num_blocks=num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                avg_down=self.avg_down,
+                with_cp=with_cp,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                dcn=dcn,
+                plugins=stage_plugins,
+                init_cfg=block_init_cfg)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = self.block.expansion * base_channels * 2**(
+            len(self.stage_blocks) - 1)
+
+    def make_stage_plugins(self, plugins, stage_idx):
+        """Make plugins for ResNet ``stage_idx`` th stage.
+
+        Currently we support to insert ``context_block``,
+        ``empirical_attention_block``, ``nonlocal_block`` into the backbone
+        like ResNet/ResNeXt. They could be inserted after conv1/conv2/conv3 of
+        Bottleneck.
+
+        An example of plugins format could be:
+
+        Examples:
+            >>> plugins=[
+            ...     dict(cfg=dict(type='xxx', arg1='xxx'),
+            ...          stages=(False, True, True, True),
+            ...          position='after_conv2'),
+            ...     dict(cfg=dict(type='yyy'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3'),
+            ...     dict(cfg=dict(type='zzz', postfix='1'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3'),
+            ...     dict(cfg=dict(type='zzz', postfix='2'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3')
+            ... ]
+            >>> self = ResNet(depth=18)
+            >>> stage_plugins = self.make_stage_plugins(plugins, 0)
+            >>> assert len(stage_plugins) == 3
+
+        Suppose ``stage_idx=0``, the structure of blocks in the stage would be:
+
+        .. code-block:: none
+
+            conv1-> conv2->conv3->yyy->zzz1->zzz2
+
+        Suppose 'stage_idx=1', the structure of blocks in the stage would be:
+
+        .. code-block:: none
+
+            conv1-> conv2->xxx->conv3->yyy->zzz1->zzz2
+
+        If stages is missing, the plugin would be applied to all stages.
+
+        Args:
+            plugins (list[dict]): List of plugins cfg to build. The postfix is
+                required if multiple same type plugins are inserted.
+            stage_idx (int): Index of stage to build
+
+        Returns:
+            list[dict]: Plugins for current stage
+        """
+        stage_plugins = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            stages = plugin.pop('stages', None)
+            assert stages is None or len(stages) == self.num_stages
+            # whether to insert plugin into current stage
+            if stages is None or stages[stage_idx]:
+                stage_plugins.append(plugin)
+
+        return stage_plugins
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``."""
+        return ResLayer(**kwargs)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels, stem_channels):
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                build_conv_layer(
+                    self.conv_cfg,
+                    in_channels,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels // 2,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(
+                    self.conv_cfg,
+                    stem_channels // 2,
+                    stem_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels)[1],
+                nn.ReLU(inplace=True))
+        else:
+            self.conv1 = build_conv_layer(
+                self.conv_cfg,
+                in_channels,
+                stem_channels,
+                kernel_size=7,
+                stride=2,
+                padding=3,
+                bias=False)
+            self.norm1_name, norm1 = build_norm_layer(
+                self.norm_cfg, stem_channels, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+            self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            if self.deep_stem:
+                self.stem.eval()
+                for param in self.stem.parameters():
+                    param.requires_grad = False
+            else:
+                self.norm1.eval()
+                for m in [self.conv1, self.norm1]:
+                    for param in m.parameters():
+                        param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        """Forward function."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super(ResNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+
+@BACKBONES.register_module()
+class ResNetV1d(ResNet):
+    r"""ResNetV1d variant described in `Bag of Tricks
+    <https://arxiv.org/pdf/1812.01187.pdf>`_.
+
+    Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in
+    the input stem with three 3x3 convs. And in the downsampling block, a 2x2
+    avg_pool with stride 2 is added before conv, whose stride is changed to 1.
+    """
+
+    def __init__(self, **kwargs):
+        super(ResNetV1d, self).__init__(
+            deep_stem=True, avg_down=True, **kwargs)
diff --git a/mmcv/models/backbones/vgg.py b/mmcv/models/backbones/vgg.py
new file mode 100644
index 0000000..dcda6f1
--- /dev/null
+++ b/mmcv/models/backbones/vgg.py
@@ -0,0 +1,175 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+
+import torch.nn as nn
+
+from ..utils.weight_init import constant_init, kaiming_init, normal_init
+
+
+def conv3x3(in_planes, out_planes, dilation=1):
+    """3x3 convolution with padding."""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        padding=dilation,
+        dilation=dilation)
+
+
+def make_vgg_layer(inplanes,
+                   planes,
+                   num_blocks,
+                   dilation=1,
+                   with_bn=False,
+                   ceil_mode=False):
+    layers = []
+    for _ in range(num_blocks):
+        layers.append(conv3x3(inplanes, planes, dilation))
+        if with_bn:
+            layers.append(nn.BatchNorm2d(planes))
+        layers.append(nn.ReLU(inplace=True))
+        inplanes = planes
+    layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode))
+
+    return layers
+
+
+class VGG(nn.Module):
+    """VGG backbone.
+
+    Args:
+        depth (int): Depth of vgg, from {11, 13, 16, 19}.
+        with_bn (bool): Use BatchNorm or not.
+        num_classes (int): number of classes for classification.
+        num_stages (int): VGG stages, normally 5.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze
+            running stats (mean and var).
+        bn_frozen (bool): Whether to freeze weight and bias of BN layers.
+    """
+
+    arch_settings = {
+        11: (1, 1, 2, 2, 2),
+        13: (2, 2, 2, 2, 2),
+        16: (2, 2, 3, 3, 3),
+        19: (2, 2, 4, 4, 4)
+    }
+
+    def __init__(self,
+                 depth,
+                 with_bn=False,
+                 num_classes=-1,
+                 num_stages=5,
+                 dilations=(1, 1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3, 4),
+                 frozen_stages=-1,
+                 bn_eval=True,
+                 bn_frozen=False,
+                 ceil_mode=False,
+                 with_last_pool=True):
+        super(VGG, self).__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for vgg')
+        assert num_stages >= 1 and num_stages <= 5
+        stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        assert len(dilations) == num_stages
+        assert max(out_indices) <= num_stages
+
+        self.num_classes = num_classes
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.bn_eval = bn_eval
+        self.bn_frozen = bn_frozen
+
+        self.inplanes = 3
+        start_idx = 0
+        vgg_layers = []
+        self.range_sub_modules = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            num_modules = num_blocks * (2 + with_bn) + 1
+            end_idx = start_idx + num_modules
+            dilation = dilations[i]
+            planes = 64 * 2**i if i < 4 else 512
+            vgg_layer = make_vgg_layer(
+                self.inplanes,
+                planes,
+                num_blocks,
+                dilation=dilation,
+                with_bn=with_bn,
+                ceil_mode=ceil_mode)
+            vgg_layers.extend(vgg_layer)
+            self.inplanes = planes
+            self.range_sub_modules.append([start_idx, end_idx])
+            start_idx = end_idx
+        if not with_last_pool:
+            vgg_layers.pop(-1)
+            self.range_sub_modules[-1][1] -= 1
+        self.module_name = 'features'
+        self.add_module(self.module_name, nn.Sequential(*vgg_layers))
+
+        if self.num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Linear(512 * 7 * 7, 4096),
+                nn.ReLU(True),
+                nn.Dropout(),
+                nn.Linear(4096, 4096),
+                nn.ReLU(True),
+                nn.Dropout(),
+                nn.Linear(4096, num_classes),
+            )
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            from ...runner import load_checkpoint
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+                elif isinstance(m, nn.Linear):
+                    normal_init(m, std=0.01)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        outs = []
+        vgg_layers = getattr(self, self.module_name)
+        for i in range(len(self.stage_blocks)):
+            for j in range(*self.range_sub_modules[i]):
+                vgg_layer = vgg_layers[j]
+                x = vgg_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if self.num_classes > 0:
+            x = x.view(x.size(0), -1)
+            x = self.classifier(x)
+            outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+    def train(self, mode=True):
+        super(VGG, self).train(mode)
+        if self.bn_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
+                    if self.bn_frozen:
+                        for params in m.parameters():
+                            params.requires_grad = False
+        vgg_layers = getattr(self, self.module_name)
+        if mode and self.frozen_stages >= 0:
+            for i in range(self.frozen_stages):
+                for j in range(*self.range_sub_modules[i]):
+                    mod = vgg_layers[j]
+                    mod.eval()
+                    for param in mod.parameters():
+                        param.requires_grad = False
diff --git a/mmcv/models/backbones/vovnet.py b/mmcv/models/backbones/vovnet.py
new file mode 100755
index 0000000..879d186
--- /dev/null
+++ b/mmcv/models/backbones/vovnet.py
@@ -0,0 +1,375 @@
+
+from collections import OrderedDict
+from mmcv.runner import BaseModule
+from mmdet.models.builder import BACKBONES
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.modules.batchnorm import _BatchNorm
+
+
+VoVNet19_slim_dw_eSE = {
+    'stem': [64, 64, 64],
+    'stage_conv_ch': [64, 80, 96, 112],
+    'stage_out_ch': [112, 256, 384, 512],
+    "layer_per_block": 3,
+    "block_per_stage": [1, 1, 1, 1],
+    "eSE": True,
+    "dw": True
+}
+
+VoVNet19_dw_eSE = {
+    'stem': [64, 64, 64],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 3,
+    "block_per_stage": [1, 1, 1, 1],
+    "eSE": True,
+    "dw": True
+}
+
+VoVNet19_slim_eSE = {
+    'stem': [64, 64, 128],
+    'stage_conv_ch': [64, 80, 96, 112],
+    'stage_out_ch': [112, 256, 384, 512],
+    'layer_per_block': 3,
+    'block_per_stage': [1, 1, 1, 1],
+    'eSE': True,
+    "dw": False
+}
+
+VoVNet19_eSE = {
+    'stem': [64, 64, 128],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 3,
+    "block_per_stage": [1, 1, 1, 1],
+    "eSE": True,
+    "dw": False
+}
+
+VoVNet39_eSE = {
+    'stem': [64, 64, 128],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 5,
+    "block_per_stage": [1, 1, 2, 2],
+    "eSE": True,
+    "dw": False
+}
+
+VoVNet57_eSE = {
+    'stem': [64, 64, 128],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 5,
+    "block_per_stage": [1, 1, 4, 3],
+    "eSE": True,
+    "dw": False
+}
+
+VoVNet99_eSE = {
+    'stem': [64, 64, 128],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 5,
+    "block_per_stage": [1, 3, 9, 3],
+    "eSE": True,
+    "dw": False
+}
+
+_STAGE_SPECS = {
+    "V-19-slim-dw-eSE": VoVNet19_slim_dw_eSE,
+    "V-19-dw-eSE": VoVNet19_dw_eSE,
+    "V-19-slim-eSE": VoVNet19_slim_eSE,
+    "V-19-eSE": VoVNet19_eSE,
+    "V-39-eSE": VoVNet39_eSE,
+    "V-57-eSE": VoVNet57_eSE,
+    "V-99-eSE": VoVNet99_eSE,
+}
+
+
+def dw_conv3x3(in_channels, out_channels, module_name, postfix, stride=1, kernel_size=3, padding=1):
+    """3x3 convolution with padding"""
+    return [
+        (
+            '{}_{}/dw_conv3x3'.format(module_name, postfix),
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=out_channels,
+                bias=False
+            )
+        ),
+        (
+            '{}_{}/pw_conv1x1'.format(module_name, postfix),
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, groups=1, bias=False)
+        ),
+        ('{}_{}/pw_norm'.format(module_name, postfix), nn.BatchNorm2d(out_channels)),
+        ('{}_{}/pw_relu'.format(module_name, postfix), nn.ReLU(inplace=True)),
+    ]
+
+
+def conv3x3(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=3, padding=1):
+    """3x3 convolution with padding"""
+    return [
+        (
+            f"{module_name}_{postfix}/conv",
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups,
+                bias=False,
+            ),
+        ),
+        (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)),
+        (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)),
+    ]
+
+
+def conv1x1(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=1, padding=0):
+    """1x1 convolution with padding"""
+    return [
+        (
+            f"{module_name}_{postfix}/conv",
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups,
+                bias=False,
+            ),
+        ),
+        (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)),
+        (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)),
+    ]
+
+
+class Hsigmoid(nn.Module):
+    def __init__(self, inplace=True):
+        super(Hsigmoid, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return F.relu6(x + 3.0, inplace=self.inplace) / 6.0
+
+
+class eSEModule(nn.Module):
+    def __init__(self, channel, reduction=4):
+        super(eSEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Conv2d(channel, channel, kernel_size=1, padding=0)
+        self.hsigmoid = Hsigmoid()
+
+    def forward(self, x):
+        input = x
+        x = self.avg_pool(x)
+        x = self.fc(x)
+        x = self.hsigmoid(x)
+        return input * x
+
+
+class _OSA_module(nn.Module):
+    def __init__(
+        self, in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE=False, identity=False, depthwise=False
+    ):
+
+        super(_OSA_module, self).__init__()
+
+        self.identity = identity
+        self.depthwise = depthwise
+        self.isReduced = False
+        self.layers = nn.ModuleList()
+        in_channel = in_ch
+        if self.depthwise and in_channel != stage_ch:
+            self.isReduced = True
+            self.conv_reduction = nn.Sequential(
+                OrderedDict(conv1x1(in_channel, stage_ch, "{}_reduction".format(module_name), "0"))
+            )
+        for i in range(layer_per_block):
+            if self.depthwise:
+                self.layers.append(nn.Sequential(OrderedDict(dw_conv3x3(stage_ch, stage_ch, module_name, i))))
+            else:
+                self.layers.append(nn.Sequential(OrderedDict(conv3x3(in_channel, stage_ch, module_name, i))))
+            in_channel = stage_ch
+
+        # feature aggregation
+        in_channel = in_ch + layer_per_block * stage_ch
+        self.concat = nn.Sequential(OrderedDict(conv1x1(in_channel, concat_ch, module_name, "concat")))
+
+        self.ese = eSEModule(concat_ch)
+
+    def forward(self, x):
+
+        identity_feat = x
+
+        output = []
+        output.append(x)
+        if self.depthwise and self.isReduced:
+            x = self.conv_reduction(x)
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+
+        x = torch.cat(output, dim=1)
+        xt = self.concat(x)
+
+        xt = self.ese(xt)
+
+        if self.identity:
+            xt = xt + identity_feat
+
+        return xt
+
+
+class _OSA_stage(nn.Sequential):
+    def __init__(
+        self, in_ch, stage_ch, concat_ch, block_per_stage, layer_per_block, stage_num, SE=False, depthwise=False
+    ):
+
+        super(_OSA_stage, self).__init__()
+
+        if not stage_num == 2:
+            self.add_module("Pooling", nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True))
+
+        if block_per_stage != 1:
+            SE = False
+        module_name = f"OSA{stage_num}_1"
+        self.add_module(
+            module_name, _OSA_module(in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE, depthwise=depthwise)
+        )
+        for i in range(block_per_stage - 1):
+            if i != block_per_stage - 2:  # last block
+                SE = False
+            module_name = f"OSA{stage_num}_{i + 2}"
+            self.add_module(
+                module_name,
+                _OSA_module(
+                    concat_ch,
+                    stage_ch,
+                    concat_ch,
+                    layer_per_block,
+                    module_name,
+                    SE,
+                    identity=True,
+                    depthwise=depthwise
+                ),
+            )
+
+
+@BACKBONES.register_module()
+class VoVNet(BaseModule):
+    def __init__(self, spec_name, input_ch=3, out_features=None, 
+                 frozen_stages=-1, norm_eval=True, pretrained=None, init_cfg=None):
+        """
+        Args:
+            input_ch(int) : the number of input channel
+            out_features (list[str]): name of the layers whose outputs should
+                be returned in forward. Can be anything in "stem", "stage2" ...
+        """
+        super(VoVNet, self).__init__(init_cfg)
+        self.frozen_stages = frozen_stages
+        self.norm_eval = norm_eval
+
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        stage_specs = _STAGE_SPECS[spec_name]
+
+        stem_ch = stage_specs["stem"]
+        config_stage_ch = stage_specs["stage_conv_ch"]
+        config_concat_ch = stage_specs["stage_out_ch"]
+        block_per_stage = stage_specs["block_per_stage"]
+        layer_per_block = stage_specs["layer_per_block"]
+        SE = stage_specs["eSE"]
+        depthwise = stage_specs["dw"]
+
+        self._out_features = out_features
+
+        # Stem module
+        conv_type = dw_conv3x3 if depthwise else conv3x3
+        stem = conv3x3(input_ch, stem_ch[0], "stem", "1", 2)
+        stem += conv_type(stem_ch[0], stem_ch[1], "stem", "2", 1)
+        stem += conv_type(stem_ch[1], stem_ch[2], "stem", "3", 2)
+        self.add_module("stem", nn.Sequential((OrderedDict(stem))))
+        current_stirde = 4
+        self._out_feature_strides = {"stem": current_stirde, "stage2": current_stirde}
+        self._out_feature_channels = {"stem": stem_ch[2]}
+
+        stem_out_ch = [stem_ch[2]]
+        in_ch_list = stem_out_ch + config_concat_ch[:-1]
+        # OSA stages
+        self.stage_names = []
+        for i in range(4):  # num_stages
+            name = "stage%d" % (i + 2)  # stage 2 ... stage 5
+            self.stage_names.append(name)
+            self.add_module(
+                name,
+                _OSA_stage(
+                    in_ch_list[i],
+                    config_stage_ch[i],
+                    config_concat_ch[i],
+                    block_per_stage[i],
+                    layer_per_block,
+                    i + 2,
+                    SE,
+                    depthwise,
+                ),
+            )
+
+            self._out_feature_channels[name] = config_concat_ch[i]
+            if not i == 0:
+                self._out_feature_strides[name] = current_stirde = int(current_stirde * 2)
+
+        # initialize weights
+        # self._initialize_weights()
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+
+    def forward(self, x):
+        outputs = {}
+        x = self.stem(x)
+        if "stem" in self._out_features:
+            outputs["stem"] = x
+        for name in self.stage_names:
+            x = getattr(self, name)(x)
+            if name in self._out_features:
+                outputs[name] = x
+
+        return outputs
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            m = getattr(self, 'stem')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'stage{i+1}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super(VoVNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
\ No newline at end of file
diff --git a/mmcv/models/bricks/__init__.py b/mmcv/models/bricks/__init__.py
new file mode 100644
index 0000000..c492806
--- /dev/null
+++ b/mmcv/models/bricks/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .activation import build_activation_layer
+from .conv import build_conv_layer
+from .plugin import build_plugin_layer
+from .conv_module import ConvModule
+from .drop import Dropout, DropPath
+from .norm import build_norm_layer, is_norm
+from .wrappers import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d,
+                       Linear, MaxPool2d, MaxPool3d)
+from .registry import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS,
+                       PADDING_LAYERS, PLUGIN_LAYERS, UPSAMPLE_LAYERS)
+from .transformer import build_positional_encoding
\ No newline at end of file
diff --git a/mmcv/models/bricks/activation.py b/mmcv/models/bricks/activation.py
new file mode 100644
index 0000000..5606745
--- /dev/null
+++ b/mmcv/models/bricks/activation.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmcv.utils import build_from_cfg, digit_version, TORCH_VERSION
+from .registry import ACTIVATION_LAYERS
+
+for module in [
+        nn.ReLU, nn.LeakyReLU, nn.PReLU, nn.RReLU, nn.ReLU6, nn.ELU,
+        nn.Sigmoid, nn.Tanh
+]:
+    ACTIVATION_LAYERS.register_module(module=module)
+
+
+@ACTIVATION_LAYERS.register_module(name='Clip')
+@ACTIVATION_LAYERS.register_module()
+class Clamp(nn.Module):
+    """Clamp activation layer.
+
+    This activation function is to clamp the feature map value within
+    :math:`[min, max]`. More details can be found in ``torch.clamp()``.
+
+    Args:
+        min (Number | optional): Lower-bound of the range to be clamped to.
+            Default to -1.
+        max (Number | optional): Upper-bound of the range to be clamped to.
+            Default to 1.
+    """
+
+    def __init__(self, min=-1., max=1.):
+        super(Clamp, self).__init__()
+        self.min = min
+        self.max = max
+
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: Clamped tensor.
+        """
+        return torch.clamp(x, min=self.min, max=self.max)
+
+
+class GELU(nn.Module):
+    r"""Applies the Gaussian Error Linear Units function:
+
+    .. math::
+        \text{GELU}(x) = x * \Phi(x)
+    where :math:`\Phi(x)` is the Cumulative Distribution Function for
+    Gaussian Distribution.
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. image:: scripts/activation_images/GELU.png
+
+    Examples::
+
+        >>> m = nn.GELU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input):
+        return F.gelu(input)
+
+
+if (digit_version(TORCH_VERSION) < digit_version('1.4')):
+    ACTIVATION_LAYERS.register_module(module=GELU)
+else:
+    ACTIVATION_LAYERS.register_module(module=nn.GELU)
+
+
+def build_activation_layer(cfg):
+    """Build activation layer.
+
+    Args:
+        cfg (dict): The activation layer config, which should contain:
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate an activation layer.
+
+    Returns:
+        nn.Module: Created activation layer.
+    """
+    return build_from_cfg(cfg, ACTIVATION_LAYERS)
diff --git a/mmcv/models/bricks/conv.py b/mmcv/models/bricks/conv.py
new file mode 100644
index 0000000..cf54491
--- /dev/null
+++ b/mmcv/models/bricks/conv.py
@@ -0,0 +1,44 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch import nn
+
+from .registry import CONV_LAYERS
+
+CONV_LAYERS.register_module('Conv1d', module=nn.Conv1d)
+CONV_LAYERS.register_module('Conv2d', module=nn.Conv2d)
+CONV_LAYERS.register_module('Conv3d', module=nn.Conv3d)
+CONV_LAYERS.register_module('Conv', module=nn.Conv2d)
+
+
+def build_conv_layer(cfg, *args, **kwargs):
+    """Build convolution layer.
+
+    Args:
+        cfg (None or dict): The conv layer config, which should contain:
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate an conv layer.
+        args (argument list): Arguments passed to the `__init__`
+            method of the corresponding conv layer.
+        kwargs (keyword arguments): Keyword arguments passed to the `__init__`
+            method of the corresponding conv layer.
+
+    Returns:
+        nn.Module: Created conv layer.
+    """
+    if cfg is None:
+        cfg_ = dict(type='Conv2d')
+    else:
+        if not isinstance(cfg, dict):
+            raise TypeError('cfg must be a dict')
+        if 'type' not in cfg:
+            raise KeyError('the cfg dict must contain the key "type"')
+        cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if layer_type not in CONV_LAYERS:
+        raise KeyError(f'Unrecognized norm type {layer_type}')
+    else:
+        conv_layer = CONV_LAYERS.get(layer_type)
+
+    layer = conv_layer(*args, **kwargs, **cfg_)
+
+    return layer
diff --git a/mmcv/models/bricks/conv_module.py b/mmcv/models/bricks/conv_module.py
new file mode 100644
index 0000000..bbbc616
--- /dev/null
+++ b/mmcv/models/bricks/conv_module.py
@@ -0,0 +1,207 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch.nn as nn
+
+from torch.nn.modules.instancenorm import _InstanceNorm
+from torch.nn.modules.batchnorm import _BatchNorm
+from ..utils import constant_init, kaiming_init
+from .activation import build_activation_layer
+from .conv import build_conv_layer
+from .norm import build_norm_layer
+from .padding import build_padding_layer
+from .registry import PLUGIN_LAYERS
+
+
+@PLUGIN_LAYERS.register_module()
+class ConvModule(nn.Module):
+    """A conv block that bundles conv/norm/activation layers.
+
+    This block simplifies the usage of convolution layers, which are commonly
+    used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
+    It is based upon three build methods: `build_conv_layer()`,
+    `build_norm_layer()` and `build_activation_layer()`.
+
+    Besides, we add some additional features in this module.
+    1. Automatically set `bias` of the conv layer.
+    2. Spectral norm is supported.
+    3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only
+    supports zero and circular padding, and we add "reflect" padding mode.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+            Same as that in ``nn._ConvNd``.
+        out_channels (int): Number of channels produced by the convolution.
+            Same as that in ``nn._ConvNd``.
+        kernel_size (int | tuple[int]): Size of the convolving kernel.
+            Same as that in ``nn._ConvNd``.
+        stride (int | tuple[int]): Stride of the convolution.
+            Same as that in ``nn._ConvNd``.
+        padding (int | tuple[int]): Zero-padding added to both sides of
+            the input. Same as that in ``nn._ConvNd``.
+        dilation (int | tuple[int]): Spacing between kernel elements.
+            Same as that in ``nn._ConvNd``.
+        groups (int): Number of blocked connections from input channels to
+            output channels. Same as that in ``nn._ConvNd``.
+        bias (bool | str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise
+            False. Default: "auto".
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        inplace (bool): Whether to use inplace mode for activation.
+            Default: True.
+        with_spectral_norm (bool): Whether use spectral norm in conv module.
+            Default: False.
+        padding_mode (str): If the `padding_mode` has not been supported by
+            current `Conv2d` in PyTorch, we will use our own padding layer
+            instead. Currently, we support ['zeros', 'circular'] with official
+            implementation and ['reflect'] with our own implementation.
+            Default: 'zeros'.
+        order (tuple[str]): The order of conv/norm/activation layers. It is a
+            sequence of "conv", "norm" and "act". Common examples are
+            ("conv", "norm", "act") and ("act", "conv", "norm").
+            Default: ('conv', 'norm', 'act').
+    """
+
+    _abbr_ = 'conv_block'
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias='auto',
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 inplace=True,
+                 with_spectral_norm=False,
+                 padding_mode='zeros',
+                 order=('conv', 'norm', 'act')):
+        super(ConvModule, self).__init__()
+        assert conv_cfg is None or isinstance(conv_cfg, dict)
+        assert norm_cfg is None or isinstance(norm_cfg, dict)
+        assert act_cfg is None or isinstance(act_cfg, dict)
+        official_padding_mode = ['zeros', 'circular']
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.inplace = inplace
+        self.with_spectral_norm = with_spectral_norm
+        self.with_explicit_padding = padding_mode not in official_padding_mode
+        self.order = order
+        assert isinstance(self.order, tuple) and len(self.order) == 3
+        assert set(order) == set(['conv', 'norm', 'act'])
+
+        self.with_norm = norm_cfg is not None
+        self.with_activation = act_cfg is not None
+        # if the conv layer is before a norm layer, bias is unnecessary.
+        if bias == 'auto':
+            bias = not self.with_norm
+        self.with_bias = bias
+
+        if self.with_explicit_padding:
+            pad_cfg = dict(type=padding_mode)
+            self.padding_layer = build_padding_layer(pad_cfg, padding)
+
+        # reset padding to 0 for conv module
+        conv_padding = 0 if self.with_explicit_padding else padding
+        # build convolution layer
+        self.conv = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=conv_padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        # export the attributes of self.conv to a higher level for convenience
+        self.in_channels = self.conv.in_channels
+        self.out_channels = self.conv.out_channels
+        self.kernel_size = self.conv.kernel_size
+        self.stride = self.conv.stride
+        self.padding = padding
+        self.dilation = self.conv.dilation
+        self.transposed = self.conv.transposed
+        self.output_padding = self.conv.output_padding
+        self.groups = self.conv.groups
+
+        if self.with_spectral_norm:
+            self.conv = nn.utils.spectral_norm(self.conv)
+
+        # build normalization layers
+        if self.with_norm:
+            # norm layer is after conv layer
+            if order.index('norm') > order.index('conv'):
+                norm_channels = out_channels
+            else:
+                norm_channels = in_channels
+            self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels)
+            self.add_module(self.norm_name, norm)
+            if self.with_bias:
+                if isinstance(norm, (_BatchNorm, _InstanceNorm)):
+                    warnings.warn(
+                        'Unnecessary conv bias before batch/instance norm')
+        else:
+            self.norm_name = None
+
+        # build activation layer
+        if self.with_activation:
+            act_cfg_ = act_cfg.copy()
+            # nn.Tanh has no 'inplace' argument
+            if act_cfg_['type'] not in [
+                    'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish'
+            ]:
+                act_cfg_.setdefault('inplace', inplace)
+            self.activate = build_activation_layer(act_cfg_)
+
+        # Use msra init by default
+        self.init_weights()
+
+    @property
+    def norm(self):
+        if self.norm_name:
+            return getattr(self, self.norm_name)
+        else:
+            return None
+
+    def init_weights(self):
+        # 1. It is mainly for customized conv layers with their own
+        #    initialization manners by calling their own ``init_weights()``,
+        #    and we do not want ConvModule to override the initialization.
+        # 2. For customized conv layers without their own initialization
+        #    manners (that is, they don't have their own ``init_weights()``)
+        #    and PyTorch's conv layers, they will be initialized by
+        #    this method with default ``kaiming_init``.
+        # Note: For PyTorch's conv layers, they will be overwritten by our
+        #    initialization implementation using default ``kaiming_init``.
+        if not hasattr(self.conv, 'init_weights'):
+            if self.with_activation and self.act_cfg['type'] == 'LeakyReLU':
+                nonlinearity = 'leaky_relu'
+                a = self.act_cfg.get('negative_slope', 0.01)
+            else:
+                nonlinearity = 'relu'
+                a = 0
+            kaiming_init(self.conv, a=a, nonlinearity=nonlinearity)
+        if self.with_norm:
+            constant_init(self.norm, 1, bias=0)
+
+    def forward(self, x, activate=True, norm=True):
+        for layer in self.order:
+            if layer == 'conv':
+                if self.with_explicit_padding:
+                    x = self.padding_layer(x)
+                x = self.conv(x)
+            elif layer == 'norm' and norm and self.with_norm:
+                x = self.norm(x)
+            elif layer == 'act' and activate and self.with_activation:
+                x = self.activate(x)
+        return x
diff --git a/mmcv/models/bricks/drop.py b/mmcv/models/bricks/drop.py
new file mode 100644
index 0000000..b0a0266
--- /dev/null
+++ b/mmcv/models/bricks/drop.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmcv import build_from_cfg
+from .registry import DROPOUT_LAYERS
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of
+    residual blocks).
+
+    We follow the implementation
+    https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py  # noqa: E501
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    # handle tensors with different dimensions, not just 4D tensors.
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + torch.rand(
+        shape, dtype=x.dtype, device=x.device)
+    output = x.div(keep_prob) * random_tensor.floor()
+    return output
+
+
+@DROPOUT_LAYERS.register_module()
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of
+    residual blocks).
+
+    We follow the implementation
+    https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py  # noqa: E501
+
+    Args:
+        drop_prob (float): Probability of the path to be zeroed. Default: 0.1
+    """
+
+    def __init__(self, drop_prob=0.1):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+@DROPOUT_LAYERS.register_module()
+class Dropout(nn.Dropout):
+    """A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of
+    ``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with
+    ``DropPath``
+
+    Args:
+        drop_prob (float): Probability of the elements to be
+            zeroed. Default: 0.5.
+        inplace (bool):  Do the operation inplace or not. Default: False.
+    """
+
+    def __init__(self, drop_prob=0.5, inplace=False):
+        super().__init__(p=drop_prob, inplace=inplace)
+
+
+def build_dropout(cfg, default_args=None):
+    """Builder for drop out layers."""
+    return build_from_cfg(cfg, DROPOUT_LAYERS, default_args)
diff --git a/mmcv/models/bricks/norm.py b/mmcv/models/bricks/norm.py
new file mode 100644
index 0000000..7c40c99
--- /dev/null
+++ b/mmcv/models/bricks/norm.py
@@ -0,0 +1,145 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+
+import torch.nn as nn
+
+from mmcv.utils import is_tuple_of
+from torch.nn.modules.instancenorm import _InstanceNorm
+from torch.nn.modules.batchnorm import _BatchNorm
+from .registry import NORM_LAYERS
+
+NORM_LAYERS.register_module('BN', module=nn.BatchNorm2d)
+NORM_LAYERS.register_module('BN1d', module=nn.BatchNorm1d)
+NORM_LAYERS.register_module('BN2d', module=nn.BatchNorm2d)
+NORM_LAYERS.register_module('BN3d', module=nn.BatchNorm3d)
+NORM_LAYERS.register_module('SyncBN', module=nn.SyncBatchNorm)
+NORM_LAYERS.register_module('GN', module=nn.GroupNorm)
+NORM_LAYERS.register_module('LN', module=nn.LayerNorm)
+NORM_LAYERS.register_module('IN', module=nn.InstanceNorm2d)
+NORM_LAYERS.register_module('IN1d', module=nn.InstanceNorm1d)
+NORM_LAYERS.register_module('IN2d', module=nn.InstanceNorm2d)
+NORM_LAYERS.register_module('IN3d', module=nn.InstanceNorm3d)
+
+
+def infer_abbr(class_type):
+    """Infer abbreviation from the class name.
+
+    When we build a norm layer with `build_norm_layer()`, we want to preserve
+    the norm type in variable names, e.g, self.bn1, self.gn. This method will
+    infer the abbreviation to map class types to abbreviations.
+
+    Rule 1: If the class has the property "_abbr_", return the property.
+    Rule 2: If the parent class is _BatchNorm, GroupNorm, LayerNorm or
+    InstanceNorm, the abbreviation of this layer will be "bn", "gn", "ln" and
+    "in" respectively.
+    Rule 3: If the class name contains "batch", "group", "layer" or "instance",
+    the abbreviation of this layer will be "bn", "gn", "ln" and "in"
+    respectively.
+    Rule 4: Otherwise, the abbreviation falls back to "norm".
+
+    Args:
+        class_type (type): The norm layer type.
+
+    Returns:
+        str: The inferred abbreviation.
+    """
+    if not inspect.isclass(class_type):
+        raise TypeError(
+            f'class_type must be a type, but got {type(class_type)}')
+    if hasattr(class_type, '_abbr_'):
+        return class_type._abbr_
+    if issubclass(class_type, _InstanceNorm):  # IN is a subclass of BN
+        return 'in'
+    elif issubclass(class_type, _BatchNorm):
+        return 'bn'
+    elif issubclass(class_type, nn.GroupNorm):
+        return 'gn'
+    elif issubclass(class_type, nn.LayerNorm):
+        return 'ln'
+    else:
+        class_name = class_type.__name__.lower()
+        if 'batch' in class_name:
+            return 'bn'
+        elif 'group' in class_name:
+            return 'gn'
+        elif 'layer' in class_name:
+            return 'ln'
+        elif 'instance' in class_name:
+            return 'in'
+        else:
+            return 'norm_layer'
+
+
+def build_norm_layer(cfg, num_features, postfix=''):
+    """Build normalization layer.
+
+    Args:
+        cfg (dict): The norm layer config, which should contain:
+
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate a norm layer.
+            - requires_grad (bool, optional): Whether stop gradient updates.
+        num_features (int): Number of input channels.
+        postfix (int | str): The postfix to be appended into norm abbreviation
+            to create named layer.
+
+    Returns:
+        (str, nn.Module): The first element is the layer name consisting of
+            abbreviation and postfix, e.g., bn1, gn. The second element is the
+            created norm layer.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError('cfg must be a dict')
+    if 'type' not in cfg:
+        raise KeyError('the cfg dict must contain the key "type"')
+    cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if layer_type not in NORM_LAYERS:
+        raise KeyError(f'Unrecognized norm type {layer_type}')
+
+    norm_layer = NORM_LAYERS.get(layer_type)
+    abbr = infer_abbr(norm_layer)
+
+    assert isinstance(postfix, (int, str))
+    name = abbr + str(postfix)
+
+    requires_grad = cfg_.pop('requires_grad', True)
+    cfg_.setdefault('eps', 1e-5)
+    if layer_type != 'GN':
+        layer = norm_layer(num_features, **cfg_)
+        if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):
+            layer._specify_ddp_gpu_num(1)
+    else:
+        assert 'num_groups' in cfg_
+        layer = norm_layer(num_channels=num_features, **cfg_)
+
+    for param in layer.parameters():
+        param.requires_grad = requires_grad
+
+    return name, layer
+
+
+def is_norm(layer, exclude=None):
+    """Check if a layer is a normalization layer.
+
+    Args:
+        layer (nn.Module): The layer to be checked.
+        exclude (type | tuple[type]): Types to be excluded.
+
+    Returns:
+        bool: Whether the layer is a norm layer.
+    """
+    if exclude is not None:
+        if not isinstance(exclude, tuple):
+            exclude = (exclude, )
+        if not is_tuple_of(exclude, type):
+            raise TypeError(
+                f'"exclude" must be either None or type or a tuple of types, '
+                f'but got {type(exclude)}: {exclude}')
+
+    if exclude and isinstance(layer, exclude):
+        return False
+
+    all_norm_bases = (_BatchNorm, _InstanceNorm, nn.GroupNorm, nn.LayerNorm)
+    return isinstance(layer, all_norm_bases)
diff --git a/mmcv/models/bricks/padding.py b/mmcv/models/bricks/padding.py
new file mode 100644
index 0000000..e4ac6b2
--- /dev/null
+++ b/mmcv/models/bricks/padding.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+
+from .registry import PADDING_LAYERS
+
+PADDING_LAYERS.register_module('zero', module=nn.ZeroPad2d)
+PADDING_LAYERS.register_module('reflect', module=nn.ReflectionPad2d)
+PADDING_LAYERS.register_module('replicate', module=nn.ReplicationPad2d)
+
+
+def build_padding_layer(cfg, *args, **kwargs):
+    """Build padding layer.
+
+    Args:
+        cfg (None or dict): The padding layer config, which should contain:
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate a padding layer.
+
+    Returns:
+        nn.Module: Created padding layer.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError('cfg must be a dict')
+    if 'type' not in cfg:
+        raise KeyError('the cfg dict must contain the key "type"')
+
+    cfg_ = cfg.copy()
+    padding_type = cfg_.pop('type')
+    if padding_type not in PADDING_LAYERS:
+        raise KeyError(f'Unrecognized padding type {padding_type}.')
+    else:
+        padding_layer = PADDING_LAYERS.get(padding_type)
+
+    layer = padding_layer(*args, **kwargs, **cfg_)
+
+    return layer
diff --git a/mmcv/models/bricks/plugin.py b/mmcv/models/bricks/plugin.py
new file mode 100644
index 0000000..07c010d
--- /dev/null
+++ b/mmcv/models/bricks/plugin.py
@@ -0,0 +1,88 @@
+import inspect
+import platform
+
+from .registry import PLUGIN_LAYERS
+
+if platform.system() == 'Windows':
+    import regex as re
+else:
+    import re
+
+
+def infer_abbr(class_type):
+    """Infer abbreviation from the class name.
+
+    This method will infer the abbreviation to map class types to
+    abbreviations.
+
+    Rule 1: If the class has the property "abbr", return the property.
+    Rule 2: Otherwise, the abbreviation falls back to snake case of class
+    name, e.g. the abbreviation of ``FancyBlock`` will be ``fancy_block``.
+
+    Args:
+        class_type (type): The norm layer type.
+
+    Returns:
+        str: The inferred abbreviation.
+    """
+
+    def camel2snack(word):
+        """Convert camel case word into snack case.
+
+        Modified from `inflection lib
+        <https://inflection.readthedocs.io/en/latest/#inflection.underscore>`_.
+
+        Example::
+
+            >>> camel2snack("FancyBlock")
+            'fancy_block'
+        """
+
+        word = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', word)
+        word = re.sub(r'([a-z\d])([A-Z])', r'\1_\2', word)
+        word = word.replace('-', '_')
+        return word.lower()
+
+    if not inspect.isclass(class_type):
+        raise TypeError(
+            f'class_type must be a type, but got {type(class_type)}')
+    if hasattr(class_type, '_abbr_'):
+        return class_type._abbr_
+    else:
+        return camel2snack(class_type.__name__)
+
+
+def build_plugin_layer(cfg, postfix='', **kwargs):
+    """Build plugin layer.
+
+    Args:
+        cfg (None or dict): cfg should contain:
+            type (str): identify plugin layer type.
+            layer args: args needed to instantiate a plugin layer.
+        postfix (int, str): appended into norm abbreviation to
+            create named layer. Default: ''.
+
+    Returns:
+        tuple[str, nn.Module]:
+            name (str): abbreviation + postfix
+            layer (nn.Module): created plugin layer
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError('cfg must be a dict')
+    if 'type' not in cfg:
+        raise KeyError('the cfg dict must contain the key "type"')
+    cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if layer_type not in PLUGIN_LAYERS:
+        raise KeyError(f'Unrecognized plugin type {layer_type}')
+
+    plugin_layer = PLUGIN_LAYERS.get(layer_type)
+    abbr = infer_abbr(plugin_layer)
+
+    assert isinstance(postfix, (int, str))
+    name = abbr + str(postfix)
+
+    layer = plugin_layer(**kwargs, **cfg_)
+
+    return name, layer
diff --git a/mmcv/models/bricks/registry.py b/mmcv/models/bricks/registry.py
new file mode 100644
index 0000000..c292797
--- /dev/null
+++ b/mmcv/models/bricks/registry.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import Registry
+
+CONV_LAYERS = Registry('conv layer')
+NORM_LAYERS = Registry('norm layer')
+ACTIVATION_LAYERS = Registry('activation layer')
+PADDING_LAYERS = Registry('padding layer')
+UPSAMPLE_LAYERS = Registry('upsample layer')
+PLUGIN_LAYERS = Registry('plugin layer')
+
+DROPOUT_LAYERS = Registry('drop out layers')
+POSITIONAL_ENCODING = Registry('position encoding')
+ATTENTION = Registry('attention')
+FEEDFORWARD_NETWORK = Registry('feed-forward Network')
+TRANSFORMER_LAYER = Registry('transformerLayer')
+TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence')
diff --git a/mmcv/models/bricks/transformer.py b/mmcv/models/bricks/transformer.py
new file mode 100644
index 0000000..df6e532
--- /dev/null
+++ b/mmcv/models/bricks/transformer.py
@@ -0,0 +1,611 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv import ConfigDict, deprecated_api_warning
+from .wrappers import Linear
+from .activation import build_activation_layer
+from .norm import build_norm_layer
+# from mmcv.models.bricks import Linear, build_activation_layer, build_norm_layer
+from ..backbones.base_module import BaseModule, ModuleList, Sequential
+from mmcv.utils import build_from_cfg
+from .drop import build_dropout
+from .registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING,
+                       TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE)
+
+# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file
+try:
+    from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention  # noqa F401
+    warnings.warn(
+        ImportWarning(
+            '``MultiScaleDeformableAttention`` has been moved to '
+            '``mmcv.ops.multi_scale_deform_attn``, please change original path '  # noqa E501
+            '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` '  # noqa E501
+            'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` '  # noqa E501
+        ))
+
+except ImportError:
+    warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from '
+                  '``mmcv.ops.multi_scale_deform_attn``, '
+                  'You should install ``mmcv-full`` if you need this module. ')
+
+
+def build_positional_encoding(cfg, default_args=None):
+    """Builder for Position Encoding."""
+    return build_from_cfg(cfg, POSITIONAL_ENCODING, default_args)
+
+
+def build_attention(cfg, default_args=None):
+    """Builder for attention."""
+    return build_from_cfg(cfg, ATTENTION, default_args)
+
+
+def build_feedforward_network(cfg, default_args=None):
+    """Builder for feed-forward network (FFN)."""
+    return build_from_cfg(cfg, FEEDFORWARD_NETWORK, default_args)
+
+
+def build_transformer_layer(cfg, default_args=None):
+    """Builder for transformer layer."""
+    return build_from_cfg(cfg, TRANSFORMER_LAYER, default_args)
+
+
+def build_transformer_layer_sequence(cfg, default_args=None):
+    """Builder for transformer encoder and transformer decoder."""
+    return build_from_cfg(cfg, TRANSFORMER_LAYER_SEQUENCE, default_args)
+
+
+@ATTENTION.register_module()
+class MultiheadAttention(BaseModule):
+    """A wrapper for ``torch.nn.MultiheadAttention``.
+
+    This module implements MultiheadAttention with identity connection,
+    and positional encoding  is also passed as input.
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): When it is True,  Key, Query and Value are shape of
+            (batch, n, embed_dim), otherwise (n, batch, embed_dim).
+             Default to False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=dict(type='Dropout', drop_prob=0.),
+                 init_cfg=None,
+                 batch_first=False,
+                 with_cp=False,
+                 **kwargs):
+        super(MultiheadAttention, self).__init__(init_cfg)
+        if 'dropout' in kwargs:
+            warnings.warn('The arguments `dropout` in MultiheadAttention '
+                          'has been deprecated, now you can separately '
+                          'set `attn_drop`(float), proj_drop(float), '
+                          'and `dropout_layer`(dict) ')
+            attn_drop = kwargs['dropout']
+            dropout_layer['drop_prob'] = kwargs.pop('dropout')
+
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.batch_first = batch_first
+
+        self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop,
+                                          **kwargs)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else nn.Identity()
+        self.with_cp = with_cp
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiheadAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_pos=None,
+                attn_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `MultiheadAttention`.
+
+        **kwargs allow passing a more general data flow when combining
+        with other operations in `transformerlayer`.
+
+        Args:
+            query (Tensor): The input query with shape [num_queries, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+                If None, the ``query`` will be used. Defaults to None.
+            value (Tensor): The value tensor with same shape as `key`.
+                Same in `nn.MultiheadAttention.forward`. Defaults to None.
+                If None, the `key` will be used.
+            identity (Tensor): This tensor, with the same shape as x,
+                will be used for the identity link.
+                If None, `x` will be used. Defaults to None.
+            query_pos (Tensor): The positional encoding for query, with
+                the same shape as `x`. If not None, it will
+                be added to `x` before forward function. Defaults to None.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`. Defaults to None. If not None, it will
+                be added to `key` before forward function. If None, and
+                `query_pos` has the same shape as `key`, then `query_pos`
+                will be used for `key_pos`. Defaults to None.
+            attn_mask (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+                Defaults to None.
+
+        Returns:
+            Tensor: forwarded results with shape
+                [num_queries, bs, embed_dims]
+                if self.batch_first is False, else
+                [bs, num_queries embed_dims].
+        """
+
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+        if identity is None:
+            identity = query
+        if key_pos is None:
+            if query_pos is not None:
+                # use query_pos if key_pos is not available
+                if query_pos.shape == key.shape:
+                    key_pos = query_pos
+                else:
+                    warnings.warn(f'position encoding of key is'
+                                  f'missing in {self.__class__.__name__}.')
+        if query_pos is not None:
+            query = query + query_pos
+        if key_pos is not None:
+            key = key + key_pos
+
+        # Because the dataflow('key', 'query', 'value') of
+        # ``torch.nn.MultiheadAttention`` is (num_query, batch,
+        # embed_dims), We should adjust the shape of dataflow from
+        # batch_first (batch, num_query, embed_dims) to num_query_first
+        # (num_query ,batch, embed_dims), and recover ``attn_output``
+        # from num_query_first to batch_first.
+        if self.batch_first:
+            query = query.transpose(0, 1)
+            key = key.transpose(0, 1)
+            value = value.transpose(0, 1)
+
+        if self.with_cp:
+            out = cp.checkpoint(self.attn, use_reentrant=False, query=query,
+                    key=key,
+                    value=value,
+                    attn_mask=attn_mask,
+                    key_padding_mask=key_padding_mask)[0]
+        else:
+            out = self.attn(
+                    query=query,
+                    key=key,
+                    value=value,
+                    attn_mask=attn_mask,
+                    key_padding_mask=key_padding_mask)[0]
+
+        if self.batch_first:
+            out = out.transpose(0, 1)
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+
+@FEEDFORWARD_NETWORK.register_module()
+class FFN(BaseModule):
+    """Implements feed-forward networks (FFNs) with identity connection.
+
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `MultiheadAttention`. Defaults: 256.
+        feedforward_channels (int): The hidden dimension of FFNs.
+            Defaults: 1024.
+        num_fcs (int, optional): The number of fully-connected layers in
+            FFNs. Default: 2.
+        act_cfg (dict, optional): The activation config for FFNs.
+            Default: dict(type='ReLU')
+        ffn_drop (float, optional): Probability of an element to be
+            zeroed in FFN. Default 0.0.
+        add_identity (bool, optional): Whether to add the
+            identity connection. Default: `True`.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    @deprecated_api_warning(
+        {
+            'dropout': 'ffn_drop',
+            'add_residual': 'add_identity'
+        },
+        cls_name='FFN')
+    def __init__(self,
+                 embed_dims=256,
+                 feedforward_channels=1024,
+                 num_fcs=2,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 ffn_drop=0.,
+                 dropout_layer=None,
+                 add_identity=True,
+                 init_cfg=None,
+                 **kwargs):
+        super(FFN, self).__init__(init_cfg)
+        assert num_fcs >= 2, 'num_fcs should be no less ' \
+            f'than 2. got {num_fcs}.'
+        self.embed_dims = embed_dims
+        self.feedforward_channels = feedforward_channels
+        self.num_fcs = num_fcs
+        self.act_cfg = act_cfg
+        self.activate = build_activation_layer(act_cfg)
+
+        layers = []
+        in_channels = embed_dims
+        for _ in range(num_fcs - 1):
+            layers.append(
+                Sequential(
+                    Linear(in_channels, feedforward_channels), self.activate,
+                    nn.Dropout(ffn_drop)))
+            in_channels = feedforward_channels
+        layers.append(Linear(feedforward_channels, embed_dims))
+        layers.append(nn.Dropout(ffn_drop))
+        self.layers = Sequential(*layers)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else torch.nn.Identity()
+        self.add_identity = add_identity
+
+    @deprecated_api_warning({'residual': 'identity'}, cls_name='FFN')
+    def forward(self, x, identity=None):
+        """Forward function for `FFN`.
+
+        The function would add x to the output tensor if residue is None.
+        """
+        out = self.layers(x)
+        if not self.add_identity:
+            return self.dropout_layer(out)
+        if identity is None:
+            identity = x
+        return identity + self.dropout_layer(out)
+
+
+@TRANSFORMER_LAYER.register_module()
+class BaseTransformerLayer(BaseModule):
+    """Base `TransformerLayer` for vision transformer.
+
+    It can be built from `mmcv.ConfigDict` and support more flexible
+    customization, for example, using any number of `FFN or LN ` and
+    use different kinds of `attention` by specifying a list of `ConfigDict`
+    named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
+    when you specifying `norm` as the first element of `operation_order`.
+    More details about the `prenorm`: `On Layer Normalization in the
+    Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
+
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for `self_attention` or `cross_attention` modules,
+            The order of the configs in the list should be consistent with
+            corresponding attentions in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config. Default: None.
+        ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for FFN, The order of the configs in the list should be
+            consistent with corresponding ffn in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Support `prenorm` when you specifying first element as `norm`.
+            Default：None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): Key, Query and Value are shape
+            of (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+    """
+
+    def __init__(self,
+                 attn_cfgs=None,
+                 with_cp=False,
+                 ffn_cfgs=dict(
+                     type='FFN',
+                     embed_dims=256,
+                     feedforward_channels=1024,
+                     num_fcs=2,
+                     ffn_drop=0.,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                 ),
+                 operation_order=None,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None,
+                 batch_first=False,
+                 **kwargs):
+
+        deprecated_args = dict(
+            feedforward_channels='feedforward_channels',
+            ffn_dropout='ffn_drop',
+            ffn_num_fcs='num_fcs')
+        for ori_name, new_name in deprecated_args.items():
+            if ori_name in kwargs:
+                warnings.warn(
+                    f'The arguments `{ori_name}` in BaseTransformerLayer '
+                    f'has been deprecated, now you should set `{new_name}` '
+                    f'and other FFN related arguments '
+                    f'to a dict named `ffn_cfgs`. ')
+                ffn_cfgs[new_name] = kwargs[ori_name]
+
+        super(BaseTransformerLayer, self).__init__(init_cfg)
+
+        self.batch_first = batch_first
+
+        assert set(operation_order) & set(
+            ['self_attn', 'norm', 'ffn', 'cross_attn']) == \
+            set(operation_order), f'The operation_order of' \
+            f' {self.__class__.__name__} should ' \
+            f'contains all four operation type ' \
+            f"{['self_attn', 'norm', 'ffn', 'cross_attn']}"
+
+        num_attn = operation_order.count('self_attn') + operation_order.count(
+            'cross_attn')
+        if isinstance(attn_cfgs, dict):
+            attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
+        else:
+            assert num_attn == len(attn_cfgs), f'The length ' \
+                f'of attn_cfg {num_attn} is ' \
+                f'not consistent with the number of attention' \
+                f'in operation_order {operation_order}.'
+
+        self.num_attn = num_attn
+        self.operation_order = operation_order
+        self.norm_cfg = norm_cfg
+        self.pre_norm = operation_order[0] == 'norm'
+        self.attentions = ModuleList()
+
+        index = 0
+        for operation_name in operation_order:
+            if operation_name in ['self_attn', 'cross_attn']:
+                if 'batch_first' in attn_cfgs[index]:
+                    assert self.batch_first == attn_cfgs[index]['batch_first']
+                else:
+                    attn_cfgs[index]['batch_first'] = self.batch_first
+                attention = build_attention(attn_cfgs[index])
+                # Some custom attentions used as `self_attn`
+                # or `cross_attn` can have different behavior.
+                attention.operation_name = operation_name
+                self.attentions.append(attention)
+                index += 1
+
+        self.embed_dims = self.attentions[0].embed_dims
+
+        self.ffns = ModuleList()
+        num_ffns = operation_order.count('ffn')
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = ConfigDict(ffn_cfgs)
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
+        assert len(ffn_cfgs) == num_ffns
+        for ffn_index in range(num_ffns):
+            if 'embed_dims' not in ffn_cfgs[ffn_index]:
+                ffn_cfgs['embed_dims'] = self.embed_dims
+            else:
+                assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
+            self.ffns.append(
+                build_feedforward_network(ffn_cfgs[ffn_index],
+                                          dict(type='FFN')))
+
+        self.norms = ModuleList()
+        num_norms = operation_order.count('norm')
+        for _ in range(num_norms):
+            self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
+        self.with_cp = with_cp
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `TransformerDecoderLayer`.
+
+        **kwargs contains some specific arguments of attentions.
+
+        Args:
+            query (Tensor): The input query with shape
+                [num_queries, bs, embed_dims] if
+                self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+            value (Tensor): The value tensor with same shape as `key`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor] | None): 2D Tensor used in
+                calculation of corresponding attention. The length of
+                it should equal to the number of `attention` in
+                `operation_order`. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in `self_attn` layer.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+
+        Returns:
+            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+        """
+
+        norm_index = 0
+        attn_index = 0
+        ffn_index = 0
+        identity = query
+        if attn_masks is None:
+            attn_masks = [None for _ in range(self.num_attn)]
+        elif isinstance(attn_masks, torch.Tensor):
+            attn_masks = [
+                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
+            ]
+            warnings.warn(f'Use same attn_mask in all attentions in '
+                          f'{self.__class__.__name__} ')
+        else:
+            assert len(attn_masks) == self.num_attn, f'The length of ' \
+                        f'attn_masks {len(attn_masks)} must be equal ' \
+                        f'to the number of attention in ' \
+                        f'operation_order {self.num_attn}'
+
+        for layer in self.operation_order:
+            if layer == 'self_attn':
+                temp_key = temp_value = query
+                query = self.attentions[attn_index](
+                    query,
+                    temp_key,
+                    temp_value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=query_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=query_key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'norm':
+                query = self.norms[norm_index](query)
+                norm_index += 1
+
+            elif layer == 'cross_attn':
+                query = self.attentions[attn_index](
+                    query,
+                    key,
+                    value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=key_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'ffn':
+                if self.with_cp:
+                    query = cp.checkpoint(self.ffns[ffn_index], query)
+                else:
+                    query = self.ffns[ffn_index](
+                    query, identity if self.pre_norm else None)
+                ffn_index += 1
+
+        return query
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class TransformerLayerSequence(BaseModule):
+    """Base class for TransformerEncoder and TransformerDecoder in vision
+    transformer.
+
+    As base-class of Encoder and Decoder in vision transformer.
+    Support customization such as specifying different kind
+    of `transformer_layer` in `transformer_coder`.
+
+    Args:
+        transformerlayer (list[obj:`mmcv.ConfigDict`] |
+            obj:`mmcv.ConfigDict`): Config of transformerlayer
+            in TransformerCoder. If it is obj:`mmcv.ConfigDict`,
+             it would be repeated `num_layer` times to a
+             list[`mmcv.ConfigDict`]. Default: None.
+        num_layers (int): The number of `TransformerLayer`. Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None):
+        super(TransformerLayerSequence, self).__init__(init_cfg)
+        if isinstance(transformerlayers, dict):
+            transformerlayers = [
+                copy.deepcopy(transformerlayers) for _ in range(num_layers)
+            ]
+        else:
+            assert isinstance(transformerlayers, list) and \
+                   len(transformerlayers) == num_layers
+        self.num_layers = num_layers
+        self.layers = ModuleList()
+        for i in range(num_layers):
+            self.layers.append(build_transformer_layer(transformerlayers[i]))
+        self.embed_dims = self.layers[0].embed_dims
+        self.pre_norm = self.layers[0].pre_norm
+
+    def forward(self,
+                query,
+                key,
+                value,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `TransformerCoder`.
+
+        Args:
+            query (Tensor): Input query with shape
+                `(num_queries, bs, embed_dims)`.
+            key (Tensor): The key tensor with shape
+                `(num_keys, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_keys, bs, embed_dims)`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor], optional): Each element is 2D Tensor
+                which is used in calculation of corresponding attention in
+                operation_order. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in self-attention
+                Default: None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+
+        Returns:
+            Tensor:  results with shape [num_queries, bs, embed_dims].
+        """
+        for layer in self.layers:
+            query = layer(
+                query,
+                key,
+                value,
+                query_pos=query_pos,
+                key_pos=key_pos,
+                attn_masks=attn_masks,
+                query_key_padding_mask=query_key_padding_mask,
+                key_padding_mask=key_padding_mask,
+                **kwargs)
+        return query
diff --git a/mmcv/models/bricks/wrappers.py b/mmcv/models/bricks/wrappers.py
new file mode 100644
index 0000000..e4bed07
--- /dev/null
+++ b/mmcv/models/bricks/wrappers.py
@@ -0,0 +1,175 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+r"""Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/wrappers.py  # noqa: E501
+
+Wrap some nn modules to support empty tensor input. Currently, these wrappers
+are mainly used in mask heads like fcn_mask_head and maskiou_heads since mask
+heads are trained on only positive RoIs.
+"""
+import math
+
+import torch
+import torch.nn as nn
+from torch.nn.modules.utils import _pair, _triple
+
+from .registry import CONV_LAYERS, UPSAMPLE_LAYERS
+
+TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2])
+
+
+def obsolete_torch_version(torch_version, version_threshold):
+    return torch_version <= version_threshold
+
+
+class NewEmptyTensorOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, new_shape):
+        ctx.shape = x.shape
+        return x.new_empty(new_shape)
+
+    @staticmethod
+    def backward(ctx, grad):
+        shape = ctx.shape
+        return NewEmptyTensorOp.apply(grad, shape), None
+
+
+@CONV_LAYERS.register_module('Conv', force=True)
+class Conv2d(nn.Conv2d):
+
+    def forward(self, x):
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size,
+                                     self.padding, self.stride, self.dilation):
+                o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
+
+
+@CONV_LAYERS.register_module('Conv3d', force=True)
+class Conv3d(nn.Conv3d):
+
+    def forward(self, x):
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size,
+                                     self.padding, self.stride, self.dilation):
+                o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
+
+
+@CONV_LAYERS.register_module()
+@CONV_LAYERS.register_module('deconv')
+@UPSAMPLE_LAYERS.register_module('deconv', force=True)
+class ConvTranspose2d(nn.ConvTranspose2d):
+
+    def forward(self, x):
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size,
+                                         self.padding, self.stride,
+                                         self.dilation, self.output_padding):
+                out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
+
+
+@CONV_LAYERS.register_module()
+@CONV_LAYERS.register_module('deconv3d')
+@UPSAMPLE_LAYERS.register_module('deconv3d', force=True)
+class ConvTranspose3d(nn.ConvTranspose3d):
+
+    def forward(self, x):
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size,
+                                         self.padding, self.stride,
+                                         self.dilation, self.output_padding):
+                out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
+
+
+class MaxPool2d(nn.MaxPool2d):
+
+    def forward(self, x):
+        # PyTorch 1.9 does not support empty tensor inference yet
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
+            out_shape = list(x.shape[:2])
+            for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size),
+                                     _pair(self.padding), _pair(self.stride),
+                                     _pair(self.dilation)):
+                o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
+                o = math.ceil(o) if self.ceil_mode else math.floor(o)
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            return empty
+
+        return super().forward(x)
+
+
+class MaxPool3d(nn.MaxPool3d):
+
+    def forward(self, x):
+        # PyTorch 1.9 does not support empty tensor inference yet
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
+            out_shape = list(x.shape[:2])
+            for i, k, p, s, d in zip(x.shape[-3:], _triple(self.kernel_size),
+                                     _triple(self.padding),
+                                     _triple(self.stride),
+                                     _triple(self.dilation)):
+                o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
+                o = math.ceil(o) if self.ceil_mode else math.floor(o)
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            return empty
+
+        return super().forward(x)
+
+
+class Linear(torch.nn.Linear):
+
+    def forward(self, x):
+        # empty tensor forward of Linear layer is supported in Pytorch 1.6
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 5)):
+            out_shape = [x.shape[0], self.out_features]
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
diff --git a/mmcv/models/builder.py b/mmcv/models/builder.py
new file mode 100644
index 0000000..798e70d
--- /dev/null
+++ b/mmcv/models/builder.py
@@ -0,0 +1,137 @@
+import warnings
+
+# from mmcv.cnn import MODELS as MMCV_MODELS
+from mmcv.utils import Registry
+from .backbones.base_module import Sequential
+from ..utils import Registry, build_from_cfg
+
+######### from mmcv.cnn
+def build_model_from_cfg(cfg, registry, default_args=None):
+    """Build a PyTorch model from config dict(s). Different from
+    ``build_from_cfg``, if cfg is a list, a ``nn.Sequential`` will be built.
+
+    Args:
+        cfg (dict, list[dict]): The config of modules, is is either a config
+            dict or a list of config dicts. If cfg is a list, a
+            the built modules will be wrapped with ``nn.Sequential``.
+        registry (:obj:`Registry`): A registry the module belongs to.
+        default_args (dict, optional): Default arguments to build the module.
+            Defaults to None.
+
+    Returns:
+        nn.Module: A built nn module.
+    """
+    if isinstance(cfg, list):
+        modules = [
+            build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg
+        ]
+        return Sequential(*modules)
+    else:
+        return build_from_cfg(cfg, registry, default_args)
+
+
+CNN_MODELS = Registry('model', build_func=build_model_from_cfg)
+
+
+MODELS = Registry('models', parent=CNN_MODELS)
+
+BACKBONES = MODELS
+NECKS = MODELS
+ROI_EXTRACTORS = MODELS
+SHARED_HEADS = MODELS
+HEADS = MODELS
+LOSSES = MODELS
+DETECTORS = MODELS
+SEGMENTORS = MODELS
+
+VOXEL_ENCODERS = MODELS
+MIDDLE_ENCODERS = MODELS
+FUSION_LAYERS = MODELS
+
+
+def build_backbone(cfg):
+    """Build backbone."""
+    return BACKBONES.build(cfg)
+
+
+def build_neck(cfg):
+    """Build neck."""
+    return NECKS.build(cfg)
+
+
+def build_roi_extractor(cfg):
+    """Build roi extractor."""
+    return ROI_EXTRACTORS.build(cfg)
+
+
+def build_shared_head(cfg):
+    """Build shared head."""
+    return SHARED_HEADS.build(cfg)
+
+
+def build_head(cfg):
+    """Build head."""
+    return HEADS.build(cfg)
+
+
+def build_loss(cfg):
+    """Build loss."""
+    return LOSSES.build(cfg)
+
+
+def build_detector(cfg, train_cfg=None, test_cfg=None):
+    """Build detector."""
+    if train_cfg is not None or test_cfg is not None:
+        warnings.warn(
+            'train_cfg and test_cfg is deprecated, '
+            'please specify them in model', UserWarning)
+    assert cfg.get('train_cfg') is None or train_cfg is None, \
+        'train_cfg specified in both outer field and model field '
+    assert cfg.get('test_cfg') is None or test_cfg is None, \
+        'test_cfg specified in both outer field and model field '
+    return DETECTORS.build(
+        cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
+    
+    
+def build_segmentor(cfg, train_cfg=None, test_cfg=None):
+    """Build segmentor."""
+    if train_cfg is not None or test_cfg is not None:
+        warnings.warn(
+            'train_cfg and test_cfg is deprecated, '
+            'please specify them in model', UserWarning)
+    assert cfg.get('train_cfg') is None or train_cfg is None, \
+        'train_cfg specified in both outer field and model field '
+    assert cfg.get('test_cfg') is None or test_cfg is None, \
+        'test_cfg specified in both outer field and model field '
+    return SEGMENTORS.build(
+        cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
+
+
+def build_model(cfg, train_cfg=None, test_cfg=None):
+    """A function warpper for building 3D detector or segmentor according to
+    cfg.
+
+    Should be deprecated in the future.
+    """
+    if cfg.type in ['EncoderDecoder3D']:
+        return build_segmentor(cfg, train_cfg=train_cfg, test_cfg=test_cfg)
+    else:
+        return build_detector(cfg, train_cfg=train_cfg, test_cfg=test_cfg)
+
+
+def build_voxel_encoder(cfg):
+    """Build voxel encoder."""
+    return VOXEL_ENCODERS.build(cfg)
+
+
+def build_middle_encoder(cfg):
+    """Build middle level encoder."""
+    return MIDDLE_ENCODERS.build(cfg)
+
+
+def build_fusion_layer(cfg):
+    """Build fusion layer."""
+    return FUSION_LAYERS.build(cfg)
+
+
+
diff --git a/mmcv/models/dense_heads/VAD_head.py b/mmcv/models/dense_heads/VAD_head.py
new file mode 100644
index 0000000..8dc29b3
--- /dev/null
+++ b/mmcv/models/dense_heads/VAD_head.py
@@ -0,0 +1,1898 @@
+import copy
+from math import pi, cos, sin
+
+import torch
+import numpy as np
+import torch.nn as nn
+import matplotlib.pyplot as plt
+import torch.nn.functional as F
+from mmcv.models.builder import HEADS, build_loss 
+from mmcv.models.dense_heads import DETRHead
+from mmcv.utils import force_fp32, auto_fp16
+from mmcv.utils import TORCH_VERSION, digit_version
+from mmcv.core.bbox.builder import build_assigner, build_sampler
+from mmcv.core.bbox.coder import build_bbox_coder
+from mmcv.models.utils.transformer import inverse_sigmoid
+from mmcv.core.bbox.transforms import bbox_xyxy_to_cxcywh
+from mmcv.models.bricks import Linear
+from mmcv.models.utils import bias_init_with_prob, xavier_init
+from mmcv.core.utils import (multi_apply, multi_apply, reduce_mean)
+from mmcv.models.bricks.transformer import build_transformer_layer_sequence
+
+from mmcv.core.bbox.util import normalize_bbox
+from mmcv.models.vad_utils.traj_lr_warmup import get_traj_warmup_loss_weight
+from mmcv.models.vad_utils.map_utils import (
+    normalize_2d_pts, normalize_2d_bbox, denormalize_2d_pts, denormalize_2d_bbox
+)
+
+class MLP(nn.Module):
+    def __init__(self, in_channels, hidden_unit, verbose=False):
+        super(MLP, self).__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(in_channels, hidden_unit),
+            nn.LayerNorm(hidden_unit),
+            nn.ReLU()
+        )
+
+    def forward(self, x):
+        x = self.mlp(x)
+        return x
+
+class LaneNet(nn.Module):
+    def __init__(self, in_channels, hidden_unit, num_subgraph_layers):
+        super(LaneNet, self).__init__()
+        self.num_subgraph_layers = num_subgraph_layers
+        self.layer_seq = nn.Sequential()
+        for i in range(num_subgraph_layers):
+            self.layer_seq.add_module(
+                f'lmlp_{i}', MLP(in_channels, hidden_unit))
+            in_channels = hidden_unit*2
+
+    def forward(self, pts_lane_feats):
+        '''
+            Extract lane_feature from vectorized lane representation
+
+        Args:
+            pts_lane_feats: [batch size, max_pnum, pts, D]
+
+        Returns:
+            inst_lane_feats: [batch size, max_pnum, D]
+        '''
+        x = pts_lane_feats
+        for name, layer in self.layer_seq.named_modules():
+            if isinstance(layer, MLP):
+                # x [bs,max_lane_num,9,dim]
+                x = layer(x)
+                x_max = torch.max(x, -2)[0]
+                x_max = x_max.unsqueeze(2).repeat(1, 1, x.shape[2], 1)
+                x = torch.cat([x, x_max], dim=-1)
+        x_max = torch.max(x, -2)[0]
+        return x_max
+
+
+@HEADS.register_module()
+class VADHead(DETRHead):
+    """Head of VAD model.
+    Args:
+        with_box_refine (bool): Whether to refine the reference points
+            in the decoder. Defaults to False.
+        as_two_stage (bool) : Whether to generate the proposal from
+            the outputs of encoder.
+        transformer (obj:`ConfigDict`): ConfigDict is used for building
+            the Encoder and Decoder.
+        bev_h, bev_w (int): spatial shape of BEV queries.
+    """
+    def __init__(self,
+                 *args,
+                 with_box_refine=False,
+                 as_two_stage=False,
+                 transformer=None,
+                 bbox_coder=None,
+                 num_cls_fcs=2,
+                 code_weights=None,
+                 bev_h=30,
+                 bev_w=30,
+                 fut_ts=6,
+                 fut_mode=6,
+                 loss_traj=dict(type='L1Loss', loss_weight=0.25),
+                 loss_traj_cls=dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=0.8),
+                 map_bbox_coder=None,
+                 map_num_query=900,
+                 map_num_classes=3,
+                 map_num_vec=20,
+                 map_num_pts_per_vec=2,
+                 map_num_pts_per_gt_vec=2,
+                 map_query_embed_type='all_pts',
+                 map_transform_method='minmax',
+                 map_gt_shift_pts_pattern='v0',
+                 map_dir_interval=1,
+                 map_code_size=None,
+                 map_code_weights=None,
+                loss_map_cls=dict(
+                     type='CrossEntropyLoss',
+                     bg_cls_weight=0.1,
+                     use_sigmoid=False,
+                     loss_weight=1.0,
+                     class_weight=1.0),
+                 loss_map_bbox=dict(type='L1Loss', loss_weight=5.0),
+                 loss_map_iou=dict(type='GIoULoss', loss_weight=2.0),
+                 loss_map_pts=dict(
+                    type='ChamferDistance',loss_src_weight=1.0,loss_dst_weight=1.0
+                 ),
+                 loss_map_dir=dict(type='PtsDirCosLoss', loss_weight=2.0),
+                 tot_epoch=None,
+                 use_traj_lr_warmup=False,
+                 motion_decoder=None,
+                 motion_map_decoder=None,
+                 use_pe=False,
+                 motion_det_score=None,
+                 map_thresh=0.5,
+                 dis_thresh=0.2,
+                 pe_normalization=True,
+                 ego_his_encoder=None,
+                 ego_fut_mode=3,
+                 loss_plan_reg=dict(type='L1Loss', loss_weight=0.25),
+                 loss_plan_bound=dict(type='PlanMapBoundLoss', loss_weight=0.1),
+                 loss_plan_col=dict(type='PlanAgentDisLoss', loss_weight=0.1),
+                 loss_plan_dir=dict(type='PlanMapThetaLoss', loss_weight=0.1),
+                 ego_agent_decoder=None,
+                 ego_map_decoder=None,
+                 query_thresh=None,
+                 query_use_fix_pad=None,
+                 ego_lcf_feat_idx=None,
+                 valid_fut_ts=6,
+                 **kwargs):
+
+        self.bev_h = bev_h
+        self.bev_w = bev_w
+        self.fp16_enabled = False
+        self.fut_ts = fut_ts
+        self.fut_mode = fut_mode
+        self.tot_epoch = tot_epoch
+        self.use_traj_lr_warmup = use_traj_lr_warmup
+        self.motion_decoder = motion_decoder
+        self.motion_map_decoder = motion_map_decoder
+        self.use_pe = use_pe
+        self.motion_det_score = motion_det_score
+        self.map_thresh = map_thresh
+        self.dis_thresh = dis_thresh
+        self.pe_normalization = pe_normalization
+        self.ego_his_encoder = ego_his_encoder
+        self.ego_fut_mode = ego_fut_mode
+        self.ego_agent_decoder = ego_agent_decoder
+        self.ego_map_decoder = ego_map_decoder
+        self.query_thresh = query_thresh
+        self.query_use_fix_pad = query_use_fix_pad
+        self.ego_lcf_feat_idx = ego_lcf_feat_idx
+        self.valid_fut_ts = valid_fut_ts
+
+        if loss_traj_cls['use_sigmoid'] == True:
+            self.traj_num_cls = 1
+        else:
+          self.traj_num_cls = 2
+
+        self.with_box_refine = with_box_refine
+        self.as_two_stage = as_two_stage
+        if self.as_two_stage:
+            transformer['as_two_stage'] = self.as_two_stage
+        if 'code_size' in kwargs:
+            self.code_size = kwargs['code_size']
+        else:
+            self.code_size = 10
+        if code_weights is not None:
+            self.code_weights = code_weights
+        else:
+            self.code_weights = [1.0, 1.0, 1.0,
+                                 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
+        if map_code_size is not None:
+            self.map_code_size = map_code_size
+        else:
+            self.map_code_size = 10
+        if map_code_weights is not None:
+            self.map_code_weights = map_code_weights
+        else:
+            self.map_code_weights = [1.0, 1.0, 1.0,
+                                 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.pc_range = self.bbox_coder.pc_range
+        self.real_w = self.pc_range[3] - self.pc_range[0]
+        self.real_h = self.pc_range[4] - self.pc_range[1]
+        self.num_cls_fcs = num_cls_fcs - 1
+
+        self.map_bbox_coder = build_bbox_coder(map_bbox_coder)
+        self.map_query_embed_type = map_query_embed_type
+        self.map_transform_method = map_transform_method
+        self.map_gt_shift_pts_pattern = map_gt_shift_pts_pattern
+        map_num_query = map_num_vec * map_num_pts_per_vec
+        self.map_num_query = map_num_query
+        self.map_num_classes = map_num_classes
+        self.map_num_vec = map_num_vec
+        self.map_num_pts_per_vec = map_num_pts_per_vec
+        self.map_num_pts_per_gt_vec = map_num_pts_per_gt_vec
+        self.map_dir_interval = map_dir_interval
+
+        if loss_map_cls['use_sigmoid'] == True:
+            self.map_cls_out_channels = map_num_classes
+        else:
+            self.map_cls_out_channels = map_num_classes + 1
+
+        self.map_bg_cls_weight = 0
+        map_class_weight = loss_map_cls.get('class_weight', None)
+        if map_class_weight is not None and (self.__class__ is VADHead):
+            assert isinstance(map_class_weight, float), 'Expected ' \
+                'class_weight to have type float. Found ' \
+                f'{type(map_class_weight)}.'
+            # NOTE following the official DETR rep0, bg_cls_weight means
+            # relative classification weight of the no-object class.
+            map_bg_cls_weight = loss_map_cls.get('bg_cls_weight', map_class_weight)
+            assert isinstance(map_bg_cls_weight, float), 'Expected ' \
+                'bg_cls_weight to have type float. Found ' \
+                f'{type(map_bg_cls_weight)}.'
+            map_class_weight = torch.ones(map_num_classes + 1) * map_class_weight
+            # set background class as the last indice
+            map_class_weight[map_num_classes] = map_bg_cls_weight
+            loss_map_cls.update({'class_weight': map_class_weight})
+            if 'bg_cls_weight' in loss_map_cls:
+                loss_map_cls.pop('bg_cls_weight')
+            self.map_bg_cls_weight = map_bg_cls_weight
+        
+        self.traj_bg_cls_weight = 0
+
+        super(VADHead, self).__init__(*args, transformer=transformer, **kwargs)
+        self.code_weights = nn.Parameter(torch.tensor(
+            self.code_weights, requires_grad=False), requires_grad=False)
+        self.map_code_weights = nn.Parameter(torch.tensor(
+            self.map_code_weights, requires_grad=False), requires_grad=False)
+        
+        if kwargs['train_cfg'] is not None:
+            assert 'map_assigner' in kwargs['train_cfg'], 'map assigner should be provided '\
+                'when train_cfg is set.'
+            map_assigner = kwargs['train_cfg']['map_assigner']
+            assert loss_map_cls['loss_weight'] == map_assigner['cls_cost']['weight'], \
+                'The classification weight for loss and matcher should be' \
+                'exactly the same.'
+            assert loss_map_bbox['loss_weight'] == map_assigner['reg_cost'][
+                'weight'], 'The regression L1 weight for loss and matcher ' \
+                'should be exactly the same.'
+            assert loss_map_iou['loss_weight'] == map_assigner['iou_cost']['weight'], \
+                'The regression iou weight for loss and matcher should be' \
+                'exactly the same.'
+            assert loss_map_pts['loss_weight'] == map_assigner['pts_cost']['weight'], \
+                'The regression l1 weight for map pts loss and matcher should be' \
+                'exactly the same.'
+
+            self.map_assigner = build_assigner(map_assigner)
+            # DETR sampling=False, so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.map_sampler = build_sampler(sampler_cfg, context=self)
+        
+        self.loss_traj = build_loss(loss_traj)
+        self.loss_traj_cls = build_loss(loss_traj_cls)
+        self.loss_map_bbox = build_loss(loss_map_bbox)
+        self.loss_map_cls = build_loss(loss_map_cls)
+        self.loss_map_iou = build_loss(loss_map_iou)
+        self.loss_map_pts = build_loss(loss_map_pts)
+        self.loss_map_dir = build_loss(loss_map_dir)
+        self.loss_plan_reg = build_loss(loss_plan_reg)
+        self.loss_plan_bound = build_loss(loss_plan_bound)
+        self.loss_plan_col = build_loss(loss_plan_col)
+        self.loss_plan_dir = build_loss(loss_plan_dir)
+
+    def _init_layers(self):
+        """Initialize classification branch and regression branch of head."""
+        cls_branch = []
+        for _ in range(self.num_reg_fcs):
+            cls_branch.append(Linear(self.embed_dims, self.embed_dims))
+            cls_branch.append(nn.LayerNorm(self.embed_dims))
+            cls_branch.append(nn.ReLU(inplace=True))
+        cls_branch.append(Linear(self.embed_dims, self.cls_out_channels))
+        cls_branch = nn.Sequential(*cls_branch)
+
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, self.code_size))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        traj_branch = []
+        for _ in range(self.num_reg_fcs):
+            traj_branch.append(Linear(self.embed_dims*2, self.embed_dims*2))
+            traj_branch.append(nn.ReLU())
+        traj_branch.append(Linear(self.embed_dims*2, self.fut_ts*2))
+        traj_branch = nn.Sequential(*traj_branch)
+
+        traj_cls_branch = []
+        for _ in range(self.num_reg_fcs):
+            traj_cls_branch.append(Linear(self.embed_dims*2, self.embed_dims*2))
+            traj_cls_branch.append(nn.LayerNorm(self.embed_dims*2))
+            traj_cls_branch.append(nn.ReLU(inplace=True))
+        traj_cls_branch.append(Linear(self.embed_dims*2, self.traj_num_cls))
+        traj_cls_branch = nn.Sequential(*traj_cls_branch)
+
+        map_cls_branch = []
+        for _ in range(self.num_reg_fcs):
+            map_cls_branch.append(Linear(self.embed_dims, self.embed_dims))
+            map_cls_branch.append(nn.LayerNorm(self.embed_dims))
+            map_cls_branch.append(nn.ReLU(inplace=True))
+        map_cls_branch.append(Linear(self.embed_dims, self.map_cls_out_channels))
+        map_cls_branch = nn.Sequential(*map_cls_branch)
+
+        map_reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            map_reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            map_reg_branch.append(nn.ReLU())
+        map_reg_branch.append(Linear(self.embed_dims, self.map_code_size))
+        map_reg_branch = nn.Sequential(*map_reg_branch)
+
+
+        def _get_clones(module, N):
+            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+        # last reg_branch is used to generate proposal from
+        # encode feature map when as_two_stage is True.
+        num_decoder_layers = 1
+        num_map_decoder_layers = 1
+        if self.transformer.decoder is not None:
+            num_decoder_layers = self.transformer.decoder.num_layers
+        if self.transformer.map_decoder is not None:
+            num_map_decoder_layers = self.transformer.map_decoder.num_layers
+        num_motion_decoder_layers = 1
+        num_pred = (num_decoder_layers + 1) if \
+            self.as_two_stage else num_decoder_layers
+        motion_num_pred = (num_motion_decoder_layers + 1) if \
+            self.as_two_stage else num_motion_decoder_layers
+        map_num_pred = (num_map_decoder_layers + 1) if \
+            self.as_two_stage else num_map_decoder_layers
+
+        if self.with_box_refine:
+            self.cls_branches = _get_clones(cls_branch, num_pred)
+            self.reg_branches = _get_clones(reg_branch, num_pred)
+            self.traj_branches = _get_clones(traj_branch, motion_num_pred)
+            self.traj_cls_branches = _get_clones(traj_cls_branch, motion_num_pred)
+            self.map_cls_branches = _get_clones(map_cls_branch, map_num_pred)
+            self.map_reg_branches = _get_clones(map_reg_branch, map_num_pred)
+        else:
+            self.cls_branches = nn.ModuleList(
+                [cls_branch for _ in range(num_pred)])
+            self.reg_branches = nn.ModuleList(
+                [reg_branch for _ in range(num_pred)])
+            self.traj_branches = nn.ModuleList(
+                [traj_branch for _ in range(motion_num_pred)])
+            self.traj_cls_branches = nn.ModuleList(
+                [traj_cls_branch for _ in range(motion_num_pred)])
+            self.map_cls_branches = nn.ModuleList(
+                [map_cls_branch for _ in range(map_num_pred)])
+            self.map_reg_branches = nn.ModuleList(
+                [map_reg_branch for _ in range(map_num_pred)])
+
+        if not self.as_two_stage:
+            self.bev_embedding = nn.Embedding(
+                self.bev_h * self.bev_w, self.embed_dims)
+            self.query_embedding = nn.Embedding(self.num_query,
+                                                self.embed_dims * 2)
+            if self.map_query_embed_type == 'all_pts':
+                self.map_query_embedding = nn.Embedding(self.map_num_query,
+                                                    self.embed_dims * 2)
+            elif self.map_query_embed_type == 'instance_pts':
+                self.map_query_embedding = None
+                self.map_instance_embedding = nn.Embedding(self.map_num_vec, self.embed_dims * 2)
+                self.map_pts_embedding = nn.Embedding(self.map_num_pts_per_vec, self.embed_dims * 2)
+        
+        if self.motion_decoder is not None:
+            self.motion_decoder = build_transformer_layer_sequence(self.motion_decoder)
+            self.motion_mode_query = nn.Embedding(self.fut_mode, self.embed_dims)	
+            self.motion_mode_query.weight.requires_grad = True
+            if self.use_pe:
+                self.pos_mlp_sa = nn.Linear(2, self.embed_dims)
+        else:
+            raise NotImplementedError('Not implement yet')
+
+        if self.motion_map_decoder is not None:
+            self.lane_encoder = LaneNet(256, 128, 3)
+            self.motion_map_decoder = build_transformer_layer_sequence(self.motion_map_decoder)
+            if self.use_pe:
+                self.pos_mlp = nn.Linear(2, self.embed_dims)
+        
+        if self.ego_his_encoder is not None:
+            self.ego_his_encoder = LaneNet(2, self.embed_dims//2, 3)
+        else:
+            self.ego_query = nn.Embedding(1, self.embed_dims)	
+
+        if self.ego_agent_decoder is not None:
+            self.ego_agent_decoder = build_transformer_layer_sequence(self.ego_agent_decoder)
+            if self.use_pe:
+                self.ego_agent_pos_mlp = nn.Linear(2, self.embed_dims)
+
+        if self.ego_map_decoder is not None:
+            self.ego_map_decoder = build_transformer_layer_sequence(self.ego_map_decoder)
+            if self.use_pe:
+                self.ego_map_pos_mlp = nn.Linear(2, self.embed_dims)
+
+        ego_fut_decoder = []
+        ego_fut_dec_in_dim = self.embed_dims*2 + len(self.ego_lcf_feat_idx) \
+            if self.ego_lcf_feat_idx is not None else self.embed_dims*2
+        for _ in range(self.num_reg_fcs):
+            ego_fut_decoder.append(Linear(ego_fut_dec_in_dim, ego_fut_dec_in_dim))
+            ego_fut_decoder.append(nn.ReLU())
+        ego_fut_decoder.append(Linear(ego_fut_dec_in_dim, self.ego_fut_mode*self.fut_ts*2))
+        self.ego_fut_decoder = nn.Sequential(*ego_fut_decoder)
+
+        self.agent_fus_mlp = nn.Sequential(
+            nn.Linear(self.fut_mode*2*self.embed_dims, self.embed_dims, bias=True),
+            nn.LayerNorm(self.embed_dims),
+            nn.ReLU(),
+            nn.Linear(self.embed_dims, self.embed_dims, bias=True))
+
+    def init_weights(self):
+        """Initialize weights of the DeformDETR head."""
+        self.transformer.init_weights()
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.cls_branches:
+                nn.init.constant_(m[-1].bias, bias_init)
+        if self.loss_map_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.map_cls_branches:
+                nn.init.constant_(m[-1].bias, bias_init)
+        if self.loss_traj_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.traj_cls_branches:
+                nn.init.constant_(m[-1].bias, bias_init)
+        # for m in self.map_reg_branches:
+        #     constant_init(m[-1], 0, bias=0)
+        # nn.init.constant_(self.map_reg_branches[0][-1].bias.data[2:], 0.)
+        if self.motion_decoder is not None:
+            for p in self.motion_decoder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+            nn.init.orthogonal_(self.motion_mode_query.weight)
+            if self.use_pe:
+                xavier_init(self.pos_mlp_sa, distribution='uniform', bias=0.)
+        if self.motion_map_decoder is not None:
+            for p in self.motion_map_decoder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+            for p in self.lane_encoder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+            if self.use_pe:
+                xavier_init(self.pos_mlp, distribution='uniform', bias=0.)
+        if self.ego_his_encoder is not None:
+            for p in self.ego_his_encoder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+        if self.ego_agent_decoder is not None:
+            for p in self.ego_agent_decoder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+        if self.ego_map_decoder is not None:
+            for p in self.ego_map_decoder.parameters():
+                if p.dim() > 1:
+                    nn.init.xavier_uniform_(p)
+
+    # @auto_fp16(apply_to=('mlvl_feats'))
+    @force_fp32(apply_to=('mlvl_feats', 'prev_bev'))
+    def forward(self,
+                mlvl_feats,
+                img_metas,
+                prev_bev=None,
+                only_bev=False,
+                ego_his_trajs=None,
+                ego_lcf_feat=None,
+            ):
+        """Forward function.
+        Args:
+            mlvl_feats (tuple[Tensor]): Features from the upstream
+                network, each is a 5D-tensor with shape
+                (B, N, C, H, W).
+            prev_bev: previous bev featues
+            only_bev: only compute BEV features with encoder. 
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        """
+        
+        bs, num_cam, _, _, _ = mlvl_feats[0].shape
+        dtype = mlvl_feats[0].dtype
+        object_query_embeds = self.query_embedding.weight.to(dtype)
+        
+        if self.map_query_embed_type == 'all_pts':
+            map_query_embeds = self.map_query_embedding.weight.to(dtype)
+        elif self.map_query_embed_type == 'instance_pts':
+            map_pts_embeds = self.map_pts_embedding.weight.unsqueeze(0)
+            map_instance_embeds = self.map_instance_embedding.weight.unsqueeze(1)
+            map_query_embeds = (map_pts_embeds + map_instance_embeds).flatten(0, 1).to(dtype)
+
+        bev_queries = self.bev_embedding.weight.to(dtype)
+
+        bev_mask = torch.zeros((bs, self.bev_h, self.bev_w),
+                               device=bev_queries.device).to(dtype)
+        bev_pos = self.positional_encoding(bev_mask).to(dtype)
+            
+        if only_bev:  # only use encoder to obtain BEV features, TODO: refine the workaround
+            return self.transformer.get_bev_features(
+                mlvl_feats,
+                bev_queries,
+                self.bev_h,
+                self.bev_w,
+                grid_length=(self.real_h / self.bev_h,
+                             self.real_w / self.bev_w),
+                bev_pos=bev_pos,
+                img_metas=img_metas,
+                prev_bev=prev_bev,
+            )
+        else:
+            outputs = self.transformer(
+                mlvl_feats,
+                bev_queries,
+                object_query_embeds,
+                map_query_embeds,
+                self.bev_h,
+                self.bev_w,
+                grid_length=(self.real_h / self.bev_h,
+                             self.real_w / self.bev_w),
+                bev_pos=bev_pos,
+                reg_branches=self.reg_branches if self.with_box_refine else None,  # noqa:E501
+                cls_branches=self.cls_branches if self.as_two_stage else None,
+                map_reg_branches=self.map_reg_branches if self.with_box_refine else None,  # noqa:E501
+                map_cls_branches=self.map_cls_branches if self.as_two_stage else None,
+                img_metas=img_metas,
+                prev_bev=prev_bev
+        )
+
+        bev_embed, hs, init_reference, inter_references, \
+            map_hs, map_init_reference, map_inter_references = outputs
+
+        hs = hs.permute(0, 2, 1, 3)
+        outputs_classes = []
+        outputs_coords = []
+        outputs_coords_bev = []
+        outputs_trajs = []
+        outputs_trajs_classes = []
+
+        map_hs = map_hs.permute(0, 2, 1, 3)
+        map_outputs_classes = []
+        map_outputs_coords = []
+        map_outputs_pts_coords = []
+        map_outputs_coords_bev = []
+
+        for lvl in range(hs.shape[0]):
+            if lvl == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[lvl - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.cls_branches[lvl](hs[lvl])
+            tmp = self.reg_branches[lvl](hs[lvl])
+
+            # TODO: check the shape of reference
+            assert reference.shape[-1] == 3
+            tmp[..., 0:2] = tmp[..., 0:2] + reference[..., 0:2]
+            tmp[..., 0:2] = tmp[..., 0:2].sigmoid()
+            outputs_coords_bev.append(tmp[..., 0:2].clone().detach())
+            tmp[..., 4:5] = tmp[..., 4:5] + reference[..., 2:3]
+            tmp[..., 4:5] = tmp[..., 4:5].sigmoid()
+            tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] -
+                             self.pc_range[0]) + self.pc_range[0])
+            tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] -
+                             self.pc_range[1]) + self.pc_range[1])
+            tmp[..., 4:5] = (tmp[..., 4:5] * (self.pc_range[5] -
+                             self.pc_range[2]) + self.pc_range[2])
+
+            # TODO: check if using sigmoid
+            outputs_coord = tmp
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+        
+        for lvl in range(map_hs.shape[0]):
+            if lvl == 0:
+                reference = map_init_reference
+            else:
+                reference = map_inter_references[lvl - 1]
+            reference = inverse_sigmoid(reference)
+            map_outputs_class = self.map_cls_branches[lvl](
+                map_hs[lvl].view(bs,self.map_num_vec, self.map_num_pts_per_vec,-1).mean(2)
+            )
+            tmp = self.map_reg_branches[lvl](map_hs[lvl])
+            # TODO: check the shape of reference
+            assert reference.shape[-1] == 2
+            tmp[..., 0:2] += reference[..., 0:2]
+            tmp = tmp.sigmoid() # cx,cy,w,h
+            map_outputs_coord, map_outputs_pts_coord = self.map_transform_box(tmp)
+            map_outputs_coords_bev.append(map_outputs_pts_coord.clone().detach())
+            map_outputs_classes.append(map_outputs_class)
+            map_outputs_coords.append(map_outputs_coord)
+            map_outputs_pts_coords.append(map_outputs_pts_coord)
+            
+        if self.motion_decoder is not None:
+            batch_size, num_agent = outputs_coords_bev[-1].shape[:2]
+            # motion_query
+            motion_query = hs[-1].permute(1, 0, 2)  # [A, B, D]
+            mode_query = self.motion_mode_query.weight  # [fut_mode, D]
+            # [M, B, D], M=A*fut_mode
+            motion_query = (motion_query[:, None, :, :] + mode_query[None, :, None, :]).flatten(0, 1)
+            if self.use_pe:
+                motion_coords = outputs_coords_bev[-1]  # [B, A, 2]
+                motion_pos = self.pos_mlp_sa(motion_coords)  # [B, A, D]
+                motion_pos = motion_pos.unsqueeze(2).repeat(1, 1, self.fut_mode, 1).flatten(1, 2)
+                motion_pos = motion_pos.permute(1, 0, 2)  # [M, B, D]
+            else:
+                motion_pos = None
+
+            if self.motion_det_score is not None:
+                motion_score = outputs_classes[-1]
+                max_motion_score = motion_score.max(dim=-1)[0]
+                invalid_motion_idx = max_motion_score < self.motion_det_score  # [B, A]
+                invalid_motion_idx = invalid_motion_idx.unsqueeze(2).repeat(1, 1, self.fut_mode).flatten(1, 2)
+            else:
+                invalid_motion_idx = None
+
+            motion_hs = self.motion_decoder(
+                query=motion_query,
+                key=motion_query,
+                value=motion_query,
+                query_pos=motion_pos,
+                key_pos=motion_pos,
+                key_padding_mask=invalid_motion_idx)
+
+            if self.motion_map_decoder is not None:
+                # map preprocess
+                motion_coords = outputs_coords_bev[-1]  # [B, A, 2]
+                motion_coords = motion_coords.unsqueeze(2).repeat(1, 1, self.fut_mode, 1).flatten(1, 2)
+                map_query = map_hs[-1].view(batch_size, self.map_num_vec, self.map_num_pts_per_vec, -1)
+                map_query = self.lane_encoder(map_query)  # [B, P, pts, D] -> [B, P, D]
+                map_score = map_outputs_classes[-1]
+                map_pos = map_outputs_coords_bev[-1]
+                map_query, map_pos, key_padding_mask = self.select_and_pad_pred_map(
+                    motion_coords, map_query, map_score, map_pos,
+                    map_thresh=self.map_thresh, dis_thresh=self.dis_thresh,
+                    pe_normalization=self.pe_normalization, use_fix_pad=True)
+                map_query = map_query.permute(1, 0, 2)  # [P, B*M, D]
+                ca_motion_query = motion_hs.permute(1, 0, 2).flatten(0, 1).unsqueeze(0)
+
+                # position encoding
+                if self.use_pe:
+                    (num_query, batch) = ca_motion_query.shape[:2] 
+                    motion_pos = torch.zeros((num_query, batch, 2), device=motion_hs.device)
+                    motion_pos = self.pos_mlp(motion_pos)
+                    map_pos = map_pos.permute(1, 0, 2)
+                    map_pos = self.pos_mlp(map_pos)
+                else:
+                    motion_pos, map_pos = None, None
+                
+                ca_motion_query = self.motion_map_decoder(
+                    query=ca_motion_query,
+                    key=map_query,
+                    value=map_query,
+                    query_pos=motion_pos,
+                    key_pos=map_pos,
+                    key_padding_mask=key_padding_mask)
+            else:
+                ca_motion_query = motion_hs.permute(1, 0, 2).flatten(0, 1).unsqueeze(0)
+
+            batch_size = outputs_coords_bev[-1].shape[0]
+            motion_hs = motion_hs.permute(1, 0, 2).unflatten(
+                dim=1, sizes=(num_agent, self.fut_mode)
+            )
+            ca_motion_query = ca_motion_query.squeeze(0).unflatten(
+                dim=0, sizes=(batch_size, num_agent, self.fut_mode)
+            )
+            motion_hs = torch.cat([motion_hs, ca_motion_query], dim=-1)  # [B, A, fut_mode, 2D]
+        else:
+            raise NotImplementedError('Not implement yet')
+
+        outputs_traj = self.traj_branches[0](motion_hs)
+        outputs_trajs.append(outputs_traj)
+        outputs_traj_class = self.traj_cls_branches[0](motion_hs)
+        outputs_trajs_classes.append(outputs_traj_class.squeeze(-1))
+        (batch, num_agent) = motion_hs.shape[:2]
+             
+        map_outputs_classes = torch.stack(map_outputs_classes)
+        map_outputs_coords = torch.stack(map_outputs_coords)
+        map_outputs_pts_coords = torch.stack(map_outputs_pts_coords)
+
+        outputs_classes = torch.stack(outputs_classes)
+        outputs_coords = torch.stack(outputs_coords)
+        outputs_trajs = torch.stack(outputs_trajs)
+        outputs_trajs_classes = torch.stack(outputs_trajs_classes)
+
+        # planning
+        (batch, num_agent) = motion_hs.shape[:2]
+        if self.ego_his_encoder is not None:
+            ego_his_feats = self.ego_his_encoder(ego_his_trajs)  # [B, 1, dim]
+        else:
+            ego_his_feats = self.ego_query.weight.unsqueeze(0).repeat(batch, 1, 1)
+        # Interaction
+        ego_query = ego_his_feats
+        ego_pos = torch.zeros((batch, 1, 2), device=ego_query.device)
+        ego_pos_emb = self.ego_agent_pos_mlp(ego_pos)
+        agent_conf = outputs_classes[-1]
+        agent_query = motion_hs.reshape(batch, num_agent, -1)
+        agent_query = self.agent_fus_mlp(agent_query) # [B, A, fut_mode, 2*D] -> [B, A, D]
+        agent_pos = outputs_coords_bev[-1]
+        agent_query, agent_pos, agent_mask = self.select_and_pad_query(
+            agent_query, agent_pos, agent_conf,
+            score_thresh=self.query_thresh, use_fix_pad=self.query_use_fix_pad
+        )
+        agent_pos_emb = self.ego_agent_pos_mlp(agent_pos)
+        # ego <-> agent interaction
+        ego_agent_query = self.ego_agent_decoder(
+            query=ego_query.permute(1, 0, 2),
+            key=agent_query.permute(1, 0, 2),
+            value=agent_query.permute(1, 0, 2),
+            query_pos=ego_pos_emb.permute(1, 0, 2),
+            key_pos=agent_pos_emb.permute(1, 0, 2),
+            key_padding_mask=agent_mask)
+
+        # ego <-> map interaction
+        ego_pos = torch.zeros((batch, 1, 2), device=agent_query.device)
+        ego_pos_emb = self.ego_map_pos_mlp(ego_pos)
+        map_query = map_hs[-1].view(batch_size, self.map_num_vec, self.map_num_pts_per_vec, -1)
+        map_query = self.lane_encoder(map_query)  # [B, P, pts, D] -> [B, P, D]
+        map_conf = map_outputs_classes[-1]
+        map_pos = map_outputs_coords_bev[-1]
+        # use the most close pts pos in each map inst as the inst's pos
+        batch, num_map = map_pos.shape[:2]
+        map_dis = torch.sqrt(map_pos[..., 0]**2 + map_pos[..., 1]**2)
+        min_map_pos_idx = map_dis.argmin(dim=-1).flatten()  # [B*P]
+        min_map_pos = map_pos.flatten(0, 1)  # [B*P, pts, 2]
+        min_map_pos = min_map_pos[range(min_map_pos.shape[0]), min_map_pos_idx]  # [B*P, 2]
+        min_map_pos = min_map_pos.view(batch, num_map, 2)  # [B, P, 2]
+        map_query, map_pos, map_mask = self.select_and_pad_query(
+            map_query, min_map_pos, map_conf,
+            score_thresh=self.query_thresh, use_fix_pad=self.query_use_fix_pad
+        )
+        map_pos_emb = self.ego_map_pos_mlp(map_pos)
+        ego_map_query = self.ego_map_decoder(
+            query=ego_agent_query,
+            key=map_query.permute(1, 0, 2),
+            value=map_query.permute(1, 0, 2),
+            query_pos=ego_pos_emb.permute(1, 0, 2),
+            key_pos=map_pos_emb.permute(1, 0, 2),
+            key_padding_mask=map_mask)
+
+        if self.ego_his_encoder is not None and self.ego_lcf_feat_idx is not None:
+            ego_feats = torch.cat(
+                [ego_his_feats,
+                 ego_map_query.permute(1, 0, 2),
+                 ego_lcf_feat.squeeze(1)[..., self.ego_lcf_feat_idx]],
+                dim=-1
+            )  # [B, 1, 2D+2]
+        elif self.ego_his_encoder is not None and self.ego_lcf_feat_idx is None:
+            ego_feats = torch.cat(
+                [ego_his_feats,
+                 ego_map_query.permute(1, 0, 2)],
+                dim=-1
+            )  # [B, 1, 2D]
+        elif self.ego_his_encoder is None and self.ego_lcf_feat_idx is not None:                
+            ego_feats = torch.cat(
+                [ego_agent_query.permute(1, 0, 2),
+                 ego_map_query.permute(1, 0, 2),
+                 ego_lcf_feat.squeeze(1)[..., self.ego_lcf_feat_idx]],
+                dim=-1
+            )  # [B, 1, 2D+2]
+        elif self.ego_his_encoder is None and self.ego_lcf_feat_idx is None:                
+            ego_feats = torch.cat(
+                [ego_agent_query.permute(1, 0, 2),
+                 ego_map_query.permute(1, 0, 2)],
+                dim=-1
+            )  # [B, 1, 2D]  
+
+        # Ego prediction
+        outputs_ego_trajs = self.ego_fut_decoder(ego_feats)
+        outputs_ego_trajs = outputs_ego_trajs.reshape(outputs_ego_trajs.shape[0], 
+                                                      self.ego_fut_mode, self.fut_ts, 2)
+
+        outs = {
+            'bev_embed': bev_embed,
+            'all_cls_scores': outputs_classes,
+            'all_bbox_preds': outputs_coords,
+            'all_traj_preds': outputs_trajs.repeat(outputs_coords.shape[0], 1, 1, 1, 1),
+            'all_traj_cls_scores': outputs_trajs_classes.repeat(outputs_coords.shape[0], 1, 1, 1),
+            'map_all_cls_scores': map_outputs_classes,
+            'map_all_bbox_preds': map_outputs_coords,
+            'map_all_pts_preds': map_outputs_pts_coords,
+            'enc_cls_scores': None,
+            'enc_bbox_preds': None,
+            'map_enc_cls_scores': None,
+            'map_enc_bbox_preds': None,
+            'map_enc_pts_preds': None,
+            'ego_fut_preds': outputs_ego_trajs,
+        }
+
+        return outs
+
+    def map_transform_box(self, pts, y_first=False):
+        """
+        Converting the points set into bounding box.
+
+        Args:
+            pts: the input points sets (fields), each points
+                set (fields) is represented as 2n scalar.
+            y_first: if y_fisrt=True, the point set is represented as
+                [y1, x1, y2, x2 ... yn, xn], otherwise the point set is
+                represented as [x1, y1, x2, y2 ... xn, yn].
+        Returns:
+            The bbox [cx, cy, w, h] transformed from points.
+        """
+        pts_reshape = pts.view(pts.shape[0], self.map_num_vec,
+                                self.map_num_pts_per_vec,2)
+        pts_y = pts_reshape[:, :, :, 0] if y_first else pts_reshape[:, :, :, 1]
+        pts_x = pts_reshape[:, :, :, 1] if y_first else pts_reshape[:, :, :, 0]
+        if self.map_transform_method == 'minmax':
+            # import pdb;pdb.set_trace()
+
+            xmin = pts_x.min(dim=2, keepdim=True)[0]
+            xmax = pts_x.max(dim=2, keepdim=True)[0]
+            ymin = pts_y.min(dim=2, keepdim=True)[0]
+            ymax = pts_y.max(dim=2, keepdim=True)[0]
+            bbox = torch.cat([xmin, ymin, xmax, ymax], dim=2)
+            bbox = bbox_xyxy_to_cxcywh(bbox)
+        else:
+            raise NotImplementedError
+        return bbox, pts_reshape
+
+    def _get_target_single(self,
+                           cls_score,
+                           bbox_pred,
+                           gt_labels,
+                           gt_bboxes,
+                           gt_attr_labels,
+                           gt_bboxes_ignore=None):
+        """"Compute regression and classification targets for one image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_query, 10].
+            gt_bboxes (Tensor): Ground truth bboxes for one image with
+                shape (num_gts, 9) in [x,y,z,w,l,h,yaw,vx,vy] format.
+            gt_labels (Tensor): Ground truth class indices for one image
+                with shape (num_gts, ).
+            gt_bboxes_ignore (Tensor, optional): Bounding boxes
+                which can be ignored. Default None.
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+                - labels (Tensor): Labels of each image.
+                - label_weights (Tensor]): Label weights of each image.
+                - bbox_targets (Tensor): BBox targets of each image.
+                - bbox_weights (Tensor): BBox weights of each image.
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+        gt_fut_trajs = gt_attr_labels[:, :self.fut_ts*2]
+        gt_fut_masks = gt_attr_labels[:, self.fut_ts*2:self.fut_ts*3]
+        gt_bbox_c = gt_bboxes.shape[-1]
+        num_gt_bbox, gt_traj_c = gt_fut_trajs.shape
+
+        assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
+                                             gt_labels, gt_bboxes_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, bbox_pred,
+                                              gt_bboxes)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes,),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_bbox_c]
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+
+        # trajs targets
+        traj_targets = torch.zeros((num_bboxes, gt_traj_c), dtype=torch.float32, device=bbox_pred.device)
+        traj_weights = torch.zeros_like(traj_targets)
+        traj_targets[pos_inds] = gt_fut_trajs[sampling_result.pos_assigned_gt_inds]
+        traj_weights[pos_inds] = 1.0
+
+        # Filter out invalid fut trajs
+        traj_masks = torch.zeros_like(traj_targets)  # [num_bboxes, fut_ts*2]
+        gt_fut_masks = gt_fut_masks.unsqueeze(-1).repeat(1, 1, 2).view(num_gt_bbox, -1)  # [num_gt_bbox, fut_ts*2]
+        traj_masks[pos_inds] = gt_fut_masks[sampling_result.pos_assigned_gt_inds]
+        traj_weights = traj_weights * traj_masks
+
+        # Extra future timestamp mask for controlling pred horizon
+        fut_ts_mask = torch.zeros((num_bboxes, self.fut_ts, 2),
+                                   dtype=torch.float32, device=bbox_pred.device)
+        fut_ts_mask[:, :self.valid_fut_ts, :] = 1.0
+        fut_ts_mask = fut_ts_mask.view(num_bboxes, -1)
+        traj_weights = traj_weights * fut_ts_mask
+
+        # DETR
+        bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
+
+        return (
+            labels, label_weights, bbox_targets, bbox_weights, traj_targets,
+            traj_weights, traj_masks.view(-1, self.fut_ts, 2)[..., 0],
+            pos_inds, neg_inds
+        )
+
+    def _map_get_target_single(self,
+                           cls_score,
+                           bbox_pred,
+                           pts_pred,
+                           gt_labels,
+                           gt_bboxes,
+                           gt_shifts_pts,
+                           gt_bboxes_ignore=None):
+        """"Compute regression and classification targets for one image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_query, 4].
+            gt_bboxes (Tensor): Ground truth bboxes for one image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth class indices for one image
+                with shape (num_gts, ).
+            gt_bboxes_ignore (Tensor, optional): Bounding boxes
+                which can be ignored. Default None.
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+                - labels (Tensor): Labels of each image.
+                - label_weights (Tensor]): Label weights of each image.
+                - bbox_targets (Tensor): BBox targets of each image.
+                - bbox_weights (Tensor): BBox weights of each image.
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+        gt_c = gt_bboxes.shape[-1]
+        assign_result, order_index = self.map_assigner.assign(bbox_pred, cls_score, pts_pred,
+                                             gt_bboxes, gt_labels, gt_shifts_pts,
+                                             gt_bboxes_ignore)
+
+        sampling_result = self.map_sampler.sample(assign_result, bbox_pred,
+                                              gt_bboxes)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes,),
+                                    self.map_num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_c]
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+        # pts targets
+        if order_index is None:
+            assigned_shift = gt_labels[sampling_result.pos_assigned_gt_inds]
+        else:
+            assigned_shift = order_index[sampling_result.pos_inds, sampling_result.pos_assigned_gt_inds]
+        pts_targets = pts_pred.new_zeros((pts_pred.size(0),
+                        pts_pred.size(1), pts_pred.size(2)))
+        pts_weights = torch.zeros_like(pts_targets)
+        pts_weights[pos_inds] = 1.0
+        # DETR
+        bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
+        pts_targets[pos_inds] = gt_shifts_pts[sampling_result.pos_assigned_gt_inds,assigned_shift,:,:]
+        return (labels, label_weights, bbox_targets, bbox_weights,
+                pts_targets, pts_weights,
+                pos_inds, neg_inds)
+
+    def get_targets(self,
+                    cls_scores_list,
+                    bbox_preds_list,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_attr_labels_list,
+                    gt_bboxes_ignore_list=None):
+        """"Compute regression and classification targets for a batch image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            tuple: a tuple containing the following targets.
+                - labels_list (list[Tensor]): Labels for all images.
+                - label_weights_list (list[Tensor]): Label weights for all \
+                    images.
+                - bbox_targets_list (list[Tensor]): BBox targets for all \
+                    images.
+                - bbox_weights_list (list[Tensor]): BBox weights for all \
+                    images.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+
+        (labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, traj_targets_list, traj_weights_list,
+         gt_fut_masks_list, pos_inds_list, neg_inds_list) = multi_apply(
+            self._get_target_single, cls_scores_list, bbox_preds_list,
+            gt_labels_list, gt_bboxes_list, gt_attr_labels_list, gt_bboxes_ignore_list
+         )
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+                traj_targets_list, traj_weights_list, gt_fut_masks_list, num_total_pos, num_total_neg)
+
+    def map_get_targets(self,
+                    cls_scores_list,
+                    bbox_preds_list,
+                    pts_preds_list,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_shifts_pts_list,
+                    gt_bboxes_ignore_list=None):
+        """"Compute regression and classification targets for a batch image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            tuple: a tuple containing the following targets.
+                - labels_list (list[Tensor]): Labels for all images.
+                - label_weights_list (list[Tensor]): Label weights for all \
+                    images.
+                - bbox_targets_list (list[Tensor]): BBox targets for all \
+                    images.
+                - bbox_weights_list (list[Tensor]): BBox weights for all \
+                    images.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+
+        (labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, pts_targets_list, pts_weights_list,
+         pos_inds_list, neg_inds_list) = multi_apply(
+            self._map_get_target_single, cls_scores_list, bbox_preds_list,pts_preds_list,
+            gt_labels_list, gt_bboxes_list, gt_shifts_pts_list, gt_bboxes_ignore_list)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, pts_targets_list, pts_weights_list,
+                num_total_pos, num_total_neg)
+
+    def loss_planning(self,
+                      ego_fut_preds,
+                      ego_fut_gt,
+                      ego_fut_masks,
+                      ego_fut_cmd,
+                      lane_preds,
+                      lane_score_preds,
+                      agent_preds,
+                      agent_fut_preds,
+                      agent_score_preds,
+                      agent_fut_cls_preds):
+        """"Loss function for ego vehicle planning.
+        Args:
+            ego_fut_preds (Tensor): [B, ego_fut_mode, fut_ts, 2]
+            ego_fut_gt (Tensor): [B, fut_ts, 2]
+            ego_fut_masks (Tensor): [B, fut_ts]
+            ego_fut_cmd (Tensor): [B, ego_fut_mode]
+            lane_preds (Tensor): [B, num_vec, num_pts, 2]
+            lane_score_preds (Tensor): [B, num_vec, 3]
+            agent_preds (Tensor): [B, num_agent, 2]
+            agent_fut_preds (Tensor): [B, num_agent, fut_mode, fut_ts, 2]
+            agent_score_preds (Tensor): [B, num_agent, 10]
+            agent_fut_cls_scores (Tensor): [B, num_agent, fut_mode]
+        Returns:
+            loss_plan_reg (Tensor): planning reg loss.
+            loss_plan_bound (Tensor): planning map boundary constraint loss.
+            loss_plan_col (Tensor): planning col constraint loss.
+            loss_plan_dir (Tensor): planning directional constraint loss.
+        """
+
+        ego_fut_gt = ego_fut_gt.unsqueeze(1).repeat(1, self.ego_fut_mode, 1, 1)
+        loss_plan_l1_weight = ego_fut_cmd[..., None, None] * ego_fut_masks[:, None, :, None]
+        loss_plan_l1_weight = loss_plan_l1_weight.repeat(1, 1, 1, 2)
+
+        loss_plan_l1 = self.loss_plan_reg(
+            ego_fut_preds,
+            ego_fut_gt,
+            loss_plan_l1_weight
+        )
+
+        loss_plan_bound = self.loss_plan_bound(
+            ego_fut_preds[ego_fut_cmd==1],
+            lane_preds,
+            lane_score_preds,
+            weight=ego_fut_masks
+        )
+
+        loss_plan_col = self.loss_plan_col(
+            ego_fut_preds[ego_fut_cmd==1],
+            agent_preds,
+            agent_fut_preds,
+            agent_score_preds,
+            agent_fut_cls_preds,
+            weight=ego_fut_masks[:, :, None].repeat(1, 1, 2)
+        )
+
+        loss_plan_dir = self.loss_plan_dir(
+            ego_fut_preds[ego_fut_cmd==1],
+            lane_preds,
+            lane_score_preds,
+            weight=ego_fut_masks
+        )
+
+        if digit_version(TORCH_VERSION) >= digit_version('1.8'):
+            loss_plan_l1 = torch.nan_to_num(loss_plan_l1)
+            loss_plan_bound = torch.nan_to_num(loss_plan_bound)
+            loss_plan_col = torch.nan_to_num(loss_plan_col)
+            loss_plan_dir = torch.nan_to_num(loss_plan_dir)
+        
+        loss_plan_dict = dict()
+        loss_plan_dict['loss_plan_reg'] = loss_plan_l1
+        loss_plan_dict['loss_plan_bound'] = loss_plan_bound
+        loss_plan_dict['loss_plan_col'] = loss_plan_col
+        loss_plan_dict['loss_plan_dir'] = loss_plan_dir
+
+        return loss_plan_dict
+    
+    def loss_single(self,
+                    cls_scores,
+                    bbox_preds,
+                    traj_preds,
+                    traj_cls_preds,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_attr_labels_list,
+                    gt_bboxes_ignore_list=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs from
+                a single decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           gt_bboxes_list, gt_labels_list,
+                                           gt_attr_labels_list, gt_bboxes_ignore_list)
+
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         traj_targets_list, traj_weights_list, gt_fut_masks_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+        traj_targets = torch.cat(traj_targets_list, 0)
+        traj_weights = torch.cat(traj_weights_list, 0)
+        gt_fut_masks = torch.cat(gt_fut_masks_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+
+        cls_avg_factor = max(cls_avg_factor, 1)
+        loss_cls = self.loss_cls(cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes accross all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # regression L1 loss
+        bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1))
+        normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range)
+        isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
+        bbox_weights = bbox_weights * self.code_weights
+        loss_bbox = self.loss_bbox(
+            bbox_preds[isnotnan, :10],
+            normalized_bbox_targets[isnotnan, :10],
+            bbox_weights[isnotnan, :10],
+            avg_factor=num_total_pos)
+
+        # traj regression loss
+        best_traj_preds = self.get_best_fut_preds(
+            traj_preds.reshape(-1, self.fut_mode, self.fut_ts, 2),
+            traj_targets.reshape(-1, self.fut_ts, 2), gt_fut_masks)
+
+        neg_inds = (bbox_weights[:, 0] == 0)
+        traj_labels = self.get_traj_cls_target(
+            traj_preds.reshape(-1, self.fut_mode, self.fut_ts, 2),
+            traj_targets.reshape(-1, self.fut_ts, 2),
+            gt_fut_masks, neg_inds)
+
+        loss_traj = self.loss_traj(
+            best_traj_preds[isnotnan],
+            traj_targets[isnotnan],
+            traj_weights[isnotnan],
+            avg_factor=num_total_pos)
+
+        if self.use_traj_lr_warmup:
+            loss_scale_factor = get_traj_warmup_loss_weight(self.epoch, self.tot_epoch)
+            loss_traj = loss_scale_factor * loss_traj
+
+        # traj classification loss
+        traj_cls_scores = traj_cls_preds.reshape(-1, self.fut_mode)
+        # construct weighted avg_factor to match with the official DETR repo
+        traj_cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.traj_bg_cls_weight
+        if self.sync_cls_avg_factor:
+            traj_cls_avg_factor = reduce_mean(
+                traj_cls_scores.new_tensor([traj_cls_avg_factor]))
+
+        traj_cls_avg_factor = max(traj_cls_avg_factor, 1)
+        loss_traj_cls = self.loss_traj_cls(
+            traj_cls_scores, traj_labels, label_weights, avg_factor=traj_cls_avg_factor
+        )
+
+        if digit_version(TORCH_VERSION) >= digit_version('1.8'):
+            loss_cls = torch.nan_to_num(loss_cls)
+            loss_bbox = torch.nan_to_num(loss_bbox)
+            loss_traj = torch.nan_to_num(loss_traj)
+            loss_traj_cls = torch.nan_to_num(loss_traj_cls)
+
+        return loss_cls, loss_bbox, loss_traj, loss_traj_cls
+
+    def get_best_fut_preds(self,
+             traj_preds,
+             traj_targets,
+             gt_fut_masks):
+        """"Choose best preds among all modes.
+        Args:
+            traj_preds (Tensor): MultiModal traj preds with shape (num_box_preds, fut_mode, fut_ts, 2).
+            traj_targets (Tensor): Ground truth traj for each pred box with shape (num_box_preds, fut_ts, 2).
+            gt_fut_masks (Tensor): Ground truth traj mask with shape (num_box_preds, fut_ts).
+            pred_box_centers (Tensor): Pred box centers with shape (num_box_preds, 2).
+            gt_box_centers (Tensor): Ground truth box centers with shape (num_box_preds, 2).
+
+        Returns:
+            best_traj_preds (Tensor): best traj preds (min displacement error with gt)
+                with shape (num_box_preds, fut_ts*2).
+        """
+
+        cum_traj_preds = traj_preds.cumsum(dim=-2)
+        cum_traj_targets = traj_targets.cumsum(dim=-2)
+
+        # Get min pred mode indices.
+        # (num_box_preds, fut_mode, fut_ts)
+        dist = torch.linalg.norm(cum_traj_targets[:, None, :, :] - cum_traj_preds, dim=-1)
+        dist = dist * gt_fut_masks[:, None, :]
+        dist = dist[..., -1]
+        dist[torch.isnan(dist)] = dist[torch.isnan(dist)] * 0
+        min_mode_idxs = torch.argmin(dist, dim=-1).tolist()
+        box_idxs = torch.arange(traj_preds.shape[0]).tolist()
+        best_traj_preds = traj_preds[box_idxs, min_mode_idxs, :, :].reshape(-1, self.fut_ts*2)
+
+        return best_traj_preds
+
+    def get_traj_cls_target(self,
+             traj_preds,
+             traj_targets,
+             gt_fut_masks,
+             neg_inds):
+        """"Get Trajectory mode classification target.
+        Args:
+            traj_preds (Tensor): MultiModal traj preds with shape (num_box_preds, fut_mode, fut_ts, 2).
+            traj_targets (Tensor): Ground truth traj for each pred box with shape (num_box_preds, fut_ts, 2).
+            gt_fut_masks (Tensor): Ground truth traj mask with shape (num_box_preds, fut_ts).
+            neg_inds (Tensor): Negtive indices with shape (num_box_preds,)
+
+        Returns:
+            traj_labels (Tensor): traj cls labels (num_box_preds,).
+        """
+
+        cum_traj_preds = traj_preds.cumsum(dim=-2)
+        cum_traj_targets = traj_targets.cumsum(dim=-2)
+
+        # Get min pred mode indices.
+        # (num_box_preds, fut_mode, fut_ts)
+        dist = torch.linalg.norm(cum_traj_targets[:, None, :, :] - cum_traj_preds, dim=-1)
+        dist = dist * gt_fut_masks[:, None, :]
+        dist = dist[..., -1]
+        dist[torch.isnan(dist)] = dist[torch.isnan(dist)] * 0
+        traj_labels = torch.argmin(dist, dim=-1)
+        traj_labels[neg_inds] = self.fut_mode
+
+        return traj_labels
+
+    def map_loss_single(self,
+                    cls_scores,
+                    bbox_preds,
+                    pts_preds,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_shifts_pts_list,
+                    gt_bboxes_ignore_list=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_pts_list (list[Tensor]): Ground truth pts for each image
+                with shape (num_gts, fixed_num, 2) in [x,y] format.
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs from
+                a single decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        pts_preds_list = [pts_preds[i] for i in range(num_imgs)]
+
+        cls_reg_targets = self.map_get_targets(cls_scores_list, bbox_preds_list,pts_preds_list,
+                                           gt_bboxes_list, gt_labels_list,gt_shifts_pts_list,
+                                           gt_bboxes_ignore_list)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         pts_targets_list, pts_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+ 
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+        pts_targets = torch.cat(pts_targets_list, 0)
+        pts_weights = torch.cat(pts_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.map_cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.map_bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+
+        cls_avg_factor = max(cls_avg_factor, 1)
+        loss_cls = self.loss_map_cls(
+            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes accross all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # regression L1 loss
+        bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1))
+        normalized_bbox_targets = normalize_2d_bbox(bbox_targets, self.pc_range)
+        # normalized_bbox_targets = bbox_targets
+        isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
+        bbox_weights = bbox_weights * self.map_code_weights
+
+        loss_bbox = self.loss_map_bbox(
+            bbox_preds[isnotnan, :4],
+            normalized_bbox_targets[isnotnan,:4],
+            bbox_weights[isnotnan, :4],
+            avg_factor=num_total_pos)
+
+        # regression pts CD loss
+        # num_samples, num_order, num_pts, num_coords
+        normalized_pts_targets = normalize_2d_pts(pts_targets, self.pc_range)
+
+        # num_samples, num_pts, num_coords
+        pts_preds = pts_preds.reshape(-1, pts_preds.size(-2), pts_preds.size(-1))
+        if self.map_num_pts_per_vec != self.map_num_pts_per_gt_vec:
+            pts_preds = pts_preds.permute(0,2,1)
+            pts_preds = F.interpolate(pts_preds, size=(self.map_num_pts_per_gt_vec), mode='linear',
+                                    align_corners=True)
+            pts_preds = pts_preds.permute(0,2,1).contiguous()
+
+        loss_pts = self.loss_map_pts(
+            pts_preds[isnotnan,:,:],
+            normalized_pts_targets[isnotnan,:,:], 
+            pts_weights[isnotnan,:,:],
+            avg_factor=num_total_pos)
+
+        dir_weights = pts_weights[:, :-self.map_dir_interval,0]
+        denormed_pts_preds = denormalize_2d_pts(pts_preds, self.pc_range)
+        denormed_pts_preds_dir = denormed_pts_preds[:,self.map_dir_interval:,:] - \
+            denormed_pts_preds[:,:-self.map_dir_interval,:]
+        pts_targets_dir = pts_targets[:, self.map_dir_interval:,:] - pts_targets[:,:-self.map_dir_interval,:]
+
+        loss_dir = self.loss_map_dir(
+            denormed_pts_preds_dir[isnotnan,:,:],
+            pts_targets_dir[isnotnan,:,:],
+            dir_weights[isnotnan,:],
+            avg_factor=num_total_pos)
+
+        bboxes = denormalize_2d_bbox(bbox_preds, self.pc_range)
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_map_iou(
+            bboxes[isnotnan, :4],
+            bbox_targets[isnotnan, :4],
+            bbox_weights[isnotnan, :4], 
+            avg_factor=num_total_pos)
+
+        if digit_version(TORCH_VERSION) >= digit_version('1.8'):
+            loss_cls = torch.nan_to_num(loss_cls)
+            loss_bbox = torch.nan_to_num(loss_bbox)
+            loss_iou = torch.nan_to_num(loss_iou)
+            loss_pts = torch.nan_to_num(loss_pts)
+            loss_dir = torch.nan_to_num(loss_dir)
+
+        return loss_cls, loss_bbox, loss_iou, loss_pts, loss_dir
+
+    @force_fp32(apply_to=('preds_dicts'))
+    def loss(self,
+             gt_bboxes_list,
+             gt_labels_list,
+             map_gt_bboxes_list,
+             map_gt_labels_list,
+             preds_dicts,
+             ego_fut_gt,
+             ego_fut_masks,
+             ego_fut_cmd,
+             gt_attr_labels,
+             gt_bboxes_ignore=None,
+             map_gt_bboxes_ignore=None,
+             img_metas=None):
+        """"Loss function.
+        Args:
+
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            preds_dicts:
+                all_cls_scores (Tensor): Classification score of all
+                    decoder layers, has shape
+                    [nb_dec, bs, num_query, cls_out_channels].
+                all_bbox_preds (Tensor): Sigmoid regression
+                    outputs of all decode layers. Each is a 4D-tensor with
+                    normalized coordinate format (cx, cy, w, h) and shape
+                    [nb_dec, bs, num_query, 4].
+                enc_cls_scores (Tensor): Classification scores of
+                    points on encode feature map , has shape
+                    (N, h*w, num_classes). Only be passed when as_two_stage is
+                    True, otherwise is None.
+                enc_bbox_preds (Tensor): Regression results of each points
+                    on the encode feature map, has shape (N, h*w, 4). Only be
+                    passed when as_two_stage is True, otherwise is None.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert gt_bboxes_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            f'for gt_bboxes_ignore setting to None.'
+
+        map_gt_vecs_list = copy.deepcopy(map_gt_bboxes_list)
+
+        all_cls_scores = preds_dicts['all_cls_scores']
+        all_bbox_preds = preds_dicts['all_bbox_preds']
+        all_traj_preds = preds_dicts['all_traj_preds']
+        all_traj_cls_scores = preds_dicts['all_traj_cls_scores']
+        enc_cls_scores = preds_dicts['enc_cls_scores']
+        enc_bbox_preds = preds_dicts['enc_bbox_preds']
+        map_all_cls_scores = preds_dicts['map_all_cls_scores']
+        map_all_bbox_preds = preds_dicts['map_all_bbox_preds']
+        map_all_pts_preds = preds_dicts['map_all_pts_preds']
+        map_enc_cls_scores = preds_dicts['map_enc_cls_scores']
+        map_enc_bbox_preds = preds_dicts['map_enc_bbox_preds']
+        map_enc_pts_preds = preds_dicts['map_enc_pts_preds']
+        ego_fut_preds = preds_dicts['ego_fut_preds']
+
+        num_dec_layers = len(all_cls_scores)
+        device = gt_labels_list[0].device
+
+        gt_bboxes_list = [torch.cat(
+            (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]),
+            dim=1).to(device) for gt_bboxes in gt_bboxes_list]
+
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_attr_labels_list = [gt_attr_labels for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+
+        losses_cls, losses_bbox, loss_traj, loss_traj_cls = multi_apply(
+            self.loss_single, all_cls_scores, all_bbox_preds, all_traj_preds,
+            all_traj_cls_scores, all_gt_bboxes_list, all_gt_labels_list,
+            all_gt_attr_labels_list, all_gt_bboxes_ignore_list)
+        
+
+        num_dec_layers = len(map_all_cls_scores)
+        device = map_gt_labels_list[0].device
+
+        map_gt_bboxes_list = [
+            map_gt_bboxes.bbox.to(device) for map_gt_bboxes in map_gt_vecs_list]
+        map_gt_pts_list = [
+            map_gt_bboxes.fixed_num_sampled_points.to(device) for map_gt_bboxes in map_gt_vecs_list]
+        if self.map_gt_shift_pts_pattern == 'v0':
+            map_gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points.to(device) for gt_bboxes in map_gt_vecs_list]
+        elif self.map_gt_shift_pts_pattern == 'v1':
+            map_gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points_v1.to(device) for gt_bboxes in map_gt_vecs_list]
+        elif self.map_gt_shift_pts_pattern == 'v2':
+            map_gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points_v2.to(device) for gt_bboxes in map_gt_vecs_list]
+        elif self.map_gt_shift_pts_pattern == 'v3':
+            map_gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points_v3.to(device) for gt_bboxes in map_gt_vecs_list]
+        elif self.map_gt_shift_pts_pattern == 'v4':
+            map_gt_shifts_pts_list = [
+                gt_bboxes.shift_fixed_num_sampled_points_v4.to(device) for gt_bboxes in map_gt_vecs_list]
+        else:
+            raise NotImplementedError
+        map_all_gt_bboxes_list = [map_gt_bboxes_list for _ in range(num_dec_layers)]
+        map_all_gt_labels_list = [map_gt_labels_list for _ in range(num_dec_layers)]
+        map_all_gt_pts_list = [map_gt_pts_list for _ in range(num_dec_layers)]
+        map_all_gt_shifts_pts_list = [map_gt_shifts_pts_list for _ in range(num_dec_layers)]
+        map_all_gt_bboxes_ignore_list = [
+            map_gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+
+        map_losses_cls, map_losses_bbox, map_losses_iou, \
+            map_losses_pts, map_losses_dir = multi_apply(
+            self.map_loss_single, map_all_cls_scores, map_all_bbox_preds,
+            map_all_pts_preds, map_all_gt_bboxes_list, map_all_gt_labels_list,
+            map_all_gt_shifts_pts_list, map_all_gt_bboxes_ignore_list)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        loss_dict['loss_traj'] = loss_traj[-1]
+        loss_dict['loss_traj_cls'] = loss_traj_cls[-1]
+        # loss from the last decoder layer
+        loss_dict['loss_map_cls'] = map_losses_cls[-1]
+        loss_dict['loss_map_bbox'] = map_losses_bbox[-1]
+        loss_dict['loss_map_iou'] = map_losses_iou[-1]
+        loss_dict['loss_map_pts'] = map_losses_pts[-1]
+        loss_dict['loss_map_dir'] = map_losses_dir[-1]
+
+        # Planning Loss
+        ego_fut_gt = ego_fut_gt.squeeze(1)
+        ego_fut_masks = ego_fut_masks.squeeze(1).squeeze(1)
+        ego_fut_cmd = ego_fut_cmd.squeeze(1).squeeze(1)
+
+        batch, num_agent = all_traj_preds[-1].shape[:2]
+        agent_fut_preds = all_traj_preds[-1].view(batch, num_agent, self.fut_mode, self.fut_ts, 2)
+        agent_fut_cls_preds = all_traj_cls_scores[-1].view(batch, num_agent, self.fut_mode)
+        loss_plan_input = [ego_fut_preds, ego_fut_gt, ego_fut_masks, ego_fut_cmd,
+                           map_all_pts_preds[-1], map_all_cls_scores[-1].sigmoid(),
+                           all_bbox_preds[-1][..., 0:2], agent_fut_preds,
+                           all_cls_scores[-1].sigmoid(), agent_fut_cls_preds.sigmoid()]
+
+        loss_planning_dict = self.loss_planning(*loss_plan_input)
+        loss_dict['loss_plan_reg'] = loss_planning_dict['loss_plan_reg']
+        loss_dict['loss_plan_bound'] = loss_planning_dict['loss_plan_bound']
+        loss_dict['loss_plan_col'] = loss_planning_dict['loss_plan_col']
+        loss_dict['loss_plan_dir'] = loss_planning_dict['loss_plan_dir']
+
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], losses_bbox[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            num_dec_layer += 1
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for map_loss_cls_i, map_loss_bbox_i, map_loss_iou_i, map_loss_pts_i, map_loss_dir_i in zip(
+            map_losses_cls[:-1],
+            map_losses_bbox[:-1],
+            map_losses_iou[:-1],
+            map_losses_pts[:-1],
+            map_losses_dir[:-1]
+        ):
+            loss_dict[f'd{num_dec_layer}.loss_map_cls'] = map_loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_map_bbox'] = map_loss_bbox_i
+            loss_dict[f'd{num_dec_layer}.loss_map_iou'] = map_loss_iou_i
+            loss_dict[f'd{num_dec_layer}.loss_map_pts'] = map_loss_pts_i
+            loss_dict[f'd{num_dec_layer}.loss_map_dir'] = map_loss_dir_i
+            num_dec_layer += 1
+
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            binary_labels_list = [
+                torch.zeros_like(gt_labels_list[i])
+                for i in range(len(all_gt_labels_list))
+            ]
+            enc_loss_cls, enc_losses_bbox = \
+                self.loss_single(enc_cls_scores, enc_bbox_preds,
+                                 gt_bboxes_list, binary_labels_list,
+                                 gt_bboxes_ignore)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+
+        if map_enc_cls_scores is not None:
+            map_binary_labels_list = [
+                torch.zeros_like(map_gt_labels_list[i])
+                for i in range(len(map_all_gt_labels_list))
+            ]
+            # TODO bug here, but we dont care enc_loss now
+            map_enc_loss_cls, map_enc_loss_bbox, map_enc_loss_iou, \
+                 map_enc_loss_pts, map_enc_loss_dir = \
+                self.map_loss_single(
+                    map_enc_cls_scores, map_enc_bbox_preds,
+                    map_enc_pts_preds, map_gt_bboxes_list,
+                    map_binary_labels_list, map_gt_pts_list,
+                    map_gt_bboxes_ignore
+                )
+            loss_dict['enc_loss_map_cls'] = map_enc_loss_cls
+            loss_dict['enc_loss_map_bbox'] = map_enc_loss_bbox
+            loss_dict['enc_loss_map_iou'] = map_enc_loss_iou
+            loss_dict['enc_loss_map_pts'] = map_enc_loss_pts
+            loss_dict['enc_loss_map_dir'] = map_enc_loss_dir
+
+        return loss_dict
+
+    @force_fp32(apply_to=('preds_dicts'))
+    def get_bboxes(self, preds_dicts, img_metas, rescale=False):
+        """Generate bboxes from bbox head predictions.
+        Args:
+            preds_dicts (tuple[list[dict]]): Prediction results.
+            img_metas (list[dict]): Point cloud and image's meta info.
+        Returns:
+            list[dict]: Decoded bbox, scores and labels after nms.
+        """
+
+        det_preds_dicts = self.bbox_coder.decode(preds_dicts)
+        # map_bboxes: xmin, ymin, xmax, ymax
+        map_preds_dicts = self.map_bbox_coder.decode(preds_dicts)
+
+        num_samples = len(det_preds_dicts)
+        assert len(det_preds_dicts) == len(map_preds_dicts), \
+             'len(preds_dict) should be equal to len(map_preds_dicts)'
+        ret_list = []
+        for i in range(num_samples):
+            preds = det_preds_dicts[i]
+            bboxes = preds['bboxes']
+            bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
+            code_size = bboxes.shape[-1]
+            bboxes = img_metas[i]['box_type_3d'](bboxes, code_size)
+            scores = preds['scores']
+            labels = preds['labels']
+            trajs = preds['trajs']
+
+            map_preds = map_preds_dicts[i]
+            map_bboxes = map_preds['map_bboxes']
+            map_scores = map_preds['map_scores']
+            map_labels = map_preds['map_labels']
+            map_pts = map_preds['map_pts']
+
+            ret_list.append([bboxes, scores, labels, trajs, map_bboxes,
+                             map_scores, map_labels, map_pts])
+
+        return ret_list
+
+    def select_and_pad_pred_map(
+        self,
+        motion_pos,
+        map_query,
+        map_score,
+        map_pos,
+        map_thresh=0.5,
+        dis_thresh=None,
+        pe_normalization=True,
+        use_fix_pad=False
+    ):
+        """select_and_pad_pred_map.
+        Args:
+            motion_pos: [B, A, 2]
+            map_query: [B, P, D].
+            map_score: [B, P, 3].
+            map_pos: [B, P, pts, 2].
+            map_thresh: map confidence threshold for filtering low-confidence preds
+            dis_thresh: distance threshold for masking far maps for each agent in cross-attn
+            use_fix_pad: always pad one lane instance for each batch
+        Returns:
+            selected_map_query: [B*A, P1(+1), D], P1 is the max inst num after filter and pad.
+            selected_map_pos: [B*A, P1(+1), 2]
+            selected_padding_mask: [B*A, P1(+1)]
+        """
+        
+        if dis_thresh is None:
+            raise NotImplementedError('Not implement yet')
+
+        # use the most close pts pos in each map inst as the inst's pos
+        batch, num_map = map_pos.shape[:2]
+        map_dis = torch.sqrt(map_pos[..., 0]**2 + map_pos[..., 1]**2)
+        min_map_pos_idx = map_dis.argmin(dim=-1).flatten()  # [B*P]
+        min_map_pos = map_pos.flatten(0, 1)  # [B*P, pts, 2]
+        min_map_pos = min_map_pos[range(min_map_pos.shape[0]), min_map_pos_idx]  # [B*P, 2]
+        min_map_pos = min_map_pos.view(batch, num_map, 2)  # [B, P, 2]
+
+        # select & pad map vectors for different batch using map_thresh
+        map_score = map_score.sigmoid()
+        map_max_score = map_score.max(dim=-1)[0]
+        map_idx = map_max_score > map_thresh
+        batch_max_pnum = 0
+        for i in range(map_score.shape[0]):
+            pnum = map_idx[i].sum()
+            if pnum > batch_max_pnum:
+                batch_max_pnum = pnum
+
+        selected_map_query, selected_map_pos, selected_padding_mask = [], [], []
+        for i in range(map_score.shape[0]):
+            dim = map_query.shape[-1]
+            valid_pnum = map_idx[i].sum()
+            valid_map_query = map_query[i, map_idx[i]]
+            valid_map_pos = min_map_pos[i, map_idx[i]]
+            pad_pnum = batch_max_pnum - valid_pnum
+            padding_mask = torch.tensor([False], device=map_score.device).repeat(batch_max_pnum)
+            if pad_pnum != 0:
+                valid_map_query = torch.cat([valid_map_query, torch.zeros((pad_pnum, dim), device=map_score.device)], dim=0)
+                valid_map_pos = torch.cat([valid_map_pos, torch.zeros((pad_pnum, 2), device=map_score.device)], dim=0)
+                padding_mask[valid_pnum:] = True
+            selected_map_query.append(valid_map_query)
+            selected_map_pos.append(valid_map_pos)
+            selected_padding_mask.append(padding_mask)
+
+        selected_map_query = torch.stack(selected_map_query, dim=0)
+        selected_map_pos = torch.stack(selected_map_pos, dim=0)
+        selected_padding_mask = torch.stack(selected_padding_mask, dim=0)
+
+        # generate different pe for map vectors for each agent
+        num_agent = motion_pos.shape[1]
+        selected_map_query = selected_map_query.unsqueeze(1).repeat(1, num_agent, 1, 1)  # [B, A, max_P, D]
+        selected_map_pos = selected_map_pos.unsqueeze(1).repeat(1, num_agent, 1, 1)  # [B, A, max_P, 2]
+        selected_padding_mask = selected_padding_mask.unsqueeze(1).repeat(1, num_agent, 1)  # [B, A, max_P]
+        # move lane to per-car coords system
+        selected_map_dist = selected_map_pos - motion_pos[:, :, None, :]  # [B, A, max_P, 2]
+        if pe_normalization:
+            selected_map_pos = selected_map_pos - motion_pos[:, :, None, :]  # [B, A, max_P, 2]
+
+        # filter far map inst for each agent
+        map_dis = torch.sqrt(selected_map_dist[..., 0]**2 + selected_map_dist[..., 1]**2)
+        valid_map_inst = (map_dis <= dis_thresh)  # [B, A, max_P]
+        invalid_map_inst = (valid_map_inst == False)
+        selected_padding_mask = selected_padding_mask + invalid_map_inst
+
+        selected_map_query = selected_map_query.flatten(0, 1)
+        selected_map_pos = selected_map_pos.flatten(0, 1)
+        selected_padding_mask = selected_padding_mask.flatten(0, 1)
+
+        num_batch = selected_padding_mask.shape[0]
+        feat_dim = selected_map_query.shape[-1]
+        if use_fix_pad:
+            pad_map_query = torch.zeros((num_batch, 1, feat_dim), device=selected_map_query.device)
+            pad_map_pos = torch.ones((num_batch, 1, 2), device=selected_map_pos.device)
+            pad_lane_mask = torch.tensor([False], device=selected_padding_mask.device).unsqueeze(0).repeat(num_batch, 1)
+            selected_map_query = torch.cat([selected_map_query, pad_map_query], dim=1)
+            selected_map_pos = torch.cat([selected_map_pos, pad_map_pos], dim=1)
+            selected_padding_mask = torch.cat([selected_padding_mask, pad_lane_mask], dim=1)
+
+        return selected_map_query, selected_map_pos, selected_padding_mask
+
+
+    def select_and_pad_query(
+        self,
+        query,
+        query_pos,
+        query_score,
+        score_thresh=0.5,
+        use_fix_pad=True
+    ):
+        """select_and_pad_query.
+        Args:
+            query: [B, Q, D].
+            query_pos: [B, Q, 2]
+            query_score: [B, Q, C].
+            score_thresh: confidence threshold for filtering low-confidence query
+            use_fix_pad: always pad one query instance for each batch
+        Returns:
+            selected_query: [B, Q', D]
+            selected_query_pos: [B, Q', 2]
+            selected_padding_mask: [B, Q']
+        """
+
+        # select & pad query for different batch using score_thresh
+        query_score = query_score.sigmoid()
+        query_score = query_score.max(dim=-1)[0]
+        query_idx = query_score > score_thresh
+        batch_max_qnum = 0
+        for i in range(query_score.shape[0]):
+            qnum = query_idx[i].sum()
+            if qnum > batch_max_qnum:
+                batch_max_qnum = qnum
+
+        selected_query, selected_query_pos, selected_padding_mask = [], [], []
+        for i in range(query_score.shape[0]):
+            dim = query.shape[-1]
+            valid_qnum = query_idx[i].sum()
+            valid_query = query[i, query_idx[i]]
+            valid_query_pos = query_pos[i, query_idx[i]]
+            pad_qnum = batch_max_qnum - valid_qnum
+            padding_mask = torch.tensor([False], device=query_score.device).repeat(batch_max_qnum)
+            if pad_qnum != 0:
+                valid_query = torch.cat([valid_query, torch.zeros((pad_qnum, dim), device=query_score.device)], dim=0)
+                valid_query_pos = torch.cat([valid_query_pos, torch.zeros((pad_qnum, 2), device=query_score.device)], dim=0)
+                padding_mask[valid_qnum:] = True
+            selected_query.append(valid_query)
+            selected_query_pos.append(valid_query_pos)
+            selected_padding_mask.append(padding_mask)
+
+        selected_query = torch.stack(selected_query, dim=0)
+        selected_query_pos = torch.stack(selected_query_pos, dim=0)
+        selected_padding_mask = torch.stack(selected_padding_mask, dim=0)
+
+        num_batch = selected_padding_mask.shape[0]
+        feat_dim = selected_query.shape[-1]
+        if use_fix_pad:
+            pad_query = torch.zeros((num_batch, 1, feat_dim), device=selected_query.device)
+            pad_query_pos = torch.ones((num_batch, 1, 2), device=selected_query_pos.device)
+            pad_mask = torch.tensor([False], device=selected_padding_mask.device).unsqueeze(0).repeat(num_batch, 1)
+            selected_query = torch.cat([selected_query, pad_query], dim=1)
+            selected_query_pos = torch.cat([selected_query_pos, pad_query_pos], dim=1)
+            selected_padding_mask = torch.cat([selected_padding_mask, pad_mask], dim=1)
+
+        return selected_query, selected_query_pos, selected_padding_mask
diff --git a/mmcv/models/dense_heads/__init__.py b/mmcv/models/dense_heads/__init__.py
new file mode 100644
index 0000000..6d6e404
--- /dev/null
+++ b/mmcv/models/dense_heads/__init__.py
@@ -0,0 +1,10 @@
+from .detr_head import DETRHead
+from .rpn_head import RPNHead
+from .ga_rpn_head import GARPNHead
+from .track_head import BEVFormerTrackHead
+from .panseg_head import PansegformerHead
+from .occ_head import OccHead
+from .motion_head import MotionHead
+from .planning_head import PlanningHeadSingleMode
+from .bevformer_head import BEVFormerHead
+from .VAD_head import VADHead
\ No newline at end of file
diff --git a/mmcv/models/dense_heads/anchor3d_head.py b/mmcv/models/dense_heads/anchor3d_head.py
new file mode 100644
index 0000000..4e566fa
--- /dev/null
+++ b/mmcv/models/dense_heads/anchor3d_head.py
@@ -0,0 +1,513 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmcv.utils import force_fp32
+from mmcv.models import BaseModule
+from torch import nn as nn
+
+from mmcv.core import (PseudoSampler, box3d_multiclass_nms, limit_period,
+                          xywhr2xyxyr)
+from mmcv.core import (build_assigner, build_bbox_coder,
+                        build_prior_generator, build_sampler, multi_apply)
+from mmcv.models import HEADS
+from ..builder import build_loss
+from .train_mixins import AnchorTrainMixin
+
+@HEADS.register_module()
+class Anchor3DHead(BaseModule, AnchorTrainMixin):
+    """Anchor head for SECOND/PointPillars/MVXNet/PartA2.
+
+    Args:
+        num_classes (int): Number of classes.
+        in_channels (int): Number of channels in the input feature map.
+        train_cfg (dict): Train configs.
+        test_cfg (dict): Test configs.
+        feat_channels (int): Number of channels of the feature map.
+        use_direction_classifier (bool): Whether to add a direction classifier.
+        anchor_generator(dict): Config dict of anchor generator.
+        assigner_per_size (bool): Whether to do assignment for each separate
+            anchor size.
+        assign_per_class (bool): Whether to do assignment for each class.
+        diff_rad_by_sin (bool): Whether to change the difference into sin
+            difference for box regression loss.
+        dir_offset (float | int): The offset of BEV rotation angles.
+            (TODO: may be moved into box coder)
+        dir_limit_offset (float | int): The limited range of BEV
+            rotation angles. (TODO: may be moved into box coder)
+        bbox_coder (dict): Config dict of box coders.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        loss_dir (dict): Config of direction classifier loss.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 train_cfg,
+                 test_cfg,
+                 feat_channels=256,
+                 use_direction_classifier=True,
+                 anchor_generator=dict(
+                     type='Anchor3DRangeGenerator',
+                     range=[0, -39.68, -1.78, 69.12, 39.68, -1.78],
+                     strides=[2],
+                     sizes=[[1.6, 3.9, 1.56]],
+                     rotations=[0, 1.57],
+                     custom_values=[],
+                     reshape_out=False),
+                 assigner_per_size=False,
+                 assign_per_class=False,
+                 diff_rad_by_sin=True,
+                 dir_offset=0,
+                 dir_limit_offset=1,
+                 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_bbox=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
+                 loss_dir=dict(type='CrossEntropyLoss', loss_weight=0.2),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.diff_rad_by_sin = diff_rad_by_sin
+        self.use_direction_classifier = use_direction_classifier
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.assigner_per_size = assigner_per_size
+        self.assign_per_class = assign_per_class
+        self.dir_offset = dir_offset
+        self.dir_limit_offset = dir_limit_offset
+        self.fp16_enabled = False
+
+        # build anchor generator
+        self.anchor_generator = build_prior_generator(anchor_generator)
+        # In 3D detection, the anchor stride is connected with anchor size
+        self.num_anchors = self.anchor_generator.num_base_anchors
+        # build box coder
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.box_code_size = self.bbox_coder.code_size
+
+        # build loss function
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        self.sampling = loss_cls['type'] not in ['FocalLoss', 'GHMC']
+        if not self.use_sigmoid_cls:
+            self.num_classes += 1
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_dir = build_loss(loss_dir)
+        self.fp16_enabled = False
+
+        self._init_layers()
+        self._init_assigner_sampler()
+
+        if init_cfg is None:
+            self.init_cfg = dict(
+                type='Normal',
+                layer='Conv2d',
+                std=0.01,
+                override=dict(
+                    type='Normal', name='conv_cls', std=0.01, bias_prob=0.01))
+
+    def _init_assigner_sampler(self):
+        """Initialize the target assigner and sampler of the head."""
+        if self.train_cfg is None:
+            return
+
+        if self.sampling:
+            self.bbox_sampler = build_sampler(self.train_cfg.sampler)
+        else:
+            self.bbox_sampler = PseudoSampler()
+        if isinstance(self.train_cfg.assigner, dict):
+            self.bbox_assigner = build_assigner(self.train_cfg.assigner)
+        elif isinstance(self.train_cfg.assigner, list):
+            self.bbox_assigner = [
+                build_assigner(res) for res in self.train_cfg.assigner
+            ]
+
+    def _init_layers(self):
+        """Initialize neural network layers of the head."""
+        self.cls_out_channels = self.num_anchors * self.num_classes
+        self.conv_cls = nn.Conv2d(self.feat_channels, self.cls_out_channels, 1)
+        self.conv_reg = nn.Conv2d(self.feat_channels,
+                                  self.num_anchors * self.box_code_size, 1)
+        if self.use_direction_classifier:
+            self.conv_dir_cls = nn.Conv2d(self.feat_channels,
+                                          self.num_anchors * 2, 1)
+
+    def forward_single(self, x):
+        """Forward function on a single-scale feature map.
+
+        Args:
+            x (torch.Tensor): Input features.
+
+        Returns:
+            tuple[torch.Tensor]: Contain score of each class, bbox \
+                regression and direction classification predictions.
+        """
+        cls_score = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
+        dir_cls_preds = None
+        if self.use_direction_classifier:
+            dir_cls_preds = self.conv_dir_cls(x)
+        return cls_score, bbox_pred, dir_cls_preds
+
+    def forward(self, feats):
+        """Forward pass.
+
+        Args:
+            feats (list[torch.Tensor]): Multi-level features, e.g.,
+                features produced by FPN.
+
+        Returns:
+            tuple[list[torch.Tensor]]: Multi-level class score, bbox \
+                and direction predictions.
+        """
+        return multi_apply(self.forward_single, feats)
+
+    def get_anchors(self, featmap_sizes, input_metas, device='cuda'):
+        """Get anchors according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            input_metas (list[dict]): contain pcd and img's meta info.
+            device (str): device of current module.
+
+        Returns:
+            list[list[torch.Tensor]]: Anchors of each image, valid flags \
+                of each image.
+        """
+        num_imgs = len(input_metas)
+        # since feature map sizes of all images are the same, we only compute
+        # anchors for one time
+        multi_level_anchors = self.anchor_generator.grid_anchors(
+            featmap_sizes, device=device)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+        return anchor_list
+
+    def loss_single(self, cls_score, bbox_pred, dir_cls_preds, labels,
+                    label_weights, bbox_targets, bbox_weights, dir_targets,
+                    dir_weights, num_total_samples):
+        """Calculate loss of Single-level results.
+
+        Args:
+            cls_score (torch.Tensor): Class score in single-level.
+            bbox_pred (torch.Tensor): Bbox prediction in single-level.
+            dir_cls_preds (torch.Tensor): Predictions of direction class
+                in single-level.
+            labels (torch.Tensor): Labels of class.
+            label_weights (torch.Tensor): Weights of class loss.
+            bbox_targets (torch.Tensor): Targets of bbox predictions.
+            bbox_weights (torch.Tensor): Weights of bbox loss.
+            dir_targets (torch.Tensor): Targets of direction predictions.
+            dir_weights (torch.Tensor): Weights of direction loss.
+            num_total_samples (int): The number of valid samples.
+
+        Returns:
+            tuple[torch.Tensor]: Losses of class, bbox \
+                and direction, respectively.
+        """
+        # classification loss
+        if num_total_samples is None:
+            num_total_samples = int(cls_score.shape[0])
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3, 1).reshape(-1, self.num_classes)
+        assert labels.max().item() <= self.num_classes
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=num_total_samples)
+
+        # regression loss
+        bbox_pred = bbox_pred.permute(0, 2, 3,
+                                      1).reshape(-1, self.box_code_size)
+        bbox_targets = bbox_targets.reshape(-1, self.box_code_size)
+        bbox_weights = bbox_weights.reshape(-1, self.box_code_size)
+
+        bg_class_ind = self.num_classes
+        pos_inds = ((labels >= 0)
+                    & (labels < bg_class_ind)).nonzero(
+                        as_tuple=False).reshape(-1)
+        num_pos = len(pos_inds)
+
+        pos_bbox_pred = bbox_pred[pos_inds]
+        pos_bbox_targets = bbox_targets[pos_inds]
+        pos_bbox_weights = bbox_weights[pos_inds]
+
+        # dir loss
+        if self.use_direction_classifier:
+            dir_cls_preds = dir_cls_preds.permute(0, 2, 3, 1).reshape(-1, 2)
+            dir_targets = dir_targets.reshape(-1)
+            dir_weights = dir_weights.reshape(-1)
+            pos_dir_cls_preds = dir_cls_preds[pos_inds]
+            pos_dir_targets = dir_targets[pos_inds]
+            pos_dir_weights = dir_weights[pos_inds]
+
+        if num_pos > 0:
+            code_weight = self.train_cfg.get('code_weight', None)
+            if code_weight:
+                pos_bbox_weights = pos_bbox_weights * bbox_weights.new_tensor(
+                    code_weight)
+            if self.diff_rad_by_sin:
+                pos_bbox_pred, pos_bbox_targets = self.add_sin_difference(
+                    pos_bbox_pred, pos_bbox_targets)
+            loss_bbox = self.loss_bbox(
+                pos_bbox_pred,
+                pos_bbox_targets,
+                pos_bbox_weights,
+                avg_factor=num_total_samples)
+
+            # direction classification loss
+            loss_dir = None
+            if self.use_direction_classifier:
+                loss_dir = self.loss_dir(
+                    pos_dir_cls_preds,
+                    pos_dir_targets,
+                    pos_dir_weights,
+                    avg_factor=num_total_samples)
+        else:
+            loss_bbox = pos_bbox_pred.sum()
+            if self.use_direction_classifier:
+                loss_dir = pos_dir_cls_preds.sum()
+
+        return loss_cls, loss_bbox, loss_dir
+
+    @staticmethod
+    def add_sin_difference(boxes1, boxes2):
+        """Convert the rotation difference to difference in sine function.
+
+        Args:
+            boxes1 (torch.Tensor): Original Boxes in shape (NxC), where C>=7
+                and the 7th dimension is rotation dimension.
+            boxes2 (torch.Tensor): Target boxes in shape (NxC), where C>=7 and
+                the 7th dimension is rotation dimension.
+
+        Returns:
+            tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th \
+                dimensions are changed.
+        """
+        rad_pred_encoding = torch.sin(boxes1[..., 6:7]) * torch.cos(
+            boxes2[..., 6:7])
+        rad_tg_encoding = torch.cos(boxes1[..., 6:7]) * torch.sin(boxes2[...,
+                                                                         6:7])
+        boxes1 = torch.cat(
+            [boxes1[..., :6], rad_pred_encoding, boxes1[..., 7:]], dim=-1)
+        boxes2 = torch.cat([boxes2[..., :6], rad_tg_encoding, boxes2[..., 7:]],
+                           dim=-1)
+        return boxes1, boxes2
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             dir_cls_preds,
+             gt_bboxes,
+             gt_labels,
+             input_metas,
+             gt_bboxes_ignore=None):
+        """Calculate losses.
+
+        Args:
+            cls_scores (list[torch.Tensor]): Multi-level class scores.
+            bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
+            dir_cls_preds (list[torch.Tensor]): Multi-level direction
+                class predictions.
+            gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Gt bboxes
+                of each sample.
+            gt_labels (list[torch.Tensor]): Gt labels of each sample.
+            input_metas (list[dict]): Contain pcd and img's meta info.
+            gt_bboxes_ignore (None | list[torch.Tensor]): Specify
+                which bounding.
+
+        Returns:
+            dict[str, list[torch.Tensor]]: Classification, bbox, and \
+                direction losses of each level.
+
+                - loss_cls (list[torch.Tensor]): Classification losses.
+                - loss_bbox (list[torch.Tensor]): Box regression losses.
+                - loss_dir (list[torch.Tensor]): Direction classification \
+                    losses.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.anchor_generator.num_levels
+        device = cls_scores[0].device
+        anchor_list = self.get_anchors(
+            featmap_sizes, input_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.anchor_target_3d(
+            anchor_list,
+            gt_bboxes,
+            input_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            num_classes=self.num_classes,
+            label_channels=label_channels,
+            sampling=self.sampling)
+
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         dir_targets_list, dir_weights_list, num_total_pos,
+         num_total_neg) = cls_reg_targets
+        num_total_samples = (
+            num_total_pos + num_total_neg if self.sampling else num_total_pos)
+
+        # num_total_samples = None
+        losses_cls, losses_bbox, losses_dir = multi_apply(
+            self.loss_single,
+            cls_scores,
+            bbox_preds,
+            dir_cls_preds,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            dir_targets_list,
+            dir_weights_list,
+            num_total_samples=num_total_samples)
+        return dict(
+            loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dir=losses_dir)
+
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   dir_cls_preds,
+                   input_metas,
+                   cfg=None,
+                   rescale=False):
+        """Get bboxes of anchor head.
+
+        Args:
+            cls_scores (list[torch.Tensor]): Multi-level class scores.
+            bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
+            dir_cls_preds (list[torch.Tensor]): Multi-level direction
+                class predictions.
+            input_metas (list[dict]): Contain pcd and img's meta info.
+            cfg (None | :obj:`ConfigDict`): Training or testing config.
+            rescale (list[torch.Tensor]): Whether th rescale bbox.
+
+        Returns:
+            list[tuple]: Prediction resultes of batches.
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        assert len(cls_scores) == len(dir_cls_preds)
+        num_levels = len(cls_scores)
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        device = cls_scores[0].device
+        mlvl_anchors = self.anchor_generator.grid_anchors(
+            featmap_sizes, device=device)
+        mlvl_anchors = [
+            anchor.reshape(-1, self.box_code_size) for anchor in mlvl_anchors
+        ]
+
+        result_list = []
+        for img_id in range(len(input_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            dir_cls_pred_list = [
+                dir_cls_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+
+            input_meta = input_metas[img_id]
+            proposals = self.get_bboxes_single(cls_score_list, bbox_pred_list,
+                                               dir_cls_pred_list, mlvl_anchors,
+                                               input_meta, cfg, rescale)
+            result_list.append(proposals)
+        return result_list
+
+    def get_bboxes_single(self,
+                          cls_scores,
+                          bbox_preds,
+                          dir_cls_preds,
+                          mlvl_anchors,
+                          input_meta,
+                          cfg=None,
+                          rescale=False):
+        """Get bboxes of single branch.
+
+        Args:
+            cls_scores (torch.Tensor): Class score in single batch.
+            bbox_preds (torch.Tensor): Bbox prediction in single batch.
+            dir_cls_preds (torch.Tensor): Predictions of direction class
+                in single batch.
+            mlvl_anchors (List[torch.Tensor]): Multi-level anchors
+                in single batch.
+            input_meta (list[dict]): Contain pcd and img's meta info.
+            cfg (None | :obj:`ConfigDict`): Training or testing config.
+            rescale (list[torch.Tensor]): whether th rescale bbox.
+
+        Returns:
+            tuple: Contain predictions of single batch.
+
+                - bboxes (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes.
+                - scores (torch.Tensor): Class score of each bbox.
+                - labels (torch.Tensor): Label of each bbox.
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_dir_scores = []
+        for cls_score, bbox_pred, dir_cls_pred, anchors in zip(
+                cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            assert cls_score.size()[-2:] == dir_cls_pred.size()[-2:]
+            dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
+            dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
+
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.num_classes)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2,
+                                          0).reshape(-1, self.box_code_size)
+
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+                dir_cls_score = dir_cls_score[topk_inds]
+
+            bboxes = self.bbox_coder.decode(anchors, bbox_pred)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_dir_scores.append(dir_cls_score)
+
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
+            mlvl_bboxes, box_dim=self.box_code_size).bev)
+        mlvl_scores = torch.cat(mlvl_scores)
+        mlvl_dir_scores = torch.cat(mlvl_dir_scores)
+
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the front when using sigmoid
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+
+        score_thr = cfg.get('score_thr', 0)
+        results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
+                                       mlvl_scores, score_thr, cfg.max_num,
+                                       cfg, mlvl_dir_scores)
+        bboxes, scores, labels, dir_scores = results
+        if bboxes.shape[0] > 0:
+            dir_rot = limit_period(bboxes[..., 6] - self.dir_offset,
+                                   self.dir_limit_offset, np.pi)
+            bboxes[..., 6] = (
+                dir_rot + self.dir_offset +
+                np.pi * dir_scores.to(bboxes.dtype))
+        bboxes = input_meta['box_type_3d'](bboxes, box_dim=self.box_code_size)
+        return bboxes, scores, labels
\ No newline at end of file
diff --git a/mmcv/models/dense_heads/anchor_free_head.py b/mmcv/models/dense_heads/anchor_free_head.py
new file mode 100644
index 0000000..7df38d6
--- /dev/null
+++ b/mmcv/models/dense_heads/anchor_free_head.py
@@ -0,0 +1,340 @@
+from abc import abstractmethod
+
+import torch
+import torch.nn as nn
+from mmcv.models.bricks import ConvModule
+from mmcv.utils import force_fp32
+
+from ...core.utils import multi_apply
+from ..builder import HEADS, build_loss
+from .base_dense_head import BaseDenseHead
+from .dense_test_mixins import BBoxTestMixin
+
+
+@HEADS.register_module()
+class AnchorFreeHead(BaseDenseHead, BBoxTestMixin):
+    """Anchor-free head (FCOS, Fovea, RepPoints, etc.).
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+        stacked_convs (int): Number of stacking convs of the head.
+        strides (tuple): Downsample factor of each feature map.
+        dcn_on_last_conv (bool): If true, use dcn in the last layer of
+            towers. Default: False.
+        conv_bias (bool | str): If specified as `auto`, it will be decided by
+            the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
+            None, otherwise False. Default: "auto".
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        train_cfg (dict): Training config of anchor head.
+        test_cfg (dict): Testing config of anchor head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """  # noqa: W605
+
+    _version = 1
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 feat_channels=256,
+                 stacked_convs=4,
+                 strides=(4, 8, 16, 32, 64),
+                 dcn_on_last_conv=False,
+                 conv_bias='auto',
+                 loss_cls=dict(
+                     type='FocalLoss',
+                     use_sigmoid=True,
+                     gamma=2.0,
+                     alpha=0.25,
+                     loss_weight=1.0),
+                 loss_bbox=dict(type='IoULoss', loss_weight=1.0),
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='conv_cls',
+                         std=0.01,
+                         bias_prob=0.01))):
+        super(AnchorFreeHead, self).__init__(init_cfg)
+        self.num_classes = num_classes
+        self.cls_out_channels = num_classes
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.strides = strides
+        self.dcn_on_last_conv = dcn_on_last_conv
+        assert conv_bias == 'auto' or isinstance(conv_bias, bool)
+        self.conv_bias = conv_bias
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.fp16_enabled = False
+
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self._init_cls_convs()
+        self._init_reg_convs()
+        self._init_predictor()
+
+    def _init_cls_convs(self):
+        """Initialize classification conv layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias))
+
+    def _init_reg_convs(self):
+        """Initialize bbox regression conv layers of the head."""
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias))
+
+    def _init_predictor(self):
+        """Initialize predictor layers of the head."""
+        self.conv_cls = nn.Conv2d(
+            self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        """Hack some keys of the model state dict so that can load checkpoints
+        of previous version."""
+        version = local_metadata.get('version', None)
+        if version is None:
+            # the key is different in early versions
+            # for example, 'fcos_cls' become 'conv_cls' now
+            bbox_head_keys = [
+                k for k in state_dict.keys() if k.startswith(prefix)
+            ]
+            ori_predictor_keys = []
+            new_predictor_keys = []
+            # e.g. 'fcos_cls' or 'fcos_reg'
+            for key in bbox_head_keys:
+                ori_predictor_keys.append(key)
+                key = key.split('.')
+                conv_name = None
+                if key[1].endswith('cls'):
+                    conv_name = 'conv_cls'
+                elif key[1].endswith('reg'):
+                    conv_name = 'conv_reg'
+                elif key[1].endswith('centerness'):
+                    conv_name = 'conv_centerness'
+                else:
+                    assert NotImplementedError
+                if conv_name is not None:
+                    key[1] = conv_name
+                    new_predictor_keys.append('.'.join(key))
+                else:
+                    ori_predictor_keys.pop(-1)
+            for i in range(len(new_predictor_keys)):
+                state_dict[new_predictor_keys[i]] = state_dict.pop(
+                    ori_predictor_keys[i])
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually contain classification scores and bbox predictions.
+                cls_scores (list[Tensor]): Box scores for each scale level,
+                    each is a 4D-tensor, the channel number is
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * 4.
+        """
+        return multi_apply(self.forward_single, feats)[:2]
+
+    def forward_single(self, x):
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+
+        Returns:
+            tuple: Scores for each class, bbox predictions, features
+                after classification and regression conv layers, some
+                models needs these features like FCOS.
+        """
+        cls_feat = x
+        reg_feat = x
+
+        for cls_layer in self.cls_convs:
+            cls_feat = cls_layer(cls_feat)
+        cls_score = self.conv_cls(cls_feat)
+
+        for reg_layer in self.reg_convs:
+            reg_feat = reg_layer(reg_feat)
+        bbox_pred = self.conv_reg(reg_feat)
+        return cls_score, bbox_pred, cls_feat, reg_feat
+
+    @abstractmethod
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   img_metas,
+                   cfg=None,
+                   rescale=None):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_points * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_points * 4, H, W)
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used
+            rescale (bool): If True, return boxes in original image space
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_targets(self, points, gt_bboxes_list, gt_labels_list):
+        """Compute regression, classification and centerness targets for points
+        in multiple images.
+
+        Args:
+            points (list[Tensor]): Points of each fpn level, each has shape
+                (num_points, 2).
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image,
+                each has shape (num_gt, 4).
+            gt_labels_list (list[Tensor]): Ground truth labels of each box,
+                each has shape (num_gt,).
+        """
+        raise NotImplementedError
+
+    def _get_points_single(self,
+                           featmap_size,
+                           stride,
+                           dtype,
+                           device,
+                           flatten=False):
+        """Get points of a single scale level."""
+        h, w = featmap_size
+        # First create Range with the default dtype, than convert to
+        # target `dtype` for onnx exporting.
+        x_range = torch.arange(w, device=device).to(dtype)
+        y_range = torch.arange(h, device=device).to(dtype)
+        y, x = torch.meshgrid(y_range, x_range)
+        if flatten:
+            y = y.flatten()
+            x = x.flatten()
+        return y, x
+
+    def get_points(self, featmap_sizes, dtype, device, flatten=False):
+        """Get points according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            dtype (torch.dtype): Type of points.
+            device (torch.device): Device of points.
+
+        Returns:
+            tuple: points of each image.
+        """
+        mlvl_points = []
+        for i in range(len(featmap_sizes)):
+            mlvl_points.append(
+                self._get_points_single(featmap_sizes[i], self.strides[i],
+                                        dtype, device, flatten))
+        return mlvl_points
+
+    def aug_test(self, feats, img_metas, rescale=False):
+        """Test function with test time augmentation.
+
+        Args:
+            feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[ndarray]: bbox results of each class
+        """
+        return self.aug_test_bboxes(feats, img_metas, rescale=rescale)
diff --git a/mmcv/models/dense_heads/anchor_head.py b/mmcv/models/dense_heads/anchor_head.py
new file mode 100644
index 0000000..fd06528
--- /dev/null
+++ b/mmcv/models/dense_heads/anchor_head.py
@@ -0,0 +1,746 @@
+import torch
+import torch.nn as nn
+from mmcv.utils import force_fp32
+
+from mmcv.core.anchor import (anchor_inside_flags, build_anchor_generator, images_to_levels)
+from mmcv.core.bbox.builder import (build_assigner, build_bbox_coder, build_sampler)
+from mmcv.core.utils import (multi_apply, unmap)
+from mmcv.core.post_processing.bbox_nms import (multiclass_nms)
+from ..builder import HEADS, build_loss
+from .base_dense_head import BaseDenseHead
+from .dense_test_mixins import BBoxTestMixin
+
+
+@HEADS.register_module()
+class AnchorHead(BaseDenseHead, BBoxTestMixin):
+    """Anchor-based head (RPN, RetinaNet, SSD, etc.).
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+        anchor_generator (dict): Config dict for anchor generator
+        bbox_coder (dict): Config of bounding box coder.
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Default False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        train_cfg (dict): Training config of anchor head.
+        test_cfg (dict): Testing config of anchor head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 feat_channels=256,
+                 anchor_generator=dict(
+                     type='AnchorGenerator',
+                     scales=[8, 16, 32],
+                     ratios=[0.5, 1.0, 2.0],
+                     strides=[4, 8, 16, 32, 64]),
+                 bbox_coder=dict(
+                     type='DeltaXYWHBBoxCoder',
+                     clip_border=True,
+                     target_means=(.0, .0, .0, .0),
+                     target_stds=(1.0, 1.0, 1.0, 1.0)),
+                 reg_decoded_bbox=False,
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=True,
+                     loss_weight=1.0),
+                 loss_bbox=dict(
+                     type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=dict(type='Normal', layers='Conv2d', std=0.01)):
+        super(AnchorHead, self).__init__(init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        # TODO better way to determine whether sample or not
+        self.sampling = loss_cls['type'] not in [
+            'FocalLoss', 'GHMC', 'QualityFocalLoss'
+        ]
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+
+        if self.cls_out_channels <= 0:
+            raise ValueError(f'num_classes={num_classes} is too small')
+        self.reg_decoded_bbox = reg_decoded_bbox
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            # use PseudoSampler when sampling is False
+            if self.sampling and hasattr(self.train_cfg, 'sampler'):
+                sampler_cfg = self.train_cfg.sampler
+            else:
+                sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+        self.fp16_enabled = False
+
+        self.anchor_generator = build_anchor_generator(anchor_generator)
+        # usually the numbers of anchors for each level are the same
+        # except SSD detectors
+        self.num_anchors = self.anchor_generator.num_base_anchors[0]
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.conv_cls = nn.Conv2d(self.in_channels,
+                                  self.num_anchors * self.cls_out_channels, 1)
+        self.conv_reg = nn.Conv2d(self.in_channels, self.num_anchors * 4, 1)
+
+    def forward_single(self, x):
+        """Forward feature of a single scale level.
+
+        Args:
+            x (Tensor): Features of a single scale level.
+
+        Returns:
+            tuple:
+                cls_score (Tensor): Cls scores for a single scale level \
+                    the channels number is num_anchors * num_classes.
+                bbox_pred (Tensor): Box energies / deltas for a single scale \
+                    level, the channels number is num_anchors * 4.
+        """
+        cls_score = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
+        return cls_score, bbox_pred
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: A tuple of classification scores and bbox prediction.
+
+                - cls_scores (list[Tensor]): Classification scores for all \
+                    scale levels, each is a 4D-tensor, the channels number \
+                    is num_anchors * num_classes.
+                - bbox_preds (list[Tensor]): Box energies / deltas for all \
+                    scale levels, each is a 4D-tensor, the channels number \
+                    is num_anchors * 4.
+        """
+        return multi_apply(self.forward_single, feats)
+
+    def get_anchors(self, featmap_sizes, img_metas, device='cuda'):
+        """Get anchors according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            img_metas (list[dict]): Image meta info.
+            device (torch.device | str): Device for returned tensors
+
+        Returns:
+            tuple:
+                anchor_list (list[Tensor]): Anchors of each image.
+                valid_flag_list (list[Tensor]): Valid flags of each image.
+        """
+        num_imgs = len(img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # anchors for one time
+        multi_level_anchors = self.anchor_generator.grid_anchors(
+            featmap_sizes, device)
+        anchor_list = [multi_level_anchors for _ in range(num_imgs)]
+
+        # for each image, we compute valid flags of multi level anchors
+        valid_flag_list = []
+        for img_id, img_meta in enumerate(img_metas):
+            multi_level_flags = self.anchor_generator.valid_flags(
+                featmap_sizes, img_meta['pad_shape'], device)
+            valid_flag_list.append(multi_level_flags)
+
+        return anchor_list, valid_flag_list
+
+    def _get_targets_single(self,
+                            flat_anchors,
+                            valid_flags,
+                            gt_bboxes,
+                            gt_bboxes_ignore,
+                            gt_labels,
+                            img_meta,
+                            label_channels=1,
+                            unmap_outputs=True):
+        """Compute regression and classification targets for anchors in a
+        single image.
+
+        Args:
+            flat_anchors (Tensor): Multi-level anchors of the image, which are
+                concatenated into a single tensor of shape (num_anchors ,4)
+            valid_flags (Tensor): Multi level valid flags of the image,
+                which are concatenated into a single tensor of
+                    shape (num_anchors,).
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            img_meta (dict): Meta info of the image.
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple:
+                labels_list (list[Tensor]): Labels of each level
+                label_weights_list (list[Tensor]): Label weights of each level
+                bbox_targets_list (list[Tensor]): BBox targets of each level
+                bbox_weights_list (list[Tensor]): BBox weights of each level
+                num_total_pos (int): Number of positive samples in all images
+                num_total_neg (int): Number of negative samples in all images
+        """
+        inside_flags = anchor_inside_flags(flat_anchors, valid_flags,
+                                           img_meta['img_shape'][:2],
+                                           self.train_cfg.allowed_border)
+        if not inside_flags.any():
+            return (None, ) * 7
+        # assign gt and sample anchors
+        anchors = flat_anchors[inside_flags, :]
+
+        assign_result = self.assigner.assign(
+            anchors, gt_bboxes, gt_bboxes_ignore,
+            None if self.sampling else gt_labels)
+        sampling_result = self.sampler.sample(assign_result, anchors,
+                                              gt_bboxes)
+
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        labels = anchors.new_full((num_valid_anchors, ),
+                                  self.num_classes,
+                                  dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            if not self.reg_decoded_bbox:
+                pos_bbox_targets = self.bbox_coder.encode(
+                    sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+            else:
+                pos_bbox_targets = sampling_result.pos_gt_bboxes
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            if gt_labels is None:
+                # Only rpn gives gt_labels as None
+                # Foreground is the first class since v2.5.0
+                labels[pos_inds] = 0
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_anchors.size(0)
+            labels = unmap(
+                labels, num_total_anchors, inside_flags,
+                fill=self.num_classes)  # fill bg label
+            label_weights = unmap(label_weights, num_total_anchors,
+                                  inside_flags)
+            bbox_targets = unmap(bbox_targets, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds, sampling_result)
+
+    def get_targets(self,
+                    anchor_list,
+                    valid_flag_list,
+                    gt_bboxes_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None,
+                    gt_labels_list=None,
+                    label_channels=1,
+                    unmap_outputs=True,
+                    return_sampling_results=False):
+        """Compute regression and classification targets for anchors in
+        multiple images.
+
+        Args:
+            anchor_list (list[list[Tensor]]): Multi level anchors of each
+                image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, 4).
+            valid_flag_list (list[list[Tensor]]): Multi level valid flags of
+                each image. The outer list indicates images, and the inner list
+                corresponds to feature levels of the image. Each element of
+                the inner list is a tensor of shape (num_anchors, )
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            img_metas (list[dict]): Meta info of each image.
+            gt_bboxes_ignore_list (list[Tensor]): Ground truth bboxes to be
+                ignored.
+            gt_labels_list (list[Tensor]): Ground truth labels of each box.
+            label_channels (int): Channel of label.
+            unmap_outputs (bool): Whether to map outputs back to the original
+                set of anchors.
+
+        Returns:
+            tuple: Usually returns a tuple containing learning targets.
+
+                - labels_list (list[Tensor]): Labels of each level.
+                - label_weights_list (list[Tensor]): Label weights of each \
+                    level.
+                - bbox_targets_list (list[Tensor]): BBox targets of each level.
+                - bbox_weights_list (list[Tensor]): BBox weights of each level.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+            additional_returns: This function enables user-defined returns from
+                `self._get_targets_single`. These returns are currently refined
+                to properties at each feature map (i.e. having HxW dimension).
+                The results will be concatenated after the end
+        """
+        num_imgs = len(img_metas)
+        assert len(anchor_list) == len(valid_flag_list) == num_imgs
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors to a single tensor
+        concat_anchor_list = []
+        concat_valid_flag_list = []
+        for i in range(num_imgs):
+            assert len(anchor_list[i]) == len(valid_flag_list[i])
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+            concat_valid_flag_list.append(torch.cat(valid_flag_list[i]))
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+        results = multi_apply(
+            self._get_targets_single,
+            concat_anchor_list,
+            concat_valid_flag_list,
+            gt_bboxes_list,
+            gt_bboxes_ignore_list,
+            gt_labels_list,
+            img_metas,
+            label_channels=label_channels,
+            unmap_outputs=unmap_outputs)
+        (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights,
+         pos_inds_list, neg_inds_list, sampling_results_list) = results[:7]
+        rest_results = list(results[7:])  # user-added return values
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        # split targets to a list w.r.t. multiple levels
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        res = (labels_list, label_weights_list, bbox_targets_list,
+               bbox_weights_list, num_total_pos, num_total_neg)
+        if return_sampling_results:
+            res = res + (sampling_results_list, )
+        for i, r in enumerate(rest_results):  # user-added return values
+            rest_results[i] = images_to_levels(r, num_level_anchors)
+
+        return res + tuple(rest_results)
+
+    def loss_single(self, cls_score, bbox_pred, anchors, labels, label_weights,
+                    bbox_targets, bbox_weights, num_total_samples):
+        """Compute loss of a single scale level.
+
+        Args:
+            cls_score (Tensor): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W).
+            bbox_pred (Tensor): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W).
+            anchors (Tensor): Box reference for each scale level with shape
+                (N, num_total_anchors, 4).
+            labels (Tensor): Labels of each anchors with shape
+                (N, num_total_anchors).
+            label_weights (Tensor): Label weights of each anchor with shape
+                (N, num_total_anchors)
+            bbox_targets (Tensor): BBox regression targets of each anchor wight
+                shape (N, num_total_anchors, 4).
+            bbox_weights (Tensor): BBox regression loss weights of each anchor
+                with shape (N, num_total_anchors, 4).
+            num_total_samples (int): If sampling, num total samples equal to
+                the number of total anchors; Otherwise, it is the number of
+                positive anchors.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        # classification loss
+        labels = labels.reshape(-1)
+        label_weights = label_weights.reshape(-1)
+        cls_score = cls_score.permute(0, 2, 3,
+                                      1).reshape(-1, self.cls_out_channels)
+        loss_cls = self.loss_cls(
+            cls_score, labels, label_weights, avg_factor=num_total_samples)
+        # regression loss
+        bbox_targets = bbox_targets.reshape(-1, 4)
+        bbox_weights = bbox_weights.reshape(-1, 4)
+        bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, 4)
+        if self.reg_decoded_bbox:
+            # When the regression loss (e.g. `IouLoss`, `GIouLoss`)
+            # is applied directly on the decoded bounding boxes, it
+            # decodes the already encoded coordinates to absolute format.
+            anchors = anchors.reshape(-1, 4)
+            bbox_pred = self.bbox_coder.decode(anchors, bbox_pred)
+        loss_bbox = self.loss_bbox(
+            bbox_pred,
+            bbox_targets,
+            bbox_weights,
+            avg_factor=num_total_samples)
+        return loss_cls, loss_bbox
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss. Default: None
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.anchor_generator.num_levels
+
+        device = cls_scores[0].device
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_metas, device=device)
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.get_targets(
+            anchor_list,
+            valid_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        num_total_samples = (
+            num_total_pos + num_total_neg if self.sampling else num_total_pos)
+
+        # anchor number of multi levels
+        num_level_anchors = [anchors.size(0) for anchors in anchor_list[0]]
+        # concat all level anchors and flags to a single tensor
+        concat_anchor_list = []
+        for i in range(len(anchor_list)):
+            concat_anchor_list.append(torch.cat(anchor_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_single,
+            cls_scores,
+            bbox_preds,
+            all_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            num_total_samples=num_total_samples)
+        return dict(loss_cls=losses_cls, loss_bbox=losses_bbox)
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   img_metas,
+                   cfg=None,
+                   rescale=False,
+                   with_nms=True):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each level in the
+                feature pyramid, has shape
+                (N, num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each
+                level in the feature pyramid, has shape
+                (N, num_anchors * 4, H, W).
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cfg (mmcv.Config | None): Test / postprocessing configuration,
+                if None, test_cfg would be used
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is an (n, 5) tensor, where 5 represent
+                (tl_x, tl_y, br_x, br_y, score) and the score between 0 and 1.
+                The shape of the second tensor in the tuple is (n,), and
+                each element represents the class label of the corresponding
+                box.
+
+        Example:
+            >>> import mmcv
+            >>> self = AnchorHead(
+            >>>     num_classes=9,
+            >>>     in_channels=1,
+            >>>     anchor_generator=dict(
+            >>>         type='AnchorGenerator',
+            >>>         scales=[8],
+            >>>         ratios=[0.5, 1.0, 2.0],
+            >>>         strides=[4,]))
+            >>> img_metas = [{'img_shape': (32, 32, 3), 'scale_factor': 1}]
+            >>> cfg = mmcv.Config(dict(
+            >>>     score_thr=0.00,
+            >>>     nms=dict(type='nms', iou_thr=1.0),
+            >>>     max_per_img=10))
+            >>> feat = torch.rand(1, 1, 3, 3)
+            >>> cls_score, bbox_pred = self.forward_single(feat)
+            >>> # note the input lists are over different levels, not images
+            >>> cls_scores, bbox_preds = [cls_score], [bbox_pred]
+            >>> result_list = self.get_bboxes(cls_scores, bbox_preds,
+            >>>                               img_metas, cfg)
+            >>> det_bboxes, det_labels = result_list[0]
+            >>> assert len(result_list) == 1
+            >>> assert det_bboxes.shape[1] == 5
+            >>> assert len(det_bboxes) == len(det_labels) == cfg.max_per_img
+        """
+        assert len(cls_scores) == len(bbox_preds)
+        num_levels = len(cls_scores)
+
+        device = cls_scores[0].device
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_anchors = self.anchor_generator.grid_anchors(
+            featmap_sizes, device=device)
+
+        mlvl_cls_scores = [cls_scores[i].detach() for i in range(num_levels)]
+        mlvl_bbox_preds = [bbox_preds[i].detach() for i in range(num_levels)]
+
+        if torch.onnx.is_in_onnx_export():
+            assert len(
+                img_metas
+            ) == 1, 'Only support one input image while in exporting to ONNX'
+            img_shapes = img_metas[0]['img_shape_for_onnx']
+        else:
+            img_shapes = [
+                img_metas[i]['img_shape']
+                for i in range(cls_scores[0].shape[0])
+            ]
+        scale_factors = [
+            img_metas[i]['scale_factor'] for i in range(cls_scores[0].shape[0])
+        ]
+
+        if with_nms:
+            # some heads don't support with_nms argument
+            result_list = self._get_bboxes(mlvl_cls_scores, mlvl_bbox_preds,
+                                           mlvl_anchors, img_shapes,
+                                           scale_factors, cfg, rescale)
+        else:
+            result_list = self._get_bboxes(mlvl_cls_scores, mlvl_bbox_preds,
+                                           mlvl_anchors, img_shapes,
+                                           scale_factors, cfg, rescale,
+                                           with_nms)
+        return result_list
+
+    def _get_bboxes(self,
+                    mlvl_cls_scores,
+                    mlvl_bbox_preds,
+                    mlvl_anchors,
+                    img_shapes,
+                    scale_factors,
+                    cfg,
+                    rescale=False,
+                    with_nms=True):
+        """Transform outputs for a batch item into bbox predictions.
+
+        Args:
+            mlvl_cls_scores (list[Tensor]): Each element in the list is
+                the scores of bboxes of single level in the feature pyramid,
+                has shape (N, num_anchors * num_classes, H, W).
+            mlvl_bbox_preds (list[Tensor]):  Each element in the list is the
+                bboxes predictions of single level in the feature pyramid,
+                has shape (N, num_anchors * 4, H, W).
+            mlvl_anchors (list[Tensor]): Each element in the list is
+                the anchors of single level in feature pyramid, has shape
+                (num_anchors, 4).
+            img_shapes (list[tuple[int]]): Each tuple in the list represent
+                the shape(height, width, 3) of single image in the batch.
+            scale_factors (list[ndarray]): Scale factor of the batch
+                image arange as list[(w_scale, h_scale, w_scale, h_scale)].
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is an (n, 5) tensor, where 5 represent
+                (tl_x, tl_y, br_x, br_y, score) and the score between 0 and 1.
+                The shape of the second tensor in the tuple is (n,), and
+                each element represents the class label of the corresponding
+                box.
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(mlvl_cls_scores) == len(mlvl_bbox_preds) == len(
+            mlvl_anchors)
+        batch_size = mlvl_cls_scores[0].shape[0]
+        # convert to tensor to keep tracing
+        nms_pre_tensor = torch.tensor(
+            cfg.get('nms_pre', -1),
+            device=mlvl_cls_scores[0].device,
+            dtype=torch.long)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        for cls_score, bbox_pred, anchors in zip(mlvl_cls_scores,
+                                                 mlvl_bbox_preds,
+                                                 mlvl_anchors):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            cls_score = cls_score.permute(0, 2, 3,
+                                          1).reshape(batch_size, -1,
+                                                     self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(0, 2, 3,
+                                          1).reshape(batch_size, -1, 4)
+            anchors = anchors.expand_as(bbox_pred)
+            # Always keep topk op for dynamic input in onnx
+            from mmcv.core.export import get_k_for_topk
+            nms_pre = get_k_for_topk(nms_pre_tensor, bbox_pred.shape[1])
+            if nms_pre > 0:
+                # Get maximum scores for foreground classes.
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(-1)
+                else:
+                    # remind that we set FG labels to [0, num_class-1]
+                    # since mmcv v2.0
+                    # BG cat_id: num_class
+                    max_scores, _ = scores[..., :-1].max(-1)
+
+                _, topk_inds = max_scores.topk(nms_pre)
+                batch_inds = torch.arange(batch_size).view(
+                    -1, 1).expand_as(topk_inds)
+                anchors = anchors[batch_inds, topk_inds, :]
+                bbox_pred = bbox_pred[batch_inds, topk_inds, :]
+                scores = scores[batch_inds, topk_inds, :]
+
+            bboxes = self.bbox_coder.decode(
+                anchors, bbox_pred, max_shape=img_shapes)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+
+        batch_mlvl_bboxes = torch.cat(mlvl_bboxes, dim=1)
+        if rescale:
+            batch_mlvl_bboxes /= batch_mlvl_bboxes.new_tensor(
+                scale_factors).unsqueeze(1)
+        batch_mlvl_scores = torch.cat(mlvl_scores, dim=1)
+
+        # Replace multiclass_nms with ONNX::NonMaxSuppression in deployment
+        if torch.onnx.is_in_onnx_export() and with_nms:
+            from mmcv.core.export import add_dummy_nms_for_onnx
+            # ignore background class
+            if not self.use_sigmoid_cls:
+                num_classes = batch_mlvl_scores.shape[2] - 1
+                batch_mlvl_scores = batch_mlvl_scores[..., :num_classes]
+            max_output_boxes_per_class = cfg.nms.get(
+                'max_output_boxes_per_class', 200)
+            iou_threshold = cfg.nms.get('iou_threshold', 0.5)
+            score_threshold = cfg.score_thr
+            nms_pre = cfg.get('deploy_nms_pre', -1)
+            return add_dummy_nms_for_onnx(batch_mlvl_bboxes, batch_mlvl_scores,
+                                          max_output_boxes_per_class,
+                                          iou_threshold, score_threshold,
+                                          nms_pre, cfg.max_per_img)
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the backend when using sigmoid
+            # remind that we set FG labels to [0, num_class-1] since mmcv v2.0
+            # BG cat_id: num_class
+            padding = batch_mlvl_scores.new_zeros(batch_size,
+                                                  batch_mlvl_scores.shape[1],
+                                                  1)
+            batch_mlvl_scores = torch.cat([batch_mlvl_scores, padding], dim=-1)
+
+        if with_nms:
+            det_results = []
+            for (mlvl_bboxes, mlvl_scores) in zip(batch_mlvl_bboxes,
+                                                  batch_mlvl_scores):
+                det_bbox, det_label = multiclass_nms(mlvl_bboxes, mlvl_scores,
+                                                     cfg.score_thr, cfg.nms,
+                                                     cfg.max_per_img)
+                det_results.append(tuple([det_bbox, det_label]))
+        else:
+            det_results = [
+                tuple(mlvl_bs)
+                for mlvl_bs in zip(batch_mlvl_bboxes, batch_mlvl_scores)
+            ]
+        return det_results
+
+    def aug_test(self, feats, img_metas, rescale=False):
+        """Test function with test time augmentation.
+
+        Args:
+            feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5), where
+                5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n,), The length of list should always be 1.
+        """
+        return self.aug_test_bboxes(feats, img_metas, rescale=rescale)
diff --git a/mmcv/models/dense_heads/base_dense_head.py b/mmcv/models/dense_heads/base_dense_head.py
new file mode 100644
index 0000000..e2a422d
--- /dev/null
+++ b/mmcv/models/dense_heads/base_dense_head.py
@@ -0,0 +1,78 @@
+from abc import ABCMeta, abstractmethod
+
+from mmcv.models.backbones import BaseModule
+
+
+class BaseDenseHead(BaseModule, metaclass=ABCMeta):
+    """Base class for DenseHeads."""
+
+    def __init__(self, init_cfg=None):
+        super(BaseDenseHead, self).__init__(init_cfg)
+
+    @abstractmethod
+    def loss(self, **kwargs):
+        """Compute losses of the head."""
+        pass
+
+    @abstractmethod
+    def get_bboxes(self, **kwargs):
+        """Transform network output for a batch into bbox predictions."""
+        pass
+
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels=None,
+                      gt_bboxes_ignore=None,
+                      proposal_cfg=None,
+                      **kwargs):
+        """
+        Args:
+            x (list[Tensor]): Features from FPN.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used
+
+        Returns:
+            tuple:
+                losses: (dict[str, Tensor]): A dictionary of loss components.
+                proposal_list (list[Tensor]): Proposals of each image.
+        """
+        outs = self(x)
+        if gt_labels is None:
+            loss_inputs = outs + (gt_bboxes, img_metas)
+        else:
+            loss_inputs = outs + (gt_bboxes, gt_labels, img_metas)
+        losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        if proposal_cfg is None:
+            return losses
+        else:
+            proposal_list = self.get_bboxes(*outs, img_metas, cfg=proposal_cfg)
+            return losses, proposal_list
+
+    def simple_test(self, feats, img_metas, rescale=False):
+        """Test function without test-time augmentation.
+
+        Args:
+            feats (tuple[torch.Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n,)
+        """
+        return self.simple_test_bboxes(feats, img_metas, rescale=rescale)
diff --git a/mmcv/models/dense_heads/bev_head.py b/mmcv/models/dense_heads/bev_head.py
new file mode 100644
index 0000000..2227a0c
--- /dev/null
+++ b/mmcv/models/dense_heads/bev_head.py
@@ -0,0 +1,130 @@
+import copy
+from re import I
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.models import Linear, bias_init_with_prob
+from mmcv.utils import TORCH_VERSION, digit_version
+
+from mmcv.core import (multi_apply, multi_apply, reduce_mean)
+from mmcv.models.utils.transformer import inverse_sigmoid
+from mmcv.models import HEADS, BaseModule
+from mmcv.models.dense_heads import DETRHead
+from mmcv.core.bbox.coder import build_bbox_coder
+from traitlets import import_item
+from mmcv.core.bbox.util import normalize_bbox
+from mmcv.models.bricks import build_positional_encoding
+from mmcv.utils import force_fp32
+import numpy as np
+import cv2 as cv
+from mmcv.models.modules.transformerV2 import PerceptionTransformerBEVEncoder
+from mmcv.models.utils import build_transformer
+from mmcv.models.builder import build_head
+from mmcv.models.dense_heads.free_anchor3d_head import FreeAnchor3DHead
+
+@HEADS.register_module()
+class BEVHead(BaseModule):
+    def __init__(self, 
+                 bev_h,
+                 bev_w,
+                 pc_range,
+                 embed_dims,
+                 transformer, 
+                 positional_encoding: dict,
+                 pts_bbox_head_3d: dict, 
+                 init_cfg=None,
+                 **kwargs,
+                 ):
+        super(BEVHead, self).__init__(init_cfg=init_cfg)
+        self.bev_h = bev_h
+        self.bev_w = bev_w
+        self.embed_dims = embed_dims
+        self.pc_range = pc_range
+        self.fp16_enabled = False
+        self.transformer :PerceptionTransformerBEVEncoder = build_transformer(transformer)
+        self.positional_encoding = build_positional_encoding(positional_encoding)
+
+        pts_bbox_head_3d.update(kwargs)
+        self.pts_bbox_head_3d = build_head(pts_bbox_head_3d)
+        self.real_w = self.pc_range[3] - self.pc_range[0]
+        self.real_h = self.pc_range[4] - self.pc_range[1]
+        
+        self._init_layers()
+    def init_weights(self):
+        """Initialize weights of the Multi View BEV Encoder"""
+        self.transformer.init_weights()
+
+    def _init_layers(self):
+        """Initialize classification branch and regression branch of head."""
+        
+        self.bev_embedding = nn.Embedding(self.bev_h * self.bev_w, self.embed_dims)
+
+    @force_fp32(apply_to=('mlvl_feats', 'pred_bev'))
+    def forward(self, mlvl_feats, img_metas, prev_bev=None,  only_bev=False):
+        bs, num_cam, _, _, _ = mlvl_feats[0].shape
+        dtype = mlvl_feats[0].dtype
+        bev_queries = self.bev_embedding.weight.to(dtype)
+
+        bev_mask = torch.zeros((bs, self.bev_h, self.bev_w),
+                               device=bev_queries.device).to(dtype)
+        bev_pos = self.positional_encoding(bev_mask).to(dtype)
+
+        bev_embed = self.transformer(
+                mlvl_feats,
+                bev_queries,
+                self.bev_h,
+                self.bev_w,
+                grid_length=(self.real_h / self.bev_h,
+                             self.real_w / self.bev_w),
+                bev_pos=bev_pos,
+                img_metas=img_metas,
+                prev_bev=prev_bev,
+            )
+
+        if only_bev:
+            return bev_embed
+        
+        bev_feature = bev_embed.permute(0, 2, 1).reshape(bs, self.embed_dims, self.bev_h, self.bev_w)
+        ret = {}
+        ret['pred'] = self.pts_bbox_head_3d([bev_feature,])
+        if not self.training:
+            ret['bev_embed'] = bev_embed
+        return ret 
+    
+
+    @force_fp32(apply_to=('ret'))
+    def loss(self,
+             gt_bboxes_list,
+             gt_labels_list,
+             ret,
+             gt_bboxes_ignore=None,
+             img_metas=None):
+        assert gt_bboxes_ignore is None
+        return self.pts_bbox_head_3d.loss(gt_bboxes_list, gt_labels_list, ret['pred'], gt_bboxes_ignore=gt_bboxes_ignore, img_metas=img_metas)
+    
+    @force_fp32(apply_to=('ret'))
+    def get_bboxes(self, ret, img_metas, rescale=False):
+        return self.pts_bbox_head_3d.get_bboxes(ret['pred'], img_metas)
+
+@HEADS.register_module()
+class FreeAnchor3DHeadV2(FreeAnchor3DHead):
+    @force_fp32(apply_to=('pred'))
+    def loss(self,
+             gt_bboxes_list,
+             gt_labels_list,
+             pred,
+             gt_bboxes_ignore=None,
+             img_metas=None):
+            cls_scores, bbox_preds, dir_cls_preds = pred
+            
+            return super().loss(cls_scores, bbox_preds, dir_cls_preds, gt_bboxes_list, gt_labels_list, img_metas, gt_bboxes_ignore)
+    @force_fp32(apply_to=('pred'))
+    def get_bboxes(self, pred, img_metas, rescale=False):
+        cls_scores, bbox_preds, dir_cls_preds = pred
+        return super().get_bboxes(
+                   cls_scores,
+                   bbox_preds,
+                   dir_cls_preds,
+                   img_metas,
+                   cfg=None,
+                   rescale=rescale)
\ No newline at end of file
diff --git a/mmcv/models/dense_heads/bevformer_head.py b/mmcv/models/dense_heads/bevformer_head.py
new file mode 100644
index 0000000..c1852db
--- /dev/null
+++ b/mmcv/models/dense_heads/bevformer_head.py
@@ -0,0 +1,686 @@
+import copy
+import torch
+import torch.nn as nn
+
+from mmcv.models.bricks import Linear
+from mmcv.models.utils.weight_init import bias_init_with_prob
+
+from mmcv.utils import TORCH_VERSION, digit_version
+from mmcv.core.utils.misc import multi_apply
+from mmcv.core.utils.dist_utils import reduce_mean
+from mmcv.models.utils.transformer import inverse_sigmoid
+from mmcv.models import HEADS
+from mmcv.models.dense_heads import DETRHead
+from mmcv.core.bbox.coder import build_bbox_coder
+from mmcv.core.bbox.util import normalize_bbox
+from mmcv.utils import force_fp32, auto_fp16
+
+
+@HEADS.register_module()
+class BEVFormerHead(DETRHead):
+    """Head of Detr3D.
+    Args:
+        with_box_refine (bool): Whether to refine the reference points
+            in the decoder. Defaults to False.
+        as_two_stage (bool) : Whether to generate the proposal from
+            the outputs of encoder.
+        transformer (obj:`ConfigDict`): ConfigDict is used for building
+            the Encoder and Decoder.
+        bev_h, bev_w (int): spatial shape of BEV queries.
+    """
+
+    def __init__(self,
+                 *args,
+                 with_box_refine=False,
+                 as_two_stage=False,
+                 transformer=None,
+                 bbox_coder=None,
+                 num_cls_fcs=2,
+                 code_weights=None,
+                 bev_h=30,
+                 bev_w=30,
+                 **kwargs):
+
+        self.bev_h = bev_h
+        self.bev_w = bev_w
+        self.fp16_enabled = False
+
+        self.with_box_refine = with_box_refine
+        self.as_two_stage = as_two_stage
+        if self.as_two_stage:
+            transformer['as_two_stage'] = self.as_two_stage
+        if 'code_size' in kwargs:
+            self.code_size = kwargs['code_size']
+        else:
+            self.code_size = 10
+        if code_weights is not None:
+            self.code_weights = code_weights
+        else:
+            self.code_weights = [1.0, 1.0, 1.0,
+                                 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.pc_range = self.bbox_coder.pc_range
+        self.real_w = self.pc_range[3] - self.pc_range[0]
+        self.real_h = self.pc_range[4] - self.pc_range[1]
+        self.num_cls_fcs = num_cls_fcs - 1
+        super(BEVFormerHead, self).__init__(
+            *args, transformer=transformer, **kwargs)
+        self.code_weights = nn.Parameter(torch.tensor(
+            self.code_weights, requires_grad=False), requires_grad=False)
+
+    def _init_layers(self):
+        """Initialize classification branch and regression branch of head."""
+        cls_branch = []
+        for _ in range(self.num_reg_fcs):
+            cls_branch.append(Linear(self.embed_dims, self.embed_dims))
+            cls_branch.append(nn.LayerNorm(self.embed_dims))
+            cls_branch.append(nn.ReLU(inplace=True))
+        cls_branch.append(Linear(self.embed_dims, self.cls_out_channels))
+        fc_cls = nn.Sequential(*cls_branch)
+
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, self.code_size))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        def _get_clones(module, N):
+            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+        # last reg_branch is used to generate proposal from
+        # encode feature map when as_two_stage is True.
+        num_pred = (self.transformer.decoder.num_layers + 1) if \
+            self.as_two_stage else self.transformer.decoder.num_layers
+
+        if self.with_box_refine:
+            self.cls_branches = _get_clones(fc_cls, num_pred)
+            self.reg_branches = _get_clones(reg_branch, num_pred)
+        else:
+            self.cls_branches = nn.ModuleList(
+                [fc_cls for _ in range(num_pred)])
+            self.reg_branches = nn.ModuleList(
+                [reg_branch for _ in range(num_pred)])
+
+        if not self.as_two_stage:
+            self.bev_embedding = nn.Embedding(
+                self.bev_h * self.bev_w, self.embed_dims)
+            self.query_embedding = nn.Embedding(self.num_query,
+                                                self.embed_dims * 2)
+
+    def init_weights(self):
+        """Initialize weights of the DeformDETR head."""
+        self.transformer.init_weights()
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.cls_branches:
+                nn.init.constant_(m[-1].bias, bias_init)
+
+    @auto_fp16(apply_to=('mlvl_feats'))
+    def forward(self, mlvl_feats, img_metas, prev_bev=None,  only_bev=False):
+        """Forward function.
+        Args:
+            mlvl_feats (tuple[Tensor]): Features from the upstream
+                network, each is a 5D-tensor with shape
+                (B, N, C, H, W).
+            prev_bev: previous bev featues
+            only_bev: only compute BEV features with encoder. 
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        """
+        bs, num_cam, _, _, _ = mlvl_feats[0].shape
+        dtype = mlvl_feats[0].dtype
+        object_query_embeds = self.query_embedding.weight.to(dtype)
+        bev_queries = self.bev_embedding.weight.to(dtype)
+
+        bev_mask = torch.zeros((bs, self.bev_h, self.bev_w),
+                               device=bev_queries.device).to(dtype)
+        bev_pos = self.positional_encoding(bev_mask).to(dtype)
+
+        if only_bev:  # only use encoder to obtain BEV features, TODO: refine the workaround
+            return self.transformer.get_bev_features(
+                mlvl_feats,
+                bev_queries,
+                self.bev_h,
+                self.bev_w,
+                grid_length=(self.real_h / self.bev_h,
+                             self.real_w / self.bev_w),
+                bev_pos=bev_pos,
+                img_metas=img_metas,
+                prev_bev=prev_bev,
+            )
+        else:
+            outputs = self.transformer(
+                mlvl_feats,
+                bev_queries,
+                object_query_embeds,
+                self.bev_h,
+                self.bev_w,
+                grid_length=(self.real_h / self.bev_h,
+                             self.real_w / self.bev_w),
+                bev_pos=bev_pos,
+                reg_branches=self.reg_branches if self.with_box_refine else None,  # noqa:E501
+                cls_branches=self.cls_branches if self.as_two_stage else None,
+                img_metas=img_metas,
+                prev_bev=prev_bev
+        )
+
+        bev_embed, hs, init_reference, inter_references = outputs
+        hs = hs.permute(0, 2, 1, 3)
+        outputs_classes = []
+        outputs_coords = []
+        for lvl in range(hs.shape[0]):
+            if lvl == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[lvl - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.cls_branches[lvl](hs[lvl])
+            tmp = self.reg_branches[lvl](hs[lvl])
+
+            # TODO: check the shape of reference
+            assert reference.shape[-1] == 3
+            tmp[..., 0:2] += reference[..., 0:2]
+            tmp[..., 0:2] = tmp[..., 0:2].sigmoid()
+            tmp[..., 4:5] += reference[..., 2:3]
+            tmp[..., 4:5] = tmp[..., 4:5].sigmoid()
+            tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] -
+                             self.pc_range[0]) + self.pc_range[0])
+            tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] -
+                             self.pc_range[1]) + self.pc_range[1])
+            tmp[..., 4:5] = (tmp[..., 4:5] * (self.pc_range[5] -
+                             self.pc_range[2]) + self.pc_range[2])
+
+            # TODO: check if using sigmoid
+            outputs_coord = tmp
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+
+        outputs_classes = torch.stack(outputs_classes)
+        outputs_coords = torch.stack(outputs_coords)
+
+        outs = {
+            'bev_embed': bev_embed,
+            'all_cls_scores': outputs_classes,
+            'all_bbox_preds': outputs_coords,
+            'enc_cls_scores': None,
+            'enc_bbox_preds': None,
+        }
+
+        return outs
+
+    def _get_target_single(self,
+                           cls_score,
+                           bbox_pred,
+                           gt_labels,
+                           gt_bboxes,
+                           gt_bboxes_ignore=None):
+        """"Compute regression and classification targets for one image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_query, 4].
+            gt_bboxes (Tensor): Ground truth bboxes for one image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth class indices for one image
+                with shape (num_gts, ).
+            gt_bboxes_ignore (Tensor, optional): Bounding boxes
+                which can be ignored. Default None.
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+                - labels (Tensor): Labels of each image.
+                - label_weights (Tensor]): Label weights of each image.
+                - bbox_targets (Tensor): BBox targets of each image.
+                - bbox_weights (Tensor): BBox weights of each image.
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+        gt_c = gt_bboxes.shape[-1]
+
+        assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
+                                             gt_labels, gt_bboxes_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, bbox_pred,
+                                              gt_bboxes)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes,),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_c]
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+
+        # DETR
+        bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
+        return (labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds)
+
+    def get_targets(self,
+                    cls_scores_list,
+                    bbox_preds_list,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_bboxes_ignore_list=None):
+        """"Compute regression and classification targets for a batch image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            tuple: a tuple containing the following targets.
+                - labels_list (list[Tensor]): Labels for all images.
+                - label_weights_list (list[Tensor]): Label weights for all \
+                    images.
+                - bbox_targets_list (list[Tensor]): BBox targets for all \
+                    images.
+                - bbox_weights_list (list[Tensor]): BBox weights for all \
+                    images.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+
+        (labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(
+            self._get_target_single, cls_scores_list, bbox_preds_list,
+            gt_labels_list, gt_bboxes_list, gt_bboxes_ignore_list)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def loss_single(self,
+                    cls_scores,
+                    bbox_preds,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_bboxes_ignore_list=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs from
+                a single decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           gt_bboxes_list, gt_labels_list,
+                                           gt_bboxes_ignore_list)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+
+        cls_avg_factor = max(cls_avg_factor, 1)
+        loss_cls = self.loss_cls(
+            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes accross all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # regression L1 loss
+        bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1))
+        normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range)
+        isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
+        bbox_weights = bbox_weights * self.code_weights
+
+        loss_bbox = self.loss_bbox(
+            bbox_preds[isnotnan, :10], normalized_bbox_targets[isnotnan,
+                                                               :10], bbox_weights[isnotnan, :10],
+            avg_factor=num_total_pos)
+        if digit_version(TORCH_VERSION) >= digit_version('1.8'):
+            loss_cls = torch.nan_to_num(loss_cls)
+            loss_bbox = torch.nan_to_num(loss_bbox)
+        return loss_cls, loss_bbox
+
+    @force_fp32(apply_to=('preds_dicts'))
+    def loss(self,
+             gt_bboxes_list,
+             gt_labels_list,
+             preds_dicts,
+             gt_bboxes_ignore=None,
+             img_metas=None):
+        """"Loss function.
+        Args:
+
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            preds_dicts:
+                all_cls_scores (Tensor): Classification score of all
+                    decoder layers, has shape
+                    [nb_dec, bs, num_query, cls_out_channels].
+                all_bbox_preds (Tensor): Sigmoid regression
+                    outputs of all decode layers. Each is a 4D-tensor with
+                    normalized coordinate format (cx, cy, w, h) and shape
+                    [nb_dec, bs, num_query, 4].
+                enc_cls_scores (Tensor): Classification scores of
+                    points on encode feature map , has shape
+                    (N, h*w, num_classes). Only be passed when as_two_stage is
+                    True, otherwise is None.
+                enc_bbox_preds (Tensor): Regression results of each points
+                    on the encode feature map, has shape (N, h*w, 4). Only be
+                    passed when as_two_stage is True, otherwise is None.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert gt_bboxes_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            f'for gt_bboxes_ignore setting to None.'
+
+        all_cls_scores = preds_dicts['all_cls_scores']
+        all_bbox_preds = preds_dicts['all_bbox_preds']
+        enc_cls_scores = preds_dicts['enc_cls_scores']
+        enc_bbox_preds = preds_dicts['enc_bbox_preds']
+
+        num_dec_layers = len(all_cls_scores)
+        device = gt_labels_list[0].device
+
+        gt_bboxes_list = [torch.cat(
+            (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]),
+            dim=1).to(device) for gt_bboxes in gt_bboxes_list]
+
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_single, all_cls_scores, all_bbox_preds,
+            all_gt_bboxes_list, all_gt_labels_list,
+            all_gt_bboxes_ignore_list)
+
+        loss_dict = dict()
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            binary_labels_list = [
+                torch.zeros_like(gt_labels_list[i])
+                for i in range(len(all_gt_labels_list))
+            ]
+            enc_loss_cls, enc_losses_bbox = \
+                self.loss_single(enc_cls_scores, enc_bbox_preds,
+                                 gt_bboxes_list, binary_labels_list, gt_bboxes_ignore)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1],
+                                           losses_bbox[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            num_dec_layer += 1
+        return loss_dict
+
+    @force_fp32(apply_to=('preds_dicts'))
+    def get_bboxes(self, preds_dicts, img_metas, rescale=False):
+        """Generate bboxes from bbox head predictions.
+        Args:
+            preds_dicts (tuple[list[dict]]): Prediction results.
+            img_metas (list[dict]): Point cloud and image's meta info.
+        Returns:
+            list[dict]: Decoded bbox, scores and labels after nms.
+        """
+
+        preds_dicts = self.bbox_coder.decode(preds_dicts)
+
+        num_samples = len(preds_dicts)
+        ret_list = []
+        for i in range(num_samples):
+            preds = preds_dicts[i]
+            bboxes = preds['bboxes']
+
+            bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
+
+            code_size = bboxes.shape[-1]
+            bboxes = img_metas[i]['box_type_3d'](bboxes, code_size)
+            scores = preds['scores']
+            labels = preds['labels']
+
+            ret_list.append([bboxes, scores, labels])
+
+        return ret_list
+
+
+@HEADS.register_module()
+class BEVFormerHead_GroupDETR(BEVFormerHead):
+    def __init__(self,
+                 *args,
+                 group_detr=1,
+                 **kwargs):
+        self.group_detr = group_detr
+        assert 'num_query' in kwargs
+        kwargs['num_query'] = group_detr * kwargs['num_query']
+        super().__init__(*args, **kwargs)
+
+    def forward(self, mlvl_feats, img_metas, prev_bev=None,  only_bev=False):
+        bs, num_cam, _, _, _ = mlvl_feats[0].shape
+        dtype = mlvl_feats[0].dtype
+        object_query_embeds = self.query_embedding.weight.to(dtype)
+        if not self.training:  # NOTE: Only difference to bevformer head
+            object_query_embeds = object_query_embeds[:self.num_query // self.group_detr]
+        bev_queries = self.bev_embedding.weight.to(dtype)
+
+        bev_mask = torch.zeros((bs, self.bev_h, self.bev_w),
+                               device=bev_queries.device).to(dtype)
+        bev_pos = self.positional_encoding(bev_mask).to(dtype)
+
+        if only_bev:
+            return self.transformer.get_bev_features(
+                mlvl_feats,
+                bev_queries,
+                self.bev_h,
+                self.bev_w,
+                grid_length=(self.real_h / self.bev_h,
+                             self.real_w / self.bev_w),
+                bev_pos=bev_pos,
+                img_metas=img_metas,
+                prev_bev=prev_bev,
+            )
+        else:
+            outputs = self.transformer(
+                mlvl_feats,
+                bev_queries,
+                object_query_embeds,
+                self.bev_h,
+                self.bev_w,
+                grid_length=(self.real_h / self.bev_h,
+                             self.real_w / self.bev_w),
+                bev_pos=bev_pos,
+                reg_branches=self.reg_branches if self.with_box_refine else None,  # noqa:E501
+                cls_branches=self.cls_branches if self.as_two_stage else None,
+                img_metas=img_metas,
+                prev_bev=prev_bev
+        )
+
+        bev_embed, hs, init_reference, inter_references = outputs
+        hs = hs.permute(0, 2, 1, 3)
+        outputs_classes = []
+        outputs_coords = []
+        for lvl in range(hs.shape[0]):
+            if lvl == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[lvl - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.cls_branches[lvl](hs[lvl])
+            tmp = self.reg_branches[lvl](hs[lvl])
+            assert reference.shape[-1] == 3
+            tmp[..., 0:2] += reference[..., 0:2]
+            tmp[..., 0:2] = tmp[..., 0:2].sigmoid()
+            tmp[..., 4:5] += reference[..., 2:3]
+            tmp[..., 4:5] = tmp[..., 4:5].sigmoid()
+            tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] -
+                             self.pc_range[0]) + self.pc_range[0])
+            tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] -
+                             self.pc_range[1]) + self.pc_range[1])
+            tmp[..., 4:5] = (tmp[..., 4:5] * (self.pc_range[5] -
+                             self.pc_range[2]) + self.pc_range[2])
+            outputs_coord = tmp
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+
+        outputs_classes = torch.stack(outputs_classes)
+        outputs_coords = torch.stack(outputs_coords)
+
+        outs = {
+            'bev_embed': bev_embed,
+            'all_cls_scores': outputs_classes,
+            'all_bbox_preds': outputs_coords,
+            'enc_cls_scores': None,
+            'enc_bbox_preds': None,
+        }
+
+        return outs
+
+    def loss(self,
+             gt_bboxes_list,
+             gt_labels_list,
+             preds_dicts,
+             gt_bboxes_ignore=None,
+             img_metas=None):
+        """"Loss function.
+        Args:
+
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            preds_dicts:
+                all_cls_scores (Tensor): Classification score of all
+                    decoder layers, has shape
+                    [nb_dec, bs, num_query, cls_out_channels].
+                all_bbox_preds (Tensor): Sigmoid regression
+                    outputs of all decode layers. Each is a 4D-tensor with
+                    normalized coordinate format (cx, cy, w, h) and shape
+                    [nb_dec, bs, num_query, 4].
+                enc_cls_scores (Tensor): Classification scores of
+                    points on encode feature map , has shape
+                    (N, h*w, num_classes). Only be passed when as_two_stage is
+                    True, otherwise is None.
+                enc_bbox_preds (Tensor): Regression results of each points
+                    on the encode feature map, has shape (N, h*w, 4). Only be
+                    passed when as_two_stage is True, otherwise is None.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert gt_bboxes_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            f'for gt_bboxes_ignore setting to None.'
+
+        all_cls_scores = preds_dicts['all_cls_scores']
+        all_bbox_preds = preds_dicts['all_bbox_preds']
+        enc_cls_scores = preds_dicts['enc_cls_scores']
+        enc_bbox_preds = preds_dicts['enc_bbox_preds']
+        assert enc_cls_scores is None and enc_bbox_preds is None 
+
+        num_dec_layers = len(all_cls_scores)
+        device = gt_labels_list[0].device
+
+        gt_bboxes_list = [torch.cat(
+            (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]),
+            dim=1).to(device) for gt_bboxes in gt_bboxes_list]
+
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+
+        loss_dict = dict()
+        loss_dict['loss_cls'] = 0
+        loss_dict['loss_bbox'] = 0
+        for num_dec_layer in range(all_cls_scores.shape[0] - 1):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = 0
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = 0
+        num_query_per_group = self.num_query // self.group_detr
+        for group_index in range(self.group_detr):
+            group_query_start = group_index * num_query_per_group
+            group_query_end = (group_index+1) * num_query_per_group
+            group_cls_scores =  all_cls_scores[:, :,group_query_start:group_query_end, :]
+            group_bbox_preds = all_bbox_preds[:, :,group_query_start:group_query_end, :]
+            losses_cls, losses_bbox = multi_apply(
+                self.loss_single, group_cls_scores, group_bbox_preds,
+                all_gt_bboxes_list, all_gt_labels_list,
+                all_gt_bboxes_ignore_list)
+            loss_dict['loss_cls'] += losses_cls[-1] / self.group_detr
+            loss_dict['loss_bbox'] += losses_bbox[-1] / self.group_detr
+            # loss from other decoder layers
+            num_dec_layer = 0
+            for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], losses_bbox[:-1]):
+                loss_dict[f'd{num_dec_layer}.loss_cls'] += loss_cls_i / self.group_detr
+                loss_dict[f'd{num_dec_layer}.loss_bbox'] += loss_bbox_i / self.group_detr
+                num_dec_layer += 1
+        return loss_dict
\ No newline at end of file
diff --git a/mmcv/models/dense_heads/dense_test_mixins.py b/mmcv/models/dense_heads/dense_test_mixins.py
new file mode 100644
index 0000000..a548e40
--- /dev/null
+++ b/mmcv/models/dense_heads/dense_test_mixins.py
@@ -0,0 +1,202 @@
+import sys
+from inspect import signature
+
+import torch
+
+from mmcv.core.post_processing.merge_augs import merge_aug_proposals
+from mmcv.core.post_processing.bbox_nms import multiclass_nms
+# from ...core.post_processing import merge_aug_proposals, multiclass_nms
+
+if sys.version_info >= (3, 7):
+    from mmcv.utils.contextmanagers import completed
+
+
+class BBoxTestMixin(object):
+    """Mixin class for testing det bboxes via DenseHead."""
+
+    def simple_test_bboxes(self, feats, img_metas, rescale=False):
+        """Test det bboxes without test-time augmentation, can be applied in
+        DenseHead except for ``RPNHead`` and its variants, e.g., ``GARPNHead``,
+        etc.
+
+        Args:
+            feats (tuple[torch.Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n,)
+        """
+        outs = self.forward(feats)
+        results_list = self.get_bboxes(*outs, img_metas, rescale=rescale)
+        return results_list
+
+    def aug_test_bboxes(self, feats, img_metas, rescale=False):
+        """Test det bboxes with test time augmentation, can be applied in
+        DenseHead except for ``RPNHead`` and its variants, e.g., ``GARPNHead``,
+        etc.
+
+        Args:
+            feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n,). The length of list should always be 1.
+        """
+        # check with_nms argument
+        gb_sig = signature(self.get_bboxes)
+        gb_args = [p.name for p in gb_sig.parameters.values()]
+        if hasattr(self, '_get_bboxes'):
+            gbs_sig = signature(self._get_bboxes)
+        else:
+            gbs_sig = signature(self._get_bboxes_single)
+        gbs_args = [p.name for p in gbs_sig.parameters.values()]
+        assert ('with_nms' in gb_args) and ('with_nms' in gbs_args), \
+            f'{self.__class__.__name__}' \
+            ' does not support test-time augmentation'
+
+        aug_bboxes = []
+        aug_scores = []
+        aug_factors = []  # score_factors for NMS
+        for x, img_meta in zip(feats, img_metas):
+            # only one image in the batch
+            outs = self.forward(x)
+            bbox_inputs = outs + (img_meta, self.test_cfg, False, False)
+            bbox_outputs = self.get_bboxes(*bbox_inputs)[0]
+            aug_bboxes.append(bbox_outputs[0])
+            aug_scores.append(bbox_outputs[1])
+            # bbox_outputs of some detectors (e.g., ATSS, FCOS, YOLOv3)
+            # contains additional element to adjust scores before NMS
+            if len(bbox_outputs) >= 3:
+                aug_factors.append(bbox_outputs[2])
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = self.merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas)
+        merged_factors = torch.cat(aug_factors, dim=0) if aug_factors else None
+        det_bboxes, det_labels = multiclass_nms(
+            merged_bboxes,
+            merged_scores,
+            self.test_cfg.score_thr,
+            self.test_cfg.nms,
+            self.test_cfg.max_per_img,
+            score_factors=merged_factors)
+
+        if rescale:
+            _det_bboxes = det_bboxes
+        else:
+            _det_bboxes = det_bboxes.clone()
+            _det_bboxes[:, :4] *= det_bboxes.new_tensor(
+                img_metas[0][0]['scale_factor'])
+
+        return [
+            (_det_bboxes, det_labels),
+        ]
+
+    def simple_test_rpn(self, x, img_metas):
+        """Test without augmentation, only for ``RPNHead`` and its variants,
+        e.g., ``GARPNHead``, etc.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            img_metas (list[dict]): Meta info of each image.
+
+        Returns:
+            list[Tensor]: Proposals of each image, each item has shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+        """
+        rpn_outs = self(x)
+        proposal_list = self.get_bboxes(*rpn_outs, img_metas)
+        return proposal_list
+
+    def aug_test_rpn(self, feats, img_metas):
+        """Test with augmentation for only for ``RPNHead`` and its variants,
+        e.g., ``GARPNHead``, etc.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                        a 4D-tensor.
+            img_metas (list[dict]): Meta info of each image.
+
+        Returns:
+            list[Tensor]: Proposals of each image, each item has shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+        """
+        samples_per_gpu = len(img_metas[0])
+        aug_proposals = [[] for _ in range(samples_per_gpu)]
+        for x, img_meta in zip(feats, img_metas):
+            proposal_list = self.simple_test_rpn(x, img_meta)
+            for i, proposals in enumerate(proposal_list):
+                aug_proposals[i].append(proposals)
+        # reorganize the order of 'img_metas' to match the dimensions
+        # of 'aug_proposals'
+        aug_img_metas = []
+        for i in range(samples_per_gpu):
+            aug_img_meta = []
+            for j in range(len(img_metas)):
+                aug_img_meta.append(img_metas[j][i])
+            aug_img_metas.append(aug_img_meta)
+        # after merging, proposals will be rescaled to the original image size
+        merged_proposals = [
+            merge_aug_proposals(proposals, aug_img_meta, self.test_cfg)
+            for proposals, aug_img_meta in zip(aug_proposals, aug_img_metas)
+        ]
+        return merged_proposals
+
+    if sys.version_info >= (3, 7):
+
+        async def async_simple_test_rpn(self, x, img_metas):
+            sleep_interval = self.test_cfg.pop('async_sleep_interval', 0.025)
+            async with completed(
+                    __name__, 'rpn_head_forward',
+                    sleep_interval=sleep_interval):
+                rpn_outs = self(x)
+
+            proposal_list = self.get_bboxes(*rpn_outs, img_metas)
+            return proposal_list
+
+    def merge_aug_bboxes(self, aug_bboxes, aug_scores, img_metas):
+        """Merge augmented detection bboxes and scores.
+
+        Args:
+            aug_bboxes (list[Tensor]): shape (n, 4*#class)
+            aug_scores (list[Tensor] or None): shape (n, #class)
+            img_shapes (list[Tensor]): shape (3, ).
+
+        Returns:
+            tuple[Tensor]: ``bboxes`` with shape (n,4), where
+            4 represent (tl_x, tl_y, br_x, br_y)
+            and ``scores`` with shape (n,).
+        """
+        recovered_bboxes = []
+        for bboxes, img_info in zip(aug_bboxes, img_metas):
+            img_shape = img_info[0]['img_shape']
+            scale_factor = img_info[0]['scale_factor']
+            flip = img_info[0]['flip']
+            flip_direction = img_info[0]['flip_direction']
+            bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip,
+                                       flip_direction)
+            recovered_bboxes.append(bboxes)
+        bboxes = torch.cat(recovered_bboxes, dim=0)
+        if aug_scores is None:
+            return bboxes
+        else:
+            scores = torch.cat(aug_scores, dim=0)
+            return bboxes, scores
diff --git a/mmcv/models/dense_heads/detr_head.py b/mmcv/models/dense_heads/detr_head.py
new file mode 100644
index 0000000..f19a3f6
--- /dev/null
+++ b/mmcv/models/dense_heads/detr_head.py
@@ -0,0 +1,843 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.models.bricks import Conv2d, Linear, build_activation_layer
+from mmcv.models.bricks.transformer import FFN, build_positional_encoding
+from mmcv.utils import force_fp32
+
+from mmcv.core.utils import multi_apply, reduce_mean
+from mmcv.core.bbox.builder import (build_assigner, build_sampler)
+from mmcv.core.bbox.transforms import (bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh)
+from mmcv.models.utils import build_transformer
+from ..builder import HEADS, build_loss
+from .anchor_free_head import AnchorFreeHead
+
+
+@HEADS.register_module()
+class DETRHead(AnchorFreeHead):
+    """Implements the DETR transformer head.
+
+    See `paper: End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        num_classes (int): Number of categories excluding the background.
+        in_channels (int): Number of channels in the input feature map.
+        num_query (int): Number of query in Transformer.
+        num_reg_fcs (int, optional): Number of fully-connected layers used in
+            `FFN`, which is then used for the regression head. Default 2.
+        transformer (obj:`mmcv.ConfigDict`|dict): Config for transformer.
+            Default: None.
+        sync_cls_avg_factor (bool): Whether to sync the avg_factor of
+            all ranks. Default to False.
+        positional_encoding (obj:`mmcv.ConfigDict`|dict):
+            Config for position encoding.
+        loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the
+            classification loss. Default `CrossEntropyLoss`.
+        loss_bbox (obj:`mmcv.ConfigDict`|dict): Config of the
+            regression loss. Default `L1Loss`.
+        loss_iou (obj:`mmcv.ConfigDict`|dict): Config of the
+            regression iou loss. Default `GIoULoss`.
+        tran_cfg (obj:`mmcv.ConfigDict`|dict): Training config of
+            transformer head.
+        test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of
+            transformer head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    _version = 2
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 num_query=100,
+                 num_reg_fcs=2,
+                 transformer=None,
+                 sync_cls_avg_factor=False,
+                 positional_encoding=dict(
+                     type='SinePositionalEncoding',
+                     num_feats=128,
+                     normalize=True),
+                 loss_cls=dict(
+                     type='CrossEntropyLoss',
+                     bg_cls_weight=0.1,
+                     use_sigmoid=False,
+                     loss_weight=1.0,
+                     class_weight=1.0),
+                 loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+                 loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+                 train_cfg=dict(
+                     assigner=dict(
+                         type='HungarianAssigner',
+                         cls_cost=dict(type='ClassificationCost', weight=1.),
+                         reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+                         iou_cost=dict(
+                             type='IoUCost', iou_mode='giou', weight=2.0))),
+                 test_cfg=dict(max_per_img=100),
+                 init_cfg=None,
+                 **kwargs):
+        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
+        # since it brings inconvenience when the initialization of
+        # `AnchorFreeHead` is called.
+        super(AnchorFreeHead, self).__init__(init_cfg)
+        self.bg_cls_weight = 0
+        self.sync_cls_avg_factor = sync_cls_avg_factor
+        class_weight = loss_cls.get('class_weight', None)
+        if class_weight is not None and (self.__class__ is DETRHead):
+            assert isinstance(class_weight, float), 'Expected ' \
+                'class_weight to have type float. Found ' \
+                f'{type(class_weight)}.'
+            # NOTE following the official DETR rep0, bg_cls_weight means
+            # relative classification weight of the no-object class.
+            bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight)
+            assert isinstance(bg_cls_weight, float), 'Expected ' \
+                'bg_cls_weight to have type float. Found ' \
+                f'{type(bg_cls_weight)}.'
+            class_weight = torch.ones(num_classes + 1) * class_weight
+            # set background class as the last indice
+            class_weight[num_classes] = bg_cls_weight
+            loss_cls.update({'class_weight': class_weight})
+            if 'bg_cls_weight' in loss_cls:
+                loss_cls.pop('bg_cls_weight')
+            self.bg_cls_weight = bg_cls_weight
+
+        if train_cfg:
+            assert 'assigner' in train_cfg, 'assigner should be provided '\
+                'when train_cfg is set.'
+            assigner = train_cfg['assigner']
+            assert loss_cls['loss_weight'] == assigner['cls_cost']['weight'], \
+                'The classification weight for loss and matcher should be' \
+                'exactly the same.'
+            assert loss_bbox['loss_weight'] == assigner['reg_cost'][
+                'weight'], 'The regression L1 weight for loss and matcher ' \
+                'should be exactly the same.'
+            assert loss_iou['loss_weight'] == assigner['iou_cost']['weight'], \
+                'The regression iou weight for loss and matcher should be' \
+                'exactly the same.'
+            self.assigner = build_assigner(assigner)
+            # DETR sampling=False, so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+        self.num_query = num_query
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.num_reg_fcs = num_reg_fcs
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.fp16_enabled = False
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_iou = build_loss(loss_iou)
+
+        if self.loss_cls.use_sigmoid:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+        self.act_cfg = transformer.get('act_cfg',
+                                       dict(type='ReLU', inplace=True))
+        self.activate = build_activation_layer(self.act_cfg)
+        self.positional_encoding = build_positional_encoding(
+            positional_encoding)
+        self.transformer = build_transformer(transformer)
+        self.embed_dims = self.transformer.embed_dims
+        assert 'num_feats' in positional_encoding
+        num_feats = positional_encoding['num_feats']
+        assert num_feats * 2 == self.embed_dims, 'embed_dims should' \
+            f' be exactly 2 times of num_feats. Found {self.embed_dims}' \
+            f' and {num_feats}.'
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers of the transformer head."""
+        self.input_proj = Conv2d(
+            self.in_channels, self.embed_dims, kernel_size=1)
+        self.fc_cls = Linear(self.embed_dims, self.cls_out_channels)
+        self.reg_ffn = FFN(
+            self.embed_dims,
+            self.embed_dims,
+            self.num_reg_fcs,
+            self.act_cfg,
+            dropout=0.0,
+            add_residual=False)
+        self.fc_reg = Linear(self.embed_dims, 4)
+        self.query_embedding = nn.Embedding(self.num_query, self.embed_dims)
+
+    def init_weights(self):
+        """Initialize weights of the transformer head."""
+        # The initialization for transformer is important
+        self.transformer.init_weights()
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        """load checkpoints."""
+        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
+        # since `AnchorFreeHead._load_from_state_dict` should not be
+        # called here. Invoking the default `Module._load_from_state_dict`
+        # is enough.
+
+        # Names of some parameters in has been changed.
+        version = local_metadata.get('version', None)
+        if (version is None or version < 2) and self.__class__ is DETRHead:
+            convert_dict = {
+                '.self_attn.': '.attentions.0.',
+                '.ffn.': '.ffns.0.',
+                '.multihead_attn.': '.attentions.1.',
+                '.decoder.norm.': '.decoder.post_norm.'
+            }
+            state_dict_keys = list(state_dict.keys())
+            for k in state_dict_keys:
+                for ori_key, convert_key in convert_dict.items():
+                    if ori_key in k:
+                        convert_key = k.replace(ori_key, convert_key)
+                        state_dict[convert_key] = state_dict[k]
+                        del state_dict[k]
+
+        super(AnchorFreeHead,
+              self)._load_from_state_dict(state_dict, prefix, local_metadata,
+                                          strict, missing_keys,
+                                          unexpected_keys, error_msgs)
+
+    def forward(self, feats, img_metas):
+        """Forward function.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple[list[Tensor], list[Tensor]]: Outputs for all scale levels.
+
+                - all_cls_scores_list (list[Tensor]): Classification scores \
+                    for each scale level. Each is a 4D-tensor with shape \
+                    [nb_dec, bs, num_query, cls_out_channels]. Note \
+                    `cls_out_channels` should includes background.
+                - all_bbox_preds_list (list[Tensor]): Sigmoid regression \
+                    outputs for each scale level. Each is a 4D-tensor with \
+                    normalized coordinate format (cx, cy, w, h) and shape \
+                    [nb_dec, bs, num_query, 4].
+        """
+        num_levels = len(feats)
+        img_metas_list = [img_metas for _ in range(num_levels)]
+        return multi_apply(self.forward_single, feats, img_metas_list)
+
+    def forward_single(self, x, img_metas):
+        """"Forward function for a single feature level.
+
+        Args:
+            x (Tensor): Input feature from backbone's single stage, shape
+                [bs, c, h, w].
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head,
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression
+                head with normalized coordinate format (cx, cy, w, h).
+                Shape [nb_dec, bs, num_query, 4].
+        """
+        # construct binary masks which used for the transformer.
+        # NOTE following the official DETR repo, non-zero values representing
+        # ignored positions, while zero values means valid positions.
+        batch_size = x.size(0)
+        input_img_h, input_img_w = img_metas[0]['batch_input_shape']
+        masks = x.new_ones((batch_size, input_img_h, input_img_w))
+        for img_id in range(batch_size):
+            img_h, img_w, _ = img_metas[img_id]['img_shape']
+            masks[img_id, :img_h, :img_w] = 0
+
+        x = self.input_proj(x)
+        # interpolate masks to have the same spatial shape with x
+        masks = F.interpolate(
+            masks.unsqueeze(1), size=x.shape[-2:]).to(torch.bool).squeeze(1)
+        # position encoding
+        pos_embed = self.positional_encoding(masks)  # [bs, embed_dim, h, w]
+        # outs_dec: [nb_dec, bs, num_query, embed_dim]
+        outs_dec, _ = self.transformer(x, masks, self.query_embedding.weight,
+                                       pos_embed)
+
+        all_cls_scores = self.fc_cls(outs_dec)
+        all_bbox_preds = self.fc_reg(self.activate(
+            self.reg_ffn(outs_dec))).sigmoid()
+        return all_cls_scores, all_bbox_preds
+
+    @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
+    def loss(self,
+             all_cls_scores_list,
+             all_bbox_preds_list,
+             gt_bboxes_list,
+             gt_labels_list,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """"Loss function.
+
+        Only outputs from the last feature level are used for computing
+        losses by default.
+
+        Args:
+            all_cls_scores_list (list[Tensor]): Classification outputs
+                for each feature level. Each is a 4D-tensor with shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds_list (list[Tensor]): Sigmoid regression
+                outputs for each feature level. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        # NOTE defaultly only the outputs from the last feature scale is used.
+        all_cls_scores = all_cls_scores_list[-1]
+        all_bbox_preds = all_bbox_preds_list[-1]
+        assert gt_bboxes_ignore is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+
+        num_dec_layers = len(all_cls_scores)
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+        img_metas_list = [img_metas for _ in range(num_dec_layers)]
+
+        losses_cls, losses_bbox, losses_iou = multi_apply(
+            self.loss_single, all_cls_scores, all_bbox_preds,
+            all_gt_bboxes_list, all_gt_labels_list, img_metas_list,
+            all_gt_bboxes_ignore_list)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        loss_dict['loss_iou'] = losses_iou[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1],
+                                                       losses_bbox[:-1],
+                                                       losses_iou[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def loss_single(self,
+                    cls_scores,
+                    bbox_preds,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs from
+                a single decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           gt_bboxes_list, gt_labels_list,
+                                           img_metas, gt_bboxes_ignore_list)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        loss_cls = self.loss_cls(
+            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes accross all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(img_metas, bbox_preds):
+            img_h, img_w, _ = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors, 0)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(
+            bboxes, bboxes_gt, bbox_weights, avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(
+            bbox_preds, bbox_targets, bbox_weights, avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def get_targets(self,
+                    cls_scores_list,
+                    bbox_preds_list,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None):
+        """"Compute regression and classification targets for a batch image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+                - labels_list (list[Tensor]): Labels for all images.
+                - label_weights_list (list[Tensor]): Label weights for all \
+                    images.
+                - bbox_targets_list (list[Tensor]): BBox targets for all \
+                    images.
+                - bbox_weights_list (list[Tensor]): BBox weights for all \
+                    images.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+
+        (labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(
+             self._get_target_single, cls_scores_list, bbox_preds_list,
+             gt_bboxes_list, gt_labels_list, img_metas, gt_bboxes_ignore_list)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def _get_target_single(self,
+                           cls_score,
+                           bbox_pred,
+                           gt_bboxes,
+                           gt_labels,
+                           img_meta,
+                           gt_bboxes_ignore=None):
+        """"Compute regression and classification targets for one image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_query, 4].
+            gt_bboxes (Tensor): Ground truth bboxes for one image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth class indices for one image
+                with shape (num_gts, ).
+            img_meta (dict): Meta information for one image.
+            gt_bboxes_ignore (Tensor, optional): Bounding boxes
+                which can be ignored. Default None.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image.
+                - label_weights (Tensor]): Label weights of each image.
+                - bbox_targets (Tensor): BBox targets of each image.
+                - bbox_weights (Tensor): BBox weights of each image.
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+        assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
+                                             gt_labels, img_meta,
+                                             gt_bboxes_ignore)
+        sampling_result = self.sampler.sample(assign_result, bbox_pred,
+                                              gt_bboxes)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+        img_h, img_w, _ = img_meta['img_shape']
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        pos_gt_bboxes_normalized = sampling_result.pos_gt_bboxes / factor
+        pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized)
+        bbox_targets[pos_inds] = pos_gt_bboxes_targets
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    # over-write because img_metas are needed as inputs for bbox_head.
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels=None,
+                      gt_bboxes_ignore=None,
+                      proposal_cfg=None,
+                      **kwargs):
+        """Forward function for training mode.
+
+        Args:
+            x (list[Tensor]): Features from backbone.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert proposal_cfg is None, '"proposal_cfg" must be None'
+        outs = self(x, img_metas)
+        if gt_labels is None:
+            loss_inputs = outs + (gt_bboxes, img_metas)
+        else:
+            loss_inputs = outs + (gt_bboxes, gt_labels, img_metas)
+        losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        return losses
+
+    @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
+    def get_bboxes(self,
+                   all_cls_scores_list,
+                   all_bbox_preds_list,
+                   img_metas,
+                   rescale=False):
+        """Transform network outputs for a batch into bbox predictions.
+
+        Args:
+            all_cls_scores_list (list[Tensor]): Classification outputs
+                for each feature level. Each is a 4D-tensor with shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds_list (list[Tensor]): Sigmoid regression
+                outputs for each feature level. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            img_metas (list[dict]): Meta information of each image.
+            rescale (bool, optional): If True, return boxes in original
+                image space. Default False.
+
+        Returns:
+            list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple. \
+                The first item is an (n, 5) tensor, where the first 4 columns \
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the \
+                5-th column is a score between 0 and 1. The second item is a \
+                (n,) tensor where each item is the predicted class label of \
+                the corresponding box.
+        """
+        # NOTE defaultly only using outputs from the last feature level,
+        # and only the outputs from the last decoder layer is used.
+        cls_scores = all_cls_scores_list[-1][-1]
+        bbox_preds = all_bbox_preds_list[-1][-1]
+
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score = cls_scores[img_id]
+            bbox_pred = bbox_preds[img_id]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            proposals = self._get_bboxes_single(cls_score, bbox_pred,
+                                                img_shape, scale_factor,
+                                                rescale)
+            result_list.append(proposals)
+
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_score,
+                           bbox_pred,
+                           img_shape,
+                           scale_factor,
+                           rescale=False):
+        """Transform outputs from the last decoder layer into bbox predictions
+        for each image.
+
+        Args:
+            cls_score (Tensor): Box score logits from the last decoder layer
+                for each image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
+                for each image, with coordinate format (cx, cy, w, h) and
+                shape [num_query, 4].
+            img_shape (tuple[int]): Shape of input image, (height, width, 3).
+            scale_factor (ndarray, optional): Scale factor of the image arange
+                as (w_scale, h_scale, w_scale, h_scale).
+            rescale (bool, optional): If True, return boxes in original image
+                space. Default False.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels.
+
+                - det_bboxes: Predicted bboxes with shape [num_query, 5], \
+                    where the first 4 columns are bounding box positions \
+                    (tl_x, tl_y, br_x, br_y) and the 5-th column are scores \
+                    between 0 and 1.
+                - det_labels: Predicted labels of the corresponding box with \
+                    shape [num_query].
+        """
+        assert len(cls_score) == len(bbox_pred)
+        max_per_img = self.test_cfg.get('max_per_img', self.num_query)
+        # exclude background
+        if self.loss_cls.use_sigmoid:
+            cls_score = cls_score.sigmoid()
+            scores, indexes = cls_score.view(-1).topk(max_per_img)
+            det_labels = indexes % self.num_classes
+            bbox_index = indexes // self.num_classes
+            bbox_pred = bbox_pred[bbox_index]
+        else:
+            scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1)
+            scores, bbox_index = scores.topk(max_per_img)
+            bbox_pred = bbox_pred[bbox_index]
+            det_labels = det_labels[bbox_index]
+
+        det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
+        det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
+        det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
+        det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1])
+        det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0])
+        if rescale:
+            det_bboxes /= det_bboxes.new_tensor(scale_factor)
+        det_bboxes = torch.cat((det_bboxes, scores.unsqueeze(1)), -1)
+
+        return det_bboxes, det_labels
+
+    def simple_test_bboxes(self, feats, img_metas, rescale=False):
+        """Test det bboxes without test-time augmentation.
+
+        Args:
+            feats (tuple[torch.Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n,)
+        """
+        # forward of this head requires img_metas
+        outs = self.forward(feats, img_metas)
+        results_list = self.get_bboxes(*outs, img_metas, rescale=rescale)
+        return results_list
+
+    def forward_onnx(self, feats, img_metas):
+        """Forward function for exporting to ONNX.
+
+        Over-write `forward` because: `masks` is directly created with
+        zero (valid position tag) and has the same spatial size as `x`.
+        Thus the construction of `masks` is different from that in `forward`.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple[list[Tensor], list[Tensor]]: Outputs for all scale levels.
+
+                - all_cls_scores_list (list[Tensor]): Classification scores \
+                    for each scale level. Each is a 4D-tensor with shape \
+                    [nb_dec, bs, num_query, cls_out_channels]. Note \
+                    `cls_out_channels` should includes background.
+                - all_bbox_preds_list (list[Tensor]): Sigmoid regression \
+                    outputs for each scale level. Each is a 4D-tensor with \
+                    normalized coordinate format (cx, cy, w, h) and shape \
+                    [nb_dec, bs, num_query, 4].
+        """
+        num_levels = len(feats)
+        img_metas_list = [img_metas for _ in range(num_levels)]
+        return multi_apply(self.forward_single_onnx, feats, img_metas_list)
+
+    def forward_single_onnx(self, x, img_metas):
+        """"Forward function for a single feature level with ONNX exportation.
+
+        Args:
+            x (Tensor): Input feature from backbone's single stage, shape
+                [bs, c, h, w].
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head,
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression
+                head with normalized coordinate format (cx, cy, w, h).
+                Shape [nb_dec, bs, num_query, 4].
+        """
+        # Note `img_shape` is not dynamically traceable to ONNX,
+        # since the related augmentation was done with numpy under
+        # CPU. Thus `masks` is directly created with zeros (valid tag)
+        # and the same spatial shape as `x`.
+        # The difference between torch and exported ONNX model may be
+        # ignored, since the same performance is achieved (e.g.
+        # 40.1 vs 40.1 for DETR)
+        batch_size = x.size(0)
+        h, w = x.size()[-2:]
+        masks = x.new_zeros((batch_size, h, w))  # [B,h,w]
+
+        x = self.input_proj(x)
+        # interpolate masks to have the same spatial shape with x
+        masks = F.interpolate(
+            masks.unsqueeze(1), size=x.shape[-2:]).to(torch.bool).squeeze(1)
+        pos_embed = self.positional_encoding(masks)
+        outs_dec, _ = self.transformer(x, masks, self.query_embedding.weight,
+                                       pos_embed)
+
+        all_cls_scores = self.fc_cls(outs_dec)
+        all_bbox_preds = self.fc_reg(self.activate(
+            self.reg_ffn(outs_dec))).sigmoid()
+        return all_cls_scores, all_bbox_preds
+
+    def onnx_export(self, all_cls_scores_list, all_bbox_preds_list, img_metas):
+        """Transform network outputs into bbox predictions, with ONNX
+        exportation.
+
+        Args:
+            all_cls_scores_list (list[Tensor]): Classification outputs
+                for each feature level. Each is a 4D-tensor with shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds_list (list[Tensor]): Sigmoid regression
+                outputs for each feature level. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            img_metas (list[dict]): Meta information of each image.
+
+        Returns:
+            tuple[Tensor, Tensor]: dets of shape [N, num_det, 5]
+                and class labels of shape [N, num_det].
+        """
+        assert len(img_metas) == 1, \
+            'Only support one input image while in exporting to ONNX'
+
+        cls_scores = all_cls_scores_list[-1][-1]
+        bbox_preds = all_bbox_preds_list[-1][-1]
+
+        # Note `img_shape` is not dynamically traceable to ONNX,
+        # here `img_shape_for_onnx` (padded shape of image tensor)
+        # is used.
+        img_shape = img_metas[0]['img_shape_for_onnx']
+        max_per_img = self.test_cfg.get('max_per_img', self.num_query)
+        batch_size = cls_scores.size(0)
+        # `batch_index_offset` is used for the gather of concatenated tensor
+        batch_index_offset = torch.arange(batch_size).to(
+            cls_scores.device) * max_per_img
+        batch_index_offset = batch_index_offset.unsqueeze(1).expand(
+            batch_size, max_per_img)
+
+        # supports dynamical batch inference
+        if self.loss_cls.use_sigmoid:
+            cls_scores = cls_scores.sigmoid()
+            scores, indexes = cls_scores.view(batch_size, -1).topk(
+                max_per_img, dim=1)
+            det_labels = indexes % self.num_classes
+            bbox_index = indexes // self.num_classes
+            bbox_index = (bbox_index + batch_index_offset).view(-1)
+            bbox_preds = bbox_preds.view(-1, 4)[bbox_index]
+            bbox_preds = bbox_preds.view(batch_size, -1, 4)
+        else:
+            scores, det_labels = F.softmax(
+                cls_scores, dim=-1)[..., :-1].max(-1)
+            scores, bbox_index = scores.topk(max_per_img, dim=1)
+            bbox_index = (bbox_index + batch_index_offset).view(-1)
+            bbox_preds = bbox_preds.view(-1, 4)[bbox_index]
+            det_labels = det_labels.view(-1)[bbox_index]
+            bbox_preds = bbox_preds.view(batch_size, -1, 4)
+            det_labels = det_labels.view(batch_size, -1)
+
+        det_bboxes = bbox_cxcywh_to_xyxy(bbox_preds)
+        # use `img_shape_tensor` for dynamically exporting to ONNX
+        img_shape_tensor = img_shape.flip(0).repeat(2)  # [w,h,w,h]
+        img_shape_tensor = img_shape_tensor.unsqueeze(0).unsqueeze(0).expand(
+            batch_size, det_bboxes.size(1), 4)
+        det_bboxes = det_bboxes * img_shape_tensor
+        # dynamically clip bboxes
+        x1, y1, x2, y2 = det_bboxes.split((1, 1, 1, 1), dim=-1)
+        from mmcv.core.export import dynamic_clip_for_onnx
+        x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, img_shape)
+        det_bboxes = torch.cat([x1, y1, x2, y2], dim=-1)
+        det_bboxes = torch.cat((det_bboxes, scores.unsqueeze(-1)), -1)
+
+        return det_bboxes, det_labels
diff --git a/mmcv/models/dense_heads/free_anchor3d_head.py b/mmcv/models/dense_heads/free_anchor3d_head.py
new file mode 100644
index 0000000..6e9797c
--- /dev/null
+++ b/mmcv/models/dense_heads/free_anchor3d_head.py
@@ -0,0 +1,284 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.runner import force_fp32
+from torch.nn import functional as F
+
+from mmcv.core.bbox import bbox_overlaps_nearest_3d
+from mmcv.models import HEADS
+from .anchor3d_head import Anchor3DHead
+from .train_mixins import get_direction_target
+
+
+@HEADS.register_module()
+class FreeAnchor3DHead(Anchor3DHead):
+    r"""`FreeAnchor <https://arxiv.org/abs/1909.02466>`_ head for 3D detection.
+
+    Note:
+        This implementation is directly modified from the `mmdet implementation
+        <https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/free_anchor_retina_head.py>`_.
+        We find it also works on 3D detection with minor modification, i.e.,
+        different hyper-parameters and a additional direction classifier.
+
+    Args:
+        pre_anchor_topk (int): Number of boxes that be token in each bag.
+        bbox_thr (float): The threshold of the saturated linear function. It is
+            usually the same with the IoU threshold used in NMS.
+        gamma (float): Gamma parameter in focal loss.
+        alpha (float): Alpha parameter in focal loss.
+        kwargs (dict): Other arguments are the same as those in :class:`Anchor3DHead`.
+    """  # noqa: E501
+
+    def __init__(self,
+                 pre_anchor_topk=50,
+                 bbox_thr=0.6,
+                 gamma=2.0,
+                 alpha=0.5,
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(init_cfg=init_cfg, **kwargs)
+        self.pre_anchor_topk = pre_anchor_topk
+        self.bbox_thr = bbox_thr
+        self.gamma = gamma
+        self.alpha = alpha
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             dir_cls_preds,
+             gt_bboxes,
+             gt_labels,
+             input_metas,
+             gt_bboxes_ignore=None):
+        """Calculate loss of FreeAnchor head.
+
+        Args:
+            cls_scores (list[torch.Tensor]): Classification scores of
+                different samples.
+            bbox_preds (list[torch.Tensor]): Box predictions of
+                different samples
+            dir_cls_preds (list[torch.Tensor]): Direction predictions of
+                different samples
+            gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes.
+            gt_labels (list[torch.Tensor]): Ground truth labels.
+            input_metas (list[dict]): List of input meta information.
+            gt_bboxes_ignore (list[:obj:`BaseInstance3DBoxes`], optional):
+                Ground truth boxes that should be ignored. Defaults to None.
+
+        Returns:
+            dict[str, torch.Tensor]: Loss items.
+
+                - positive_bag_loss (torch.Tensor): Loss of positive samples.
+                - negative_bag_loss (torch.Tensor): Loss of negative samples.
+        """
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.anchor_generator.num_levels
+
+        anchor_list = self.get_anchors(featmap_sizes, input_metas)
+        anchors = [torch.cat(anchor) for anchor in anchor_list]
+
+        # concatenate each level
+        cls_scores = [
+            cls_score.permute(0, 2, 3, 1).reshape(
+                cls_score.size(0), -1, self.num_classes)
+            for cls_score in cls_scores
+        ]
+        bbox_preds = [
+            bbox_pred.permute(0, 2, 3, 1).reshape(
+                bbox_pred.size(0), -1, self.box_code_size)
+            for bbox_pred in bbox_preds
+        ]
+        dir_cls_preds = [
+            dir_cls_pred.permute(0, 2, 3,
+                                 1).reshape(dir_cls_pred.size(0), -1, 2)
+            for dir_cls_pred in dir_cls_preds
+        ]
+
+        cls_scores = torch.cat(cls_scores, dim=1)
+        bbox_preds = torch.cat(bbox_preds, dim=1)
+        dir_cls_preds = torch.cat(dir_cls_preds, dim=1)
+
+        cls_prob = torch.sigmoid(cls_scores)
+        box_prob = []
+        num_pos = 0
+        positive_losses = []
+        for _, (anchors_, gt_labels_, gt_bboxes_, cls_prob_, bbox_preds_,
+                dir_cls_preds_) in enumerate(
+                    zip(anchors, gt_labels, gt_bboxes, cls_prob, bbox_preds,
+                        dir_cls_preds)):
+
+            gt_bboxes_ = gt_bboxes_.tensor.to(anchors_.device)
+
+            with torch.no_grad():
+                # box_localization: a_{j}^{loc}, shape: [j, 4]
+                pred_boxes = self.bbox_coder.decode(anchors_, bbox_preds_)
+
+                # object_box_iou: IoU_{ij}^{loc}, shape: [i, j]
+                object_box_iou = bbox_overlaps_nearest_3d(
+                    gt_bboxes_, pred_boxes)
+
+                # object_box_prob: P{a_{j} -> b_{i}}, shape: [i, j]
+                t1 = self.bbox_thr
+                t2 = object_box_iou.max(
+                    dim=1, keepdim=True).values.clamp(min=t1 + 1e-6)
+                object_box_prob = ((object_box_iou - t1) / (t2 - t1)).clamp(
+                    min=0, max=1)
+
+                # object_cls_box_prob: P{a_{j} -> b_{i}}, shape: [i, c, j]
+                num_obj = gt_labels_.size(0)
+                indices = torch.stack(
+                    [torch.arange(num_obj).type_as(gt_labels_), gt_labels_],
+                    dim=0)
+
+                object_cls_box_prob = torch.sparse_coo_tensor(
+                    indices, object_box_prob)
+
+                # image_box_iou: P{a_{j} \in A_{+}}, shape: [c, j]
+                """
+                from "start" to "end" implement:
+                image_box_iou = torch.sparse.max(object_cls_box_prob,
+                                                 dim=0).t()
+
+                """
+                # start
+                box_cls_prob = torch.sparse.sum(
+                    object_cls_box_prob, dim=0).to_dense()
+
+                indices = torch.nonzero(box_cls_prob, as_tuple=False).t_()
+                if indices.numel() == 0:
+                    image_box_prob = torch.zeros(
+                        anchors_.size(0),
+                        self.num_classes).type_as(object_box_prob)
+                else:
+                    nonzero_box_prob = torch.where(
+                        (gt_labels_.unsqueeze(dim=-1) == indices[0]),
+                        object_box_prob[:, indices[1]],
+                        torch.tensor(
+                            [0]).type_as(object_box_prob)).max(dim=0).values
+
+                    # upmap to shape [j, c]
+                    image_box_prob = torch.sparse_coo_tensor(
+                        indices.flip([0]),
+                        nonzero_box_prob,
+                        size=(anchors_.size(0), self.num_classes)).to_dense()
+                # end
+
+                box_prob.append(image_box_prob)
+
+            # construct bags for objects
+            match_quality_matrix = bbox_overlaps_nearest_3d(
+                gt_bboxes_, anchors_)
+            _, matched = torch.topk(
+                match_quality_matrix,
+                self.pre_anchor_topk,
+                dim=1,
+                sorted=False)
+            del match_quality_matrix
+
+            # matched_cls_prob: P_{ij}^{cls}
+            matched_cls_prob = torch.gather(
+                cls_prob_[matched], 2,
+                gt_labels_.view(-1, 1, 1).repeat(1, self.pre_anchor_topk,
+                                                 1)).squeeze(2)
+
+            # matched_box_prob: P_{ij}^{loc}
+            matched_anchors = anchors_[matched]
+            matched_object_targets = self.bbox_coder.encode(
+                matched_anchors,
+                gt_bboxes_.unsqueeze(dim=1).expand_as(matched_anchors))
+
+            # direction classification loss
+            loss_dir = None
+            if self.use_direction_classifier:
+                # also calculate direction prob: P_{ij}^{dir}
+                matched_dir_targets = get_direction_target(
+                    matched_anchors,
+                    matched_object_targets,
+                    self.dir_offset,
+                    one_hot=False)
+                loss_dir = self.loss_dir(
+                    dir_cls_preds_[matched].transpose(-2, -1),
+                    matched_dir_targets,
+                    reduction_override='none')
+
+            # generate bbox weights
+            if self.diff_rad_by_sin:
+                bbox_preds_[matched], matched_object_targets = \
+                    self.add_sin_difference(
+                        bbox_preds_[matched], matched_object_targets)
+            bbox_weights = matched_anchors.new_ones(matched_anchors.size())
+            # Use pop is not right, check performance
+            code_weight = self.train_cfg.get('code_weight', None)
+            if code_weight:
+                bbox_weights = bbox_weights * bbox_weights.new_tensor(
+                    code_weight)
+            loss_bbox = self.loss_bbox(
+                bbox_preds_[matched],
+                matched_object_targets,
+                bbox_weights,
+                reduction_override='none').sum(-1)
+
+            if loss_dir is not None:
+                loss_bbox += loss_dir
+            matched_box_prob = torch.exp(-loss_bbox)
+
+            # positive_losses: {-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )}
+            num_pos += len(gt_bboxes_)
+            positive_losses.append(
+                self.positive_bag_loss(matched_cls_prob, matched_box_prob))
+
+        positive_loss = torch.cat(positive_losses).sum() / max(1, num_pos)
+
+        # box_prob: P{a_{j} \in A_{+}}
+        box_prob = torch.stack(box_prob, dim=0)
+
+        # negative_loss:
+        # \sum_{j}{ FL((1 - P{a_{j} \in A_{+}}) * (1 - P_{j}^{bg})) } / n||B||
+        negative_loss = self.negative_bag_loss(cls_prob, box_prob).sum() / max(
+            1, num_pos * self.pre_anchor_topk)
+
+        losses = {
+            'positive_bag_loss': positive_loss,
+            'negative_bag_loss': negative_loss
+        }
+        return losses
+
+    def positive_bag_loss(self, matched_cls_prob, matched_box_prob):
+        """Generate positive bag loss.
+
+        Args:
+            matched_cls_prob (torch.Tensor): Classification probability
+                of matched positive samples.
+            matched_box_prob (torch.Tensor): Bounding box probability
+                of matched positive samples.
+
+        Returns:
+            torch.Tensor: Loss of positive samples.
+        """
+        # bag_prob = Mean-max(matched_prob)
+        matched_prob = matched_cls_prob * matched_box_prob
+        weight = 1 / torch.clamp(1 - matched_prob, 1e-12, None)
+        weight /= weight.sum(dim=1).unsqueeze(dim=-1)
+        bag_prob = (weight * matched_prob).sum(dim=1)
+        # positive_bag_loss = -self.alpha * log(bag_prob)
+        bag_prob = bag_prob.clamp(0, 1)  # to avoid bug of BCE, check
+        return self.alpha * F.binary_cross_entropy(
+            bag_prob, torch.ones_like(bag_prob), reduction='none')
+
+    def negative_bag_loss(self, cls_prob, box_prob):
+        """Generate negative bag loss.
+
+        Args:
+            cls_prob (torch.Tensor): Classification probability
+                of negative samples.
+            box_prob (torch.Tensor): Bounding box probability
+                of negative samples.
+
+        Returns:
+            torch.Tensor: Loss of negative samples.
+        """
+        prob = cls_prob * (1 - box_prob)
+        prob = prob.clamp(0, 1)  # to avoid bug of BCE, check
+        negative_bag_loss = prob**self.gamma * F.binary_cross_entropy(
+            prob, torch.zeros_like(prob), reduction='none')
+        return (1 - self.alpha) * negative_bag_loss
\ No newline at end of file
diff --git a/mmcv/models/dense_heads/ga_rpn_head.py b/mmcv/models/dense_heads/ga_rpn_head.py
new file mode 100644
index 0000000..7c739de
--- /dev/null
+++ b/mmcv/models/dense_heads/ga_rpn_head.py
@@ -0,0 +1,176 @@
+import copy
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv import ConfigDict
+from mmcv.ops import nms
+
+from ..builder import HEADS
+from .guided_anchor_head import GuidedAnchorHead
+
+
+@HEADS.register_module()
+class GARPNHead(GuidedAnchorHead):
+    """Guided-Anchor-based RPN head."""
+
+    def __init__(self,
+                 in_channels,
+                 init_cfg=dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.01,
+                     override=dict(
+                         type='Normal',
+                         name='conv_loc',
+                         std=0.01,
+                         bias_prob=0.01)),
+                 **kwargs):
+        super(GARPNHead, self).__init__(
+            1, in_channels, init_cfg=init_cfg, **kwargs)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.rpn_conv = nn.Conv2d(
+            self.in_channels, self.feat_channels, 3, padding=1)
+        super(GARPNHead, self)._init_layers()
+
+    def forward_single(self, x):
+        """Forward feature of a single scale level."""
+
+        x = self.rpn_conv(x)
+        x = F.relu(x, inplace=True)
+        (cls_score, bbox_pred, shape_pred,
+         loc_pred) = super(GARPNHead, self).forward_single(x)
+        return cls_score, bbox_pred, shape_pred, loc_pred
+
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             shape_preds,
+             loc_preds,
+             gt_bboxes,
+             img_metas,
+             gt_bboxes_ignore=None):
+        losses = super(GARPNHead, self).loss(
+            cls_scores,
+            bbox_preds,
+            shape_preds,
+            loc_preds,
+            gt_bboxes,
+            None,
+            img_metas,
+            gt_bboxes_ignore=gt_bboxes_ignore)
+        return dict(
+            loss_rpn_cls=losses['loss_cls'],
+            loss_rpn_bbox=losses['loss_bbox'],
+            loss_anchor_shape=losses['loss_shape'],
+            loss_anchor_loc=losses['loss_loc'])
+
+    def _get_bboxes_single(self,
+                           cls_scores,
+                           bbox_preds,
+                           mlvl_anchors,
+                           mlvl_masks,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False):
+        cfg = self.test_cfg if cfg is None else cfg
+
+        cfg = copy.deepcopy(cfg)
+
+        # deprecate arguments warning
+        if 'nms' not in cfg or 'max_num' in cfg or 'nms_thr' in cfg:
+            warnings.warn(
+                'In rpn_proposal or test_cfg, '
+                'nms_thr has been moved to a dict named nms as '
+                'iou_threshold, max_num has been renamed as max_per_img, '
+                'name of original arguments and the way to specify '
+                'iou_threshold of NMS will be deprecated.')
+        if 'nms' not in cfg:
+            cfg.nms = ConfigDict(dict(type='nms', iou_threshold=cfg.nms_thr))
+        if 'max_num' in cfg:
+            if 'max_per_img' in cfg:
+                assert cfg.max_num == cfg.max_per_img, f'You ' \
+                    f'set max_num and max_per_img at the same time, ' \
+                    f'but get {cfg.max_num} ' \
+                    f'and {cfg.max_per_img} respectively' \
+                    'Please delete max_num which will be deprecated.'
+            else:
+                cfg.max_per_img = cfg.max_num
+        if 'nms_thr' in cfg:
+            assert cfg.nms.iou_threshold == cfg.nms_thr, f'You set ' \
+                f'iou_threshold in nms and ' \
+                f'nms_thr at the same time, but get ' \
+                f'{cfg.nms.iou_threshold} and {cfg.nms_thr}' \
+                f' respectively. Please delete the ' \
+                f'nms_thr which will be deprecated.'
+
+        assert cfg.nms.get('type', 'nms') == 'nms', 'GARPNHead only support ' \
+            'naive nms.'
+
+        mlvl_proposals = []
+        for idx in range(len(cls_scores)):
+            rpn_cls_score = cls_scores[idx]
+            rpn_bbox_pred = bbox_preds[idx]
+            anchors = mlvl_anchors[idx]
+            mask = mlvl_masks[idx]
+            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
+            # if no location is kept, end.
+            if mask.sum() == 0:
+                continue
+            rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
+            if self.use_sigmoid_cls:
+                rpn_cls_score = rpn_cls_score.reshape(-1)
+                scores = rpn_cls_score.sigmoid()
+            else:
+                rpn_cls_score = rpn_cls_score.reshape(-1, 2)
+                # remind that we set FG labels to [0, num_class-1]
+                # since mmdet v2.0
+                # BG cat_id: num_class
+                scores = rpn_cls_score.softmax(dim=1)[:, :-1]
+            # filter scores, bbox_pred w.r.t. mask.
+            # anchors are filtered in get_anchors() beforehand.
+            scores = scores[mask]
+            rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1,
+                                                                   4)[mask, :]
+            if scores.dim() == 0:
+                rpn_bbox_pred = rpn_bbox_pred.unsqueeze(0)
+                anchors = anchors.unsqueeze(0)
+                scores = scores.unsqueeze(0)
+            # filter anchors, bbox_pred, scores w.r.t. scores
+            if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre:
+                _, topk_inds = scores.topk(cfg.nms_pre)
+                rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
+                anchors = anchors[topk_inds, :]
+                scores = scores[topk_inds]
+            # get proposals w.r.t. anchors and rpn_bbox_pred
+            proposals = self.bbox_coder.decode(
+                anchors, rpn_bbox_pred, max_shape=img_shape)
+            # filter out too small bboxes
+            if cfg.min_bbox_size >= 0:
+                w = proposals[:, 2] - proposals[:, 0]
+                h = proposals[:, 3] - proposals[:, 1]
+                valid_inds = torch.nonzero(
+                    (w > cfg.min_bbox_size) & (h > cfg.min_bbox_size),
+                    as_tuple=False).squeeze()
+                proposals = proposals[valid_inds, :]
+                scores = scores[valid_inds]
+            # NMS in current level
+            proposals, _ = nms(proposals, scores, cfg.nms.iou_threshold)
+            proposals = proposals[:cfg.nms_post, :]
+            mlvl_proposals.append(proposals)
+        proposals = torch.cat(mlvl_proposals, 0)
+        if cfg.get('nms_across_levels', False):
+            # NMS across multi levels
+            proposals, _ = nms(proposals[:, :4], proposals[:, -1],
+                               cfg.nms.iou_threshold)
+            proposals = proposals[:cfg.max_per_img, :]
+        else:
+            scores = proposals[:, 4]
+            num = min(cfg.max_per_img, proposals.shape[0])
+            _, topk_inds = scores.topk(num)
+            proposals = proposals[topk_inds, :]
+        return proposals
diff --git a/mmcv/models/dense_heads/guided_anchor_head.py b/mmcv/models/dense_heads/guided_anchor_head.py
new file mode 100644
index 0000000..b36b957
--- /dev/null
+++ b/mmcv/models/dense_heads/guided_anchor_head.py
@@ -0,0 +1,862 @@
+import torch
+import torch.nn as nn
+# from mmcv.ops import DeformConv2d, MaskedConv2d
+from mmcv.ops.deform_conv import DeformConv2d
+from mmcv.ops.masked_conv import MaskedConv2d
+from mmcv.models.backbones import BaseModule
+from mmcv.utils import force_fp32
+
+
+from mmcv.core.anchor import anchor_inside_flags, build_anchor_generator, images_to_levels, calc_region
+from mmcv.core.utils import multi_apply, reduce_mean, unmap
+from mmcv.core.bbox.builder import build_assigner, build_sampler
+from mmcv.core.post_processing.bbox_nms import multiclass_nms
+from ..builder import HEADS, build_loss
+from .anchor_head import AnchorHead
+
+
+class FeatureAdaption(BaseModule):
+    """Feature Adaption Module.
+
+    Feature Adaption Module is implemented based on DCN v1.
+    It uses anchor shape prediction rather than feature map to
+    predict offsets of deform conv layer.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        out_channels (int): Number of channels in the output feature map.
+        kernel_size (int): Deformable conv kernel size.
+        deform_groups (int): Deformable conv group size.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 deform_groups=4,
+                 init_cfg=dict(
+                     type='Normal',
+                     layer='Conv2d',
+                     std=0.1,
+                     override=dict(
+                         type='Normal', name='conv_adaption', std=0.01))):
+        super(FeatureAdaption, self).__init__(init_cfg)
+        offset_channels = kernel_size * kernel_size * 2
+        self.conv_offset = nn.Conv2d(
+            2, deform_groups * offset_channels, 1, bias=False)
+        self.conv_adaption = DeformConv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            padding=(kernel_size - 1) // 2,
+            deform_groups=deform_groups)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x, shape):
+        offset = self.conv_offset(shape.detach())
+        x = self.relu(self.conv_adaption(x, offset))
+        return x
+
+
+@HEADS.register_module()
+class GuidedAnchorHead(AnchorHead):
+    """Guided-Anchor-based head (GA-RPN, GA-RetinaNet, etc.).
+
+    This GuidedAnchorHead will predict high-quality feature guided
+    anchors and locations where anchors will be kept in inference.
+    There are mainly 3 categories of bounding-boxes.
+
+    - Sampled 9 pairs for target assignment. (approxes)
+    - The square boxes where the predicted anchors are based on. (squares)
+    - Guided anchors.
+
+    Please refer to https://arxiv.org/abs/1901.03278 for more details.
+
+    Args:
+        num_classes (int): Number of classes.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels.
+        approx_anchor_generator (dict): Config dict for approx generator
+        square_anchor_generator (dict): Config dict for square generator
+        anchor_coder (dict): Config dict for anchor coder
+        bbox_coder (dict): Config dict for bbox coder
+        reg_decoded_bbox (bool): If true, the regression loss would be
+            applied directly on decoded bounding boxes, converting both
+            the predicted boxes and regression targets to absolute
+            coordinates format. Default False. It should be `True` when
+            using `IoULoss`, `GIoULoss`, or `DIoULoss` in the bbox head.
+        deform_groups: (int): Group number of DCN in
+            FeatureAdaption module.
+        loc_filter_thr (float): Threshold to filter out unconcerned regions.
+        loss_loc (dict): Config of location loss.
+        loss_shape (dict): Config of anchor shape loss.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of bbox regression loss.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(
+            self,
+            num_classes,
+            in_channels,
+            feat_channels=256,
+            approx_anchor_generator=dict(
+                type='AnchorGenerator',
+                octave_base_scale=8,
+                scales_per_octave=3,
+                ratios=[0.5, 1.0, 2.0],
+                strides=[4, 8, 16, 32, 64]),
+            square_anchor_generator=dict(
+                type='AnchorGenerator',
+                ratios=[1.0],
+                scales=[8],
+                strides=[4, 8, 16, 32, 64]),
+            anchor_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[.0, .0, .0, .0],
+                target_stds=[1.0, 1.0, 1.0, 1.0]
+            ),
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[.0, .0, .0, .0],
+                target_stds=[1.0, 1.0, 1.0, 1.0]
+            ),
+            reg_decoded_bbox=False,
+            deform_groups=4,
+            loc_filter_thr=0.01,
+            train_cfg=None,
+            test_cfg=None,
+            loss_loc=dict(
+                type='FocalLoss',
+                use_sigmoid=True,
+                gamma=2.0,
+                alpha=0.25,
+                loss_weight=1.0),
+            loss_shape=dict(type='BoundedIoULoss', beta=0.2, loss_weight=1.0),
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+            loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                           loss_weight=1.0),
+            init_cfg=dict(type='Normal', layer='Conv2d', std=0.01,
+                          override=dict(type='Normal',
+                                        name='conv_loc',
+                                        std=0.01,
+                                        bias_prob=0.01))):  # yapf: disable
+        super(AnchorHead, self).__init__(init_cfg)
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.deform_groups = deform_groups
+        self.loc_filter_thr = loc_filter_thr
+
+        # build approx_anchor_generator and square_anchor_generator
+        assert (approx_anchor_generator['octave_base_scale'] ==
+                square_anchor_generator['scales'][0])
+        assert (approx_anchor_generator['strides'] ==
+                square_anchor_generator['strides'])
+        self.approx_anchor_generator = build_anchor_generator(
+            approx_anchor_generator)
+        self.square_anchor_generator = build_anchor_generator(
+            square_anchor_generator)
+        self.approxs_per_octave = self.approx_anchor_generator \
+            .num_base_anchors[0]
+
+        self.reg_decoded_bbox = reg_decoded_bbox
+
+        # one anchor per location
+        self.num_anchors = 1
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        self.loc_focal_loss = loss_loc['type'] in ['FocalLoss']
+        self.sampling = loss_cls['type'] not in ['FocalLoss']
+        self.ga_sampling = train_cfg is not None and hasattr(
+            train_cfg, 'ga_sampler')
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = self.num_classes
+        else:
+            self.cls_out_channels = self.num_classes + 1
+
+        # build bbox_coder
+        self.anchor_coder = build_bbox_coder(anchor_coder)
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+
+        # build losses
+        self.loss_loc = build_loss(loss_loc)
+        self.loss_shape = build_loss(loss_shape)
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if self.train_cfg:
+            self.assigner = build_assigner(self.train_cfg.assigner)
+            # use PseudoSampler when sampling is False
+            if self.sampling and hasattr(self.train_cfg, 'sampler'):
+                sampler_cfg = self.train_cfg.sampler
+            else:
+                sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+
+            self.ga_assigner = build_assigner(self.train_cfg.ga_assigner)
+            if self.ga_sampling:
+                ga_sampler_cfg = self.train_cfg.ga_sampler
+            else:
+                ga_sampler_cfg = dict(type='PseudoSampler')
+            self.ga_sampler = build_sampler(ga_sampler_cfg, context=self)
+
+        self.fp16_enabled = False
+
+        self._init_layers()
+
+    def _init_layers(self):
+        self.relu = nn.ReLU(inplace=True)
+        self.conv_loc = nn.Conv2d(self.in_channels, 1, 1)
+        self.conv_shape = nn.Conv2d(self.in_channels, self.num_anchors * 2, 1)
+        self.feature_adaption = FeatureAdaption(
+            self.in_channels,
+            self.feat_channels,
+            kernel_size=3,
+            deform_groups=self.deform_groups)
+        self.conv_cls = MaskedConv2d(self.feat_channels,
+                                     self.num_anchors * self.cls_out_channels,
+                                     1)
+        self.conv_reg = MaskedConv2d(self.feat_channels, self.num_anchors * 4,
+                                     1)
+
+    def forward_single(self, x):
+        loc_pred = self.conv_loc(x)
+        shape_pred = self.conv_shape(x)
+        x = self.feature_adaption(x, shape_pred)
+        # masked conv is only used during inference for speed-up
+        if not self.training:
+            mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr
+        else:
+            mask = None
+        cls_score = self.conv_cls(x, mask)
+        bbox_pred = self.conv_reg(x, mask)
+        return cls_score, bbox_pred, shape_pred, loc_pred
+
+    def forward(self, feats):
+        return multi_apply(self.forward_single, feats)
+
+    def get_sampled_approxs(self, featmap_sizes, img_metas, device='cuda'):
+        """Get sampled approxs and inside flags according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            img_metas (list[dict]): Image meta info.
+            device (torch.device | str): device for returned tensors
+
+        Returns:
+            tuple: approxes of each image, inside flags of each image
+        """
+        num_imgs = len(img_metas)
+
+        # since feature map sizes of all images are the same, we only compute
+        # approxes for one time
+        multi_level_approxs = self.approx_anchor_generator.grid_anchors(
+            featmap_sizes, device=device)
+        approxs_list = [multi_level_approxs for _ in range(num_imgs)]
+
+        # for each image, we compute inside flags of multi level approxes
+        inside_flag_list = []
+        for img_id, img_meta in enumerate(img_metas):
+            multi_level_flags = []
+            multi_level_approxs = approxs_list[img_id]
+
+            # obtain valid flags for each approx first
+            multi_level_approx_flags = self.approx_anchor_generator \
+                .valid_flags(featmap_sizes,
+                             img_meta['pad_shape'],
+                             device=device)
+
+            for i, flags in enumerate(multi_level_approx_flags):
+                approxs = multi_level_approxs[i]
+                inside_flags_list = []
+                for i in range(self.approxs_per_octave):
+                    split_valid_flags = flags[i::self.approxs_per_octave]
+                    split_approxs = approxs[i::self.approxs_per_octave, :]
+                    inside_flags = anchor_inside_flags(
+                        split_approxs, split_valid_flags,
+                        img_meta['img_shape'][:2],
+                        self.train_cfg.allowed_border)
+                    inside_flags_list.append(inside_flags)
+                # inside_flag for a position is true if any anchor in this
+                # position is true
+                inside_flags = (
+                    torch.stack(inside_flags_list, 0).sum(dim=0) > 0)
+                multi_level_flags.append(inside_flags)
+            inside_flag_list.append(multi_level_flags)
+        return approxs_list, inside_flag_list
+
+    def get_anchors(self,
+                    featmap_sizes,
+                    shape_preds,
+                    loc_preds,
+                    img_metas,
+                    use_loc_filter=False,
+                    device='cuda'):
+        """Get squares according to feature map sizes and guided anchors.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            shape_preds (list[tensor]): Multi-level shape predictions.
+            loc_preds (list[tensor]): Multi-level location predictions.
+            img_metas (list[dict]): Image meta info.
+            use_loc_filter (bool): Use loc filter or not.
+            device (torch.device | str): device for returned tensors
+
+        Returns:
+            tuple: square approxs of each image, guided anchors of each image,
+                loc masks of each image
+        """
+        num_imgs = len(img_metas)
+        num_levels = len(featmap_sizes)
+
+        # since feature map sizes of all images are the same, we only compute
+        # squares for one time
+        multi_level_squares = self.square_anchor_generator.grid_anchors(
+            featmap_sizes, device=device)
+        squares_list = [multi_level_squares for _ in range(num_imgs)]
+
+        # for each image, we compute multi level guided anchors
+        guided_anchors_list = []
+        loc_mask_list = []
+        for img_id, img_meta in enumerate(img_metas):
+            multi_level_guided_anchors = []
+            multi_level_loc_mask = []
+            for i in range(num_levels):
+                squares = squares_list[img_id][i]
+                shape_pred = shape_preds[i][img_id]
+                loc_pred = loc_preds[i][img_id]
+                guided_anchors, loc_mask = self._get_guided_anchors_single(
+                    squares,
+                    shape_pred,
+                    loc_pred,
+                    use_loc_filter=use_loc_filter)
+                multi_level_guided_anchors.append(guided_anchors)
+                multi_level_loc_mask.append(loc_mask)
+            guided_anchors_list.append(multi_level_guided_anchors)
+            loc_mask_list.append(multi_level_loc_mask)
+        return squares_list, guided_anchors_list, loc_mask_list
+
+    def _get_guided_anchors_single(self,
+                                   squares,
+                                   shape_pred,
+                                   loc_pred,
+                                   use_loc_filter=False):
+        """Get guided anchors and loc masks for a single level.
+
+        Args:
+            square (tensor): Squares of a single level.
+            shape_pred (tensor): Shape predictions of a single level.
+            loc_pred (tensor): Loc predictions of a single level.
+            use_loc_filter (list[tensor]): Use loc filter or not.
+
+        Returns:
+            tuple: guided anchors, location masks
+        """
+        # calculate location filtering mask
+        loc_pred = loc_pred.sigmoid().detach()
+        if use_loc_filter:
+            loc_mask = loc_pred >= self.loc_filter_thr
+        else:
+            loc_mask = loc_pred >= 0.0
+        mask = loc_mask.permute(1, 2, 0).expand(-1, -1, self.num_anchors)
+        mask = mask.contiguous().view(-1)
+        # calculate guided anchors
+        squares = squares[mask]
+        anchor_deltas = shape_pred.permute(1, 2, 0).contiguous().view(
+            -1, 2).detach()[mask]
+        bbox_deltas = anchor_deltas.new_full(squares.size(), 0)
+        bbox_deltas[:, 2:] = anchor_deltas
+        guided_anchors = self.anchor_coder.decode(
+            squares, bbox_deltas, wh_ratio_clip=1e-6)
+        return guided_anchors, mask
+
+    def ga_loc_targets(self, gt_bboxes_list, featmap_sizes):
+        """Compute location targets for guided anchoring.
+
+        Each feature map is divided into positive, negative and ignore regions.
+        - positive regions: target 1, weight 1
+        - ignore regions: target 0, weight 0
+        - negative regions: target 0, weight 0.1
+
+        Args:
+            gt_bboxes_list (list[Tensor]): Gt bboxes of each image.
+            featmap_sizes (list[tuple]): Multi level sizes of each feature
+                maps.
+
+        Returns:
+            tuple
+        """
+        anchor_scale = self.approx_anchor_generator.octave_base_scale
+        anchor_strides = self.approx_anchor_generator.strides
+        # Currently only supports same stride in x and y direction.
+        for stride in anchor_strides:
+            assert (stride[0] == stride[1])
+        anchor_strides = [stride[0] for stride in anchor_strides]
+
+        center_ratio = self.train_cfg.center_ratio
+        ignore_ratio = self.train_cfg.ignore_ratio
+        img_per_gpu = len(gt_bboxes_list)
+        num_lvls = len(featmap_sizes)
+        r1 = (1 - center_ratio) / 2
+        r2 = (1 - ignore_ratio) / 2
+        all_loc_targets = []
+        all_loc_weights = []
+        all_ignore_map = []
+        for lvl_id in range(num_lvls):
+            h, w = featmap_sizes[lvl_id]
+            loc_targets = torch.zeros(
+                img_per_gpu,
+                1,
+                h,
+                w,
+                device=gt_bboxes_list[0].device,
+                dtype=torch.float32)
+            loc_weights = torch.full_like(loc_targets, -1)
+            ignore_map = torch.zeros_like(loc_targets)
+            all_loc_targets.append(loc_targets)
+            all_loc_weights.append(loc_weights)
+            all_ignore_map.append(ignore_map)
+        for img_id in range(img_per_gpu):
+            gt_bboxes = gt_bboxes_list[img_id]
+            scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
+                               (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
+            min_anchor_size = scale.new_full(
+                (1, ), float(anchor_scale * anchor_strides[0]))
+            # assign gt bboxes to different feature levels w.r.t. their scales
+            target_lvls = torch.floor(
+                torch.log2(scale) - torch.log2(min_anchor_size) + 0.5)
+            target_lvls = target_lvls.clamp(min=0, max=num_lvls - 1).long()
+            for gt_id in range(gt_bboxes.size(0)):
+                lvl = target_lvls[gt_id].item()
+                # rescaled to corresponding feature map
+                gt_ = gt_bboxes[gt_id, :4] / anchor_strides[lvl]
+                # calculate ignore regions
+                ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                    gt_, r2, featmap_sizes[lvl])
+                # calculate positive (center) regions
+                ctr_x1, ctr_y1, ctr_x2, ctr_y2 = calc_region(
+                    gt_, r1, featmap_sizes[lvl])
+                all_loc_targets[lvl][img_id, 0, ctr_y1:ctr_y2 + 1,
+                                     ctr_x1:ctr_x2 + 1] = 1
+                all_loc_weights[lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
+                                     ignore_x1:ignore_x2 + 1] = 0
+                all_loc_weights[lvl][img_id, 0, ctr_y1:ctr_y2 + 1,
+                                     ctr_x1:ctr_x2 + 1] = 1
+                # calculate ignore map on nearby low level feature
+                if lvl > 0:
+                    d_lvl = lvl - 1
+                    # rescaled to corresponding feature map
+                    gt_ = gt_bboxes[gt_id, :4] / anchor_strides[d_lvl]
+                    ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                        gt_, r2, featmap_sizes[d_lvl])
+                    all_ignore_map[d_lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
+                                          ignore_x1:ignore_x2 + 1] = 1
+                # calculate ignore map on nearby high level feature
+                if lvl < num_lvls - 1:
+                    u_lvl = lvl + 1
+                    # rescaled to corresponding feature map
+                    gt_ = gt_bboxes[gt_id, :4] / anchor_strides[u_lvl]
+                    ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                        gt_, r2, featmap_sizes[u_lvl])
+                    all_ignore_map[u_lvl][img_id, 0, ignore_y1:ignore_y2 + 1,
+                                          ignore_x1:ignore_x2 + 1] = 1
+        for lvl_id in range(num_lvls):
+            # ignore negative regions w.r.t. ignore map
+            all_loc_weights[lvl_id][(all_loc_weights[lvl_id] < 0)
+                                    & (all_ignore_map[lvl_id] > 0)] = 0
+            # set negative regions with weight 0.1
+            all_loc_weights[lvl_id][all_loc_weights[lvl_id] < 0] = 0.1
+        # loc average factor to balance loss
+        loc_avg_factor = sum(
+            [t.size(0) * t.size(-1) * t.size(-2)
+             for t in all_loc_targets]) / 200
+        return all_loc_targets, all_loc_weights, loc_avg_factor
+
+    def _ga_shape_target_single(self,
+                                flat_approxs,
+                                inside_flags,
+                                flat_squares,
+                                gt_bboxes,
+                                gt_bboxes_ignore,
+                                img_meta,
+                                unmap_outputs=True):
+        """Compute guided anchoring targets.
+
+        This function returns sampled anchors and gt bboxes directly
+        rather than calculates regression targets.
+
+        Args:
+            flat_approxs (Tensor): flat approxs of a single image,
+                shape (n, 4)
+            inside_flags (Tensor): inside flags of a single image,
+                shape (n, ).
+            flat_squares (Tensor): flat squares of a single image,
+                shape (approxs_per_octave * n, 4)
+            gt_bboxes (Tensor): Ground truth bboxes of a single image.
+            img_meta (dict): Meta info of a single image.
+            approxs_per_octave (int): number of approxs per octave
+            cfg (dict): RPN train configs.
+            unmap_outputs (bool): unmap outputs or not.
+
+        Returns:
+            tuple
+        """
+        if not inside_flags.any():
+            return (None, ) * 5
+        # assign gt and sample anchors
+        expand_inside_flags = inside_flags[:, None].expand(
+            -1, self.approxs_per_octave).reshape(-1)
+        approxs = flat_approxs[expand_inside_flags, :]
+        squares = flat_squares[inside_flags, :]
+
+        assign_result = self.ga_assigner.assign(approxs, squares,
+                                                self.approxs_per_octave,
+                                                gt_bboxes, gt_bboxes_ignore)
+        sampling_result = self.ga_sampler.sample(assign_result, squares,
+                                                 gt_bboxes)
+
+        bbox_anchors = torch.zeros_like(squares)
+        bbox_gts = torch.zeros_like(squares)
+        bbox_weights = torch.zeros_like(squares)
+
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        if len(pos_inds) > 0:
+            bbox_anchors[pos_inds, :] = sampling_result.pos_bboxes
+            bbox_gts[pos_inds, :] = sampling_result.pos_gt_bboxes
+            bbox_weights[pos_inds, :] = 1.0
+
+        # map up to original set of anchors
+        if unmap_outputs:
+            num_total_anchors = flat_squares.size(0)
+            bbox_anchors = unmap(bbox_anchors, num_total_anchors, inside_flags)
+            bbox_gts = unmap(bbox_gts, num_total_anchors, inside_flags)
+            bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+        return (bbox_anchors, bbox_gts, bbox_weights, pos_inds, neg_inds)
+
+    def ga_shape_targets(self,
+                         approx_list,
+                         inside_flag_list,
+                         square_list,
+                         gt_bboxes_list,
+                         img_metas,
+                         gt_bboxes_ignore_list=None,
+                         unmap_outputs=True):
+        """Compute guided anchoring targets.
+
+        Args:
+            approx_list (list[list]): Multi level approxs of each image.
+            inside_flag_list (list[list]): Multi level inside flags of each
+                image.
+            square_list (list[list]): Multi level squares of each image.
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+            img_metas (list[dict]): Meta info of each image.
+            gt_bboxes_ignore_list (list[Tensor]): ignore list of gt bboxes.
+            unmap_outputs (bool): unmap outputs or not.
+
+        Returns:
+            tuple
+        """
+        num_imgs = len(img_metas)
+        assert len(approx_list) == len(inside_flag_list) == len(
+            square_list) == num_imgs
+        # anchor number of multi levels
+        num_level_squares = [squares.size(0) for squares in square_list[0]]
+        # concat all level anchors and flags to a single tensor
+        inside_flag_flat_list = []
+        approx_flat_list = []
+        square_flat_list = []
+        for i in range(num_imgs):
+            assert len(square_list[i]) == len(inside_flag_list[i])
+            inside_flag_flat_list.append(torch.cat(inside_flag_list[i]))
+            approx_flat_list.append(torch.cat(approx_list[i]))
+            square_flat_list.append(torch.cat(square_list[i]))
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        (all_bbox_anchors, all_bbox_gts, all_bbox_weights, pos_inds_list,
+         neg_inds_list) = multi_apply(
+             self._ga_shape_target_single,
+             approx_flat_list,
+             inside_flag_flat_list,
+             square_flat_list,
+             gt_bboxes_list,
+             gt_bboxes_ignore_list,
+             img_metas,
+             unmap_outputs=unmap_outputs)
+        # no valid anchors
+        if any([bbox_anchors is None for bbox_anchors in all_bbox_anchors]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        # split targets to a list w.r.t. multiple levels
+        bbox_anchors_list = images_to_levels(all_bbox_anchors,
+                                             num_level_squares)
+        bbox_gts_list = images_to_levels(all_bbox_gts, num_level_squares)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_squares)
+        return (bbox_anchors_list, bbox_gts_list, bbox_weights_list,
+                num_total_pos, num_total_neg)
+
+    def loss_shape_single(self, shape_pred, bbox_anchors, bbox_gts,
+                          anchor_weights, anchor_total_num):
+        shape_pred = shape_pred.permute(0, 2, 3, 1).contiguous().view(-1, 2)
+        bbox_anchors = bbox_anchors.contiguous().view(-1, 4)
+        bbox_gts = bbox_gts.contiguous().view(-1, 4)
+        anchor_weights = anchor_weights.contiguous().view(-1, 4)
+        bbox_deltas = bbox_anchors.new_full(bbox_anchors.size(), 0)
+        bbox_deltas[:, 2:] += shape_pred
+        # filter out negative samples to speed-up weighted_bounded_iou_loss
+        inds = torch.nonzero(
+            anchor_weights[:, 0] > 0, as_tuple=False).squeeze(1)
+        bbox_deltas_ = bbox_deltas[inds]
+        bbox_anchors_ = bbox_anchors[inds]
+        bbox_gts_ = bbox_gts[inds]
+        anchor_weights_ = anchor_weights[inds]
+        pred_anchors_ = self.anchor_coder.decode(
+            bbox_anchors_, bbox_deltas_, wh_ratio_clip=1e-6)
+        loss_shape = self.loss_shape(
+            pred_anchors_,
+            bbox_gts_,
+            anchor_weights_,
+            avg_factor=anchor_total_num)
+        return loss_shape
+
+    def loss_loc_single(self, loc_pred, loc_target, loc_weight,
+                        loc_avg_factor):
+        loss_loc = self.loss_loc(
+            loc_pred.reshape(-1, 1),
+            loc_target.reshape(-1).long(),
+            loc_weight.reshape(-1),
+            avg_factor=loc_avg_factor)
+        return loss_loc
+
+    @force_fp32(
+        apply_to=('cls_scores', 'bbox_preds', 'shape_preds', 'loc_preds'))
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             shape_preds,
+             loc_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             gt_bboxes_ignore=None):
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == self.approx_anchor_generator.num_levels
+
+        device = cls_scores[0].device
+
+        # get loc targets
+        loc_targets, loc_weights, loc_avg_factor = self.ga_loc_targets(
+            gt_bboxes, featmap_sizes)
+
+        # get sampled approxes
+        approxs_list, inside_flag_list = self.get_sampled_approxs(
+            featmap_sizes, img_metas, device=device)
+        # get squares and guided anchors
+        squares_list, guided_anchors_list, _ = self.get_anchors(
+            featmap_sizes, shape_preds, loc_preds, img_metas, device=device)
+
+        # get shape targets
+        shape_targets = self.ga_shape_targets(approxs_list, inside_flag_list,
+                                              squares_list, gt_bboxes,
+                                              img_metas)
+        if shape_targets is None:
+            return None
+        (bbox_anchors_list, bbox_gts_list, anchor_weights_list, anchor_fg_num,
+         anchor_bg_num) = shape_targets
+        anchor_total_num = (
+            anchor_fg_num if not self.ga_sampling else anchor_fg_num +
+            anchor_bg_num)
+
+        # get anchor targets
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = self.get_targets(
+            guided_anchors_list,
+            inside_flag_list,
+            gt_bboxes,
+            img_metas,
+            gt_bboxes_ignore_list=gt_bboxes_ignore,
+            gt_labels_list=gt_labels,
+            label_channels=label_channels)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        num_total_samples = (
+            num_total_pos + num_total_neg if self.sampling else num_total_pos)
+
+        # anchor number of multi levels
+        num_level_anchors = [
+            anchors.size(0) for anchors in guided_anchors_list[0]
+        ]
+        # concat all level anchors to a single tensor
+        concat_anchor_list = []
+        for i in range(len(guided_anchors_list)):
+            concat_anchor_list.append(torch.cat(guided_anchors_list[i]))
+        all_anchor_list = images_to_levels(concat_anchor_list,
+                                           num_level_anchors)
+
+        # get classification and bbox regression losses
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_single,
+            cls_scores,
+            bbox_preds,
+            all_anchor_list,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            num_total_samples=num_total_samples)
+
+        # get anchor location loss
+        losses_loc = []
+        for i in range(len(loc_preds)):
+            loss_loc = self.loss_loc_single(
+                loc_preds[i],
+                loc_targets[i],
+                loc_weights[i],
+                loc_avg_factor=loc_avg_factor)
+            losses_loc.append(loss_loc)
+
+        # get anchor shape loss
+        losses_shape = []
+        for i in range(len(shape_preds)):
+            loss_shape = self.loss_shape_single(
+                shape_preds[i],
+                bbox_anchors_list[i],
+                bbox_gts_list[i],
+                anchor_weights_list[i],
+                anchor_total_num=anchor_total_num)
+            losses_shape.append(loss_shape)
+
+        return dict(
+            loss_cls=losses_cls,
+            loss_bbox=losses_bbox,
+            loss_shape=losses_shape,
+            loss_loc=losses_loc)
+
+    @force_fp32(
+        apply_to=('cls_scores', 'bbox_preds', 'shape_preds', 'loc_preds'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   shape_preds,
+                   loc_preds,
+                   img_metas,
+                   cfg=None,
+                   rescale=False):
+        assert len(cls_scores) == len(bbox_preds) == len(shape_preds) == len(
+            loc_preds)
+        num_levels = len(cls_scores)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        device = cls_scores[0].device
+        # get guided anchors
+        _, guided_anchors, loc_masks = self.get_anchors(
+            featmap_sizes,
+            shape_preds,
+            loc_preds,
+            img_metas,
+            use_loc_filter=not self.training,
+            device=device)
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            guided_anchor_list = [
+                guided_anchors[img_id][i].detach() for i in range(num_levels)
+            ]
+            loc_mask_list = [
+                loc_masks[img_id][i].detach() for i in range(num_levels)
+            ]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            proposals = self._get_bboxes_single(cls_score_list, bbox_pred_list,
+                                                guided_anchor_list,
+                                                loc_mask_list, img_shape,
+                                                scale_factor, cfg, rescale)
+            result_list.append(proposals)
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_scores,
+                           bbox_preds,
+                           mlvl_anchors,
+                           mlvl_masks,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False):
+        cfg = self.test_cfg if cfg is None else cfg
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        for cls_score, bbox_pred, anchors, mask in zip(cls_scores, bbox_preds,
+                                                       mlvl_anchors,
+                                                       mlvl_masks):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            # if no location is kept, end.
+            if mask.sum() == 0:
+                continue
+            # reshape scores and bbox_pred
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            # filter scores, bbox_pred w.r.t. mask.
+            # anchors are filtered in get_anchors() beforehand.
+            scores = scores[mask, :]
+            bbox_pred = bbox_pred[mask, :]
+            if scores.dim() == 0:
+                anchors = anchors.unsqueeze(0)
+                scores = scores.unsqueeze(0)
+                bbox_pred = bbox_pred.unsqueeze(0)
+            # filter anchors, bbox_pred, scores w.r.t. scores
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    # remind that we set FG labels to [0, num_class-1]
+                    # since mmcv v2.0
+                    # BG cat_id: num_class
+                    max_scores, _ = scores[:, :-1].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+            bboxes = self.bbox_coder.decode(
+                anchors, bbox_pred, max_shape=img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+        mlvl_scores = torch.cat(mlvl_scores)
+        if self.use_sigmoid_cls:
+            # Add a dummy background class to the backend when using sigmoid
+            # remind that we set FG labels to [0, num_class-1] since mmcv v2.0
+            # BG cat_id: num_class
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+            mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
+        # multi class NMS
+        det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores,
+                                                cfg.score_thr, cfg.nms,
+                                                cfg.max_per_img)
+        return det_bboxes, det_labels
diff --git a/mmcv/models/dense_heads/motion_head.py b/mmcv/models/dense_heads/motion_head.py
new file mode 100644
index 0000000..859c3ff
--- /dev/null
+++ b/mmcv/models/dense_heads/motion_head.py
@@ -0,0 +1,560 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+#---------------------------------------------------------------------------------#
+
+import torch
+import copy
+from mmcv.models import HEADS
+from mmcv.utils import force_fp32, auto_fp16
+from mmcv.models.utils.functional import (
+    bivariate_gaussian_activation,
+    norm_points,
+    pos2posemb2d,
+    anchor_coordinate_transform
+)
+from .motion_head_plugin.motion_utils import nonlinear_smoother
+from .motion_head_plugin.base_motion_head import BaseMotionHead
+
+
+@HEADS.register_module()
+class MotionHead(BaseMotionHead):
+    """
+    MotionHead module for a neural network, which predicts motion trajectories and is used in an autonomous driving task.
+
+    Args:
+        *args: Variable length argument list.
+        predict_steps (int): The number of steps to predict motion trajectories.
+        transformerlayers (dict): A dictionary defining the configuration of transformer layers.
+        bbox_coder: An instance of a bbox coder to be used for encoding/decoding boxes.
+        num_cls_fcs (int): The number of fully-connected layers in the classification branch.
+        bev_h (int): The height of the bird's-eye-view map.
+        bev_w (int): The width of the bird's-eye-view map.
+        embed_dims (int): The number of dimensions to use for the query and key vectors in transformer layers.
+        num_anchor (int): The number of anchor points.
+        det_layer_num (int): The number of layers in the transformer model.
+        group_id_list (list): A list of group IDs to use for grouping the classes.
+        pc_range: The range of the point cloud.
+        use_nonlinear_optimizer (bool): A boolean indicating whether to use a non-linear optimizer for training.
+        anchor_info_path (str): The path to the file containing the anchor information.
+        vehicle_id_list(list[int]): class id of vehicle class, used for filtering out non-vehicle objects
+    """
+    def __init__(self,
+                 *args,
+                 predict_steps=12,
+                 transformerlayers=None,
+                 bbox_coder=None,
+                 num_cls_fcs=2,
+                 bev_h=30,
+                 bev_w=30,
+                 embed_dims=256,
+                 num_anchor=6,
+                 det_layer_num=6,
+                 group_id_list=[],
+                 pc_range=None,
+                 use_nonlinear_optimizer=False,
+                 anchor_info_path=None,
+                 loss_traj=dict(),
+                 num_classes=0,
+                 vehicle_id_list=[0, 1, 2, 3, 4, 6, 7],
+                 **kwargs):
+        super(MotionHead, self).__init__()
+        
+        self.bev_h = bev_h
+        self.bev_w = bev_w
+        self.num_cls_fcs = num_cls_fcs - 1
+        self.num_reg_fcs = num_cls_fcs - 1
+        self.embed_dims = embed_dims        
+        self.num_anchor = num_anchor
+        self.num_anchor_group = len(group_id_list)
+        
+        # we merge the classes into groups for anchor assignment
+        self.cls2group = [0 for i in range(num_classes)]
+        for i, grouped_ids in enumerate(group_id_list):
+            for gid in grouped_ids:
+                self.cls2group[gid] = i
+        self.cls2group = torch.tensor(self.cls2group)
+        self.pc_range = pc_range
+        self.predict_steps = predict_steps
+        self.vehicle_id_list = vehicle_id_list
+        
+        self.use_nonlinear_optimizer = use_nonlinear_optimizer
+        self._load_anchors(anchor_info_path)
+        self._build_loss(loss_traj)
+        self._build_layers(transformerlayers, det_layer_num)
+        self._init_layers()
+
+    def forward_train(self,
+                      bev_embed,
+                      gt_bboxes_3d,
+                      gt_labels_3d,
+                      gt_fut_traj=None,
+                      gt_fut_traj_mask=None,
+                      gt_sdc_fut_traj=None, 
+                      gt_sdc_fut_traj_mask=None, 
+                      outs_track={},
+                      outs_seg={}
+                  ):
+        """Forward function
+        Args:
+            bev_embed (Tensor): BEV feature map with the shape of [B, C, H, W].
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \
+                bboxes of each sample.
+            gt_labels_3d (list[torch.Tensor]): Labels of each sample.
+            img_metas (list[dict]): Meta information of each sample.
+            gt_fut_traj (list[torch.Tensor]): Ground truth future trajectory of each sample.
+            gt_fut_traj_mask (list[torch.Tensor]): Ground truth future trajectory mask of each sample.
+            gt_sdc_fut_traj (list[torch.Tensor]): Ground truth future trajectory of each sample.
+            gt_sdc_fut_traj_mask (list[torch.Tensor]): Ground truth future trajectory mask of each sample.
+            outs_track (dict): Outputs of track head.
+            outs_seg (dict): Outputs of seg head.
+            future_states (list[torch.Tensor]): Ground truth future states of each sample.
+        Returns:
+            dict: Losses of each branch.
+        """
+        track_query = outs_track['track_query_embeddings'][None, None, ...] # num_dec, B, A_track, D
+        all_matched_idxes = [outs_track['track_query_matched_idxes']] #BxN
+        track_boxes = outs_track['track_bbox_results']
+        
+        # cat sdc query/gt to the last
+        sdc_match_index = torch.zeros((1,), dtype=all_matched_idxes[0].dtype, device=all_matched_idxes[0].device)
+        sdc_match_index[0] = gt_fut_traj[0].shape[0]
+        all_matched_idxes = [torch.cat([all_matched_idxes[0], sdc_match_index], dim=0)]
+        gt_fut_traj[0] = torch.cat([gt_fut_traj[0], gt_sdc_fut_traj[0]], dim=0)
+        gt_fut_traj_mask[0] = torch.cat([gt_fut_traj_mask[0], gt_sdc_fut_traj_mask[0]], dim=0)
+        track_query = torch.cat([track_query, outs_track['sdc_embedding'][None, None, None, :]], dim=2)
+        sdc_track_boxes = outs_track['sdc_track_bbox_results']
+        track_boxes[0][0].tensor = torch.cat([track_boxes[0][0].tensor, sdc_track_boxes[0][0].tensor], dim=0)
+        track_boxes[0][1] = torch.cat([track_boxes[0][1], sdc_track_boxes[0][1]], dim=0)
+        track_boxes[0][2] = torch.cat([track_boxes[0][2], sdc_track_boxes[0][2]], dim=0)
+        track_boxes[0][3] = torch.cat([track_boxes[0][3], sdc_track_boxes[0][3]], dim=0)
+        
+        memory, memory_mask, memory_pos, lane_query, _, lane_query_pos, hw_lvl = outs_seg['args_tuple']
+
+        outs_motion = self(bev_embed, track_query, lane_query, lane_query_pos, track_boxes)
+        loss_inputs = [gt_bboxes_3d, gt_fut_traj, gt_fut_traj_mask, outs_motion, all_matched_idxes, track_boxes]
+        losses = self.loss(*loss_inputs)
+
+        def filter_vehicle_query(outs_motion, all_matched_idxes, gt_labels_3d, vehicle_id_list):
+            query_label = gt_labels_3d[0][-1][all_matched_idxes[0]]
+            # select vehicle query according to vehicle_id_list
+            vehicle_mask = torch.zeros_like(query_label)
+            for veh_id in vehicle_id_list:
+                vehicle_mask |=  query_label == veh_id
+            outs_motion['traj_query'] = outs_motion['traj_query'][:, :, vehicle_mask>0]
+            outs_motion['track_query'] = outs_motion['track_query'][:, vehicle_mask>0]
+            outs_motion['track_query_pos'] = outs_motion['track_query_pos'][:, vehicle_mask>0]
+            all_matched_idxes[0] = all_matched_idxes[0][vehicle_mask>0]
+            return outs_motion, all_matched_idxes
+
+        all_matched_idxes[0] = all_matched_idxes[0][:-1]
+        outs_motion['sdc_traj_query'] = outs_motion['traj_query'][:, :, -1]         # [3, 1, 6, 256]     [n_dec, b, n_mode, d]
+        outs_motion['sdc_track_query'] = outs_motion['track_query'][:, -1]          # [1, 256]           [b, d]
+        outs_motion['sdc_track_query_pos'] = outs_motion['track_query_pos'][:, -1]  # [1, 256]           [b, d]
+        outs_motion['traj_query'] = outs_motion['traj_query'][:, :, :-1]            # [3, 1, 3, 6, 256]  [n_dec, b, nq, n_mode, d]
+        outs_motion['track_query'] = outs_motion['track_query'][:, :-1]             # [1, 3, 256]        [b, nq, d]   
+        outs_motion['track_query_pos'] = outs_motion['track_query_pos'][:, :-1]     # [1, 3, 256]        [b, nq, d]  
+
+        
+        outs_motion, all_matched_idxes = filter_vehicle_query(outs_motion, all_matched_idxes, gt_labels_3d, self.vehicle_id_list)
+        outs_motion['all_matched_idxes'] = all_matched_idxes
+
+        ret_dict = dict(losses=losses, outs_motion=outs_motion, track_boxes=track_boxes)
+        return ret_dict
+
+    def forward_test(self, bev_embed, outs_track={}, outs_seg={}):
+        """Test function"""
+        track_query = outs_track['track_query_embeddings'][None, None, ...]
+        track_boxes = outs_track['track_bbox_results']
+        
+        track_query = torch.cat([track_query, outs_track['sdc_embedding'][None, None, None, :]], dim=2)
+        sdc_track_boxes = outs_track['sdc_track_bbox_results']
+
+        track_boxes[0][0].tensor = torch.cat([track_boxes[0][0].tensor, sdc_track_boxes[0][0].tensor], dim=0)
+        track_boxes[0][1] = torch.cat([track_boxes[0][1], sdc_track_boxes[0][1]], dim=0)
+        track_boxes[0][2] = torch.cat([track_boxes[0][2], sdc_track_boxes[0][2]], dim=0)
+        track_boxes[0][3] = torch.cat([track_boxes[0][3], sdc_track_boxes[0][3]], dim=0)      
+        memory, memory_mask, memory_pos, lane_query, _, lane_query_pos, hw_lvl = outs_seg['args_tuple']
+        outs_motion = self(bev_embed, track_query, lane_query, lane_query_pos, track_boxes)
+        traj_results = self.get_trajs(outs_motion, track_boxes)
+        bboxes, scores, labels, bbox_index, mask = track_boxes[0]
+        outs_motion['track_scores'] = scores[None, :]
+        labels[-1] = 0
+        def filter_vehicle_query(outs_motion, labels, vehicle_id_list):
+            if len(labels) < 1:  # No other obj query except sdc query.
+                return None
+
+            # select vehicle query according to vehicle_id_list
+            vehicle_mask = torch.zeros_like(labels)
+            for veh_id in vehicle_id_list:
+                vehicle_mask |=  labels == veh_id
+            outs_motion['traj_query'] = outs_motion['traj_query'][:, :, vehicle_mask>0]
+            outs_motion['track_query'] = outs_motion['track_query'][:, vehicle_mask>0]
+            outs_motion['track_query_pos'] = outs_motion['track_query_pos'][:, vehicle_mask>0]
+            outs_motion['track_scores'] = outs_motion['track_scores'][:, vehicle_mask>0]
+            return outs_motion
+        
+        outs_motion = filter_vehicle_query(outs_motion, labels, self.vehicle_id_list)
+        
+        # filter sdc query
+        outs_motion['sdc_traj_query'] = outs_motion['traj_query'][:, :, -1]
+        outs_motion['sdc_track_query'] = outs_motion['track_query'][:, -1]
+        outs_motion['sdc_track_query_pos'] = outs_motion['track_query_pos'][:, -1]
+        outs_motion['traj_query'] = outs_motion['traj_query'][:, :, :-1]
+        outs_motion['track_query'] = outs_motion['track_query'][:, :-1]
+        outs_motion['track_query_pos'] = outs_motion['track_query_pos'][:, :-1]
+        outs_motion['track_scores'] = outs_motion['track_scores'][:, :-1]
+
+        return traj_results, outs_motion
+
+    @auto_fp16(apply_to=('bev_embed', 'track_query', 'lane_query', 'lane_query_pos', 'lane_query_embed', 'prev_bev'))
+    def forward(self, 
+                bev_embed, 
+                track_query, 
+                lane_query, 
+                lane_query_pos, 
+                track_bbox_results):
+        """
+        Applies forward pass on the model for motion prediction using bird's eye view (BEV) embedding, track query, lane query, and track bounding box results.
+
+        Args:
+        bev_embed (torch.Tensor): A tensor of shape (h*w, B, D) representing the bird's eye view embedding.
+        track_query (torch.Tensor): A tensor of shape (B, num_dec, A_track, D) representing the track query.
+        lane_query (torch.Tensor): A tensor of shape (N, M_thing, D) representing the lane query.
+        lane_query_pos (torch.Tensor): A tensor of shape (N, M_thing, D) representing the position of the lane query.
+        track_bbox_results (List[torch.Tensor]): A list of tensors containing the tracking bounding box results for each image in the batch.
+
+        Returns:
+        dict: A dictionary containing the following keys and values:
+        - 'all_traj_scores': A tensor of shape (num_levels, B, A_track, num_points) with trajectory scores for each level.
+        - 'all_traj_preds': A tensor of shape (num_levels, B, A_track, num_points, num_future_steps, 2) with predicted trajectories for each level.
+        - 'valid_traj_masks': A tensor of shape (B, A_track) indicating the validity of trajectory masks.
+        - 'traj_query': A tensor containing intermediate states of the trajectory queries.
+        - 'track_query': A tensor containing the input track queries.
+        - 'track_query_pos': A tensor containing the positional embeddings of the track queries.
+        """
+        
+        dtype = track_query.dtype
+        device = track_query.device
+        num_groups = self.kmeans_anchors.shape[0]
+
+        # extract the last frame of the track query
+        track_query = track_query[:, -1]
+        
+        # encode the center point of the track query
+        reference_points_track = self._extract_tracking_centers(
+            track_bbox_results, self.pc_range)
+        track_query_pos = self.boxes_query_embedding_layer(pos2posemb2d(reference_points_track.to(device)))  # B, A, D
+        
+        # construct the learnable query positional embedding
+        # split and stack according to groups
+        learnable_query_pos = self.learnable_motion_query_embedding.weight.to(dtype)  # latent anchor (P*G, D)
+        learnable_query_pos = torch.stack(torch.split(learnable_query_pos, self.num_anchor, dim=0))
+
+        # construct the agent level/scene-level query positional embedding 
+        # (num_groups, num_anchor, 12, 2)
+        # to incorporate the information of different groups and coordinates, and embed the headding and location information
+        agent_level_anchors = self.kmeans_anchors.to(dtype).to(device).view(num_groups, self.num_anchor, self.predict_steps, 2).detach()
+        scene_level_ego_anchors = anchor_coordinate_transform(agent_level_anchors, track_bbox_results, with_translation_transform=True)  # B, A, G, P ,12 ,2
+        scene_level_offset_anchors = anchor_coordinate_transform(agent_level_anchors, track_bbox_results, with_translation_transform=False)  # B, A, G, P ,12 ,2
+
+        agent_level_norm = norm_points(agent_level_anchors, self.pc_range)
+        scene_level_ego_norm = norm_points(scene_level_ego_anchors, self.pc_range)
+        scene_level_offset_norm = norm_points(scene_level_offset_anchors, self.pc_range)
+
+        # we only use the last point of the anchor
+        agent_level_embedding = self.agent_level_embedding_layer(
+            pos2posemb2d(agent_level_norm[..., -1, :]))  # G, P, D
+        scene_level_ego_embedding = self.scene_level_ego_embedding_layer(
+            pos2posemb2d(scene_level_ego_norm[..., -1, :]))  # B, A, G, P , D
+        scene_level_offset_embedding = self.scene_level_offset_embedding_layer(
+            pos2posemb2d(scene_level_offset_norm[..., -1, :]))  # B, A, G, P , D
+
+        batch_size, num_agents = scene_level_ego_embedding.shape[:2]
+        agent_level_embedding = agent_level_embedding[None,None, ...].expand(batch_size, num_agents, -1, -1, -1)
+        learnable_embed = learnable_query_pos[None, None, ...].expand(batch_size, num_agents, -1, -1, -1)
+
+        
+        # save for latter, anchors
+        # B, A, G, P ,12 ,2 -> B, A, P ,12 ,2
+        scene_level_offset_anchors = self.group_mode_query_pos(track_bbox_results, scene_level_offset_anchors)  
+
+        # select class embedding
+        # B, A, G, P , D-> B, A, P , D
+        agent_level_embedding = self.group_mode_query_pos(
+            track_bbox_results, agent_level_embedding)  
+        scene_level_ego_embedding = self.group_mode_query_pos(
+            track_bbox_results, scene_level_ego_embedding)  # B, A, G, P , D-> B, A, P , D
+        
+        # B, A, G, P , D -> B, A, P , D
+        scene_level_offset_embedding = self.group_mode_query_pos(
+            track_bbox_results, scene_level_offset_embedding)  
+        learnable_embed = self.group_mode_query_pos(
+            track_bbox_results, learnable_embed)  
+
+        init_reference = scene_level_offset_anchors.detach()
+
+        outputs_traj_scores = []
+        outputs_trajs = []
+
+        inter_states, inter_references = self.motionformer(
+            track_query,  # B, A_track, D
+            lane_query,  # B, M, D
+            track_query_pos=track_query_pos,
+            lane_query_pos=lane_query_pos,
+            track_bbox_results=track_bbox_results,
+            bev_embed=bev_embed,
+            reference_trajs=init_reference,
+            traj_reg_branches=self.traj_reg_branches,
+            traj_cls_branches=self.traj_cls_branches,
+            # anchor embeddings 
+            agent_level_embedding=agent_level_embedding,
+            scene_level_ego_embedding=scene_level_ego_embedding,
+            scene_level_offset_embedding=scene_level_offset_embedding,
+            learnable_embed=learnable_embed,
+            # anchor positional embeddings layers
+            agent_level_embedding_layer=self.agent_level_embedding_layer,
+            scene_level_ego_embedding_layer=self.scene_level_ego_embedding_layer,
+            scene_level_offset_embedding_layer=self.scene_level_offset_embedding_layer,
+            spatial_shapes=torch.tensor(
+                [[self.bev_h, self.bev_w]], device=device),
+            level_start_index=torch.tensor([0], device=device))
+
+        for lvl in range(inter_states.shape[0]):
+            outputs_class = self.traj_cls_branches[lvl](inter_states[lvl])
+            tmp = self.traj_reg_branches[lvl](inter_states[lvl])
+            tmp = self.unflatten_traj(tmp)
+            
+            # we use cumsum trick here to get the trajectory 
+            tmp[..., :2] = torch.cumsum(tmp[..., :2], dim=3)
+
+            outputs_class = self.log_softmax(outputs_class.squeeze(3))
+            outputs_traj_scores.append(outputs_class)
+
+            for bs in range(tmp.shape[0]):
+                tmp[bs] = bivariate_gaussian_activation(tmp[bs])
+            outputs_trajs.append(tmp)
+        outputs_traj_scores = torch.stack(outputs_traj_scores)
+        outputs_trajs = torch.stack(outputs_trajs)
+
+        B, A_track, D = track_query.shape
+        valid_traj_masks = track_query.new_ones((B, A_track)) > 0
+        outs = {
+            'all_traj_scores': outputs_traj_scores,
+            'all_traj_preds': outputs_trajs,
+            'valid_traj_masks': valid_traj_masks,
+            'traj_query': inter_states,
+            'track_query': track_query,
+            'track_query_pos': track_query_pos,
+        }
+
+        return outs
+
+    def group_mode_query_pos(self, bbox_results, mode_query_pos):
+        """
+        Group mode query positions based on the input bounding box results.
+        
+        Args:
+            bbox_results (List[Tuple[torch.Tensor]]): A list of tuples containing the bounding box results for each image in the batch.
+            mode_query_pos (torch.Tensor): A tensor of shape (B, A, G, P, D) representing the mode query positions.
+        
+        Returns:
+            torch.Tensor: A tensor of shape (B, A, P, D) representing the classified mode query positions.
+        """
+        batch_size = len(bbox_results)
+        agent_num = mode_query_pos.shape[1]
+        batched_mode_query_pos = []
+        self.cls2group = self.cls2group.to(mode_query_pos.device)
+        # TODO: vectorize this
+        # group the embeddings based on the class
+        for i in range(batch_size):
+            bboxes, scores, labels, bbox_index, mask = bbox_results[i]
+            label = labels.to(mode_query_pos.device)
+            grouped_label = self.cls2group[label]
+            grouped_mode_query_pos = []
+            for j in range(agent_num):
+                grouped_mode_query_pos.append(
+                    mode_query_pos[i, j, grouped_label[j]])
+            batched_mode_query_pos.append(torch.stack(grouped_mode_query_pos))
+        return torch.stack(batched_mode_query_pos)
+
+    @force_fp32(apply_to=('preds_dicts_motion'))
+    def loss(self,
+             gt_bboxes_3d,
+             gt_fut_traj,
+             gt_fut_traj_mask,
+             preds_dicts_motion,
+             all_matched_idxes,
+             track_bbox_results):
+        """
+        Computes the loss function for the given ground truth and prediction dictionaries.
+        
+        Args:
+            gt_bboxes_3d (List[torch.Tensor]): A list of tensors representing ground truth 3D bounding boxes for each image.
+            gt_fut_traj (torch.Tensor): A tensor representing the ground truth future trajectories.
+            gt_fut_traj_mask (torch.Tensor): A tensor representing the ground truth future trajectory masks.
+            preds_dicts_motion (Dict[str, torch.Tensor]): A dictionary containing motion-related prediction tensors.
+            all_matched_idxes (List[torch.Tensor]): A list of tensors containing the matched ground truth indices for each image in the batch.
+            track_bbox_results (List[Tuple[torch.Tensor]]): A list of tuples containing the tracking bounding box results for each image in the batch.
+
+        Returns:
+            dict[str, torch.Tensor]: A dictionary of loss components.
+        """
+
+        # motion related predictions
+        all_traj_scores = preds_dicts_motion['all_traj_scores']
+        all_traj_preds = preds_dicts_motion['all_traj_preds']
+
+        num_dec_layers = len(all_traj_scores)
+
+        all_gt_fut_traj = [gt_fut_traj for _ in range(num_dec_layers)]
+        all_gt_fut_traj_mask = [
+            gt_fut_traj_mask for _ in range(num_dec_layers)]
+
+        losses_traj = []
+        gt_fut_traj_all, gt_fut_traj_mask_all = self.compute_matched_gt_traj(
+            all_gt_fut_traj[0], all_gt_fut_traj_mask[0], all_matched_idxes, track_bbox_results, gt_bboxes_3d)
+        for i in range(num_dec_layers):
+            loss_traj, l_class, l_reg, l_mindae, l_minfde, l_mr = self.compute_loss_traj(all_traj_scores[i], all_traj_preds[i],
+                                                                                         gt_fut_traj_all, gt_fut_traj_mask_all, all_matched_idxes)
+            losses_traj.append(
+                (loss_traj, l_class, l_reg, l_mindae, l_minfde, l_mr))
+
+        loss_dict = dict()
+        loss_dict['loss_traj'] = losses_traj[-1][0]
+        loss_dict['l_class'] = losses_traj[-1][1]
+        loss_dict['l_reg'] = losses_traj[-1][2]
+        loss_dict['min_ade'] = losses_traj[-1][3]
+        loss_dict['min_fde'] = losses_traj[-1][4]
+        loss_dict['mr'] = losses_traj[-1][5]
+
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_traj_i in losses_traj[:-1]:
+            loss_dict[f'd{num_dec_layer}.loss_traj'] = loss_traj_i[0]
+            loss_dict[f'd{num_dec_layer}.l_class'] = loss_traj_i[1]
+            loss_dict[f'd{num_dec_layer}.l_reg'] = loss_traj_i[2]
+            loss_dict[f'd{num_dec_layer}.min_ade'] = loss_traj_i[3]
+            loss_dict[f'd{num_dec_layer}.min_fde'] = loss_traj_i[4]
+            loss_dict[f'd{num_dec_layer}.mr'] = loss_traj_i[5]
+            num_dec_layer += 1
+
+        return loss_dict
+
+    def compute_matched_gt_traj(self,
+                                gt_fut_traj,
+                                gt_fut_traj_mask,
+                                all_matched_idxes,
+                                track_bbox_results,
+                                gt_bboxes_3d):
+        """
+        Computes the matched ground truth trajectories for a batch of images based on matched indexes.
+
+        Args:
+        gt_fut_traj (torch.Tensor): Ground truth future trajectories of shape (num_imgs, num_objects, num_future_steps, 2).
+        gt_fut_traj_mask (torch.Tensor): Ground truth future trajectory masks of shape (num_imgs, num_objects, num_future_steps, 2).
+        all_matched_idxes (List[torch.Tensor]): A list of tensors containing the matched indexes for each image in the batch.
+        track_bbox_results (List[torch.Tensor]): A list of tensors containing the tracking bounding box results for each image in the batch.
+        gt_bboxes_3d (List[torch.Tensor]): A list of tensors containing the ground truth 3D bounding boxes for each image in the batch.
+
+        Returns:
+        torch.Tensor: A concatenated tensor of the matched ground truth future trajectories.
+        torch.Tensor: A concatenated tensor of the matched ground truth future trajectory masks.
+        """
+        num_imgs = len(all_matched_idxes)
+        gt_fut_traj_all = []
+        gt_fut_traj_mask_all = []
+        for i in range(num_imgs):
+            matched_gt_idx = all_matched_idxes[i]
+            valid_traj_masks = matched_gt_idx >= 0
+            matched_gt_fut_traj = gt_fut_traj[i][matched_gt_idx][valid_traj_masks]
+            matched_gt_fut_traj_mask = gt_fut_traj_mask[i][matched_gt_idx][valid_traj_masks]
+            if self.use_nonlinear_optimizer:
+                # TODO: sdc query is not supported non-linear optimizer
+                bboxes = track_bbox_results[i][0].to(matched_gt_idx.device).tensor[valid_traj_masks]
+                matched_gt_bboxes_3d = gt_bboxes_3d[i][-1].to(matched_gt_idx.device).tensor[matched_gt_idx[:-1]
+                                                                  ][valid_traj_masks[:-1]]
+                sdc_gt_fut_traj = matched_gt_fut_traj[-1:]
+                sdc_gt_fut_traj_mask = matched_gt_fut_traj_mask[-1:]
+                matched_gt_fut_traj = matched_gt_fut_traj[:-1]
+                matched_gt_fut_traj_mask = matched_gt_fut_traj_mask[:-1]
+                bboxes = bboxes[:-1]
+                matched_gt_fut_traj, matched_gt_fut_traj_mask = nonlinear_smoother(
+                    matched_gt_bboxes_3d, matched_gt_fut_traj, matched_gt_fut_traj_mask, bboxes)
+                matched_gt_fut_traj = torch.cat(
+                    [matched_gt_fut_traj, sdc_gt_fut_traj], dim=0)
+                matched_gt_fut_traj_mask = torch.cat(
+                    [matched_gt_fut_traj_mask, sdc_gt_fut_traj_mask], dim=0)
+            matched_gt_fut_traj_mask = torch.all(
+                matched_gt_fut_traj_mask > 0, dim=-1)
+            gt_fut_traj_all.append(matched_gt_fut_traj)
+            gt_fut_traj_mask_all.append(matched_gt_fut_traj_mask)
+        gt_fut_traj_all = torch.cat(gt_fut_traj_all, dim=0)
+        gt_fut_traj_mask_all = torch.cat(gt_fut_traj_mask_all, dim=0)
+        return gt_fut_traj_all, gt_fut_traj_mask_all
+
+    def compute_loss_traj(self,
+                          traj_scores,
+                          traj_preds,
+                          gt_fut_traj_all,
+                          gt_fut_traj_mask_all,
+                          all_matched_idxes):
+        """
+        Computes the trajectory loss given the predicted trajectories, ground truth trajectories, and other relevant information.
+        
+        Args:
+            traj_scores (torch.Tensor): A tensor representing the trajectory scores.
+            traj_preds (torch.Tensor): A tensor representing the predicted trajectories.
+            gt_fut_traj_all (torch.Tensor): A tensor representing the ground truth future trajectories.
+            gt_fut_traj_mask_all (torch.Tensor): A tensor representing the ground truth future trajectory masks.
+            all_matched_idxes (List[torch.Tensor]): A list of tensors containing the matched ground truth indices for each image in the batch.
+        
+        Returns:
+            tuple: A tuple containing the total trajectory loss, classification loss, regression loss, minimum average displacement error, minimum final displacement error, and miss rate.
+        """
+        num_imgs = traj_scores.size(0)
+        traj_prob_all = []
+        traj_preds_all = []
+        for i in range(num_imgs):
+            matched_gt_idx = all_matched_idxes[i]
+            valid_traj_masks = matched_gt_idx >= 0
+            # select valid and matched
+            batch_traj_prob = traj_scores[i, valid_traj_masks, :]
+            # (n_objs, n_modes, step, 5)
+            batch_traj_preds = traj_preds[i, valid_traj_masks, ...]
+            traj_prob_all.append(batch_traj_prob)
+            traj_preds_all.append(batch_traj_preds)
+        traj_prob_all = torch.cat(traj_prob_all, dim=0)
+        traj_preds_all = torch.cat(traj_preds_all, dim=0)
+        traj_loss, l_class, l_reg, l_minade, l_minfde, l_mr = self.loss_traj(
+            traj_prob_all, traj_preds_all, gt_fut_traj_all, gt_fut_traj_mask_all)
+        return traj_loss, l_class, l_reg, l_minade, l_minfde, l_mr
+
+    @force_fp32(apply_to=('preds_dicts'))
+    def get_trajs(self, preds_dicts, bbox_results):
+        """
+        Generates trajectories from the prediction results, bounding box results.
+        
+        Args:
+            preds_dicts (tuple[list[dict]]): A tuple containing lists of dictionaries with prediction results.
+            bbox_results (List[Tuple[torch.Tensor]]): A list of tuples containing the bounding box results for each image in the batch.
+        
+        Returns:
+            List[dict]: A list of dictionaries containing decoded bounding boxes, scores, and labels after non-maximum suppression.
+        """
+        num_samples = len(bbox_results)
+        num_layers = preds_dicts['all_traj_preds'].shape[0]
+        ret_list = []
+        for i in range(num_samples):
+            preds = dict()
+            for j in range(num_layers):
+                subfix = '_' + str(j) if j < (num_layers - 1) else ''
+                traj = preds_dicts['all_traj_preds'][j, i]
+                traj_scores = preds_dicts['all_traj_scores'][j, i]
+
+                traj_scores, traj = traj_scores.cpu(), traj.cpu()
+                preds['traj' + subfix] = traj
+                preds['traj_scores' + subfix] = traj_scores
+            ret_list.append(preds)
+        return ret_list
diff --git a/mmcv/models/dense_heads/motion_head_plugin/__init__.py b/mmcv/models/dense_heads/motion_head_plugin/__init__.py
new file mode 100644
index 0000000..79b6df5
--- /dev/null
+++ b/mmcv/models/dense_heads/motion_head_plugin/__init__.py
@@ -0,0 +1,4 @@
+from .motion_optimization import MotionNonlinearSmoother
+from .modules import MotionTransformerDecoder
+from .motion_deformable_attn import MotionTransformerAttentionLayer, MotionDeformableAttention
+from .motion_utils import *
\ No newline at end of file
diff --git a/mmcv/models/dense_heads/motion_head_plugin/base_motion_head.py b/mmcv/models/dense_heads/motion_head_plugin/base_motion_head.py
new file mode 100644
index 0000000..d71c0ee
--- /dev/null
+++ b/mmcv/models/dense_heads/motion_head_plugin/base_motion_head.py
@@ -0,0 +1,140 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+#---------------------------------------------------------------------------------#
+
+import torch
+import copy
+import pickle
+import torch.nn as nn
+from mmcv.models import  build_loss
+from mmcv.models.bricks.transformer import build_transformer_layer_sequence
+
+class BaseMotionHead(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super(BaseMotionHead, self).__init__()
+        pass
+
+    def _build_loss(self, loss_traj):
+        """
+        Build the loss function for the motion prediction task.
+
+        Args:
+            loss_traj (dict): A dictionary containing the parameters for the loss function.
+
+        Returns:
+            None
+        """
+        self.loss_traj = build_loss(loss_traj)
+        self.unflatten_traj = nn.Unflatten(3, (self.predict_steps, 5))
+        self.log_softmax = nn.LogSoftmax(dim=2)
+
+    def _load_anchors(self, anchor_info_path):
+        """
+        Load the anchor information from a file.
+
+        Args:
+            anchor_info_path (str): The path to the file containing the anchor information.
+
+        Returns:
+            None
+        """
+        anchor_infos = pickle.load(open(anchor_info_path, 'rb'))
+        self.kmeans_anchors = torch.stack(
+            [torch.from_numpy(a) for a in anchor_infos["anchors_all"]])  # Nc, Pc, steps, 2
+        
+    def _build_layers(self, transformerlayers, det_layer_num):
+        """
+        Build the layers of the motion prediction module.
+
+        Args:
+            transformerlayers (dict): A dictionary containing the parameters for the transformer layers.
+            det_layer_num (int): The number of detection layers.
+
+        Returns:
+            None
+        """
+        self.learnable_motion_query_embedding = nn.Embedding(
+            self.num_anchor * self.num_anchor_group, self.embed_dims)
+        self.motionformer = build_transformer_layer_sequence(
+            transformerlayers)
+        self.layer_track_query_fuser = nn.Sequential(
+            nn.Linear(self.embed_dims * det_layer_num, self.embed_dims),
+            nn.LayerNorm(self.embed_dims),
+            nn.ReLU(inplace=True)
+        )
+
+        self.agent_level_embedding_layer = nn.Sequential(
+            nn.Linear(self.embed_dims, self.embed_dims*2),
+            nn.ReLU(),
+            nn.Linear(self.embed_dims*2, self.embed_dims),
+        )
+        self.scene_level_ego_embedding_layer = nn.Sequential(
+            nn.Linear(self.embed_dims, self.embed_dims*2),
+            nn.ReLU(),
+            nn.Linear(self.embed_dims*2, self.embed_dims),
+        )
+        self.scene_level_offset_embedding_layer = nn.Sequential(
+            nn.Linear(self.embed_dims, self.embed_dims*2),
+            nn.ReLU(),
+            nn.Linear(self.embed_dims*2, self.embed_dims),
+        )
+        self.boxes_query_embedding_layer = nn.Sequential(
+            nn.Linear(self.embed_dims, self.embed_dims*2),
+            nn.ReLU(),
+            nn.Linear(self.embed_dims*2, self.embed_dims),
+        )
+    
+    def _init_layers(self):
+        """Initialize classification branch and regression branch of head."""
+        traj_cls_branch = []
+        traj_cls_branch.append(nn.Linear(self.embed_dims, self.embed_dims))
+        traj_cls_branch.append(nn.LayerNorm(self.embed_dims))
+        traj_cls_branch.append(nn.ReLU(inplace=True))
+        for _ in range(self.num_reg_fcs-1):
+            traj_cls_branch.append(nn.Linear(self.embed_dims, self.embed_dims))
+            traj_cls_branch.append(nn.LayerNorm(self.embed_dims))
+            traj_cls_branch.append(nn.ReLU(inplace=True))
+        traj_cls_branch.append(nn.Linear(self.embed_dims, 1))
+        traj_cls_branch = nn.Sequential(*traj_cls_branch)
+
+        traj_reg_branch = []
+        traj_reg_branch.append(nn.Linear(self.embed_dims, self.embed_dims))
+        traj_reg_branch.append(nn.ReLU())
+        for _ in range(self.num_reg_fcs-1):
+            traj_reg_branch.append(nn.Linear(self.embed_dims, self.embed_dims))
+            traj_reg_branch.append(nn.ReLU())
+        traj_reg_branch.append(nn.Linear(self.embed_dims, self.predict_steps * 5))
+        traj_reg_branch = nn.Sequential(*traj_reg_branch)
+
+        def _get_clones(module, N):
+            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+        num_pred = self.motionformer.num_layers
+        self.traj_cls_branches = _get_clones(traj_cls_branch, num_pred)
+        self.traj_reg_branches = _get_clones(traj_reg_branch, num_pred)
+
+    def _extract_tracking_centers(self, bbox_results, bev_range):
+        """
+        extract the bboxes centers and normized according to the bev range
+        
+        Args:
+            bbox_results (List[Tuple[torch.Tensor]]): A list of tuples containing the bounding box results for each image in the batch.
+            bev_range (List[float]): A list of float values representing the bird's eye view range.
+
+        Returns:
+            torch.Tensor: A tensor representing normized centers of the detection bounding boxes.
+        """
+        batch_size = len(bbox_results)
+        det_bbox_posembed = []
+        for i in range(batch_size):
+            bboxes, scores, labels, bbox_index, mask = bbox_results[i]
+            xy = bboxes.gravity_center[:, :2]
+            x_norm = (xy[:, 0] - bev_range[0]) / \
+                (bev_range[3] - bev_range[0])
+            y_norm = (xy[:, 1] - bev_range[1]) / \
+                (bev_range[4] - bev_range[1])
+            det_bbox_posembed.append(
+                torch.cat([x_norm[:, None], y_norm[:, None]], dim=-1))
+        return torch.stack(det_bbox_posembed)
\ No newline at end of file
diff --git a/mmcv/models/dense_heads/motion_head_plugin/modules.py b/mmcv/models/dense_heads/motion_head_plugin/modules.py
new file mode 100644
index 0000000..372528c
--- /dev/null
+++ b/mmcv/models/dense_heads/motion_head_plugin/modules.py
@@ -0,0 +1,280 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+#---------------------------------------------------------------------------------#
+
+import torch
+import torch.nn as nn
+from mmcv.models.bricks.registry import TRANSFORMER_LAYER_SEQUENCE
+from mmcv.models.bricks.transformer import build_transformer_layer
+from mmcv.models.backbones.base_module import BaseModule
+from mmcv.models.utils.functional import (
+    norm_points,
+    pos2posemb2d,
+    trajectory_coordinate_transform
+)
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class MotionTransformerDecoder(BaseModule):
+    """Implements the decoder in DETR3D transformer.
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self, pc_range=None, embed_dims=256, transformerlayers=None, num_layers=3, **kwargs):
+        super(MotionTransformerDecoder, self).__init__()
+        self.pc_range = pc_range
+        self.embed_dims = embed_dims
+        self.num_layers = num_layers
+        self.intention_interaction_layers = IntentionInteraction()
+        self.track_agent_interaction_layers = nn.ModuleList(
+            [TrackAgentInteraction() for i in range(self.num_layers)])
+        self.map_interaction_layers = nn.ModuleList(
+            [MapInteraction() for i in range(self.num_layers)])
+        self.bev_interaction_layers = nn.ModuleList(
+            [build_transformer_layer(transformerlayers) for i in range(self.num_layers)])
+
+        self.static_dynamic_fuser = nn.Sequential(
+            nn.Linear(self.embed_dims*2, self.embed_dims*2),
+            nn.ReLU(),
+            nn.Linear(self.embed_dims*2, self.embed_dims),
+        )
+        self.dynamic_embed_fuser = nn.Sequential(
+            nn.Linear(self.embed_dims*3, self.embed_dims*2),
+            nn.ReLU(),
+            nn.Linear(self.embed_dims*2, self.embed_dims),
+        )
+        self.in_query_fuser = nn.Sequential(
+            nn.Linear(self.embed_dims*2, self.embed_dims*2),
+            nn.ReLU(),
+            nn.Linear(self.embed_dims*2, self.embed_dims),
+        )
+        self.out_query_fuser = nn.Sequential(
+            nn.Linear(self.embed_dims*4, self.embed_dims*2),
+            nn.ReLU(),
+            nn.Linear(self.embed_dims*2, self.embed_dims),
+        )
+
+    def forward(self,
+                track_query,
+                lane_query,
+                track_query_pos=None,
+                lane_query_pos=None,
+                track_bbox_results=None,
+                bev_embed=None,
+                reference_trajs=None,
+                traj_reg_branches=None,
+                agent_level_embedding=None,
+                scene_level_ego_embedding=None,
+                scene_level_offset_embedding=None,
+                learnable_embed=None,
+                agent_level_embedding_layer=None,
+                scene_level_ego_embedding_layer=None,
+                scene_level_offset_embedding_layer=None,
+                **kwargs):
+        """Forward function for `MotionTransformerDecoder`.
+        Args:
+            agent_query (B, A, D)
+            map_query (B, M, D) 
+            map_query_pos (B, G, D)
+            static_intention_embed (B, A, P, D)
+            offset_query_embed (B, A, P, D)
+            global_intention_embed (B, A, P, D)
+            learnable_intention_embed (B, A, P, D)
+            det_query_pos (B, A, D)
+        Returns:
+            None
+        """
+        intermediate = []
+        intermediate_reference_trajs = []
+
+        B, _, P, D = agent_level_embedding.shape
+        track_query_bc = track_query.unsqueeze(2).expand(-1, -1, P, -1)  # (B, A, P, D)
+        track_query_pos_bc = track_query_pos.unsqueeze(2).expand(-1, -1, P, -1)  # (B, A, P, D)
+
+        # static intention embedding, which is imutable throughout all layers
+        agent_level_embedding = self.intention_interaction_layers(agent_level_embedding)
+        static_intention_embed = agent_level_embedding + scene_level_offset_embedding + learnable_embed
+        reference_trajs_input = reference_trajs.unsqueeze(4).detach()
+
+        query_embed = torch.zeros_like(static_intention_embed)
+        for lid in range(self.num_layers):
+            # fuse static and dynamic intention embedding
+            # the dynamic intention embedding is the output of the previous layer, which is initialized with anchor embedding
+            dynamic_query_embed = self.dynamic_embed_fuser(torch.cat(
+                [agent_level_embedding, scene_level_offset_embedding, scene_level_ego_embedding], dim=-1))
+            
+            # fuse static and dynamic intention embedding
+            query_embed_intention = self.static_dynamic_fuser(torch.cat(
+                [static_intention_embed, dynamic_query_embed], dim=-1))  # (B, A, P, D)
+            
+            # fuse intention embedding with query embedding
+            query_embed = self.in_query_fuser(torch.cat([query_embed, query_embed_intention], dim=-1))
+            
+            # interaction between agents
+            track_query_embed = self.track_agent_interaction_layers[lid](
+                query_embed, track_query, query_pos=track_query_pos_bc, key_pos=track_query_pos)
+            
+            # interaction between agents and map
+            map_query_embed = self.map_interaction_layers[lid](
+                query_embed, lane_query, query_pos=track_query_pos_bc, key_pos=lane_query_pos)
+            
+            # interaction between agents and bev, ie. interaction between agents and goals
+            # implemented with deformable transformer
+            bev_query_embed = self.bev_interaction_layers[lid](
+                query_embed,
+                value=bev_embed,
+                query_pos=track_query_pos_bc,
+                bbox_results=track_bbox_results,
+                reference_trajs=reference_trajs_input,
+                **kwargs)
+            
+            # fusing the embeddings from different interaction layers
+            query_embed = [track_query_embed, map_query_embed, bev_query_embed, track_query_bc+track_query_pos_bc]
+            query_embed = torch.cat(query_embed, dim=-1)
+            query_embed = self.out_query_fuser(query_embed)
+
+            if traj_reg_branches is not None:
+                # update reference trajectory
+                tmp = traj_reg_branches[lid](query_embed)
+                bs, n_agent, n_modes, n_steps, _ = reference_trajs.shape
+                tmp = tmp.view(bs, n_agent, n_modes, n_steps, -1)
+                
+                # we predict speed of trajectory and use cumsum trick to get the trajectory
+                tmp[..., :2] = torch.cumsum(tmp[..., :2], dim=3)
+                new_reference_trajs = torch.zeros_like(reference_trajs)
+                new_reference_trajs = tmp[..., :2]
+                reference_trajs = new_reference_trajs.detach()
+                reference_trajs_input = reference_trajs.unsqueeze(4)  # BS NUM_AGENT NUM_MODE 12 NUM_LEVEL  2
+
+                # update embedding, which is used in the next layer
+                # only update the embedding of the last step, i.e. the goal
+                ep_offset_embed = reference_trajs.detach()
+                ep_ego_embed = trajectory_coordinate_transform(reference_trajs.unsqueeze(
+                    2), track_bbox_results, with_translation_transform=True, with_rotation_transform=False).squeeze(2).detach()
+                ep_agent_embed = trajectory_coordinate_transform(reference_trajs.unsqueeze(
+                    2), track_bbox_results, with_translation_transform=False, with_rotation_transform=True).squeeze(2).detach()
+
+                agent_level_embedding = agent_level_embedding_layer(pos2posemb2d(
+                    norm_points(ep_agent_embed[..., -1, :], self.pc_range)))
+                scene_level_ego_embedding = scene_level_ego_embedding_layer(pos2posemb2d(
+                    norm_points(ep_ego_embed[..., -1, :], self.pc_range)))
+                scene_level_offset_embedding = scene_level_offset_embedding_layer(pos2posemb2d(
+                    norm_points(ep_offset_embed[..., -1, :], self.pc_range)))
+
+                intermediate.append(query_embed)
+                intermediate_reference_trajs.append(reference_trajs)
+
+        return torch.stack(intermediate), torch.stack(intermediate_reference_trajs)
+
+
+class TrackAgentInteraction(BaseModule):
+    """
+    Modeling the interaction between the agents
+    """
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 dropout=0.1,
+                 batch_first=True,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.batch_first = batch_first
+        self.interaction_transformer = nn.TransformerDecoderLayer(d_model=embed_dims,
+                                                                  nhead=num_heads,
+                                                                  dropout=dropout,
+                                                                  dim_feedforward=embed_dims*2,
+                                                                  batch_first=batch_first)
+
+    def forward(self, query, key, query_pos=None, key_pos=None):
+        '''
+        query: context query (B, A, P, D) 
+        query_pos: mode pos embedding (B, A, P, D)
+        key: (B, A, D)
+        key_pos: (B, A, D)
+        '''
+        B, A, P, D = query.shape
+        if query_pos is not None:
+            query = query + query_pos
+        if key_pos is not None:
+            key = key + key_pos
+        mem = key.expand(B*A, -1, -1)
+        # N, A, P, D -> N*A, P, D
+        query = torch.flatten(query, start_dim=0, end_dim=1)
+        query = self.interaction_transformer(query, mem)
+        query = query.view(B, A, P, D)
+        return query
+
+
+class MapInteraction(BaseModule):
+    """
+    Modeling the interaction between the agent and the map
+    """
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 dropout=0.1,
+                 batch_first=True,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.batch_first = batch_first
+        self.interaction_transformer = nn.TransformerDecoderLayer(d_model=embed_dims,
+                                                                  nhead=num_heads,
+                                                                  dropout=dropout,
+                                                                  dim_feedforward=embed_dims*2,
+                                                                  batch_first=batch_first)
+
+    def forward(self, query, key, query_pos=None, key_pos=None):
+        '''
+        x: context query (B, A, P, D) 
+        query_pos: mode pos embedding (B, A, P, D)
+        '''
+        B, A, P, D = query.shape
+        if query_pos is not None:
+            query = query + query_pos
+        if key_pos is not None:
+            key = key + key_pos
+
+        # N, A, P, D -> N*A, P, D
+        query = torch.flatten(query, start_dim=0, end_dim=1)
+        mem = key.expand(B*A, -1, -1)
+        query = self.interaction_transformer(query, mem)
+        query = query.view(B, A, P, D)
+        return query
+
+
+class IntentionInteraction(BaseModule):
+    """
+    Modeling the interaction between anchors
+    """
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 dropout=0.1,
+                 batch_first=True,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+
+        self.batch_first = batch_first
+        self.interaction_transformer = nn.TransformerEncoderLayer(d_model=embed_dims,
+                                                                  nhead=num_heads,
+                                                                  dropout=dropout,
+                                                                  dim_feedforward=embed_dims*2,
+                                                                  batch_first=batch_first)
+
+    def forward(self, query):
+        B, A, P, D = query.shape
+        # B, A, P, D -> B*A,P, D
+        rebatch_x = torch.flatten(query, start_dim=0, end_dim=1)
+        rebatch_x = self.interaction_transformer(rebatch_x)
+        out = rebatch_x.view(B, A, P, D)
+        return out
diff --git a/mmcv/models/dense_heads/motion_head_plugin/motion_deformable_attn.py b/mmcv/models/dense_heads/motion_head_plugin/motion_deformable_attn.py
new file mode 100644
index 0000000..c91a7c1
--- /dev/null
+++ b/mmcv/models/dense_heads/motion_head_plugin/motion_deformable_attn.py
@@ -0,0 +1,632 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+#---------------------------------------------------------------------------------#
+
+import copy
+import warnings
+import torch
+import math
+import torch.nn as nn
+
+from einops import rearrange, repeat
+from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch
+from mmcv.models.utils import xavier_init, constant_init
+from mmcv.models.bricks.registry import ATTENTION, TRANSFORMER_LAYER
+from mmcv.models.bricks.transformer import build_attention, build_feedforward_network, build_norm_layer
+from mmcv.models.bricks.drop import build_dropout
+from mmcv.models.backbones.base_module import BaseModule, ModuleList, Sequential
+from mmcv.utils import ConfigDict, deprecated_api_warning
+from mmcv.models.modules.multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32
+
+
+@TRANSFORMER_LAYER.register_module()
+class MotionTransformerAttentionLayer(BaseModule):
+    """Base `TransformerLayer` for vision transformer.
+    It can be built from `mmcv.ConfigDict` and support more flexible
+    customization, for example, using any number of `FFN or LN ` and
+    use different kinds of `attention` by specifying a list of `ConfigDict`
+    named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
+    when you specifying `norm` as the first element of `operation_order`.
+    More details about the `prenorm`: `On Layer Normalization in the
+    Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for `self_attention` or `cross_attention` modules,
+            The order of the configs in the list should be consistent with
+            corresponding attentions in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config. Default: None.
+        ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for FFN, The order of the configs in the list should be
+            consistent with corresponding ffn in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Support `prenorm` when you specifying first element as `norm`.
+            Default：None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): Key, Query and Value are shape
+            of (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+    """
+
+    def __init__(self,
+                 attn_cfgs=None,
+                 ffn_cfgs=dict(
+                     type='FFN',
+                     embed_dims=256,
+                     feedforward_channels=1024,
+                     num_fcs=2,
+                     ffn_drop=0.,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                 ),
+                 operation_order=None,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None,
+                 batch_first=False,
+                 **kwargs):
+
+        deprecated_args = dict(
+            feedforward_channels='feedforward_channels',
+            ffn_dropout='ffn_drop',
+            ffn_num_fcs='num_fcs')
+        for ori_name, new_name in deprecated_args.items():
+            if ori_name in kwargs:
+                warnings.warn(
+                    f'The arguments `{ori_name}` in BaseTransformerLayer '
+                    f'has been deprecated, now you should set `{new_name}` '
+                    f'and other FFN related arguments '
+                    f'to a dict named `ffn_cfgs`. ', DeprecationWarning)
+                ffn_cfgs[new_name] = kwargs[ori_name]
+
+        super().__init__(init_cfg)
+
+        self.batch_first = batch_first
+
+        num_attn = operation_order.count('self_attn') + operation_order.count(
+            'cross_attn')
+        if isinstance(attn_cfgs, dict):
+            attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
+        else:
+            assert num_attn == len(attn_cfgs), f'The length ' \
+                f'of attn_cfg {num_attn} is ' \
+                f'not consistent with the number of attention' \
+                f'in operation_order {operation_order}.'
+
+        self.num_attn = num_attn
+        self.operation_order = operation_order
+        self.norm_cfg = norm_cfg
+        self.pre_norm = operation_order[0] == 'norm'
+        self.attentions = ModuleList()
+
+        index = 0
+        for operation_name in operation_order:
+            if operation_name in ['self_attn', 'cross_attn']:
+                if 'batch_first' in attn_cfgs[index]:
+                    assert self.batch_first == attn_cfgs[index]['batch_first']
+                else:
+                    attn_cfgs[index]['batch_first'] = self.batch_first
+                attention = build_attention(attn_cfgs[index])
+                # Some custom attentions used as `self_attn`
+                # or `cross_attn` can have different behavior.
+                attention.operation_name = operation_name
+                self.attentions.append(attention)
+                index += 1
+
+        self.embed_dims = self.attentions[0].embed_dims
+
+        self.ffns = ModuleList()
+        num_ffns = operation_order.count('ffn')
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = ConfigDict(ffn_cfgs)
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
+        assert len(ffn_cfgs) == num_ffns
+        for ffn_index in range(num_ffns):
+            if 'embed_dims' not in ffn_cfgs[ffn_index]:
+                ffn_cfgs[ffn_index]['embed_dims'] = self.embed_dims
+            else:
+                assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
+            self.ffns.append(
+                build_feedforward_network(ffn_cfgs[ffn_index],
+                                          dict(type='FFN')))
+
+        self.norms = ModuleList()
+        num_norms = operation_order.count('norm')
+        for _ in range(num_norms):
+            self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `TransformerDecoderLayer`.
+        **kwargs contains some specific arguments of attentions.
+        Args:
+            query (Tensor): The input query with shape
+                [num_queries, bs, embed_dims] if
+                self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+            value (Tensor): The value tensor with same shape as `key`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor] | None): 2D Tensor used in
+                calculation of corresponding attention. The length of
+                it should equal to the number of `attention` in
+                `operation_order`. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in `self_attn` layer.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+        Returns:
+            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+        """
+
+        norm_index = 0
+        attn_index = 0
+        ffn_index = 0
+        identity = query
+        if attn_masks is None:
+            attn_masks = [None for _ in range(self.num_attn)]
+        elif isinstance(attn_masks, torch.Tensor):
+            attn_masks = [
+                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
+            ]
+            warnings.warn(f'Use same attn_mask in all attentions in '
+                          f'{self.__class__.__name__} ')
+        else:
+            assert len(attn_masks) == self.num_attn, f'The length of ' \
+                        f'attn_masks {len(attn_masks)} must be equal ' \
+                        f'to the number of attention in ' \
+                        f'operation_order {self.num_attn}'
+
+        for layer in self.operation_order:
+            if layer == 'self_attn':
+                temp_key = temp_value = query
+                query = self.attentions[attn_index](
+                    query,
+                    temp_key,
+                    temp_value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=query_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=query_key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'norm':
+                query = self.norms[norm_index](query)
+                norm_index += 1
+
+            elif layer == 'cross_attn':
+                query = self.attentions[attn_index](
+                    query,
+                    key,
+                    value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=key_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'ffn':
+                query = self.ffns[ffn_index](
+                    query, identity if self.pre_norm else None)
+                ffn_index += 1
+
+        return query
+
+@ATTENTION.register_module()
+class MotionDeformableAttention(BaseModule):
+    """An attention module used in Deformable-Detr.
+
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 num_steps=1,
+                 sample_index=-1,
+                 im2col_step=64,
+                 dropout=0.1,
+                 bev_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
+                 voxel_size=[0.2, 0.2, 8],
+                 batch_first=True,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+        self.fp16_enabled = False
+        self.bev_range = bev_range
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.num_steps = num_steps
+        self.sample_index = sample_index
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_steps * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_steps * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = Sequential(nn.Linear(num_steps*embed_dims, embed_dims),
+                                      nn.LayerNorm(embed_dims),
+                                      nn.ReLU(inplace=True)
+                                     )
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+            self.num_heads, 1, 1, 1,
+            2).repeat(1, self.num_steps, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiScaleDeformableAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                bbox_results=None,
+                reference_trajs=None,
+                flag='decoder',
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+        bs, num_agent, num_mode, _ = query.shape
+        num_query = num_agent * num_mode
+        if value is None:
+            value = query
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        query = torch.flatten(query, start_dim=1, end_dim=2)
+        
+        value = value.permute(1, 0, 2)
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_steps, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_steps, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_steps,
+                                                   self.num_levels,
+                                                   self.num_points)
+        # bs, n_query, n_head, n_steps, N_level, N_points, 2
+        # BS NUM_AGENT NUM_MODE 12 NUM_LEVEL  2
+        if reference_trajs.shape[-1] == 2:
+            reference_trajs = reference_trajs[:, :, :, [self.sample_index], :, :]
+            reference_trajs_ego = self.agent_coords_to_ego_coords(copy.deepcopy(reference_trajs), bbox_results).detach()
+            reference_trajs_ego = torch.flatten(reference_trajs_ego, start_dim=1, end_dim=2)
+            reference_trajs_ego = reference_trajs_ego[:, :, None, :, :, None, :]
+            reference_trajs_ego[..., 0] -= self.bev_range[0]
+            reference_trajs_ego[..., 1] -= self.bev_range[1]
+            reference_trajs_ego[..., 0] /= (self.bev_range[3] - self.bev_range[0])
+            reference_trajs_ego[..., 1] /= (self.bev_range[4] - self.bev_range[1])
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_trajs_ego \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, None, :, None, :]
+
+            sampling_locations = rearrange(sampling_locations, 'bs nq nh ns nl np c -> bs nq ns nh nl np c') # permute([0,1,3,2,4,5,6])
+            attention_weights = rearrange(attention_weights, 'bs nq nh ns nl np -> bs nq ns nh nl np') #.permute([0,1,3,2,4,5])
+            sampling_locations = sampling_locations.reshape(bs, num_query*self.num_steps, self.num_heads, self.num_levels, self.num_points, 2)
+            attention_weights = attention_weights.reshape(bs, num_query*self.num_steps, self.num_heads, self.num_levels, self.num_points)
+
+        else:
+            raise ValueError(
+                f'Last dim of reference_trajs must be'
+                f' 2 or 4, but get {reference_trajs.shape[-1]} instead.')
+        if torch.cuda.is_available() and value.is_cuda:
+
+            # using fp16 deformable attention is unstable because it performs many sum operations
+            if value.dtype == torch.float16:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            else:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+        output = output.view(bs, num_query, self.num_steps, -1)
+        output = torch.flatten(output, start_dim=2, end_dim=3)
+        output = self.output_proj(output)
+        output = output.view(bs, num_agent, num_mode, -1)
+        return self.dropout(output) + identity
+
+    def agent_coords_to_ego_coords(self, reference_trajs, bbox_results):
+        batch_size = len(bbox_results)
+        reference_trajs_ego = []
+        for i in range(batch_size):
+            boxes_3d, scores, labels, bbox_index, mask = bbox_results[i]
+            det_centers = boxes_3d.gravity_center.to(reference_trajs.device)
+            batch_reference_trajs = reference_trajs[i]
+            batch_reference_trajs += det_centers[:, None, None, None, :2]
+            reference_trajs_ego.append(batch_reference_trajs)
+        return torch.stack(reference_trajs_ego)
+    
+    def rot_2d(self, yaw):
+        sy, cy = torch.sin(yaw), torch.cos(yaw)
+        out = torch.stack([torch.stack([cy, -sy]), torch.stack([sy, cy])]).permute([2,0,1])
+        return out
+
+@ATTENTION.register_module()
+class CustomModeMultiheadAttention(BaseModule):
+    """A wrapper for ``torch.nn.MultiheadAttention``.
+    This module implements MultiheadAttention with identity connection,
+    and positional encoding  is also passed as input.
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): When it is True,  Key, Query and Value are shape of
+            (batch, n, embed_dim), otherwise (n, batch, embed_dim).
+             Default to False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=dict(type='Dropout', drop_prob=0.),
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(init_cfg)
+        if 'dropout' in kwargs:
+            warnings.warn(
+                'The arguments `dropout` in MultiheadAttention '
+                'has been deprecated, now you can separately '
+                'set `attn_drop`(float), proj_drop(float), '
+                'and `dropout_layer`(dict) ', DeprecationWarning)
+            attn_drop = kwargs['dropout']
+            dropout_layer['drop_prob'] = kwargs.pop('dropout')
+
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+
+        self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop, **kwargs)
+
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else nn.Identity()
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiheadAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_pos=None,
+                attn_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `MultiheadAttention`.
+        **kwargs allow passing a more general data flow when combining
+        with other operations in `transformerlayer`.
+        Args:
+            query (Tensor): The input query with shape [num_queries, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+                If None, the ``query`` will be used. Defaults to None.
+            value (Tensor): The value tensor with same shape as `key`.
+                Same in `nn.MultiheadAttention.forward`. Defaults to None.
+                If None, the `key` will be used.
+            identity (Tensor): This tensor, with the same shape as x,
+                will be used for the identity link.
+                If None, `x` will be used. Defaults to None.
+            query_pos (Tensor): The positional encoding for query, with
+                the same shape as `x`. If not None, it will
+                be added to `x` before forward function. Defaults to None.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`. Defaults to None. If not None, it will
+                be added to `key` before forward function. If None, and
+                `query_pos` has the same shape as `key`, then `query_pos`
+                will be used for `key_pos`. Defaults to None.
+            attn_mask (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+                Defaults to None.
+        Returns:
+            Tensor: forwarded results with shape
+            [num_queries, bs, embed_dims]
+            if self.batch_first is False, else
+            [bs, num_queries embed_dims].
+        """
+        query_pos = query_pos.unsqueeze(1)
+        key_pos = key_pos.unsqueeze(1)
+        bs, n_agent, n_query, D = query.shape
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+        if identity is None:
+            identity = query
+        if key_pos is None:
+            if query_pos is not None:
+                # use query_pos if key_pos is not available
+                if query_pos.shape == key.shape:
+                    key_pos = query_pos
+                else:
+                    warnings.warn(f'position encoding of key is'
+                                  f'missing in {self.__class__.__name__}.')
+        if query_pos is not None:
+            query = query + query_pos
+        if key_pos is not None:
+            key = key + key_pos
+
+        # Because the dataflow('key', 'query', 'value') of
+        # ``torch.nn.MultiheadAttention`` is (num_query, batch,
+        # embed_dims), We should adjust the shape of dataflow from
+        # batch_first (batch, num_query, embed_dims) to num_query_first
+        # (num_query ,batch, embed_dims), and recover ``attn_output``
+        # from num_query_first to batch_first.
+        query = torch.flatten(query, start_dim=0, end_dim=1)
+        key = torch.flatten(key, start_dim=0, end_dim=1)
+        value = torch.flatten(value, start_dim=0, end_dim=1)
+        identity = torch.flatten(identity, start_dim=0, end_dim=1)
+        
+        query = query.transpose(0, 1)
+        key = key.transpose(0, 1)
+        value = value.transpose(0, 1)
+
+        out = self.attn(
+            query=query,
+            key=key,
+            value=value,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask)[0]
+
+        out = out.transpose(0, 1)
+        out = identity + self.dropout_layer(self.proj_drop(out))
+
+        return out.view(bs, n_agent, n_query, D)
\ No newline at end of file
diff --git a/mmcv/models/dense_heads/motion_head_plugin/motion_optimization.py b/mmcv/models/dense_heads/motion_head_plugin/motion_optimization.py
new file mode 100644
index 0000000..0dced26
--- /dev/null
+++ b/mmcv/models/dense_heads/motion_head_plugin/motion_optimization.py
@@ -0,0 +1,218 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+#---------------------------------------------------------------------------------#
+
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import numpy.typing as npt
+from casadi import DM, Opti, OptiSol, cos, diff, sin, sumsqr, vertcat
+Pose = Tuple[float, float, float]  # (x, y, yaw)
+
+
+class MotionNonlinearSmoother:
+    """
+    Smoothing a set of xy observations with a vehicle dynamics model.
+    Solved with direct multiple-shooting.
+    modified from https://github.com/motional/nuplan-devkit
+    :param trajectory_len: trajectory length
+    :param dt: timestep (sec)
+    """
+
+    def __init__(self, trajectory_len: int, dt: float):
+        """
+        :param trajectory_len: the length of trajectory to be optimized.
+        :param dt: the time interval between trajectory points.
+        """
+        self.dt = dt
+        self.trajectory_len = trajectory_len
+        self.current_index = 0
+        # Use a array of dts to make it compatible to situations with varying dts across different time steps.
+        self._dts: npt.NDArray[np.float32] = np.asarray(
+            [[dt] * trajectory_len])
+        self._init_optimization()
+
+    def _init_optimization(self) -> None:
+        """
+        Initialize related variables and constraints for optimization.
+        """
+        self.nx = 4  # state dim
+        self.nu = 2  # control dim
+
+        self._optimizer = Opti()  # Optimization problem
+        self._create_decision_variables()
+        self._create_parameters()
+        self._set_dynamic_constraints()
+        self._set_state_constraints()
+        self._set_control_constraints()
+        self._set_objective()
+
+        # Set default solver options (quiet)
+        self._optimizer.solver(
+            "ipopt", {"ipopt.print_level": 0, "print_time": 0, "ipopt.sb": "yes"})
+
+    def set_reference_trajectory(self, x_curr: Sequence[float], reference_trajectory: Sequence[Pose]) -> None:
+        """
+        Set the reference trajectory that the smoother is trying to loosely track.
+        :param x_curr: current state of size nx (x, y, yaw, speed)
+        :param reference_trajectory: N+1 x 3 reference, where the second dim is for (x, y, yaw)
+        """
+        self._check_inputs(x_curr, reference_trajectory)
+
+        self._optimizer.set_value(self.x_curr, DM(x_curr))
+        self._optimizer.set_value(self.ref_traj, DM(reference_trajectory).T)
+        self._set_initial_guess(x_curr, reference_trajectory)
+
+    def set_solver_optimizerons(self, options: Dict[str, Any]) -> None:
+        """
+        Control solver options including verbosity.
+        :param options: Dictionary containing optimization criterias
+        """
+        self._optimizer.solver("ipopt", options)
+
+    def solve(self) -> OptiSol:
+        """
+        Solve the optimization problem. Assumes the reference trajectory was already set.
+        :return Casadi optimization class
+        """
+        return self._optimizer.solve()
+
+    def _create_decision_variables(self) -> None:
+        """
+        Define the decision variables for the trajectory optimization.
+        """
+        # State trajectory (x, y, yaw, speed)
+        self.state = self._optimizer.variable(self.nx, self.trajectory_len + 1)
+        self.position_x = self.state[0, :]
+        self.position_y = self.state[1, :]
+        self.yaw = self.state[2, :]
+        self.speed = self.state[3, :]
+
+        # Control trajectory (curvature, accel)
+        self.control = self._optimizer.variable(self.nu, self.trajectory_len)
+        self.curvature = self.control[0, :]
+        self.accel = self.control[1, :]
+
+        # Derived control and state variables, dt[:, 1:] becuases state vector is one step longer than action.
+        self.curvature_rate = diff(self.curvature) / self._dts[:, 1:]
+        self.jerk = diff(self.accel) / self._dts[:, 1:]
+        self.lateral_accel = self.speed[: self.trajectory_len] ** 2 * \
+            self.curvature
+
+    def _create_parameters(self) -> None:
+        """
+        Define the expert trjactory and current position for the trajectory optimizaiton.
+        """
+        self.ref_traj = self._optimizer.parameter(
+            3, self.trajectory_len + 1)  # (x, y, yaw)
+        self.x_curr = self._optimizer.parameter(self.nx, 1)
+
+    def _set_dynamic_constraints(self) -> None:
+        r"""
+        Set the system dynamics constraints as following:
+          dx/dt = f(x,u)
+          \dot{x} = speed * cos(yaw)
+          \dot{y} = speed * sin(yaw)
+          \dot{yaw} = speed * curvature
+          \dot{speed} = accel
+        """
+        state = self.state
+        control = self.control
+        dt = self.dt
+
+        def process(x: Sequence[float], u: Sequence[float]) -> Any:
+            """Process for state propagation."""
+            return vertcat(x[3] * cos(x[2]), x[3] * sin(x[2]), x[3] * u[0], u[1])
+
+        for k in range(self.trajectory_len):  # loop over control intervals
+            # Runge-Kutta 4 integration
+            k1 = process(state[:, k], control[:, k])
+            k2 = process(state[:, k] + dt / 2 * k1, control[:, k])
+            k3 = process(state[:, k] + dt / 2 * k2, control[:, k])
+            k4 = process(state[:, k] + dt * k3, control[:, k])
+            next_state = state[:, k] + dt / 6 * (k1 + 2 * k2 + 2 * k3 + k4)
+            self._optimizer.subject_to(
+                state[:, k + 1] == next_state)  # close the gaps
+
+    def _set_control_constraints(self) -> None:
+        """Set the hard control constraints."""
+        curvature_limit = 1.0 / 5.0  # 1/m
+        self._optimizer.subject_to(
+            self._optimizer.bounded(-curvature_limit, self.curvature, curvature_limit))
+        accel_limit = 4.0  # m/s^2
+        self._optimizer.subject_to(
+            self._optimizer.bounded(-accel_limit, self.accel, accel_limit))
+
+    def _set_state_constraints(self) -> None:
+        """Set the hard state constraints."""
+        # Constrain the current time -- NOT start of history
+        # initial boundary condition
+        self._optimizer.subject_to(
+            self.state[:, self.current_index] == self.x_curr)
+
+        max_speed = 35.0  # m/s
+        self._optimizer.subject_to(self._optimizer.bounded(
+            0.0, self.speed, max_speed))  # only forward
+        max_yaw_rate = 1.75  # rad/s
+        self._optimizer.subject_to(
+            self._optimizer.bounded(-max_yaw_rate, diff(self.yaw) / self._dts, max_yaw_rate))
+        max_lateral_accel = 4.0  # m/s^2, assumes circular motion acc_lat = speed^2 * curvature
+        self._optimizer.subject_to(
+            self._optimizer.bounded(
+                -max_lateral_accel, self.speed[:, : self.trajectory_len] ** 2 *
+                self.curvature, max_lateral_accel
+            )
+        )
+
+    def _set_objective(self) -> None:
+        """Set the objective function. Use care when modifying these weights."""
+        # Follow reference, minimize control rates and absolute inputs
+        alpha_xy = 1.0
+        alpha_yaw = 0.1
+        alpha_rate = 0.08
+        alpha_abs = 0.08
+        alpha_lat_accel = 0.06
+        cost_stage = (
+            alpha_xy *
+            sumsqr(self.ref_traj[:2, :] -
+                   vertcat(self.position_x, self.position_y))
+            + alpha_yaw * sumsqr(self.ref_traj[2, :] - self.yaw)
+            + alpha_rate * (sumsqr(self.curvature_rate) + sumsqr(self.jerk))
+            + alpha_abs * (sumsqr(self.curvature) + sumsqr(self.accel))
+            + alpha_lat_accel * sumsqr(self.lateral_accel)
+        )
+
+        # Take special care with the final state
+        alpha_terminal_xy = 1.0
+        alpha_terminal_yaw = 40.0  # really care about final heading to help with lane changes
+        cost_terminal = alpha_terminal_xy * sumsqr(
+            self.ref_traj[:2, -1] -
+            vertcat(self.position_x[-1], self.position_y[-1])
+        ) + alpha_terminal_yaw * sumsqr(self.ref_traj[2, -1] - self.yaw[-1])
+
+        self._optimizer.minimize(
+            cost_stage + self.trajectory_len / 4.0 * cost_terminal)
+
+    def _set_initial_guess(self, x_curr: Sequence[float], reference_trajectory: Sequence[Pose]) -> None:
+        """Set a warm-start for the solver based on the reference trajectory."""
+        self._check_inputs(x_curr, reference_trajectory)
+
+        # Initialize state guess based on reference
+        self._optimizer.set_initial(self.state[:3, :], DM(
+            reference_trajectory).T)  # (x, y, yaw)
+        self._optimizer.set_initial(self.state[3, :], DM(x_curr[3]))  # speed
+
+    def _check_inputs(self, x_curr: Sequence[float], reference_trajectory: Sequence[Pose]) -> None:
+        """Raise ValueError if inputs are not of proper size."""
+        if len(x_curr) != self.nx:
+            raise ValueError(
+                f"x_curr length {len(x_curr)} must be equal to state dim {self.nx}")
+
+        if len(reference_trajectory) != self.trajectory_len + 1:
+            raise ValueError(
+                f"reference traj length {len(reference_trajectory)} must be equal to {self.trajectory_len + 1}"
+            )
diff --git a/mmcv/models/dense_heads/motion_head_plugin/motion_utils.py b/mmcv/models/dense_heads/motion_head_plugin/motion_utils.py
new file mode 100644
index 0000000..48ef857
--- /dev/null
+++ b/mmcv/models/dense_heads/motion_head_plugin/motion_utils.py
@@ -0,0 +1,99 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+#---------------------------------------------------------------------------------#
+
+import torch
+import random
+import numpy as np
+from .motion_optimization import MotionNonlinearSmoother
+
+
+def nonlinear_smoother(gt_bboxes_3d, gt_fut_traj, gt_fut_traj_mask, bbox_tensor):
+    """
+    This function applies a nonlinear smoother to the ground truth future trajectories of 3D bounding boxes.
+    It takes into account the vehicle's yaw and velocity to generate smooth, realistic trajectories.
+
+    Args:
+    gt_bboxes_3d (torch.Tensor): Ground truth 3D bounding boxes of shape (batch_size, 7).
+    gt_fut_traj (torch.Tensor): Ground truth future trajectories of shape (batch_size, 12, 2).
+    gt_fut_traj_mask (torch.Tensor): A mask indicating valid timesteps in the ground truth future trajectories of shape (batch_size, 12).
+    bbox_tensor (torch.Tensor): A tensor representing the bounding box properties of shape (batch_size, 9).
+
+    Returns:
+    torch.Tensor: The perturbed trajectories of shape (batch_size, 12, 2).
+    torch.Tensor: The updated mask indicating valid timesteps in the perturbed trajectories of the same shape as gt_fut_traj_mask.
+    """
+    device = gt_fut_traj.device
+    dtype = gt_fut_traj.dtype
+    gt_bboxes_3d = gt_bboxes_3d.cpu().detach().numpy()
+    gt_fut_traj = gt_fut_traj.cpu().detach().numpy()
+    gt_fut_traj_xy_diff = np.zeros((gt_fut_traj.shape[0], 13, 2))
+    gt_fut_traj_xy_diff[:, 1:, :] = gt_fut_traj
+    gt_fut_traj_xy_diff = np.diff(gt_fut_traj_xy_diff, axis=1)
+    gt_fut_traj_yaw = np.arctan2(
+        gt_fut_traj_xy_diff[:, :, 1], gt_fut_traj_xy_diff[:, :, 0])
+    gt_fut_traj_yaw = np.concatenate(
+        [-np.pi/2 - gt_bboxes_3d[:, None, 6:7], gt_fut_traj_yaw[:, :, None]], axis=1)
+    gt_fut_traj = np.concatenate(
+        [gt_bboxes_3d[:, None, :2], gt_fut_traj], axis=1)
+
+    gt_fut_traj_mask = gt_fut_traj_mask.cpu().detach().numpy()
+    bbox_tensor = bbox_tensor.cpu().detach().numpy()
+    ts_limit = gt_fut_traj_mask.sum(1)[:, 0]
+    yaw_preds = bbox_tensor[:, 6]
+    vel_preds = bbox_tensor[:, -2:]
+    speed_preds = np.sqrt(np.sum(vel_preds**2, axis=-1))
+    traj_perturb_all = []
+
+    # we set some constraints here to avoid perturbing the trajectories that are not dynamic, 
+    # or have large differences with the ground truth
+    def _is_dynamic(traj, ts, dist_thres):
+        return np.sqrt(np.sum((traj[ts, :2] - traj[0, :2])**2)) > dist_thres
+
+    def _check_diff(x_curr, ref_traj):
+        if np.sqrt((x_curr[0] - ref_traj[0, 0]) ** 2 + (x_curr[1] - ref_traj[0, 1])**2) > 2:
+            return False
+        a = np.array([np.cos(x_curr[2]), np.sin(x_curr[2])])
+        b = np.array([np.cos(ref_traj[0, 2]), np.sin(ref_traj[0, 2])])
+        diff_theta = np.arccos(
+            np.sum(a*b)/(np.sqrt(np.sum(a**2)) * np.sqrt(np.sum(b**2))))
+        if diff_theta > np.pi/180 * 30:
+            return False
+        return True
+
+    def _check_ade(traj_pert, traj_ref, thres):
+        return np.mean(np.sqrt(np.sum((traj_pert[:, :2] - traj_ref[:, :2])**2, axis=-1))) < thres
+
+    perturb_count = 0
+    perturb_used_count = 0
+    for i in range(gt_fut_traj.shape[0]):
+        ts = ts_limit[i]
+        x_curr = [bbox_tensor[i, 0], bbox_tensor[i, 1], -
+                  np.pi/2 - yaw_preds[i], speed_preds[i]]
+        reference_trajectory = np.concatenate(
+            [gt_fut_traj[i], gt_fut_traj_yaw[i]], axis=-1)
+        if ts > 1 and _is_dynamic(gt_fut_traj[i], int(ts), 2) and _check_diff(x_curr, reference_trajectory):
+            smoother = MotionNonlinearSmoother(
+                trajectory_len=int(ts), dt=0.5)
+            reference_trajectory = reference_trajectory[:int(ts)+1, :]
+            smoother.set_reference_trajectory(x_curr, reference_trajectory)
+            sol = smoother.solve()
+            traj_perturb = np.stack(
+                [sol.value(smoother.position_x), sol.value(smoother.position_y)], axis=-1)
+            perturb_used_count += 1
+            if not _check_ade(traj_perturb, reference_trajectory, thres=1.5):
+                traj_perturb = gt_fut_traj[i, 1:,
+                                           :2] - gt_fut_traj[i, 0:1, :2]
+            else:
+                traj_perturb_tmp = traj_perturb[1:,
+                                                :2] - traj_perturb[0:1, :2]
+                traj_perturb = np.zeros((12, 2))
+                traj_perturb[:traj_perturb_tmp.shape[0],
+                             :] = traj_perturb_tmp[:, :2]
+                perturb_count += 1
+        else:
+            traj_perturb = gt_fut_traj[i, 1:, :2] - gt_fut_traj[i, 0:1, :2]
+        traj_perturb_all.append(traj_perturb)
+    return torch.tensor(traj_perturb_all, device=device, dtype=dtype), torch.tensor(gt_fut_traj_mask > 0, device=device)
diff --git a/mmcv/models/dense_heads/occ_head.py b/mmcv/models/dense_heads/occ_head.py
new file mode 100644
index 0000000..5ad8a9a
--- /dev/null
+++ b/mmcv/models/dense_heads/occ_head.py
@@ -0,0 +1,482 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+#---------------------------------------------------------------------------------#
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.models.builder import HEADS, build_loss
+from mmcv.models.backbones.base_module import BaseModule
+from einops import rearrange
+from mmcv.core.utils import reduce_mean
+from mmcv.models.bricks.transformer import build_transformer_layer_sequence
+import copy
+from .occ_head_plugin import MLP, BevFeatureSlicer, SimpleConv2d, CVT_Decoder, Bottleneck, UpsamplingAdd, \
+                             predict_instance_segmentation_and_trajectories
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+@HEADS.register_module()
+class OccHead(BaseModule):
+    def __init__(self, 
+                 # General
+                 receptive_field=3,
+                 n_future=4,
+                 spatial_extent=(50, 50),
+                 ignore_index=255,
+
+                 # BEV
+                 grid_conf = None,
+
+                 bev_size=(200, 200),
+                 bev_emb_dim=256,
+                 bev_proj_dim=64,
+                 bev_proj_nlayers=1,
+
+                 # Query
+                 query_dim=256,
+                 query_mlp_layers=3,
+                 detach_query_pos=True,
+                 temporal_mlp_layer=2,
+
+                 # Transformer
+                 transformer_decoder=None,
+
+                 attn_mask_thresh=0.5,
+                 # Loss
+                 sample_ignore_mode='all_valid',
+                 aux_loss_weight=1.,
+
+                 loss_mask=None,
+                 loss_dice=None,
+
+                 # Cfgs
+                 init_cfg=None,
+
+                 # Eval
+                 pan_eval=False,
+                 test_seg_thresh:float=0.5,
+                 test_with_track_score=False,
+                 ):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+            'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg)
+        self.receptive_field = receptive_field  # NOTE: Used by prepare_future_labels in E2EPredTransformer
+        self.n_future = n_future
+        self.spatial_extent = spatial_extent
+        self.ignore_index  = ignore_index
+
+        bevformer_bev_conf = {
+            'xbound': [-51.2, 51.2, 0.512],
+            'ybound': [-51.2, 51.2, 0.512],
+            'zbound': [-10.0, 10.0, 20.0],
+        }
+        self.bev_sampler =  BevFeatureSlicer(bevformer_bev_conf, grid_conf)
+        
+        self.bev_size = bev_size
+        self.bev_proj_dim = bev_proj_dim
+
+        if bev_proj_nlayers == 0:
+            self.bev_light_proj = nn.Sequential()
+        else:
+            self.bev_light_proj = SimpleConv2d(
+                in_channels=bev_emb_dim,
+                conv_channels=bev_emb_dim,
+                out_channels=bev_proj_dim,
+                num_conv=bev_proj_nlayers,
+            )
+
+        # Downscale bev_feat -> /4
+        self.base_downscale = nn.Sequential(
+            Bottleneck(in_channels=bev_proj_dim, downsample=True),
+            Bottleneck(in_channels=bev_proj_dim, downsample=True)
+        )
+
+        # Future blocks with transformer
+        self.n_future_blocks = self.n_future + 1
+
+        # - transformer
+        self.attn_mask_thresh = attn_mask_thresh
+        
+        self.num_trans_layers = transformer_decoder.num_layers
+        assert self.num_trans_layers % self.n_future_blocks == 0
+
+        self.num_heads = transformer_decoder.transformerlayers.\
+            attn_cfgs.num_heads
+        self.transformer_decoder = build_transformer_layer_sequence(
+            transformer_decoder)
+
+        # - temporal-mlps
+        # query_out_dim = bev_proj_dim
+
+        temporal_mlp = MLP(query_dim, query_dim, bev_proj_dim, num_layers=temporal_mlp_layer)
+        self.temporal_mlps = _get_clones(temporal_mlp, self.n_future_blocks)
+            
+        # - downscale-convs
+        downscale_conv = Bottleneck(in_channels=bev_proj_dim, downsample=True)
+        self.downscale_convs = _get_clones(downscale_conv, self.n_future_blocks)
+        
+        # - upsampleAdds
+        upsample_add = UpsamplingAdd(in_channels=bev_proj_dim, out_channels=bev_proj_dim)
+        self.upsample_adds = _get_clones(upsample_add, self.n_future_blocks)
+
+        # Decoder
+        self.dense_decoder = CVT_Decoder(
+            dim=bev_proj_dim,
+            blocks=[bev_proj_dim, bev_proj_dim],
+        )
+
+        # Query
+        self.mode_fuser = nn.Sequential(
+                nn.Linear(query_dim, bev_proj_dim),
+                nn.LayerNorm(bev_proj_dim),
+                nn.ReLU(inplace=True)
+            )
+        self.multi_query_fuser =  nn.Sequential(
+                nn.Linear(query_dim * 3, query_dim * 2),
+                nn.LayerNorm(query_dim * 2),
+                nn.ReLU(inplace=True),
+                nn.Linear(query_dim * 2, bev_proj_dim),
+            )
+
+        self.detach_query_pos = detach_query_pos
+
+        self.query_to_occ_feat = MLP(
+            query_dim, query_dim, bev_proj_dim, num_layers=query_mlp_layers
+        )
+        self.temporal_mlp_for_mask = copy.deepcopy(self.query_to_occ_feat)
+        
+        # Loss
+        # For matching
+        self.sample_ignore_mode = sample_ignore_mode
+        assert self.sample_ignore_mode in ['all_valid', 'past_valid', 'none']
+
+        self.aux_loss_weight = aux_loss_weight
+
+        self.loss_dice = build_loss(loss_dice)
+        self.loss_mask = build_loss(loss_mask)
+
+        self.pan_eval = pan_eval
+        self.test_seg_thresh  = test_seg_thresh
+
+        self.test_with_track_score = test_with_track_score
+        self.init_weights()
+
+    def init_weights(self):
+        for p in self.transformer_decoder.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_normal_(p)
+
+    def get_attn_mask(self, state, ins_query):
+        # state: b, c, h, w
+        # ins_query: b, q, c
+        ins_embed = self.temporal_mlp_for_mask(
+            ins_query 
+        )
+        mask_pred = torch.einsum("bqc,bchw->bqhw", ins_embed, state)
+        attn_mask = mask_pred.sigmoid() < self.attn_mask_thresh
+        attn_mask = rearrange(attn_mask, 'b q h w -> b (h w) q').unsqueeze(1).repeat(
+            1, self.num_heads, 1, 1).flatten(0, 1)
+        attn_mask = attn_mask.detach()
+        
+        # if a mask is all True(all background), then set it all False.
+        attn_mask[torch.where(
+            attn_mask.sum(-1) == attn_mask.shape[-1])] = False
+
+        upsampled_mask_pred = F.interpolate(
+            mask_pred,
+            self.bev_size,
+            mode='bilinear',
+            align_corners=False
+        )  # Supervised by gt
+
+        return attn_mask, upsampled_mask_pred, ins_embed
+
+    def forward(self, x, ins_query):
+        base_state = rearrange(x, '(h w) b d -> b d h w', h=self.bev_size[0])
+
+        base_state = self.bev_sampler(base_state)
+        base_state = self.bev_light_proj(base_state)
+        base_state = self.base_downscale(base_state)
+        base_ins_query = ins_query
+
+        last_state = base_state
+        last_ins_query = base_ins_query
+        future_states = []
+        mask_preds = []
+        temporal_query = []
+        temporal_embed_for_mask_attn = []
+        n_trans_layer_each_block = self.num_trans_layers // self.n_future_blocks
+        assert n_trans_layer_each_block >= 1
+        
+        for i in range(self.n_future_blocks):
+            # Downscale
+            cur_state = self.downscale_convs[i](last_state)  # /4 -> /8
+
+            # Attention
+            # temporal_aware ins_query
+            cur_ins_query = self.temporal_mlps[i](last_ins_query)  # [b, q, d]
+            temporal_query.append(cur_ins_query)
+
+            # Generate attn mask 
+            attn_mask, mask_pred, cur_ins_emb_for_mask_attn = self.get_attn_mask(cur_state, cur_ins_query)
+            attn_masks = [None, attn_mask] 
+
+            mask_preds.append(mask_pred)  # /1
+            temporal_embed_for_mask_attn.append(cur_ins_emb_for_mask_attn)
+
+            cur_state = rearrange(cur_state, 'b c h w -> (h w) b c')
+            cur_ins_query = rearrange(cur_ins_query, 'b q c -> q b c')
+
+            for j in range(n_trans_layer_each_block):
+                trans_layer_ind = i * n_trans_layer_each_block + j
+                trans_layer = self.transformer_decoder.layers[trans_layer_ind]
+                cur_state = trans_layer(
+                    query=cur_state,  # [h'*w', b, c]
+                    key=cur_ins_query,  # [nq, b, c]
+                    value=cur_ins_query,  # [nq, b, c]
+                    query_pos=None,  
+                    key_pos=None,
+                    attn_masks=attn_masks,
+                    query_key_padding_mask=None,
+                    key_padding_mask=None
+                )  # out size: [h'*w', b, c]
+
+            cur_state = rearrange(cur_state, '(h w) b c -> b c h w', h=self.bev_size[0]//8)
+            
+            # Upscale to /4
+            cur_state = self.upsample_adds[i](cur_state, last_state)
+
+            # Out
+            future_states.append(cur_state)  # [b, d, h/4, w/4]
+            last_state = cur_state
+
+        future_states = torch.stack(future_states, dim=1)  # [b, t, d, h/4, w/4]
+        temporal_query = torch.stack(temporal_query, dim=1)  # [b, t, q, d]
+        mask_preds = torch.stack(mask_preds, dim=2)  # [b, q, t, h, w]
+        ins_query = torch.stack(temporal_embed_for_mask_attn, dim=1)  # [b, t, q, d]
+
+        # Decode future states to larger resolution
+        future_states = self.dense_decoder(future_states)
+        ins_occ_query = self.query_to_occ_feat(ins_query)    # [b, t, q, query_out_dim]
+        
+        # Generate final outputs
+        ins_occ_logits = torch.einsum("btqc,btchw->bqthw", ins_occ_query, future_states)
+        
+        return mask_preds, ins_occ_logits
+
+    def merge_queries(self, outs_dict, detach_query_pos=True):
+        ins_query = outs_dict.get('traj_query', None)       # [n_dec, b, nq, n_modes, dim]
+        track_query = outs_dict['track_query']              # [b, nq, d]
+        track_query_pos = outs_dict['track_query_pos']      # [b, nq, d]
+
+        if detach_query_pos:
+            track_query_pos = track_query_pos.detach()
+
+        ins_query = ins_query[-1]
+        ins_query = self.mode_fuser(ins_query).max(2)[0]
+        ins_query = self.multi_query_fuser(torch.cat([ins_query, track_query, track_query_pos], dim=-1))
+        
+        return ins_query
+
+    # With matched queries [a small part of all queries] and matched_gt results
+    def forward_train(
+                    self,
+                    bev_feat,
+                    outs_dict,
+                    gt_inds_list=None,
+                    gt_segmentation=None,
+                    gt_instance=None,
+                    gt_img_is_valid=None,
+                ):
+        # Generate warpped gt and related inputs
+        gt_segmentation, gt_instance, gt_img_is_valid = self.get_occ_labels(gt_segmentation, gt_instance, gt_img_is_valid)
+        
+        all_matched_gt_ids = outs_dict['all_matched_idxes']  # list of tensor, length bs
+
+        ins_query = self.merge_queries(outs_dict, self.detach_query_pos)
+
+        # Forward the occ-flow model
+        mask_preds_batch, ins_seg_preds_batch = self(bev_feat, ins_query=ins_query)
+        
+        # Get pred and gt
+        ins_seg_targets_batch  = gt_instance # [1, 5, 200, 200] [b, t, h, w] # ins targets of a batch
+        
+        # img_valid flag, for filtering out invalid samples in sequence when calculating loss
+        img_is_valid = gt_img_is_valid  # [1, 7]
+        assert img_is_valid.size(1) == self.receptive_field + self.n_future,  \
+                f"Img_is_valid can only be 7 as for loss calculation and evaluation!!! Don't change it"
+        frame_valid_mask = img_is_valid.bool()
+        past_valid_mask  = frame_valid_mask[:, :self.receptive_field]
+        future_frame_mask = frame_valid_mask[:, (self.receptive_field-1):]  # [1, 5]  including current frame
+
+        # only supervise when all 3 past frames are valid
+        past_valid = past_valid_mask.all(dim=1)
+        future_frame_mask[~past_valid] = False
+        
+        # Calculate loss in the batch
+        loss_dict = dict()
+        loss_dice = ins_seg_preds_batch.new_zeros(1)[0].float()
+        loss_mask = ins_seg_preds_batch.new_zeros(1)[0].float()
+        loss_aux_dice = ins_seg_preds_batch.new_zeros(1)[0].float()
+        loss_aux_mask = ins_seg_preds_batch.new_zeros(1)[0].float()
+
+        bs = ins_query.size(0)
+        assert bs == 1
+        for ind in range(bs):
+            # Each gt_bboxes contains 3 frames, we only use the last one
+            cur_gt_inds   = gt_inds_list[ind][-1]
+
+            cur_matched_gt = all_matched_gt_ids[ind]  # [n_gt]
+            
+            # Re-order gt according to matched_gt_inds
+            cur_gt_inds   = cur_gt_inds[cur_matched_gt]
+            
+            # Deal matched_gt: -1, its actually background(unmatched)
+            cur_gt_inds[cur_matched_gt == -1] = -1  # Bugfixed
+            cur_gt_inds[cur_matched_gt == -2] = -2  
+
+            frame_mask = future_frame_mask[ind]  # [t]
+
+            # Prediction
+            ins_seg_preds = ins_seg_preds_batch[ind]   # [q(n_gt for matched), t, h, w]
+            ins_seg_targets = ins_seg_targets_batch[ind]  # [t, h, w]
+            mask_preds = mask_preds_batch[ind]
+            
+            # Assigned-gt
+            ins_seg_targets_ordered = []
+            for ins_id in cur_gt_inds:
+                # -1 for unmatched query
+                # If ins_seg_targets is all 255, ignore (directly append occ-and-flow gt to list)
+                # 255 for special object --> change to -20 (same as in occ_label.py)
+                # -2 for no_query situation
+                if (ins_seg_targets == self.ignore_index).all().item() is True:
+                    ins_tgt = ins_seg_targets.long()
+                elif ins_id.item() in [-1, -2] :  # false positive query (unmatched)
+                    ins_tgt = torch.ones_like(ins_seg_targets).long() * self.ignore_index
+                else:
+                    SPECIAL_INDEX = -20
+                    if ins_id.item() == self.ignore_index:
+                        ins_id = torch.ones_like(ins_id) * SPECIAL_INDEX
+                    ins_tgt = (ins_seg_targets == ins_id).long()  # [t, h, w], 0 or 1
+                
+                ins_seg_targets_ordered.append(ins_tgt)
+            
+            ins_seg_targets_ordered = torch.stack(ins_seg_targets_ordered, dim=0)  # [n_gt, t, h, w]
+            
+            # Sanity check
+            t, h, w = ins_seg_preds.shape[-3:]
+            assert t == 1+self.n_future, f"{ins_seg_preds.size()}"
+            assert ins_seg_preds.size() == ins_seg_targets_ordered.size(),   \
+                            f"{ins_seg_preds.size()}, {ins_seg_targets_ordered.size()}"
+            
+            num_total_pos = ins_seg_preds.size(0)  # Check this line 
+
+            # loss for a sample in batch
+            num_total_pos = ins_seg_preds.new_tensor([num_total_pos])
+            num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+            
+            cur_dice_loss = self.loss_dice(
+                ins_seg_preds, ins_seg_targets_ordered, avg_factor=num_total_pos, frame_mask=frame_mask)
+
+            cur_mask_loss = self.loss_mask(
+                ins_seg_preds, ins_seg_targets_ordered, frame_mask=frame_mask
+            )
+
+            cur_aux_dice_loss = self.loss_dice(
+                mask_preds, ins_seg_targets_ordered, avg_factor=num_total_pos, frame_mask=frame_mask
+            )
+            cur_aux_mask_loss = self.loss_mask(
+                mask_preds, ins_seg_targets_ordered, frame_mask=frame_mask
+            )
+
+            loss_dice += cur_dice_loss
+            loss_mask += cur_mask_loss
+            loss_aux_dice += cur_aux_dice_loss * self.aux_loss_weight
+            loss_aux_mask += cur_aux_mask_loss * self.aux_loss_weight
+
+        loss_dict['loss_dice'] = loss_dice / bs
+        loss_dict['loss_mask'] = loss_mask / bs
+        loss_dict['loss_aux_dice'] = loss_aux_dice / bs
+        loss_dict['loss_aux_mask'] = loss_aux_mask / bs
+
+        return loss_dict
+
+    def forward_test(
+                    self,
+                    bev_feat,
+                    outs_dict,
+                    no_query=False,
+                    gt_segmentation=None,
+                    gt_instance=None,
+                    gt_img_is_valid=None,
+                ):
+        out_dict = dict()
+
+        #import pdb;pdb.set_trace()
+
+        if gt_segmentation is not None and gt_instance is not None:
+            gt_segmentation, gt_instance, gt_img_is_valid = self.get_occ_labels(gt_segmentation, gt_instance, gt_img_is_valid)
+
+            
+            out_dict['seg_gt']  = gt_segmentation[:, :1+self.n_future]  # [1, 5, 1, 200, 200]
+            out_dict['ins_seg_gt'] = self.get_ins_seg_gt(gt_instance[:, :1+self.n_future])  # [1, 5, 200, 200]
+        if no_query:
+            # output all zero results
+            out_dict['seg_out'] = torch.zeros((1, 5, 1, 200, 200),device=bev_feat.device).long()  # [1, 5, 1, 200, 200]
+            out_dict['ins_seg_out'] = torch.zeros((1, 5, 1, 200, 200),device=bev_feat.device).long()  # [1, 5, 200, 200]
+            return out_dict
+
+
+        ins_query = self.merge_queries(outs_dict, self.detach_query_pos)
+
+        _, pred_ins_logits = self(bev_feat, ins_query=ins_query)
+
+        out_dict['pred_ins_logits'] = pred_ins_logits
+
+        pred_ins_logits = pred_ins_logits[:,:,:1+self.n_future]  # [b, q, t, h, w]
+        pred_ins_sigmoid = pred_ins_logits.sigmoid()  # [b, q, t, h, w]
+
+        if self.test_with_track_score:
+            track_scores = outs_dict['track_scores'].to(pred_ins_sigmoid)  # [b, q]
+            track_scores = track_scores[:, :, None, None, None]
+            pred_ins_sigmoid = pred_ins_sigmoid * track_scores  # [b, q, t, h, w]
+
+        out_dict['pred_ins_sigmoid'] = pred_ins_sigmoid
+        pred_seg_scores = pred_ins_sigmoid.max(1)[0]
+        seg_out = (pred_seg_scores > self.test_seg_thresh).long().unsqueeze(2)  # [b, t, 1, h, w]
+        out_dict['seg_out'] = seg_out
+        if self.pan_eval:
+            # ins_pred
+            pred_consistent_instance_seg =  \
+                predict_instance_segmentation_and_trajectories(seg_out, pred_ins_sigmoid)  # bg is 0, fg starts with 1, consecutive
+            
+            out_dict['ins_seg_out'] = pred_consistent_instance_seg  # [1, 5, 200, 200]
+
+        return out_dict
+
+    def get_ins_seg_gt(self, gt_instance):
+        ins_gt_old = gt_instance  # Not consecutive, 0 for bg, otherwise ins_ind(start from 1)
+        ins_gt_new = torch.zeros_like(ins_gt_old).to(ins_gt_old)  # Make it consecutive
+        ins_inds_unique = torch.unique(ins_gt_old)
+        new_id = 1
+        for uni_id in ins_inds_unique:
+            if uni_id.item() in [0, self.ignore_index]:  # ignore background_id
+                continue
+            ins_gt_new[ins_gt_old == uni_id] = new_id
+            new_id += 1
+        return ins_gt_new  # Consecutive
+
+    def get_occ_labels(self, gt_segmentation, gt_instance, gt_img_is_valid):
+        if not self.training:
+            gt_segmentation = gt_segmentation[0]
+            gt_instance = gt_instance[0]
+            gt_img_is_valid = gt_img_is_valid[0]
+
+        gt_segmentation = gt_segmentation[:, :self.n_future+1].long().unsqueeze(2)
+        gt_instance = gt_instance[:, :self.n_future+1].long()
+        gt_img_is_valid = gt_img_is_valid[:, :self.receptive_field + self.n_future]
+        return gt_segmentation, gt_instance, gt_img_is_valid
diff --git a/mmcv/models/dense_heads/occ_head_plugin/__init__.py b/mmcv/models/dense_heads/occ_head_plugin/__init__.py
new file mode 100644
index 0000000..00abc5d
--- /dev/null
+++ b/mmcv/models/dense_heads/occ_head_plugin/__init__.py
@@ -0,0 +1,3 @@
+from .utils import *
+from .metrics import *
+from .modules import *
\ No newline at end of file
diff --git a/mmcv/models/dense_heads/occ_head_plugin/metrics.py b/mmcv/models/dense_heads/occ_head_plugin/metrics.py
new file mode 100644
index 0000000..680a45f
--- /dev/null
+++ b/mmcv/models/dense_heads/occ_head_plugin/metrics.py
@@ -0,0 +1,258 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+#---------------------------------------------------------------------------------#
+
+from typing import Optional
+
+import torch
+from mmcv.metrics.metric import Metric
+from mmcv.metrics.classification import stat_scores_multiple_classes
+from mmcv.metrics.reduction import reduce
+
+class IntersectionOverUnion(Metric):
+    """Computes intersection-over-union."""
+    def __init__(
+        self,
+        n_classes: int,
+        ignore_index: Optional[int] = None,
+        absent_score: float = 0.0,
+        reduction: str = 'none',
+    ):
+        super().__init__()
+
+        self.n_classes = n_classes
+        self.ignore_index = ignore_index
+        self.absent_score = absent_score
+        self.reduction = reduction
+
+        self.add_state('true_positive', default=torch.zeros(n_classes), dist_reduce_fx='sum')
+        self.add_state('false_positive', default=torch.zeros(n_classes), dist_reduce_fx='sum')
+        self.add_state('false_negative', default=torch.zeros(n_classes), dist_reduce_fx='sum')
+        self.add_state('support', default=torch.zeros(n_classes), dist_reduce_fx='sum')
+
+    def update(self, prediction: torch.Tensor, target: torch.Tensor):
+        tps, fps, _, fns, sups = stat_scores_multiple_classes(prediction, target, self.n_classes)
+
+        self.true_positive += tps
+        self.false_positive += fps
+        self.false_negative += fns
+        self.support += sups
+
+    def compute(self):
+        scores = torch.zeros(self.n_classes, device=self.true_positive.device, dtype=torch.float32)
+
+        for class_idx in range(self.n_classes):
+            if class_idx == self.ignore_index:
+                continue
+
+            tp = self.true_positive[class_idx]
+            fp = self.false_positive[class_idx]
+            fn = self.false_negative[class_idx]
+            sup = self.support[class_idx]
+
+            # If this class is absent in the target (no support) AND absent in the pred (no true or false
+            # positives), then use the absent_score for this class.
+            if sup + tp + fp == 0:
+                scores[class_idx] = self.absent_score
+                continue
+
+            denominator = tp + fp + fn
+            score = tp.to(torch.float) / denominator
+            scores[class_idx] = score
+
+        # Remove the ignored class index from the scores.
+        if (self.ignore_index is not None) and (0 <= self.ignore_index < self.n_classes):
+            scores = torch.cat([scores[:self.ignore_index], scores[self.ignore_index+1:]])
+
+        return reduce(scores, reduction=self.reduction)
+
+
+class PanopticMetric(Metric):
+    def __init__(
+        self,
+        n_classes: int,
+        temporally_consistent: bool = True,
+        vehicles_id: int = 1,
+    ):
+        super().__init__()
+
+        self.n_classes = n_classes
+        self.temporally_consistent = temporally_consistent
+        self.vehicles_id = vehicles_id
+        self.keys = ['iou', 'true_positive', 'false_positive', 'false_negative']
+
+        self.add_state('iou', default=torch.zeros(n_classes), dist_reduce_fx='sum')
+        self.add_state('true_positive', default=torch.zeros(n_classes), dist_reduce_fx='sum')
+        self.add_state('false_positive', default=torch.zeros(n_classes), dist_reduce_fx='sum')
+        self.add_state('false_negative', default=torch.zeros(n_classes), dist_reduce_fx='sum')
+
+    def update(self, pred_instance, gt_instance):
+        """
+        Update state with predictions and targets.
+
+        Parameters
+        ----------
+            pred_instance: (b, s, h, w)
+                Temporally consistent instance segmentation prediction.
+            gt_instance: (b, s, h, w)
+                Ground truth instance segmentation.
+        """
+        batch_size, sequence_length = gt_instance.shape[:2]
+        # Process labels
+        assert gt_instance.min() == 0, 'ID 0 of gt_instance must be background'
+        pred_segmentation = (pred_instance > 0).long()
+        gt_segmentation = (gt_instance > 0).long()
+
+        for b in range(batch_size):
+            unique_id_mapping = {}
+            for t in range(sequence_length):
+                result = self.panoptic_metrics(
+                    pred_segmentation[b, t].detach(),
+                    pred_instance[b, t].detach(),
+                    gt_segmentation[b, t],
+                    gt_instance[b, t],
+                    unique_id_mapping,
+                )
+
+                self.iou += result['iou']
+                self.true_positive += result['true_positive']
+                self.false_positive += result['false_positive']
+                self.false_negative += result['false_negative']
+
+    def compute(self):
+        denominator = torch.maximum(
+            (self.true_positive + self.false_positive / 2 + self.false_negative / 2),
+            torch.ones_like(self.true_positive)
+        )
+        pq = self.iou / denominator
+        sq = self.iou / torch.maximum(self.true_positive, torch.ones_like(self.true_positive))
+        rq = self.true_positive / denominator
+
+        return {'pq': pq,
+                'sq': sq,
+                'rq': rq,
+                # If 0, it means there wasn't any detection.
+                'denominator': (self.true_positive + self.false_positive / 2 + self.false_negative / 2),
+                }
+
+    def panoptic_metrics(self, pred_segmentation, pred_instance, gt_segmentation, gt_instance, unique_id_mapping):
+        """
+        Computes panoptic quality metric components.
+
+        Parameters
+        ----------
+            pred_segmentation: [H, W] range {0, ..., n_classes-1} (>= n_classes is void)
+            pred_instance: [H, W] range {0, ..., n_instances} (zero means background)
+            gt_segmentation: [H, W] range {0, ..., n_classes-1} (>= n_classes is void)
+            gt_instance: [H, W] range {0, ..., n_instances} (zero means background)
+            unique_id_mapping: instance id mapping to check consistency
+        """
+        n_classes = self.n_classes
+
+        result = {key: torch.zeros(n_classes, dtype=torch.float32, device=gt_instance.device) for key in self.keys}
+
+        assert pred_segmentation.dim() == 2
+        assert pred_segmentation.shape == pred_instance.shape == gt_segmentation.shape == gt_instance.shape
+
+        n_instances = int(torch.cat([pred_instance, gt_instance]).max().item())
+        n_all_things = n_instances + n_classes  # Classes + instances.
+        n_things_and_void = n_all_things + 1
+
+        # Now 1 is background; 0 is void (not used). 2 is vehicle semantic class but since it overlaps with
+        # instances, it is not present.
+        # and the rest are instance ids starting from 3
+        prediction, pred_to_cls = self.combine_mask(pred_segmentation, pred_instance, n_classes, n_all_things)
+        target, target_to_cls = self.combine_mask(gt_segmentation, gt_instance, n_classes, n_all_things)
+
+        # Compute ious between all stuff and things
+        # hack for bincounting 2 arrays together
+        x = prediction + n_things_and_void * target
+        bincount_2d = torch.bincount(x.long(), minlength=n_things_and_void ** 2)
+        if bincount_2d.shape[0] != n_things_and_void ** 2:
+            raise ValueError('Incorrect bincount size.')
+        conf = bincount_2d.reshape((n_things_and_void, n_things_and_void))
+        # Drop void class
+        conf = conf[1:, 1:]
+
+        # Confusion matrix contains intersections between all combinations of classes
+        union = conf.sum(0).unsqueeze(0) + conf.sum(1).unsqueeze(1) - conf
+        iou = torch.where(union > 0, (conf.float() + 1e-9) / (union.float() + 1e-9), torch.zeros_like(union).float())
+
+        # In the iou matrix, first dimension is target idx, second dimension is pred idx.
+        # Mapping will contain a tuple that maps prediction idx to target idx for segments matched by iou.
+        mapping = (iou > 0.5).nonzero(as_tuple=False)
+
+        # Check that classes match.
+        is_matching = pred_to_cls[mapping[:, 1]] == target_to_cls[mapping[:, 0]]
+        mapping = mapping[is_matching]
+        tp_mask = torch.zeros_like(conf, dtype=torch.bool)
+        tp_mask[mapping[:, 0], mapping[:, 1]] = True
+
+        # First ids correspond to "stuff" i.e. semantic seg.
+        # Instance ids are offset accordingly
+        for target_id, pred_id in mapping:
+            cls_id = pred_to_cls[pred_id]
+
+            if self.temporally_consistent and cls_id == self.vehicles_id:
+                if target_id.item() in unique_id_mapping and unique_id_mapping[target_id.item()] != pred_id.item():
+                    # Not temporally consistent
+                    result['false_negative'][target_to_cls[target_id]] += 1
+                    result['false_positive'][pred_to_cls[pred_id]] += 1
+                    unique_id_mapping[target_id.item()] = pred_id.item()
+                    continue
+
+            result['true_positive'][cls_id] += 1
+            result['iou'][cls_id] += iou[target_id][pred_id]
+            unique_id_mapping[target_id.item()] = pred_id.item()
+
+        for target_id in range(n_classes, n_all_things):
+            # If this is a true positive do nothing.
+            if tp_mask[target_id, n_classes:].any():
+                continue
+            # If this target instance didn't match with any predictions and was present set it as false negative.
+            if target_to_cls[target_id] != -1:
+                result['false_negative'][target_to_cls[target_id]] += 1
+
+        for pred_id in range(n_classes, n_all_things):
+            # If this is a true positive do nothing.
+            if tp_mask[n_classes:, pred_id].any():
+                continue
+            # If this predicted instance didn't match with any prediction, set that predictions as false positive.
+            if pred_to_cls[pred_id] != -1 and (conf[:, pred_id] > 0).any():
+                result['false_positive'][pred_to_cls[pred_id]] += 1
+
+        return result
+
+    def combine_mask(self, segmentation: torch.Tensor, instance: torch.Tensor, n_classes: int, n_all_things: int):
+        """Shifts all things ids by num_classes and combines things and stuff into a single mask
+
+        Returns a combined mask + a mapping from id to segmentation class.
+        """
+        instance = instance.view(-1)
+        instance_mask = instance > 0
+        instance = instance - 1 + n_classes
+
+        segmentation = segmentation.clone().view(-1)
+        segmentation_mask = segmentation < n_classes  # Remove void pixels.
+
+        # Build an index from instance id to class id.
+        instance_id_to_class_tuples = torch.cat(
+            (
+                instance[instance_mask & segmentation_mask].unsqueeze(1),
+                segmentation[instance_mask & segmentation_mask].unsqueeze(1),
+            ),
+            dim=1,
+        )
+        instance_id_to_class = -instance_id_to_class_tuples.new_ones((n_all_things,))
+        instance_id_to_class[instance_id_to_class_tuples[:, 0]] = instance_id_to_class_tuples[:, 1]
+        instance_id_to_class[torch.arange(n_classes, device=segmentation.device)] = torch.arange(
+            n_classes, device=segmentation.device
+        )
+
+        segmentation[instance_mask] = instance[instance_mask]
+        segmentation += 1  # Shift all legit classes by 1.
+        segmentation[~segmentation_mask] = 0  # Shift void class to zero.
+
+        return segmentation, instance_id_to_class
\ No newline at end of file
diff --git a/mmcv/models/dense_heads/occ_head_plugin/modules.py b/mmcv/models/dense_heads/occ_head_plugin/modules.py
new file mode 100644
index 0000000..0942210
--- /dev/null
+++ b/mmcv/models/dense_heads/occ_head_plugin/modules.py
@@ -0,0 +1,342 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+#---------------------------------------------------------------------------------#
+
+import torch
+from torch import nn
+import torch.utils.checkpoint as checkpoint
+from .utils import calculate_birds_eye_view_parameters
+import torch.nn.functional as F
+from mmcv.models.backbones.base_module import BaseModule
+from mmcv.models.bricks import ConvModule, build_conv_layer
+from einops import rearrange
+from collections import OrderedDict
+
+# Grid sampler
+# Sample a smaller receptive-field bev from larger one
+class BevFeatureSlicer(nn.Module):
+    def __init__(self, grid_conf, map_grid_conf):
+        super().__init__()
+        if grid_conf == map_grid_conf:
+            self.identity_mapping = True
+        else:
+            self.identity_mapping = False
+
+            bev_resolution, bev_start_position, bev_dimension= calculate_birds_eye_view_parameters(
+                grid_conf['xbound'], grid_conf['ybound'], grid_conf['zbound']
+            )
+
+            map_bev_resolution, map_bev_start_position, map_bev_dimension = calculate_birds_eye_view_parameters(
+                map_grid_conf['xbound'], map_grid_conf['ybound'], map_grid_conf['zbound']
+            )
+
+            self.map_x = torch.arange(
+                map_bev_start_position[0], map_grid_conf['xbound'][1], map_bev_resolution[0])
+
+            self.map_y = torch.arange(
+                map_bev_start_position[1], map_grid_conf['ybound'][1], map_bev_resolution[1])
+
+            # convert to normalized coords
+            self.norm_map_x = self.map_x / (- bev_start_position[0])
+            self.norm_map_y = self.map_y / (- bev_start_position[1])
+
+            tmp_m, tmp_n = torch.meshgrid(
+                self.norm_map_x, self.norm_map_y)  # indexing 'ij'
+            tmp_m, tmp_n = tmp_m.T, tmp_n.T  # change it to the 'xy' mode results
+
+            self.map_grid = torch.stack([tmp_m, tmp_n], dim=2)
+
+    def forward(self, x):
+        # x: bev feature map tensor of shape (b, c, h, w)
+        if self.identity_mapping:
+            return x
+        else:
+            grid = self.map_grid.unsqueeze(0).type_as(
+                x).repeat(x.shape[0], 1, 1, 1)  # (b, h, w, 2)
+
+            return F.grid_sample(x, grid=grid, mode='bilinear', align_corners=True)
+
+# General layers
+class MLP(nn.Module):
+    """Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+        )
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+class SimpleConv2d(BaseModule):
+    def __init__(self, in_channels, 
+                       out_channels, 
+                       
+                       conv_channels=64,
+                       num_conv=1,
+                       conv_cfg=dict(type='Conv2d'),
+                       norm_cfg=dict(type='BN2d'),
+                       bias='auto',
+                       init_cfg=None,
+                       ):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+            'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg=init_cfg)
+        self.out_channels = out_channels
+        if num_conv == 1:
+            conv_channels = in_channels
+
+        conv_layers = []
+        c_in = in_channels
+        for i in range(num_conv-1):
+            conv_layers.append(
+                ConvModule(
+                    c_in,
+                    conv_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=bias,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                )
+            )
+            c_in = conv_channels
+        # No norm and relu in last conv
+        conv_layers.append(
+            build_conv_layer(
+                conv_cfg,
+                conv_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=True
+            )
+        )
+        self.conv_layers = nn.Sequential(*conv_layers)
+
+        if init_cfg is None:
+            self.init_cfg = dict(type='Kaiming', layer='Conv2d')
+
+    def forward(self, x):
+        b, c_in, h_in, w_in = x.size()
+        out = self.conv_layers(x)
+        assert out.size() == (b, self.out_channels, h_in, w_in)  # sanity check
+        return out
+
+# Decoder
+class CVT_DecoderBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, skip_dim, residual, factor, upsample, with_relu=True):
+        super().__init__()
+
+        dim = out_channels // factor
+
+        if upsample:
+            self.conv = nn.Sequential(
+                nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True),
+                nn.Conv2d(in_channels, dim, 3, padding=1, bias=False),
+                nn.BatchNorm2d(dim),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(dim, out_channels, 1, padding=0, bias=False),
+                nn.BatchNorm2d(out_channels))
+        else:
+            self.conv = nn.Sequential(
+                nn.Conv2d(in_channels, dim, 3, padding=1, bias=False),
+                nn.BatchNorm2d(dim),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(dim, out_channels, 1, padding=0, bias=False),
+                nn.BatchNorm2d(out_channels))
+
+        if residual:
+            self.up = nn.Conv2d(skip_dim, out_channels, 1)
+        else:
+            self.up = None
+        
+        self.with_relu = with_relu
+        if self.with_relu:
+            self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x, skip):
+        x = self.conv(x)
+
+        if self.up is not None:
+            up = self.up(skip)
+            up = F.interpolate(up, x.shape[-2:])
+
+            x = x + up
+        if self.with_relu:
+            return self.relu(x)
+        return x
+
+class CVT_Decoder(BaseModule):
+    def __init__(self, dim, blocks, residual=True, factor=2, upsample=True, use_checkpoint=False, init_cfg=None):
+        assert init_cfg is None, 'To prevent abnormal initialization ' \
+            'behavior, init_cfg is not allowed to be set'
+        super().__init__(init_cfg=init_cfg)
+
+        layers = []
+        channels = dim
+
+        for i, out_channels in enumerate(blocks):
+            with_relu = i < len(blocks) - 1  # if not last block, with relu
+            layer = CVT_DecoderBlock(channels, out_channels, dim, residual, factor, upsample, with_relu=with_relu)
+            layers.append(layer)
+
+            channels = out_channels
+
+        self.layers = nn.Sequential(*layers)
+        self.out_channels = channels
+        self.use_checkpoint = use_checkpoint
+        
+        if init_cfg is None:
+            self.init_cfg = dict(type='Kaiming', layer='Conv2d')
+
+    def forward(self, x):
+        b, t = x.size(0), x.size(1)
+        x = rearrange(x, 'b t c h w -> (b t) c h w')
+        y = x
+        for layer in self.layers:
+            if self.use_checkpoint:
+                y = checkpoint(layer, y, x)
+            else:
+                y = layer(y, x)
+        
+        y = rearrange(y, '(b t) c h w -> b t c h w', b=b, t=t)
+        return y
+
+
+# Conv modules
+class UpsamplingAdd(nn.Module):
+    def __init__(self, in_channels, out_channels, scale_factor=2):
+        super().__init__()
+        self.upsample_layer = nn.Sequential(
+            nn.Upsample(scale_factor=scale_factor, mode='bilinear', align_corners=False),
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0, bias=False),
+            nn.BatchNorm2d(out_channels),
+        )
+
+    def forward(self, x, x_skip):
+        x = self.upsample_layer(x)
+        return x + x_skip
+
+class Interpolate(nn.Module):
+    def __init__(self, scale_factor: int = 2):
+        super().__init__()
+        self._interpolate = nn.functional.interpolate
+        self._scale_factor = scale_factor
+
+    def forward(self, x):
+        return self._interpolate(x, scale_factor=self._scale_factor, mode='bilinear', align_corners=False)
+
+class Bottleneck(nn.Module):
+    """
+    Defines a bottleneck module with a residual connection
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels=None,
+        kernel_size=3,
+        dilation=1,
+        groups=1,
+        upsample=False,
+        downsample=False,
+        dropout=0.0,
+    ):
+        super().__init__()
+        self._downsample = downsample
+        bottleneck_channels = int(in_channels / 2)
+        out_channels = out_channels or in_channels
+        padding_size = ((kernel_size - 1) * dilation + 1) // 2
+
+        # Define the main conv operation
+        assert dilation == 1
+        if upsample:
+            assert not downsample, 'downsample and upsample not possible simultaneously.'
+            bottleneck_conv = nn.ConvTranspose2d(
+                bottleneck_channels,
+                bottleneck_channels,
+                kernel_size=kernel_size,
+                bias=False,
+                dilation=1,
+                stride=2,
+                output_padding=padding_size,
+                padding=padding_size,
+                groups=groups,
+            )
+        elif downsample:
+            bottleneck_conv = nn.Conv2d(
+                bottleneck_channels,
+                bottleneck_channels,
+                kernel_size=kernel_size,
+                bias=False,
+                dilation=dilation,
+                stride=2,
+                padding=padding_size,
+                groups=groups,
+            )
+        else:
+            bottleneck_conv = nn.Conv2d(
+                bottleneck_channels,
+                bottleneck_channels,
+                kernel_size=kernel_size,
+                bias=False,
+                dilation=dilation,
+                padding=padding_size,
+                groups=groups,
+            )
+
+        self.layers = nn.Sequential(
+            OrderedDict(
+                [
+                    # First projection with 1x1 kernel
+                    ('conv_down_project', nn.Conv2d(in_channels, bottleneck_channels, kernel_size=1, bias=False)),
+                    ('abn_down_project', nn.Sequential(nn.BatchNorm2d(bottleneck_channels),
+                                                       nn.ReLU(inplace=True))),
+                    # Second conv block
+                    ('conv', bottleneck_conv),
+                    ('abn', nn.Sequential(nn.BatchNorm2d(bottleneck_channels), nn.ReLU(inplace=True))),
+                    # Final projection with 1x1 kernel
+                    ('conv_up_project', nn.Conv2d(bottleneck_channels, out_channels, kernel_size=1, bias=False)),
+                    ('abn_up_project', nn.Sequential(nn.BatchNorm2d(out_channels),
+                                                     nn.ReLU(inplace=True))),
+                    # Regulariser
+                    ('dropout', nn.Dropout2d(p=dropout)),
+                ]
+            )
+        )
+
+        if out_channels == in_channels and not downsample and not upsample:
+            self.projection = None
+        else:
+            projection = OrderedDict()
+            if upsample:
+                projection.update({'upsample_skip_proj': Interpolate(scale_factor=2)})
+            elif downsample:
+                projection.update({'upsample_skip_proj': nn.MaxPool2d(kernel_size=2, stride=2)})
+            projection.update(
+                {
+                    'conv_skip_proj': nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
+                    'bn_skip_proj': nn.BatchNorm2d(out_channels),
+                }
+            )
+            self.projection = nn.Sequential(projection)
+
+    def forward(self, *args):
+        (x,) = args
+        x_residual = self.layers(x)
+        if self.projection is not None:
+            if self._downsample:
+                # pad h/w dimensions if they are odd to prevent shape mismatch with residual layer
+                x = nn.functional.pad(x, (0, x.shape[-1] % 2, 0, x.shape[-2] % 2), value=0)
+            return x_residual + self.projection(x)
+        return x_residual + x
diff --git a/mmcv/models/dense_heads/occ_head_plugin/utils.py b/mmcv/models/dense_heads/occ_head_plugin/utils.py
new file mode 100644
index 0000000..da71cb4
--- /dev/null
+++ b/mmcv/models/dense_heads/occ_head_plugin/utils.py
@@ -0,0 +1,87 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+#---------------------------------------------------------------------------------#
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+
+
+def calculate_birds_eye_view_parameters(x_bounds, y_bounds, z_bounds):
+    """
+    Parameters
+    ----------
+        x_bounds: Forward direction in the ego-car.
+        y_bounds: Sides
+        z_bounds: Height
+
+    Returns
+    -------
+        bev_resolution: Bird's-eye view bev_resolution
+        bev_start_position Bird's-eye view first element
+        bev_dimension Bird's-eye view tensor spatial dimension
+    """
+    bev_resolution = torch.tensor(
+        [row[2] for row in [x_bounds, y_bounds, z_bounds]])
+    bev_start_position = torch.tensor(
+        [row[0] + row[2] / 2.0 for row in [x_bounds, y_bounds, z_bounds]])
+    bev_dimension = torch.tensor([(row[1] - row[0]) / row[2]
+                                 for row in [x_bounds, y_bounds, z_bounds]], dtype=torch.long)
+
+    return bev_resolution, bev_start_position, bev_dimension
+
+
+def gen_dx_bx(xbound, ybound, zbound):
+    dx = torch.Tensor([row[2] for row in [xbound, ybound, zbound]])
+    bx = torch.Tensor([row[0] + row[2]/2.0 for row in [xbound, ybound, zbound]])
+    nx = torch.LongTensor([(row[1] - row[0]) / row[2] for row in [xbound, ybound, zbound]])
+
+    return dx, bx, nx
+
+# Instance utils
+def update_instance_ids(instance_seg, old_ids, new_ids):
+    """
+    Parameters
+    ----------
+        instance_seg: torch.Tensor arbitrary shape
+        old_ids: 1D tensor containing the list of old ids, must be all present in instance_seg.
+        new_ids: 1D tensor with the new ids, aligned with old_ids
+
+    Returns
+        new_instance_seg: torch.Tensor same shape as instance_seg with new ids
+    """
+    indices = torch.arange(old_ids.max() + 1, device=instance_seg.device)
+    for old_id, new_id in zip(old_ids, new_ids):
+        indices[old_id] = new_id
+
+    return indices[instance_seg].long()
+
+
+def make_instance_seg_consecutive(instance_seg):
+    # Make the indices of instance_seg consecutive
+    unique_ids = torch.unique(instance_seg)  # include background
+    new_ids = torch.arange(len(unique_ids), device=instance_seg.device)
+    instance_seg = update_instance_ids(instance_seg, unique_ids, new_ids)
+    return instance_seg
+
+
+def predict_instance_segmentation_and_trajectories(
+                                    foreground_masks,
+                                    ins_sigmoid,
+                                    vehicles_id=1,
+                                    ):
+    if foreground_masks.dim() == 5 and foreground_masks.shape[2] == 1:
+        foreground_masks = foreground_masks.squeeze(2)  # [b, t, h, w]
+    foreground_masks = foreground_masks == vehicles_id  # [b, t, h, w]  Only these places have foreground id
+    
+    argmax_ins = ins_sigmoid.argmax(dim=1)  # long, [b, t, h, w], ins_id starts from 0
+    argmax_ins = argmax_ins + 1 # [b, t, h, w], ins_id starts from 1
+    instance_seg = (argmax_ins * foreground_masks.float()).long()  # bg is 0, fg starts with 1
+
+    # Make the indices of instance_seg consecutive
+    instance_seg = make_instance_seg_consecutive(instance_seg).long()
+
+    return instance_seg
diff --git a/mmcv/models/dense_heads/panseg_head.py b/mmcv/models/dense_heads/panseg_head.py
new file mode 100644
index 0000000..8210e59
--- /dev/null
+++ b/mmcv/models/dense_heads/panseg_head.py
@@ -0,0 +1,1327 @@
+#----------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)   #
+# Source code: https://github.com/OpenDriveLab/UniAD                               #
+# Copyright (c) OpenDriveLab. All rights reserved.                                 #
+# Modified from panoptic_segformer (https://github.com/zhiqi-li/Panoptic-SegFormer)#
+#--------------------------------------------------------------------------------- #
+
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.models.bricks import Linear
+from mmcv.models.utils import bias_init_with_prob, constant_init
+from mmcv.utils import force_fp32, auto_fp16
+from mmcv.models.utils.transformer import inverse_sigmoid
+from mmcv.models.builder import HEADS, build_loss
+from mmcv.core.bbox.transforms import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh
+from mmcv.core.bbox.builder import build_assigner, build_sampler
+from mmcv.core.utils import multi_apply, reduce_mean
+from mmcv.models.utils import build_transformer
+from .seg_head_plugin import SegDETRHead, IOU
+
+@HEADS.register_module()
+class PansegformerHead(SegDETRHead):
+    """
+    Head of Panoptic SegFormer
+
+    Code is modified from the `official github repo
+    <https://github.com/open-mmlab/mmcvection>`_.
+
+    Args:
+        with_box_refine (bool): Whether to refine the reference points
+            in the decoder. Defaults to False.
+        as_two_stage (bool) : Whether to generate the proposal from
+            the outputs of encoder.
+        transformer (obj:`ConfigDict`): ConfigDict is used for building
+            the Encoder and Decoder.
+    """
+
+    def __init__(
+            self,
+            *args,
+            bev_h,
+            bev_w,
+            canvas_size,
+            pc_range,
+            with_box_refine=False,
+            as_two_stage=False,
+            transformer=None,
+            quality_threshold_things=0.25,
+            quality_threshold_stuff=0.25,
+            overlap_threshold_things=0.4,
+            overlap_threshold_stuff=0.2,
+            thing_transformer_head=dict(
+                type='TransformerHead',  # mask decoder for things
+                d_model=256,
+                nhead=8,
+                num_decoder_layers=6),
+            stuff_transformer_head=dict(
+                type='TransformerHead',  # mask decoder for stuff
+                d_model=256,
+                nhead=8,
+                num_decoder_layers=6),
+            loss_mask=dict(type='DiceLoss', weight=2.0),
+            train_cfg=dict(
+                assigner=dict(type='HungarianAssigner',
+                              cls_cost=dict(type='ClassificationCost',
+                                            weight=1.),
+                              reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+                              iou_cost=dict(type='IoUCost',
+                                            iou_mode='giou',
+                                            weight=2.0)),
+                sampler=dict(type='PseudoSampler'),
+            ),
+            **kwargs):
+        self.bev_h = bev_h
+        self.bev_w = bev_w
+        self.canvas_size = canvas_size
+        self.pc_range = pc_range
+        self.real_w = self.pc_range[3] - self.pc_range[0]
+        self.real_h = self.pc_range[4] - self.pc_range[1]
+
+        self.with_box_refine = with_box_refine
+        self.as_two_stage = as_two_stage
+        self.quality_threshold_things = 0.1
+        self.quality_threshold_stuff = quality_threshold_stuff
+        self.overlap_threshold_things = overlap_threshold_things
+        self.overlap_threshold_stuff = overlap_threshold_stuff
+        self.fp16_enabled = False
+
+        if self.as_two_stage:
+            transformer['as_two_stage'] = self.as_two_stage
+        self.num_dec_things = thing_transformer_head['num_decoder_layers']
+        self.num_dec_stuff = stuff_transformer_head['num_decoder_layers']
+        super(PansegformerHead, self).__init__(*args,
+                                            transformer=transformer,
+                                            train_cfg=train_cfg,
+                                            **kwargs)
+        if train_cfg:
+            sampler_cfg = train_cfg['sampler_with_mask']
+            self.sampler_with_mask = build_sampler(sampler_cfg, context=self)
+            assigner_cfg = train_cfg['assigner_with_mask']
+            self.assigner_with_mask = build_assigner(assigner_cfg)
+            self.assigner_filter = build_assigner(
+                dict(
+                    type='HungarianAssigner_filter',
+                    cls_cost=dict(type='FocalLossCost', weight=2.0),
+                    reg_cost=dict(type='BBoxL1Cost',
+                                  weight=5.0,
+                                  box_format='xywh'),
+                    iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
+                    max_pos=
+                    3  # Depends on GPU memory, setting it to 1, model can be trained on 1080Ti
+                ), )
+
+        self.loss_mask = build_loss(loss_mask)
+        self.things_mask_head = build_transformer(thing_transformer_head)
+        self.stuff_mask_head = build_transformer(stuff_transformer_head)
+        self.count = 0
+
+    def _init_layers(self):
+        """Initialize classification branch and regression branch of head."""
+        if not self.as_two_stage:
+            self.bev_embedding = nn.Embedding(self.bev_h * self.bev_w, self.embed_dims)
+
+        fc_cls = Linear(self.embed_dims, self.cls_out_channels)
+        fc_cls_stuff = Linear(self.embed_dims, 1)
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, 4))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        def _get_clones(module, N):
+            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+        # last reg_branch is used to generate proposal from
+        # encode feature map when as_two_stage is True.
+        num_pred = (self.transformer.decoder.num_layers + 1) if \
+            self.as_two_stage else self.transformer.decoder.num_layers
+
+        if self.with_box_refine:
+            self.cls_branches = _get_clones(fc_cls, num_pred)
+            self.reg_branches = _get_clones(reg_branch, num_pred)
+        else:
+            self.cls_branches = nn.ModuleList(
+                [fc_cls for _ in range(num_pred)])
+            self.reg_branches = nn.ModuleList(
+                [reg_branch for _ in range(num_pred)])
+        if not self.as_two_stage:
+            self.query_embedding = nn.Embedding(self.num_query,
+                                                self.embed_dims * 2)
+        self.stuff_query = nn.Embedding(self.num_stuff_classes,
+                                        self.embed_dims * 2)
+        self.reg_branches2 = _get_clones(reg_branch, self.num_dec_things) # used in mask decoder
+        self.cls_thing_branches = _get_clones(fc_cls, self.num_dec_things) # used in mask decoder
+        self.cls_stuff_branches = _get_clones(fc_cls_stuff, self.num_dec_stuff) # used in mask deocder
+
+    def init_weights(self):
+        """Initialize weights of the DeformDETR head."""
+        self.transformer.init_weights()
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.cls_branches:
+                nn.init.constant_(m.bias, bias_init)
+            for m in self.cls_thing_branches:
+                nn.init.constant_(m.bias, bias_init)
+            for m in self.cls_stuff_branches:
+                nn.init.constant_(m.bias, bias_init)
+        for m in self.reg_branches:
+            constant_init(m[-1], 0, bias=0)
+        for m in self.reg_branches2:
+            constant_init(m[-1], 0, bias=0)
+        nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], -2.0)
+
+        if self.as_two_stage:
+            for m in self.reg_branches:
+                nn.init.constant_(m[-1].bias.data[2:], 0.0)
+
+    @force_fp32(apply_to=('bev_embed', ))
+    def forward(self, bev_embed):
+        """Forward function.
+
+        Args:
+            bev_embed (tuple[Tensor]): Features from the upstream
+                network, each is a 4D-tensor with shape
+                (N, C, H, W).
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, h). \
+                Shape [nb_dec, bs, num_query, 4].
+            enc_outputs_class (Tensor): The score of each point on encode \
+                feature map, has shape (N, h*w, num_class). Only when \
+                as_two_stage is True it would be returned, otherwise \
+                `None` would be returned.
+            enc_outputs_coord (Tensor): The proposal generate from the \
+                encode feature map, has shape (N, h*w, 4). Only when \
+                as_two_stage is True it would be returned, otherwise \
+                `None` would be returned.
+        """
+        _, bs, _ = bev_embed.shape
+
+        mlvl_feats = [torch.reshape(bev_embed, (bs, self.bev_h, self.bev_w ,-1)).permute(0, 3, 1, 2)]
+        img_masks = mlvl_feats[0].new_zeros((bs, self.bev_h, self.bev_w))
+
+        hw_lvl = [feat_lvl.shape[-2:] for feat_lvl in mlvl_feats]
+        mlvl_masks = []
+        mlvl_positional_encodings = []
+        for feat in mlvl_feats:
+            mlvl_masks.append(
+                F.interpolate(img_masks[None],
+                              size=feat.shape[-2:]).to(torch.bool).squeeze(0))
+            mlvl_positional_encodings.append(
+                self.positional_encoding(mlvl_masks[-1]))
+
+        query_embeds = None
+        if not self.as_two_stage:
+            query_embeds = self.query_embedding.weight
+        (memory, memory_pos, memory_mask, query_pos), hs, init_reference, inter_references, \
+        enc_outputs_class, enc_outputs_coord = self.transformer(
+            mlvl_feats,
+            mlvl_masks,
+            query_embeds,
+            mlvl_positional_encodings,
+            reg_branches=self.reg_branches if self.with_box_refine else None,  # noqa:E501
+            cls_branches=self.cls_branches if self.as_two_stage else None  # noqa:E501
+        )
+
+        memory = memory.permute(1, 0, 2)
+        query = hs[-1].permute(1, 0, 2)
+        query_pos = query_pos.permute(1, 0, 2)
+        memory_pos = memory_pos.permute(1, 0, 2)
+
+        # we should feed these to mask deocder.
+        args_tuple = [memory, memory_mask, memory_pos, query, None, query_pos, hw_lvl]
+
+        hs = hs.permute(0, 2, 1, 3)
+        outputs_classes = []
+        outputs_coords = []
+        for lvl in range(hs.shape[0]):
+            if lvl == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[lvl - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.cls_branches[lvl](hs[lvl])
+            tmp = self.reg_branches[lvl](hs[lvl])
+
+            if reference.shape[-1] == 4:
+                tmp += reference
+            else:
+                assert reference.shape[-1] == 2
+                tmp[..., :2] += reference
+            outputs_coord = tmp.sigmoid()
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+
+        outputs_classes = torch.stack(outputs_classes)
+        outputs_coords = torch.stack(outputs_coords)
+
+        outs = {
+                'bev_embed': None if self.as_two_stage else bev_embed,
+                'outputs_classes': outputs_classes,
+                'outputs_coords': outputs_coords,
+                'enc_outputs_class': enc_outputs_class if self.as_two_stage else None,
+                'enc_outputs_coord': enc_outputs_coord.sigmoid() if self.as_two_stage else None,
+                'args_tuple': args_tuple,
+                'reference': reference,
+            }
+        
+        return outs
+
+    @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list',
+                          'args_tuple', 'reference'))
+    def loss(
+        self,
+        all_cls_scores,
+        all_bbox_preds,
+        enc_cls_scores,
+        enc_bbox_preds,
+        args_tuple,
+        reference,
+        gt_labels_list,
+        gt_bboxes_list,
+        gt_masks_list,
+        img_metas=None,
+        gt_bboxes_ignore=None,
+    ):
+        """"Loss function.
+
+        Args:
+            all_cls_scores (Tensor): Classification score of all
+                decoder layers, has shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds (Tensor): Sigmoid regression
+                outputs of all decode layers. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            enc_cls_scores (Tensor): Classification scores of
+                points on encode feature map , has shape
+                (N, h*w, num_classes). Only be passed when as_two_stage is
+                True, otherwise is None.
+            enc_bbox_preds (Tensor): Regression results of each points
+                on the encode feature map, has shape (N, h*w, 4). Only be
+                passed when as_two_stage is True, otherwise is None.
+            args_tuple (Tuple) several args
+            reference (Tensor) reference from location decoder
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        img_metas[0]['img_shape'] = (self.canvas_size[0], self.canvas_size[1], 3)
+
+        assert gt_bboxes_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            f'for gt_bboxes_ignore setting to None.'
+
+        ### seprate things and stuff
+        gt_things_lables_list = []
+        gt_things_bboxes_list = []
+        gt_things_masks_list = []
+        gt_stuff_labels_list = []
+        gt_stuff_masks_list = []
+        for i, each in enumerate(gt_labels_list):   
+            # MDS: for coco, id<80 (Continuous id) is things. This is not true for other data sets
+            things_selected = each < self.num_things_classes
+
+            stuff_selected = things_selected == False
+
+            gt_things_lables_list.append(gt_labels_list[i][things_selected])
+            gt_things_bboxes_list.append(gt_bboxes_list[i][things_selected])
+            gt_things_masks_list.append(gt_masks_list[i][things_selected])
+
+            gt_stuff_labels_list.append(gt_labels_list[i][stuff_selected])
+            gt_stuff_masks_list.append(gt_masks_list[i][stuff_selected])
+
+        num_dec_layers = len(all_cls_scores)
+        all_gt_bboxes_list = [
+            gt_things_bboxes_list for _ in range(num_dec_layers - 1)
+        ]
+        all_gt_labels_list = [
+            gt_things_lables_list for _ in range(num_dec_layers - 1)
+        ]
+        # all_gt_masks_list = [gt_masks_list for _ in range(num_dec_layers-1)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers - 1)
+        ]
+        img_metas_list = [img_metas for _ in range(num_dec_layers - 1)]
+
+        # if the location decoder codntains L layers, we compute the losses of the first L-1 layers
+        losses_cls, losses_bbox, losses_iou = multi_apply(
+            self.loss_single, all_cls_scores[:-1], all_bbox_preds[:-1],
+            all_gt_bboxes_list, all_gt_labels_list, img_metas_list,
+            all_gt_bboxes_ignore_list)
+
+        losses_cls_f, losses_bbox_f, losses_iou_f, losses_masks_things_f, losses_masks_stuff_f, loss_mask_things_list_f, loss_mask_stuff_list_f, loss_iou_list_f, loss_bbox_list_f, loss_cls_list_f, loss_cls_stuff_list_f, things_ratio, stuff_ratio = self.loss_single_panoptic(
+            all_cls_scores[-1], all_bbox_preds[-1], args_tuple, reference,
+            gt_things_bboxes_list, gt_things_lables_list, gt_things_masks_list,
+            (gt_stuff_labels_list, gt_stuff_masks_list), img_metas,
+            gt_bboxes_ignore)
+
+        loss_dict = dict()
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            binary_labels_list = [
+                torch.zeros_like(gt_things_lables_list[i])
+                for i in range(len(img_metas))
+            ]
+            enc_loss_cls, enc_losses_bbox, enc_losses_iou = \
+                self.loss_single(enc_cls_scores, enc_bbox_preds,
+                                 gt_things_bboxes_list, binary_labels_list,
+                                 img_metas, gt_bboxes_ignore)
+            loss_dict['enc_loss_cls'] = enc_loss_cls * things_ratio
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox * things_ratio
+            loss_dict['enc_loss_iou'] = enc_losses_iou * things_ratio
+            # loss_dict['enc_loss_mask'] = enc_losses_mask
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls_f * things_ratio
+        loss_dict['loss_bbox'] = losses_bbox_f * things_ratio
+        loss_dict['loss_iou'] = losses_iou_f * things_ratio
+        loss_dict['loss_mask_things'] = losses_masks_things_f * things_ratio
+        loss_dict['loss_mask_stuff'] = losses_masks_stuff_f * stuff_ratio
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for i in range(len(loss_mask_things_list_f)):
+            loss_dict[f'd{i}.loss_mask_things_f'] = loss_mask_things_list_f[
+                i] * things_ratio
+            loss_dict[f'd{i}.loss_iou_f'] = loss_iou_list_f[i] * things_ratio
+            loss_dict[f'd{i}.loss_bbox_f'] = loss_bbox_list_f[i] * things_ratio
+            loss_dict[f'd{i}.loss_cls_f'] = loss_cls_list_f[i] * things_ratio
+        for i in range(len(loss_mask_stuff_list_f)):
+            loss_dict[f'd{i}.loss_mask_stuff_f'] = loss_mask_stuff_list_f[
+                i] * stuff_ratio
+            loss_dict[f'd{i}.loss_cls_stuff_f'] = loss_cls_stuff_list_f[
+                i] * stuff_ratio
+        for loss_cls_i, loss_bbox_i, loss_iou_i in zip(
+                losses_cls,
+                losses_bbox,
+                losses_iou,
+        ):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i * things_ratio
+            loss_dict[
+                f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i * things_ratio
+            loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i * things_ratio
+
+            num_dec_layer += 1
+        # print(loss_dict)
+        return loss_dict
+
+    def filter_query(self,
+                     cls_scores_list,
+                     bbox_preds_list,
+                     gt_bboxes_list,
+                     gt_labels_list,
+                     img_metas,
+                     gt_bboxes_ignore_list=None):
+        '''
+        This function aims to using the cost from the location decoder to filter out low-quality queries.
+        '''
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+
+        (pos_inds_mask_list, neg_inds_mask_list, labels_list,
+         label_weights_list, bbox_targets_list,
+         bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(
+             self._filter_query_single, cls_scores_list, bbox_preds_list,
+             gt_bboxes_list, gt_labels_list, img_metas, gt_bboxes_ignore_list)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+
+        return pos_inds_mask_list, neg_inds_mask_list, labels_list, label_weights_list, bbox_targets_list, \
+               bbox_weights_list, num_total_pos, num_total_neg, pos_inds_list, neg_inds_list
+
+    def _filter_query_single(self,
+                             cls_score,
+                             bbox_pred,
+                             gt_bboxes,
+                             gt_labels,
+                             img_meta,
+                             gt_bboxes_ignore=None):
+        num_bboxes = bbox_pred.size(0)
+        pos_ind_mask, neg_ind_mask, assign_result = self.assigner_filter.assign(
+            bbox_pred, cls_score, gt_bboxes, gt_labels, img_meta,
+            gt_bboxes_ignore)
+        sampling_result = self.sampler.sample(assign_result, bbox_pred,
+                                              gt_bboxes)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes, ),
+                                    self.num_things_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+        img_h, img_w, _ = img_meta['img_shape']
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        pos_gt_bboxes_normalized = sampling_result.pos_gt_bboxes / factor
+        pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized)
+        bbox_targets[pos_inds] = pos_gt_bboxes_targets
+
+        return (pos_ind_mask, neg_ind_mask, labels, label_weights,
+                bbox_targets, bbox_weights, pos_inds, neg_inds)
+
+    def get_targets_with_mask(self,
+                              cls_scores_list,
+                              bbox_preds_list,
+                              masks_preds_list_thing,
+                              gt_bboxes_list,
+                              gt_labels_list,
+                              gt_masks_list,
+                              img_metas,
+                              gt_bboxes_ignore_list=None):
+        """"Compute regression and classification targets for a batch image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_query, 4].
+            masks_preds_list_thing  (list[Tensor]):
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        """
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         mask_targets_list, mask_weights_list, pos_inds_list,
+         neg_inds_list) = multi_apply(self._get_target_single_with_mask,
+                                      cls_scores_list, bbox_preds_list,
+                                      masks_preds_list_thing, gt_bboxes_list,
+                                      gt_labels_list, gt_masks_list, img_metas,
+                                      gt_bboxes_ignore_list)
+        num_total_pos_thing = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg_thing = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, mask_targets_list, mask_weights_list,
+                num_total_pos_thing, num_total_neg_thing, pos_inds_list)
+
+    def _get_target_single_with_mask(self,
+                                     cls_score,
+                                     bbox_pred,
+                                     masks_preds_things,
+                                     gt_bboxes,
+                                     gt_labels,
+                                     gt_masks,
+                                     img_meta,
+                                     gt_bboxes_ignore=None):
+        """
+        """
+
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+
+        gt_masks = gt_masks.float()
+
+        assign_result = self.assigner_with_mask.assign(bbox_pred, cls_score,
+                                                       masks_preds_things,
+                                                       gt_bboxes, gt_labels,
+                                                       gt_masks, img_meta,
+                                                       gt_bboxes_ignore)
+        sampling_result = self.sampler_with_mask.sample(
+            assign_result, bbox_pred, gt_bboxes, gt_masks)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes, ),
+                                    self.num_things_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+        img_h, img_w, _ = img_meta['img_shape']
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        pos_gt_bboxes_normalized = sampling_result.pos_gt_bboxes / factor
+        pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized)
+        bbox_targets[pos_inds] = pos_gt_bboxes_targets
+
+        mask_weights = masks_preds_things.new_zeros(num_bboxes)
+        mask_weights[pos_inds] = 1.0
+        pos_gt_masks = sampling_result.pos_gt_masks
+        _, w, h = pos_gt_masks.shape
+        mask_target = masks_preds_things.new_zeros([num_bboxes, w, h])
+        mask_target[pos_inds] = pos_gt_masks
+
+        return (labels, label_weights, bbox_targets, bbox_weights, mask_target,
+                mask_weights, pos_inds, neg_inds)
+
+    def get_filter_results_and_loss(self, cls_scores, bbox_preds,
+                                    cls_scores_list, bbox_preds_list,
+                                    gt_bboxes_list, gt_labels_list, img_metas,
+                                    gt_bboxes_ignore_list):
+
+
+        pos_inds_mask_list, neg_inds_mask_list, labels_list, label_weights_list, bbox_targets_list, \
+        bbox_weights_list, num_total_pos_thing, num_total_neg_thing, pos_inds_list, neg_inds_list = self.filter_query(
+            cls_scores_list, bbox_preds_list,
+            gt_bboxes_list, gt_labels_list,
+            img_metas, gt_bboxes_ignore_list)
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos_thing * 1.0 + \
+                         num_total_neg_thing * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        loss_cls = self.loss_cls(cls_scores,
+                                 labels,
+                                 label_weights,
+                                 avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes accross all gpus, for
+        # normalization purposes
+
+        num_total_pos_thing = loss_cls.new_tensor([num_total_pos_thing])
+        num_total_pos_thing = torch.clamp(reduce_mean(num_total_pos_thing),
+                                          min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(img_metas, bbox_preds):
+            img_h, img_w, _ = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors, 0)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(bboxes,
+                                 bboxes_gt,
+                                 bbox_weights,
+                                 avg_factor=num_total_pos_thing)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(bbox_preds,
+                                   bbox_targets,
+                                   bbox_weights,
+                                   avg_factor=num_total_pos_thing)
+        return loss_cls, loss_iou, loss_bbox,\
+            pos_inds_mask_list, num_total_pos_thing
+
+    def loss_single_panoptic(self,
+                             cls_scores,
+                             bbox_preds,
+                             args_tuple,
+                             reference,
+                             gt_bboxes_list,
+                             gt_labels_list,
+                             gt_masks_list,
+                             gt_panoptic_list,
+                             img_metas,
+                             gt_bboxes_ignore_list=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            args_tuple:
+            reference:
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs from
+                a single decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        gt_stuff_labels_list, gt_stuff_masks_list = gt_panoptic_list
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        loss_cls, loss_iou, loss_bbox, pos_inds_mask_list, num_total_pos_thing = self.get_filter_results_and_loss(
+            cls_scores, bbox_preds, cls_scores_list, bbox_preds_list, gt_bboxes_list, gt_labels_list, img_metas, gt_bboxes_ignore_list)
+
+        memory, memory_mask, memory_pos, query, _, query_pos, hw_lvl = args_tuple
+
+        BS, _, dim_query = query.shape[0], query.shape[1], query.shape[-1]
+
+        len_query = max([len(pos_ind) for pos_ind in pos_inds_mask_list])
+        thing_query = torch.zeros([BS, len_query, dim_query],
+                                  device=query.device)
+
+        stuff_query, stuff_query_pos = torch.split(self.stuff_query.weight,
+                                                   self.embed_dims,
+                                                   dim=1)
+        stuff_query_pos = stuff_query_pos.unsqueeze(0).expand(BS, -1, -1)
+        stuff_query = stuff_query.unsqueeze(0).expand(BS, -1, -1)
+
+        for i in range(BS):
+            thing_query[i, :len(pos_inds_mask_list[i])] = query[
+                i, pos_inds_mask_list[i]]
+
+        mask_preds_things = []
+        mask_preds_stuff = []
+        # mask_preds_inter = [[],[],[]]
+        mask_preds_inter_things = [[] for _ in range(self.num_dec_things)]
+        mask_preds_inter_stuff = [[] for _ in range(self.num_dec_stuff)]
+        cls_thing_preds = [[] for _ in range(self.num_dec_things)]
+        cls_stuff_preds = [[] for _ in range(self.num_dec_stuff)]
+        BS, NQ, L = bbox_preds.shape
+        new_bbox_preds = [
+            torch.zeros([BS, len_query, L]).to(bbox_preds.device)
+            for _ in range(self.num_dec_things)
+        ]
+
+        mask_things, mask_inter_things, query_inter_things = self.things_mask_head(
+            memory, memory_mask, None, thing_query, None, None, hw_lvl=hw_lvl)
+
+        mask_stuff, mask_inter_stuff, query_inter_stuff = self.stuff_mask_head(
+            memory,
+            memory_mask,
+            None,
+            stuff_query,
+            None,
+            stuff_query_pos,
+            hw_lvl=hw_lvl)
+
+        mask_things = mask_things.squeeze(-1)
+        mask_inter_things = torch.stack(mask_inter_things, 0).squeeze(-1)
+
+        mask_stuff = mask_stuff.squeeze(-1)
+        mask_inter_stuff = torch.stack(mask_inter_stuff, 0).squeeze(-1)
+
+        for i in range(BS):
+            tmp_i = mask_things[i][:len(pos_inds_mask_list[i])].reshape(
+                -1, *hw_lvl[0])
+            mask_preds_things.append(tmp_i)
+            pos_ind = pos_inds_mask_list[i]
+            reference_i = reference[i:i + 1, pos_ind, :]
+
+            for j in range(self.num_dec_things):
+                tmp_i_j = mask_inter_things[j][i][:len(pos_inds_mask_list[i]
+                                                       )].reshape(
+                                                           -1, *hw_lvl[0])
+                mask_preds_inter_things[j].append(tmp_i_j)
+
+                # mask_preds_inter_things[j].append(mask_inter_things[j].reshape(-1, *hw_lvl[0]))
+                query_things = query_inter_things[j]
+                t1, t2, t3 = query_things.shape
+                tmp = self.reg_branches2[j](query_things.reshape(t1 * t2, t3)).reshape(t1, t2, 4)
+                if len(pos_ind) == 0:
+                    tmp = tmp.sum(
+                    ) + reference_i  # for reply bug of pytorch broadcast
+                elif reference_i.shape[-1] == 4:
+                    tmp += reference_i
+                else:
+                    assert reference_i.shape[-1] == 2
+                    tmp[..., :2] += reference_i
+
+                outputs_coord = tmp.sigmoid()
+
+                new_bbox_preds[j][i][:len(pos_inds_mask_list[i])] = outputs_coord
+                cls_thing_preds[j].append(self.cls_thing_branches[j](
+                    query_things.reshape(t1 * t2, t3)))
+
+            # stuff
+            tmp_i = mask_stuff[i].reshape(-1, *hw_lvl[0])
+            mask_preds_stuff.append(tmp_i)
+            for j in range(self.num_dec_stuff):
+                tmp_i_j = mask_inter_stuff[j][i].reshape(-1, *hw_lvl[0])
+                mask_preds_inter_stuff[j].append(tmp_i_j)
+
+                query_stuff = query_inter_stuff[j]
+                s1, s2, s3 = query_stuff.shape
+                cls_stuff_preds[j].append(self.cls_stuff_branches[j](
+                    query_stuff.reshape(s1 * s2, s3)))
+
+        masks_preds_list_thing = [
+            mask_preds_things[i] for i in range(num_imgs)
+        ]
+        mask_preds_things = torch.cat(mask_preds_things, 0)
+        mask_preds_inter_things = [
+            torch.cat(each, 0) for each in mask_preds_inter_things
+        ]
+        cls_thing_preds = [torch.cat(each, 0) for each in cls_thing_preds]
+        cls_stuff_preds = [torch.cat(each, 0) for each in cls_stuff_preds]
+        mask_preds_stuff = torch.cat(mask_preds_stuff, 0)
+        mask_preds_inter_stuff = [
+            torch.cat(each, 0) for each in mask_preds_inter_stuff
+        ]
+        cls_scores_list = [
+            cls_scores_list[i][pos_inds_mask_list[i]] for i in range(num_imgs)
+        ]
+
+        bbox_preds_list = [
+            bbox_preds_list[i][pos_inds_mask_list[i]] for i in range(num_imgs)
+        ]
+
+        gt_targets = self.get_targets_with_mask(cls_scores_list,
+                                                bbox_preds_list,
+                                                masks_preds_list_thing,
+                                                gt_bboxes_list, gt_labels_list,
+                                                gt_masks_list, img_metas,
+                                                gt_bboxes_ignore_list)
+
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         mask_targets_list, mask_weights_list, _, _,
+         pos_inds_list) = gt_targets
+
+        thing_labels = torch.cat(labels_list, 0)
+        things_weights = torch.cat(label_weights_list, 0)
+
+        bboxes_taget = torch.cat(bbox_targets_list)
+        bboxes_weights = torch.cat(bbox_weights_list)
+
+        factors = []
+        for img_meta, bbox_pred in zip(img_metas, bbox_preds_list):
+            img_h, img_w, _ = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors, 0)
+
+        bboxes_gt = bbox_cxcywh_to_xyxy(bboxes_taget) * factors
+
+        mask_things_gt = torch.cat(mask_targets_list, 0).to(torch.float)
+
+        mask_weight_things = torch.cat(mask_weights_list,
+                                       0).to(thing_labels.device)
+
+        mask_stuff_gt = []
+        mask_weight_stuff = []
+        stuff_labels = []
+        num_total_pos_stuff = 0
+        for i in range(BS):
+            num_total_pos_stuff += len(gt_stuff_labels_list[i])  ## all stuff
+
+            select_stuff_index = gt_stuff_labels_list[
+                i] - self.num_things_classes
+            mask_weight_i_stuff = torch.zeros([self.num_stuff_classes])
+            mask_weight_i_stuff[select_stuff_index] = 1
+            stuff_masks = torch.zeros(
+                (self.num_stuff_classes, *mask_targets_list[i].shape[-2:]),
+                device=mask_targets_list[i].device).to(torch.bool)
+            stuff_masks[select_stuff_index] = gt_stuff_masks_list[i].to(
+                torch.bool)
+            mask_stuff_gt.append(stuff_masks)
+            select_stuff_index = torch.cat([
+                select_stuff_index,
+                torch.tensor([self.num_stuff_classes],
+                             device=select_stuff_index.device)
+            ])
+
+            stuff_labels.append(1 - mask_weight_i_stuff)
+            mask_weight_stuff.append(mask_weight_i_stuff)
+
+        mask_weight_stuff = torch.cat(mask_weight_stuff,
+                                      0).to(thing_labels.device)
+        stuff_labels = torch.cat(stuff_labels, 0).to(thing_labels.device)
+        mask_stuff_gt = torch.cat(mask_stuff_gt, 0).to(torch.float)
+
+        num_total_pos_stuff = loss_cls.new_tensor([num_total_pos_stuff])
+        num_total_pos_stuff = torch.clamp(reduce_mean(num_total_pos_stuff),
+                                          min=1).item()
+        if mask_preds_things.shape[0] == 0:
+            loss_mask_things = (0 * mask_preds_things).sum()
+        else:
+            mask_preds = F.interpolate(mask_preds_things.unsqueeze(0),
+                                       scale_factor=2.0,
+                                       mode='bilinear').squeeze(0)
+            mask_targets_things = F.interpolate(mask_things_gt.unsqueeze(0),
+                                                size=mask_preds.shape[-2:],
+                                                mode='bilinear').squeeze(0)
+            loss_mask_things = self.loss_mask(mask_preds,
+                                              mask_targets_things,
+                                              mask_weight_things,
+                                              avg_factor=num_total_pos_thing)
+        if mask_preds_stuff.shape[0] == 0:
+            loss_mask_stuff = (0 * mask_preds_stuff).sum()
+        else:
+            mask_preds = F.interpolate(mask_preds_stuff.unsqueeze(0),
+                                       scale_factor=2.0,
+                                       mode='bilinear').squeeze(0)
+            mask_targets_stuff = F.interpolate(mask_stuff_gt.unsqueeze(0),
+                                               size=mask_preds.shape[-2:],
+                                               mode='bilinear').squeeze(0)
+
+            loss_mask_stuff = self.loss_mask(mask_preds,
+                                             mask_targets_stuff,
+                                             mask_weight_stuff,
+                                             avg_factor=num_total_pos_stuff)
+
+        loss_mask_things_list = []
+        loss_mask_stuff_list = []
+        loss_iou_list = []
+        loss_bbox_list = []
+        for j in range(len(mask_preds_inter_things)):
+            mask_preds_this_level = mask_preds_inter_things[j]
+            if mask_preds_this_level.shape[0] == 0:
+                loss_mask_j = (0 * mask_preds_this_level).sum()
+            else:
+                mask_preds_this_level = F.interpolate(
+                    mask_preds_this_level.unsqueeze(0),
+                    scale_factor=2.0,
+                    mode='bilinear').squeeze(0)
+                loss_mask_j = self.loss_mask(mask_preds_this_level,
+                                             mask_targets_things,
+                                             mask_weight_things,
+                                             avg_factor=num_total_pos_thing)
+            loss_mask_things_list.append(loss_mask_j)
+            bbox_preds_this_level = new_bbox_preds[j].reshape(-1, 4)
+            bboxes_this_level = bbox_cxcywh_to_xyxy(
+                bbox_preds_this_level) * factors
+            # We let this loss be 0. We didn't predict bbox in our mask decoder. Predicting bbox in the mask decoder is basically useless
+            loss_iou_j = self.loss_iou(bboxes_this_level,
+                                       bboxes_gt,
+                                       bboxes_weights,
+                                       avg_factor=num_total_pos_thing) * 0
+            if bboxes_taget.shape[0] != 0:
+                loss_bbox_j = self.loss_bbox(
+                    bbox_preds_this_level,
+                    bboxes_taget,
+                    bboxes_weights,
+                    avg_factor=num_total_pos_thing) * 0
+            else:
+                loss_bbox_j = bbox_preds_this_level.sum() * 0
+            loss_iou_list.append(loss_iou_j)
+            loss_bbox_list.append(loss_bbox_j)
+        for j in range(len(mask_preds_inter_stuff)):
+            mask_preds_this_level = mask_preds_inter_stuff[j]
+            if mask_preds_this_level.shape[0] == 0:
+                loss_mask_j = (0 * mask_preds_this_level).sum()
+            else:
+                mask_preds_this_level = F.interpolate(
+                    mask_preds_this_level.unsqueeze(0),
+                    scale_factor=2.0,
+                    mode='bilinear').squeeze(0)
+                loss_mask_j = self.loss_mask(mask_preds_this_level,
+                                             mask_targets_stuff,
+                                             mask_weight_stuff,
+                                             avg_factor=num_total_pos_stuff)
+            loss_mask_stuff_list.append(loss_mask_j)
+
+        loss_cls_thing_list = []
+        loss_cls_stuff_list = []
+        thing_labels = thing_labels.reshape(-1)
+        for j in range(len(mask_preds_inter_things)):
+            # We let this loss be 0. When using "query-filter", only partial thing queries are feed to the mask decoder. This will cause imbalance when supervising these queries.
+            cls_scores = cls_thing_preds[j]
+
+            if cls_scores.shape[0] == 0:
+                loss_cls_thing_j = cls_scores.sum() * 0
+            else:
+                loss_cls_thing_j = self.loss_cls(
+                    cls_scores,
+                    thing_labels,
+                    things_weights,
+                    avg_factor=num_total_pos_thing) * 2 * 0
+            loss_cls_thing_list.append(loss_cls_thing_j)
+
+        for j in range(len(mask_preds_inter_stuff)):
+            cls_scores = cls_stuff_preds[j]
+            if cls_scores.shape[0] == 0:
+                loss_cls_stuff_j = cls_stuff_preds[j].sum() * 0
+            else:
+                loss_cls_stuff_j = self.loss_cls(
+                    cls_stuff_preds[j],
+                    stuff_labels.to(torch.long),
+                    avg_factor=num_total_pos_stuff) * 2
+            loss_cls_stuff_list.append(loss_cls_stuff_j)
+
+        ## dynamic adjusting the weights
+        things_ratio, stuff_ratio = num_total_pos_thing / (
+            num_total_pos_stuff + num_total_pos_thing), num_total_pos_stuff / (
+                num_total_pos_stuff + num_total_pos_thing)
+
+        return loss_cls, loss_bbox, loss_iou, loss_mask_things, loss_mask_stuff, loss_mask_things_list, loss_mask_stuff_list, loss_iou_list, loss_bbox_list, loss_cls_thing_list, loss_cls_stuff_list, things_ratio, stuff_ratio
+    
+    def forward_test(self,
+                    pts_feats=None,
+                    gt_lane_labels=None,
+                    gt_lane_masks=None,
+                    img_metas=None,
+                    rescale=False):
+        bbox_list = [dict() for i in range(len(img_metas))]
+
+        pred_seg_dict = self(pts_feats)
+        results = self.get_bboxes(pred_seg_dict['outputs_classes'],
+                                           pred_seg_dict['outputs_coords'],
+                                           pred_seg_dict['enc_outputs_class'],
+                                           pred_seg_dict['enc_outputs_coord'],
+                                           pred_seg_dict['args_tuple'],
+                                           pred_seg_dict['reference'],
+                                           img_metas,
+                                           rescale=rescale)
+
+        if gt_lane_labels is None or gt_lane_masks is None:
+            for result_dict, pts_bbox in zip(bbox_list, results):
+                result_dict['pts_bbox'] = pts_bbox
+                result_dict['args_tuple'] = pred_seg_dict['args_tuple']
+            return bbox_list
+
+        with torch.no_grad():
+            drivable_pred = results[0]['drivable']
+            drivable_gt = gt_lane_masks[0][0, -1]
+            drivable_iou, drivable_intersection, drivable_union = IOU(drivable_pred.view(1, -1), drivable_gt.view(1, -1))
+
+            lane_pred = results[0]['lane']
+            lanes_pred = (results[0]['lane'].sum(0) > 0).int()
+            lanes_gt = (gt_lane_masks[0][0][:-1].sum(0) > 0).int()
+            lanes_iou, lanes_intersection, lanes_union = IOU(lanes_pred.view(1, -1), lanes_gt.view(1, -1))
+
+            divider_gt = (gt_lane_masks[0][0][gt_lane_labels[0][0] == 0].sum(0) > 0).int()
+            crossing_gt = (gt_lane_masks[0][0][gt_lane_labels[0][0] == 1].sum(0) > 0).int()
+            contour_gt = (gt_lane_masks[0][0][gt_lane_labels[0][0] == 2].sum(0) > 0).int()
+            divider_iou, divider_intersection, divider_union = IOU(lane_pred[0].view(1, -1), divider_gt.view(1, -1))
+            crossing_iou, crossing_intersection, crossing_union = IOU(lane_pred[1].view(1, -1), crossing_gt.view(1, -1))
+            contour_iou, contour_intersection, contour_union = IOU(lane_pred[2].view(1, -1), contour_gt.view(1, -1))
+
+
+            ret_iou = {'drivable_intersection': drivable_intersection,
+                       'drivable_union': drivable_union,
+                       'lanes_intersection': lanes_intersection,
+                       'lanes_union': lanes_union,
+                       'divider_intersection': divider_intersection,
+                       'divider_union': divider_union,
+                       'crossing_intersection': crossing_intersection,
+                       'crossing_union': crossing_union,
+                       'contour_intersection': contour_intersection,
+                       'contour_union': contour_union,
+                       'drivable_iou': drivable_iou,
+                       'lanes_iou': lanes_iou,
+                       'divider_iou': divider_iou,
+                       'crossing_iou': crossing_iou,
+                       'contour_iou': contour_iou}
+        for result_dict, pts_bbox in zip(bbox_list, results):
+            result_dict['pts_bbox'] = pts_bbox
+            result_dict['ret_iou'] = ret_iou
+            result_dict['args_tuple'] = pred_seg_dict['args_tuple']
+        return bbox_list
+
+
+    @auto_fp16(apply_to=("bev_feat", "prev_bev"))
+    def forward_train(self,
+                          bev_feat=None,
+                          img_metas=None,
+                          gt_lane_labels=None,
+                          gt_lane_bboxes=None,
+                          gt_lane_masks=None,
+                         ):
+        """
+        Forward pass of the segmentation model during training.
+
+        Args:
+            bev_feat (torch.Tensor): Bird's eye view feature maps. Shape [batch_size, channels, height, width].
+            img_metas (list[dict]): List of image meta information dictionaries.
+            gt_lane_labels (list[torch.Tensor]): Ground-truth lane class labels. Shape [batch_size, num_lanes, max_lanes].
+            gt_lane_bboxes (list[torch.Tensor]): Ground-truth lane bounding boxes. Shape [batch_size, num_lanes, 4].
+            gt_lane_masks (list[torch.Tensor]): Ground-truth lane masks. Shape [batch_size, num_lanes, height, width].
+            prev_bev (torch.Tensor): Previous bird's eye view feature map. Shape [batch_size, channels, height, width].
+
+        Returns:
+            tuple:
+                - losses_seg (torch.Tensor): Total segmentation loss.
+                - pred_seg_dict (dict): Dictionary of predicted segmentation outputs.
+        """
+        pred_seg_dict = self(bev_feat)
+        loss_inputs = [
+            pred_seg_dict['outputs_classes'],
+            pred_seg_dict['outputs_coords'],
+            pred_seg_dict['enc_outputs_class'],
+            pred_seg_dict['enc_outputs_coord'],
+            pred_seg_dict['args_tuple'],
+            pred_seg_dict['reference'],
+            gt_lane_labels,
+            gt_lane_bboxes,
+            gt_lane_masks
+        ]
+        losses_seg = self.loss(*loss_inputs, img_metas=img_metas)
+        return losses_seg, pred_seg_dict
+
+    def _get_bboxes_single(self,
+                           cls_score,
+                           bbox_pred,
+                           img_shape,
+                           scale_factor,
+                           rescale=False):
+        """
+        """
+        assert len(cls_score) == len(bbox_pred)
+        max_per_img = self.test_cfg.get('max_per_img', self.num_query)
+
+        # exclude background
+        if self.loss_cls.use_sigmoid:
+            cls_score = cls_score.sigmoid()
+            scores, indexes = cls_score.view(-1).topk(max_per_img)
+            det_labels = indexes % self.num_things_classes
+            bbox_index = indexes // self.num_things_classes
+            bbox_pred = bbox_pred[bbox_index]
+        else:
+            scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1)
+            scores, bbox_index = scores.topk(max_per_img)
+            bbox_pred = bbox_pred[bbox_index]
+            det_labels = det_labels[bbox_index]
+
+        det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
+        det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
+        det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
+        det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1])
+        det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0])
+        if rescale:
+            det_bboxes /= det_bboxes.new_tensor(scale_factor)
+        det_bboxes = torch.cat((det_bboxes, scores.unsqueeze(1)), -1)
+
+        return bbox_index, det_bboxes, det_labels
+
+    @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list',
+                          'args_tuple'))
+    def get_bboxes(
+        self,
+        all_cls_scores,
+        all_bbox_preds,
+        enc_cls_scores,
+        enc_bbox_preds,
+        args_tuple,
+        reference,
+        img_metas,
+        rescale=False,
+    ):
+        """
+        """
+        cls_scores = all_cls_scores[-1]
+        bbox_preds = all_bbox_preds[-1]
+        memory, memory_mask, memory_pos, query, _, query_pos, hw_lvl = args_tuple
+
+        seg_list = []
+        stuff_score_list = []
+        panoptic_list = []
+        bbox_list = []
+        labels_list = []
+        drivable_list = []
+        lane_list = []
+        lane_score_list = []
+        score_list = []
+        for img_id in range(len(img_metas)):
+            cls_score = cls_scores[img_id]
+            bbox_pred = bbox_preds[img_id]
+            # img_shape = img_metas[img_id]['img_shape']
+            # ori_shape = img_metas[img_id]['ori_shape']
+            # scale_factor = img_metas[img_id]['scale_factor']
+            img_shape = (self.canvas_size[0], self.canvas_size[1], 3)
+            ori_shape = (self.canvas_size[0], self.canvas_size[1], 3)
+            scale_factor = 1
+
+            index, bbox, labels = self._get_bboxes_single(
+                cls_score, bbox_pred, img_shape, scale_factor, rescale)
+
+            i = img_id
+            thing_query = query[i:i + 1, index, :]
+            thing_query_pos = query_pos[i:i + 1, index, :]
+            joint_query = torch.cat([
+                thing_query, self.stuff_query.weight[None, :, :self.embed_dims]
+            ], 1)
+
+            stuff_query_pos = self.stuff_query.weight[None, :,
+                                                      self.embed_dims:]
+
+            if self.num_stuff_classes>0:
+                joint_query = joint_query[:, :-self.num_stuff_classes]
+                
+            #import pdb;pdb.set_trace()
+
+            mask_things, mask_inter_things, query_inter_things = self.things_mask_head(
+                memory[i:i + 1],
+                memory_mask[i:i + 1],
+                None,
+                joint_query,
+                None,
+                None,
+                hw_lvl=hw_lvl)
+            # mask_stuff, mask_inter_stuff, query_inter_stuff = self.stuff_mask_head(
+            #     memory[i:i + 1],
+            #     memory_mask[i:i + 1],
+            #     None,
+            #     joint_query,
+            #     None,
+            #     stuff_query_pos,
+            #     hw_lvl=hw_lvl)
+
+            #attn_map = torch.cat([mask_things, mask_stuff], 1) 
+            attn_map = mask_things
+
+            attn_map = attn_map.squeeze(-1)  # BS, NQ, N_head,LEN
+
+            # stuff_query = query_inter_stuff[-1]
+            # scores_stuff = self.cls_stuff_branches[-1](
+            #     stuff_query).sigmoid().reshape(-1)
+
+            mask_pred = attn_map.reshape(-1, *hw_lvl[0])
+
+            mask_pred = F.interpolate(mask_pred.unsqueeze(0),
+                                      size=ori_shape[:2],
+                                      mode='bilinear').squeeze(0)
+
+            masks_all = mask_pred
+            score_list.append(masks_all)
+            drivable_list.append(masks_all[-1] > 0.5)
+            if self.num_stuff_classes>0:
+                masks_all = masks_all[:-self.num_stuff_classes]
+            seg_all = masks_all > 0.5
+            sum_seg_all = seg_all.sum((1, 2)).float() + 1
+            # scores_all = torch.cat([bbox[:, -1], scores_stuff], 0)
+            # bboxes_all = torch.cat([bbox, torch.zeros([self.num_stuff_classes, 5], device=labels.device)], 0)
+            # labels_all = torch.cat([labels, torch.arange(self.num_things_classes, self.num_things_classes+self.num_stuff_classes).to(labels.device)], 0)
+            scores_all = bbox[:, -1]
+            bboxes_all = bbox
+            labels_all = labels
+
+            ## mask wise merging
+            seg_scores = (masks_all * seg_all.float()).sum(
+                (1, 2)) / sum_seg_all
+            scores_all *= (seg_scores**2)
+
+            scores_all, index = torch.sort(scores_all, descending=True)
+
+            masks_all = masks_all[index]
+            labels_all = labels_all[index]
+            bboxes_all = bboxes_all[index]
+            seg_all = seg_all[index]
+
+            bboxes_all[:, -1] = scores_all
+
+            # MDS: select things for instance segmeantion
+            things_selected = labels_all < self.num_things_classes
+            stuff_selected = labels_all >= self.num_things_classes
+            bbox_th = bboxes_all[things_selected][:100]
+            labels_th = labels_all[things_selected][:100]
+            seg_th = seg_all[things_selected][:100]
+            labels_st = labels_all[stuff_selected]
+            scores_st = scores_all[stuff_selected]
+            masks_st = masks_all[stuff_selected]
+            
+            stuff_score_list.append(scores_st)
+
+            results = torch.zeros((2, *mask_pred.shape[-2:]),
+                                  device=mask_pred.device).to(torch.long)
+            id_unique = 1
+            lane = torch.zeros((self.num_things_classes, *mask_pred.shape[-2:]), device=mask_pred.device).to(torch.long)
+            lane_score =  torch.zeros((self.num_things_classes, *mask_pred.shape[-2:]), device=mask_pred.device).to(mask_pred.dtype)
+            for i, scores in enumerate(scores_all):
+                # MDS: things and sutff have different threholds may perform a little bit better
+                if labels_all[i] < self.num_things_classes and scores < self.quality_threshold_things:
+                    continue
+                elif labels_all[i] >= self.num_things_classes and scores < self.quality_threshold_stuff:
+                    continue
+                _mask = masks_all[i] > 0.5
+                mask_area = _mask.sum().item()
+                intersect = _mask & (results[0] > 0)
+                intersect_area = intersect.sum().item()
+                if labels_all[i] < self.num_things_classes:
+                    if mask_area == 0 or (intersect_area * 1.0 / mask_area
+                                          ) > self.overlap_threshold_things:
+                        continue
+                else:
+                    if mask_area == 0 or (intersect_area * 1.0 / mask_area
+                                          ) > self.overlap_threshold_stuff:
+                        continue
+                if intersect_area > 0:
+                    _mask = _mask & (results[0] == 0)
+                results[0, _mask] = labels_all[i]
+                if labels_all[i] < self.num_things_classes:
+                    lane[labels_all[i], _mask] = 1
+                    lane_score[labels_all[i], _mask] = masks_all[i][_mask]
+                    results[1, _mask] = id_unique
+                    id_unique += 1
+
+            # file_name = img_metas[img_id]['pts_filename'].split('/')[-1].split('.')[0]
+            # panoptic_list.append(
+            #     (results.permute(1, 2, 0).cpu().numpy(), file_name, ori_shape))
+            panoptic_list.append((results.permute(1, 2, 0).cpu().numpy(),ori_shape))            
+            
+
+            bbox_list.append(bbox_th)
+            labels_list.append(labels_th)
+            seg_list.append(seg_th)
+            lane_list.append(lane)
+            lane_score_list.append(lane_score)
+        results = []
+        for i in range(len(img_metas)):
+            results.append({
+                'bbox': bbox_list[i],
+                'segm': seg_list[i],
+                'labels': labels_list[i],
+                'panoptic': panoptic_list[i],
+                'drivable': drivable_list[i],
+                'score_list': score_list[i],
+                'lane': lane_list[i],
+                'lane_score': lane_score_list[i],
+                'stuff_score_list' : stuff_score_list[i],
+            })
+        return results
diff --git a/mmcv/models/dense_heads/planning_head.py b/mmcv/models/dense_heads/planning_head.py
new file mode 100644
index 0000000..6df7afc
--- /dev/null
+++ b/mmcv/models/dense_heads/planning_head.py
@@ -0,0 +1,251 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+#---------------------------------------------------------------------------------#
+
+import torch
+import torch.nn as nn
+from mmcv.models.builder import HEADS, build_loss
+from einops import rearrange
+from mmcv.models.utils.functional import bivariate_gaussian_activation
+from .planning_head_plugin import CollisionNonlinearOptimizer
+import numpy as np
+import copy
+
+@HEADS.register_module()
+class PlanningHeadSingleMode(nn.Module):
+    def __init__(self,
+                 bev_h=200,
+                 bev_w=200,
+                 embed_dims=256,
+                 planning_steps=6,
+                 command_dim=3,
+                 loss_planning=None,
+                 loss_collision=None,
+                 planning_eval=False,
+                 use_col_optim=False,
+                 col_optim_args=dict(
+                    occ_filter_range=5.0,
+                    sigma=1.0, 
+                    alpha_collision=5.0,
+                 ),
+                 with_adapter=False,
+                ):
+        """
+        Single Mode Planning Head for Autonomous Driving.
+
+        Args:
+            embed_dims (int): Embedding dimensions. Default: 256.
+            planning_steps (int): Number of steps for motion planning. Default: 6.
+            loss_planning (dict): Configuration for planning loss. Default: None.
+            loss_collision (dict): Configuration for collision loss. Default: None.
+            planning_eval (bool): Whether to use planning for evaluation. Default: False.
+            use_col_optim (bool): Whether to use collision optimization. Default: False.
+            col_optim_args (dict): Collision optimization arguments. Default: dict(occ_filter_range=5.0, sigma=1.0, alpha_collision=5.0).
+        """
+        super(PlanningHeadSingleMode, self).__init__()
+
+        # Nuscenes
+        self.bev_h = bev_h
+        self.bev_w = bev_w
+        self.navi_embed = nn.Embedding(command_dim, embed_dims)
+        self.reg_branch = nn.Sequential(
+            nn.Linear(embed_dims, embed_dims),
+            nn.ReLU(),
+            nn.Linear(embed_dims, planning_steps * 2),
+        )
+        self.loss_planning = build_loss(loss_planning)
+        self.planning_steps = planning_steps
+        self.planning_eval = planning_eval
+        
+        #### planning head
+        fuser_dim = 3
+        attn_module_layer = nn.TransformerDecoderLayer(embed_dims, 8, dim_feedforward=embed_dims*2, dropout=0.1, batch_first=False)
+        self.attn_module = nn.TransformerDecoder(attn_module_layer, 3)
+        
+        self.mlp_fuser = nn.Sequential(
+                nn.Linear(embed_dims*fuser_dim, embed_dims),
+                nn.LayerNorm(embed_dims),
+                nn.ReLU(inplace=True),
+            )
+        
+        self.pos_embed = nn.Embedding(1, embed_dims)
+        self.loss_collision = []
+        for cfg in loss_collision:
+            self.loss_collision.append(build_loss(cfg))
+        self.loss_collision = nn.ModuleList(self.loss_collision)
+        
+        self.use_col_optim = use_col_optim
+        self.occ_filter_range = col_optim_args['occ_filter_range']
+        self.sigma = col_optim_args['sigma']
+        self.alpha_collision = col_optim_args['alpha_collision']
+
+        # TODO: reimplement it with down-scaled feature_map
+        self.with_adapter = with_adapter
+        if with_adapter:
+            bev_adapter_block = nn.Sequential(
+                nn.Conv2d(embed_dims, embed_dims // 2, kernel_size=3, padding=1),
+                nn.ReLU(),
+                nn.Conv2d(embed_dims // 2, embed_dims, kernel_size=1),
+            )
+            N_Blocks = 3
+            bev_adapter = [copy.deepcopy(bev_adapter_block) for _ in range(N_Blocks)]
+            self.bev_adapter = nn.Sequential(*bev_adapter)
+           
+    def forward_train(self,
+                      bev_embed, 
+                      outs_motion={}, 
+                      sdc_planning=None, 
+                      sdc_planning_mask=None,
+                      command=None,
+                      gt_future_boxes=None,
+                      ):
+        """
+        Perform forward planning training with the given inputs.
+        Args:
+            bev_embed (torch.Tensor): The input bird's eye view feature map.
+            outs_motion (dict): A dictionary containing the motion outputs.
+            outs_occflow (dict): A dictionary containing the occupancy flow outputs.
+            sdc_planning (torch.Tensor, optional): The self-driving car's planned trajectory.
+            sdc_planning_mask (torch.Tensor, optional): The mask for the self-driving car's planning.
+            command (torch.Tensor, optional): The driving command issued to the self-driving car.
+            gt_future_boxes (torch.Tensor, optional): The ground truth future bounding boxes.
+            img_metas (list[dict], optional): A list of metadata information about the input images.
+
+        Returns:
+            ret_dict (dict): A dictionary containing the losses and planning outputs.
+        """
+        sdc_traj_query = outs_motion['sdc_traj_query']
+        sdc_track_query = outs_motion['sdc_track_query']
+        bev_pos = outs_motion['bev_pos']
+
+        occ_mask = None
+        
+        outs_planning = self(bev_embed, occ_mask, bev_pos, sdc_traj_query, sdc_track_query, command)
+        loss_inputs = [sdc_planning, sdc_planning_mask, outs_planning, gt_future_boxes]
+        losses = self.loss(*loss_inputs)
+        ret_dict = dict(losses=losses, outs_motion=outs_planning)
+        return ret_dict
+
+    def forward_test(self, bev_embed, outs_motion={}, outs_occflow={}, command=None):
+        sdc_traj_query = outs_motion['sdc_traj_query']
+        sdc_track_query = outs_motion['sdc_track_query']
+        bev_pos = outs_motion['bev_pos']
+        occ_mask = outs_occflow['seg_out']
+        
+        outs_planning = self(bev_embed, occ_mask, bev_pos, sdc_traj_query, sdc_track_query, command)
+        return outs_planning
+
+    def forward(self, 
+                bev_embed, 
+                occ_mask, 
+                bev_pos, 
+                sdc_traj_query, 
+                sdc_track_query, 
+                command):
+        """
+        Forward pass for PlanningHeadSingleMode.
+
+        Args:
+            bev_embed (torch.Tensor): Bird's eye view feature embedding.
+            occ_mask (torch.Tensor): Instance mask for occupancy.
+            bev_pos (torch.Tensor): BEV position.
+            sdc_traj_query (torch.Tensor): SDC trajectory query.
+            sdc_track_query (torch.Tensor): SDC track query.
+            command (int): Driving command.
+
+        Returns:
+            dict: A dictionary containing SDC trajectory and all SDC trajectories.
+        """
+        sdc_track_query = sdc_track_query.detach()
+        sdc_traj_query = sdc_traj_query[-1]
+        P = sdc_traj_query.shape[1]
+        sdc_track_query = sdc_track_query[:, None].expand(-1,P,-1)
+        
+        #import pdb;pdb.set_trace()
+        navi_embed = self.navi_embed.weight[command]
+        navi_embed = navi_embed[None].expand(-1,P,-1)
+        plan_query = torch.cat([sdc_traj_query, sdc_track_query, navi_embed], dim=-1)
+
+        plan_query = self.mlp_fuser(plan_query).max(1, keepdim=True)[0]   # expand, then fuse  # [1, 6, 768] -> [1, 1, 256]
+        plan_query = rearrange(plan_query, 'b p c -> p b c')
+        
+        bev_pos = rearrange(bev_pos, 'b c h w -> (h w) b c')
+        bev_feat = bev_embed +  bev_pos
+        
+        ##### Plugin adapter #####
+        if self.with_adapter:
+            bev_feat = rearrange(bev_feat, '(h w) b c -> b c h w', h=self.bev_h, w=self.bev_w)
+            
+            bev_feat = bev_feat + self.bev_adapter(bev_feat)  # residual connection
+            bev_feat = rearrange(bev_feat, 'b c h w -> (h w) b c')
+        ##########################
+      
+        pos_embed = self.pos_embed.weight
+        plan_query = plan_query + pos_embed[None]  # [1, 1, 256]
+        
+        # plan_query: [1, 1, 256]
+        # bev_feat: [40000, 1, 256]
+        plan_query = self.attn_module(plan_query, bev_feat)   # [1, 1, 256]
+        
+        sdc_traj_all = self.reg_branch(plan_query).view((-1, self.planning_steps, 2))
+        sdc_traj_all[...,:2] = torch.cumsum(sdc_traj_all[...,:2], dim=1)
+        sdc_traj_all[0] = bivariate_gaussian_activation(sdc_traj_all[0])
+        if self.use_col_optim and not self.training:
+            # post process, only used when testing
+            assert occ_mask is not None
+            sdc_traj_all = self.collision_optimization(sdc_traj_all, occ_mask)
+        
+        return dict(
+            sdc_traj=sdc_traj_all,
+            sdc_traj_all=sdc_traj_all,
+        )
+
+    def collision_optimization(self, sdc_traj_all, occ_mask):
+        """
+        Optimize SDC trajectory with occupancy instance mask.
+
+        Args:
+            sdc_traj_all (torch.Tensor): SDC trajectory tensor.
+            occ_mask (torch.Tensor): Occupancy flow instance mask. 
+        Returns:
+            torch.Tensor: Optimized SDC trajectory tensor.
+        """
+        pos_xy_t = []
+        valid_occupancy_num = 0
+        
+        if occ_mask.shape[2] == 1:
+            occ_mask = occ_mask.squeeze(2)
+        occ_horizon = occ_mask.shape[1]
+        assert occ_horizon == 5
+
+        for t in range(self.planning_steps):
+            cur_t = min(t+1, occ_horizon-1)
+            pos_xy = torch.nonzero(occ_mask[0][cur_t], as_tuple=False)
+            pos_xy = pos_xy[:, [1, 0]]
+            pos_xy[:, 0] = (pos_xy[:, 0] - self.bev_h//2) * 0.5 + 0.25
+            pos_xy[:, 1] = (pos_xy[:, 1] - self.bev_w//2) * 0.5 + 0.25
+
+            # filter the occupancy in range
+            keep_index = torch.sum((sdc_traj_all[0, t, :2][None, :] - pos_xy[:, :2])**2, axis=-1) < self.occ_filter_range**2
+            pos_xy_t.append(pos_xy[keep_index].cpu().detach().numpy())
+            valid_occupancy_num += torch.sum(keep_index>0)
+        if valid_occupancy_num == 0:
+            return sdc_traj_all
+        
+        col_optimizer = CollisionNonlinearOptimizer(self.planning_steps, 0.5, self.sigma, self.alpha_collision, pos_xy_t)
+        col_optimizer.set_reference_trajectory(sdc_traj_all[0].cpu().detach().numpy())
+        sol = col_optimizer.solve()
+        sdc_traj_optim = np.stack([sol.value(col_optimizer.position_x), sol.value(col_optimizer.position_y)], axis=-1)
+        return torch.tensor(sdc_traj_optim[None], device=sdc_traj_all.device, dtype=sdc_traj_all.dtype)
+    
+    def loss(self, sdc_planning, sdc_planning_mask, outs_planning, future_gt_bbox=None):
+        sdc_traj_all = outs_planning['sdc_traj_all'] # b, p, t, 5
+        loss_dict = dict()
+        for i in range(len(self.loss_collision)):
+            loss_collision = self.loss_collision[i](sdc_traj_all, sdc_planning[0, :, :self.planning_steps, :3], torch.any(sdc_planning_mask[0, :, :self.planning_steps], dim=-1), future_gt_bbox[0][1:self.planning_steps+1])
+            loss_dict[f'loss_collision_{i}'] = loss_collision          
+        loss_ade = self.loss_planning(sdc_traj_all, sdc_planning[0, :, :self.planning_steps, :2], torch.any(sdc_planning_mask[0, :, :self.planning_steps], dim=-1))
+        loss_dict.update(dict(loss_ade=loss_ade))
+        return loss_dict
\ No newline at end of file
diff --git a/mmcv/models/dense_heads/planning_head_plugin/__init__.py b/mmcv/models/dense_heads/planning_head_plugin/__init__.py
new file mode 100644
index 0000000..57e8627
--- /dev/null
+++ b/mmcv/models/dense_heads/planning_head_plugin/__init__.py
@@ -0,0 +1,4 @@
+# from .collision_optimization import *
+from .planning_metrics import *
+from .collision_optimization import *
+from .metric_stp3 import *
\ No newline at end of file
diff --git a/mmcv/models/dense_heads/planning_head_plugin/collision_optimization.py b/mmcv/models/dense_heads/planning_head_plugin/collision_optimization.py
new file mode 100644
index 0000000..bf01e49
--- /dev/null
+++ b/mmcv/models/dense_heads/planning_head_plugin/collision_optimization.py
@@ -0,0 +1,116 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+#---------------------------------------------------------------------------------#
+
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import numpy.typing as npt
+from casadi import DM, Opti, OptiSol, cos, diff, sin, sumsqr, vertcat, exp
+
+Pose = Tuple[float, float, float]  # (x, y, yaw)
+
+
+class CollisionNonlinearOptimizer:
+    """
+    Optimize planned trajectory with predicted occupancy
+    Solved with direct multiple-shooting.
+    modified from https://github.com/motional/nuplan-devkit
+    :param trajectory_len: trajectory length
+    :param dt: timestep (sec)
+    """
+
+    def __init__(self, trajectory_len: int, dt: float, sigma, alpha_collision, obj_pixel_pos):
+        """
+        :param trajectory_len: the length of trajectory to be optimized.
+        :param dt: the time interval between trajectory points.
+        """
+        self.dt = dt
+        self.trajectory_len = trajectory_len
+        self.current_index = 0
+        self.sigma = sigma
+        self.alpha_collision = alpha_collision
+        self.obj_pixel_pos = obj_pixel_pos
+        # Use a array of dts to make it compatible to situations with varying dts across different time steps.
+        self._dts: npt.NDArray[np.float32] = np.asarray([[dt] * trajectory_len])
+        self._init_optimization()
+
+    def _init_optimization(self) -> None:
+        """
+        Initialize related variables and constraints for optimization.
+        """
+        self.nx = 2  # state dim
+
+        self._optimizer = Opti()  # Optimization problem
+        self._create_decision_variables()
+        self._create_parameters()
+        self._set_objective()
+
+        # Set default solver options (quiet)
+        self._optimizer.solver("ipopt", {"ipopt.print_level": 0, "print_time": 0, "ipopt.sb": "yes"})
+
+    def set_reference_trajectory(self, reference_trajectory: Sequence[Pose]) -> None:
+        """
+        Set the reference trajectory that the smoother is trying to loosely track.
+        :param x_curr: current state of size nx (x, y)
+        :param reference_trajectory: N x 3 reference, where the second dim is for (x, y)
+        """
+        self._optimizer.set_value(self.ref_traj, DM(reference_trajectory).T)
+        self._set_initial_guess(reference_trajectory)
+
+    def set_solver_optimizerons(self, options: Dict[str, Any]) -> None:
+        """
+        Control solver options including verbosity.
+        :param options: Dictionary containing optimization criterias
+        """
+        self._optimizer.solver("ipopt", options)
+
+    def solve(self) -> OptiSol:
+        """
+        Solve the optimization problem. Assumes the reference trajectory was already set.
+        :return Casadi optimization class
+        """
+        return self._optimizer.solve()
+
+    def _create_decision_variables(self) -> None:
+        """
+        Define the decision variables for the trajectory optimization.
+        """
+        # State trajectory (x, y)
+        self.state = self._optimizer.variable(self.nx, self.trajectory_len)
+        self.position_x = self.state[0, :]
+        self.position_y = self.state[1, :]
+
+    def _create_parameters(self) -> None:
+        """
+        Define the expert trjactory and current position for the trajectory optimizaiton.
+        """
+        self.ref_traj = self._optimizer.parameter(2, self.trajectory_len)  # (x, y)
+
+    def _set_objective(self) -> None:
+        """Set the objective function. Use care when modifying these weights."""
+        # Follow reference, minimize control rates and absolute inputs
+        alpha_xy = 1.0
+        cost_stage = (
+            alpha_xy * sumsqr(self.ref_traj[:2, :] - vertcat(self.position_x, self.position_y))
+        )
+
+        alpha_collision = self.alpha_collision
+        
+        cost_collision = 0
+        normalizer = 1/(2.507*self.sigma)
+        # TODO: vectorize this
+        for t in range(len(self.obj_pixel_pos)):
+            x, y = self.position_x[t], self.position_y[t]
+            for i in range(len(self.obj_pixel_pos[t])):
+                col_x, col_y = self.obj_pixel_pos[t][i]
+                cost_collision += alpha_collision * normalizer * exp(-((x - col_x)**2 + (y - col_y)**2)/2/self.sigma**2)
+        self._optimizer.minimize(cost_stage + cost_collision)
+
+    def _set_initial_guess(self, reference_trajectory: Sequence[Pose]) -> None:
+        """Set a warm-start for the solver based on the reference trajectory."""
+        # Initialize state guess based on reference
+        self._optimizer.set_initial(self.state[:2, :], DM(reference_trajectory).T)  # (x, y, yaw)
+
diff --git a/mmcv/models/dense_heads/planning_head_plugin/metric_stp3.py b/mmcv/models/dense_heads/planning_head_plugin/metric_stp3.py
new file mode 100644
index 0000000..e70f809
--- /dev/null
+++ b/mmcv/models/dense_heads/planning_head_plugin/metric_stp3.py
@@ -0,0 +1,337 @@
+'''
+calculate planner metric same as stp3
+'''
+import numpy as np
+import torch
+import cv2
+import copy
+import matplotlib.pyplot as plt
+from skimage.draw import polygon
+from nuscenes.utils.data_classes import Box
+from scipy.spatial.transform import Rotation as R
+
+ego_width, ego_length = 1.85, 4.084
+
+class PlanningMetric():
+    def __init__(self):
+        super().__init__()
+        self.X_BOUND = [-50.0, 50.0, 0.5]  # Forward
+        self.Y_BOUND = [-50.0, 50.0, 0.5]  # Sides
+        self.Z_BOUND = [-10.0, 10.0, 20.0]  # Height
+        dx, bx, _ = self.gen_dx_bx(self.X_BOUND, self.Y_BOUND, self.Z_BOUND)
+        self.dx, self.bx = dx[:2], bx[:2]
+
+        bev_resolution, bev_start_position, bev_dimension = self.calculate_birds_eye_view_parameters(
+            self.X_BOUND, self.Y_BOUND, self.Z_BOUND
+        )
+        self.bev_resolution = bev_resolution.numpy()
+        self.bev_start_position = bev_start_position.numpy()
+        self.bev_dimension = bev_dimension.numpy()
+
+        self.W = ego_width
+        self.H = ego_length
+
+        self.category_index = {
+            'human':[2,3,4,5,6,7,8],
+            'vehicle':[14,15,16,17,18,19,20,21,22,23]
+        }
+
+        # self.n_future = n_future
+
+        # self.add_state("obj_col", default=torch.zeros(self.n_future), dist_reduce_fx="sum")
+        # self.add_state("obj_box_col", default=torch.zeros(self.n_future), dist_reduce_fx="sum")
+        # self.add_state("L2", default=torch.zeros(self.n_future),dist_reduce_fx="sum")
+        # self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum")
+
+    def gen_dx_bx(self, xbound, ybound, zbound):
+        dx = torch.Tensor([row[2] for row in [xbound, ybound, zbound]])
+        bx = torch.Tensor([row[0] + row[2]/2.0 for row in [xbound, ybound, zbound]])
+        nx = torch.LongTensor([(row[1] - row[0]) / row[2] for row in [xbound, ybound, zbound]])
+
+        return dx, bx, nx
+    
+    def calculate_birds_eye_view_parameters(self, x_bounds, y_bounds, z_bounds):
+        """
+        Parameters
+        ----------
+            x_bounds: Forward direction in the ego-car.
+            y_bounds: Sides
+            z_bounds: Height
+
+        Returns
+        -------
+            bev_resolution: Bird's-eye view bev_resolution
+            bev_start_position Bird's-eye view first element
+            bev_dimension Bird's-eye view tensor spatial dimension
+        """
+        bev_resolution = torch.tensor([row[2] for row in [x_bounds, y_bounds, z_bounds]])
+        bev_start_position = torch.tensor([row[0] + row[2] / 2.0 for row in [x_bounds, y_bounds, z_bounds]])
+        bev_dimension = torch.tensor([(row[1] - row[0]) / row[2] for row in [x_bounds, y_bounds, z_bounds]],
+                                    dtype=torch.long)
+
+        return bev_resolution, bev_start_position, bev_dimension
+    
+    def get_label(
+            self,
+            gt_agent_boxes,
+            gt_agent_feats
+        ):
+        segmentation_np, pedestrian_np = self.get_birds_eye_view_label(gt_agent_boxes,gt_agent_feats)
+        segmentation = torch.from_numpy(segmentation_np).long().unsqueeze(0)
+        pedestrian = torch.from_numpy(pedestrian_np).long().unsqueeze(0)
+
+        return segmentation, pedestrian
+    
+    def get_birds_eye_view_label(
+            self,
+            gt_agent_boxes,
+            gt_agent_feats
+        ):
+        '''
+        gt_agent_boxes (LiDARInstance3DBoxes): list of GT Bboxs.
+            dim 9 = (x,y,z)+(w,l,h)+yaw+(vx,vy)
+        gt_agent_feats: (B, A, 34)
+            dim 34 = fut_traj(6*2) + fut_mask(6) + goal(1) + lcf_feat(9) + fut_yaw(6)
+            lcf_feat (x, y, yaw, vx, vy, width, length, height, type)
+        ego_lcf_feats: (B, 9) 
+            dim 8 = (vx, vy, ax, ay, w, length, width, vel, steer)
+        '''
+        T = 6
+        segmentation = np.zeros((T,self.bev_dimension[0], self.bev_dimension[1]))
+        pedestrian = np.zeros((T,self.bev_dimension[0], self.bev_dimension[1]))
+        agent_num = gt_agent_feats.shape[1]
+
+        gt_agent_boxes = gt_agent_boxes.tensor.cpu().numpy()  #(N, 9)
+        gt_agent_feats = gt_agent_feats.cpu().numpy()
+
+        gt_agent_fut_trajs = gt_agent_feats[..., :T*2].reshape(-1, 6, 2)
+        gt_agent_fut_mask = gt_agent_feats[..., T*2:T*3].reshape(-1, 6)
+        # gt_agent_lcf_feat = gt_agent_feats[..., T*3+1:T*3+10].reshape(-1, 9)
+        gt_agent_fut_yaw = gt_agent_feats[..., T*3+10:T*4+10].reshape(-1, 6, 1)
+        gt_agent_fut_trajs = np.cumsum(gt_agent_fut_trajs, axis=1)
+        gt_agent_fut_yaw = np.cumsum(gt_agent_fut_yaw, axis=1)
+
+        gt_agent_boxes[:,6:7] = -1*(gt_agent_boxes[:,6:7] + np.pi/2) # NOTE: convert yaw to lidar frame
+        gt_agent_fut_trajs = gt_agent_fut_trajs + gt_agent_boxes[:, np.newaxis, 0:2]
+        gt_agent_fut_yaw = gt_agent_fut_yaw + gt_agent_boxes[:, np.newaxis, 6:7]
+        
+        for t in range(T):
+            for i in range(agent_num):
+                if gt_agent_fut_mask[i][t] == 1:
+                    # Filter out all non vehicle instances
+                    category_index = int(gt_agent_feats[0,i][27])
+                    agent_length, agent_width = gt_agent_boxes[i][4], gt_agent_boxes[i][3]
+                    x_a = gt_agent_fut_trajs[i, t, 0]
+                    y_a = gt_agent_fut_trajs[i, t, 1]
+                    yaw_a = gt_agent_fut_yaw[i, t, 0]
+                    param = [x_a,y_a,yaw_a,agent_length, agent_width]
+                    if (category_index in self.category_index['vehicle']):
+                        poly_region = self._get_poly_region_in_image(param)
+                        cv2.fillPoly(segmentation[t], [poly_region], 1.0)
+                    if (category_index in self.category_index['human']):
+                        poly_region = self._get_poly_region_in_image(param)
+                        cv2.fillPoly(pedestrian[t], [poly_region], 1.0)
+        
+        # vis for debug
+        # plt.figure('debug')
+        # for i in range(T):
+        #     plt.subplot(2,T,i+1)
+        #     plt.imshow(segmentation[i])
+        #     plt.subplot(2,T,i+1+T)
+        #     plt.imshow(pedestrian[i])
+        # plt.savefig('/home/users/qing01.xu/bevformer/debug_figs/car_ped_occ.jpg')
+        # plt.close()
+
+        return segmentation, pedestrian
+    
+    def _get_poly_region_in_image(self,param):
+        lidar2cv_rot = np.array([[1,0], [0,-1]])
+        x_a,y_a,yaw_a,agent_length, agent_width = param
+        trans_a = np.array([[x_a,y_a]]).T
+        rot_mat_a = np.array([[np.cos(yaw_a), -np.sin(yaw_a)],
+                                [np.sin(yaw_a), np.cos(yaw_a)]])
+        agent_corner = np.array([
+            [agent_length/2, -agent_length/2, -agent_length/2, agent_length/2],
+            [agent_width/2, agent_width/2, -agent_width/2, -agent_width/2]]) #(2,4)
+        agent_corner_lidar = np.matmul(rot_mat_a, agent_corner) + trans_a #(2,4)
+        # convert to cv frame
+        agent_corner_cv2 = (np.matmul(lidar2cv_rot, agent_corner_lidar) \
+            - self.bev_start_position[:2,None] + self.bev_resolution[:2,None] / 2.0).T / self.bev_resolution[:2] #(4,2)
+        agent_corner_cv2 = np.round(agent_corner_cv2).astype(np.int32)
+
+        return agent_corner_cv2
+
+
+    def evaluate_single_coll(self, traj, segmentation, input_gt):
+        '''
+        traj: torch.Tensor (n_future, 2)
+            自车lidar系为轨迹参考系
+                ^ y
+                |
+                | 
+                0------->
+                        x
+        segmentation: torch.Tensor (n_future, 200, 200)
+        '''
+        pts = np.array([
+            [-self.H / 2. + 0.5, self.W / 2.],
+            [self.H / 2. + 0.5, self.W / 2.],
+            [self.H / 2. + 0.5, -self.W / 2.],
+            [-self.H / 2. + 0.5, -self.W / 2.],
+        ])
+        pts = (pts - self.bx.cpu().numpy()) / (self.dx.cpu().numpy())
+        pts[:, [0, 1]] = pts[:, [1, 0]]
+        rr, cc = polygon(pts[:,1], pts[:,0])
+        rc = np.concatenate([rr[:,None], cc[:,None]], axis=-1)
+
+        n_future, _ = traj.shape
+        trajs = traj.view(n_future, 1, 2)
+        # 轨迹坐标系转换为:
+        #  ^ x
+        #  |
+        #  | 
+        #  0-------> y
+        trajs_ = copy.deepcopy(trajs)
+        trajs_[:,:,[0,1]] = trajs_[:,:,[1,0]] # can also change original tensor
+        trajs_ = trajs_ / self.dx.to(trajs.device)
+        trajs_ = trajs_.cpu().numpy() + rc # (n_future, 32, 2)
+
+        r = (self.bev_dimension[0] - trajs_[:,:,0]).astype(np.int32)
+        r = np.clip(r, 0, self.bev_dimension[0] - 1)
+
+        c = trajs_[:,:,1].astype(np.int32)
+        c = np.clip(c, 0, self.bev_dimension[1] - 1)
+
+        collision = np.full(n_future, False)
+        for t in range(n_future):
+            rr = r[t]
+            cc = c[t]
+            I = np.logical_and(
+                np.logical_and(rr >= 0, rr < self.bev_dimension[0]),
+                np.logical_and(cc >= 0, cc < self.bev_dimension[1]),
+            )
+            collision[t] = np.any(segmentation[t, rr[I], cc[I]].cpu().numpy())
+        
+        # vis for debug
+        # obs_occ = copy.deepcopy(segmentation)
+        # ego_occ = torch.zeros_like(obs_occ)
+        # for t in range(n_future):
+        #     rr = r[t]
+        #     cc = c[t]
+        #     I = np.logical_and(
+        #         np.logical_and(rr >= 0, rr < self.bev_dimension[0]),
+        #         np.logical_and(cc >= 0, cc < self.bev_dimension[1]),
+        #     )
+        #     ego_occ[t, rr[I], cc[I]]=1
+        
+        # plt.figure()
+        # for i in range(6):
+        #     plt.subplot(2,6,i+1)
+        #     plt.imshow(obs_occ[i])
+        #     plt.subplot(2,6,i+7)
+        #     plt.imshow(ego_occ[i])
+        # if input_gt:
+        #     plt.savefig('/home/users/qing01.xu/bevformer/debug_figs/occ_metric_stp3_gt.jpg')
+        # else:
+        #     plt.savefig('/home/users/qing01.xu/bevformer/debug_figs/occ_metric_stp3_pred.jpg')
+        # plt.close()
+
+        return torch.from_numpy(collision).to(device=traj.device)
+
+    def evaluate_coll(
+            self, 
+            trajs, 
+            gt_trajs, 
+            segmentation
+        ):
+        '''
+        trajs: torch.Tensor (B, n_future, 2)
+            自车lidar系为轨迹参考系
+            ^ y
+            |
+            | 
+            0------->
+                    x
+        gt_trajs: torch.Tensor (B, n_future, 2)
+        segmentation: torch.Tensor (B, n_future, 200, 200)
+
+        '''
+        gt_trajs = gt_trajs.to(device=trajs.device)
+        B, n_future, _ = trajs.shape
+        # trajs = trajs * torch.tensor([-1, 1], device=trajs.device)
+        # gt_trajs = gt_trajs * torch.tensor([-1, 1], device=gt_trajs.device)
+
+        obj_coll_sum = torch.zeros(n_future, device=segmentation.device)
+        obj_box_coll_sum = torch.zeros(n_future, device=segmentation.device)
+
+        for i in range(B):
+            gt_box_coll = self.evaluate_single_coll(gt_trajs[i], segmentation[i], input_gt=True)
+
+            xx, yy = trajs[i,:,0], trajs[i, :, 1]
+            # lidar系下的轨迹转换到图片坐标系下
+            xi = ((-self.bx[0]/2 - yy) / self.dx[0]).long()
+            yi = ((-self.bx[1]/2 + xx) / self.dx[1]).long()
+
+            m1 = torch.logical_and(
+                torch.logical_and(xi >= 0, xi < self.bev_dimension[0]),
+                torch.logical_and(yi >= 0, yi < self.bev_dimension[1]),
+            ).to(gt_box_coll.device)
+            m1 = torch.logical_and(m1, torch.logical_not(gt_box_coll))
+            #import pdb;pdb.set_trace()
+
+            ti = torch.arange(n_future)
+            obj_coll_sum[ti[m1]] += segmentation[i, ti[m1], xi[m1], yi[m1]].long()
+
+            m2 = torch.logical_not(gt_box_coll)
+            box_coll = self.evaluate_single_coll(trajs[i], segmentation[i], input_gt=False).to(ti.device)
+            obj_box_coll_sum[ti[m2]] += (box_coll[ti[m2]]).long()
+
+        return obj_coll_sum, obj_box_coll_sum
+
+    def compute_L2(self, trajs, gt_trajs):
+        '''
+        trajs: torch.Tensor (n_future, 2)
+        gt_trajs: torch.Tensor (n_future, 2)
+        '''
+        # return torch.sqrt(((trajs[:, :, :2] - gt_trajs[:, :, :2]) ** 2).sum(dim=-1))
+        pred_len = trajs.shape[0]
+        ade = float(
+            sum(
+                torch.sqrt(
+                    (trajs[i, 0] - gt_trajs[i, 0]) ** 2
+                    + (trajs[i, 1] - gt_trajs[i, 1]) ** 2
+                )
+                for i in range(pred_len)
+            )
+            / pred_len
+        )
+        
+        return ade
+
+    # def update(self, trajs, gt_trajs, segmentation):
+    #     '''
+    #     trajs: torch.Tensor (B, n_future, 3)
+    #     gt_trajs: torch.Tensor (B, n_future, 3)
+    #     segmentation: torch.Tensor (B, n_future, 200, 200)
+    #     '''
+    #     assert trajs.shape == gt_trajs.shape
+    #     L2 = self.compute_L2(trajs, gt_trajs)
+    #     obj_coll_sum, obj_box_coll_sum = self.evaluate_coll(trajs[:,:,:2], gt_trajs[:,:,:2], segmentation)
+
+    #     if torch.isnan(L2).max().item():
+    #         debug = 1
+    #     else:
+    #         self.obj_col += obj_coll_sum
+    #         self.obj_box_col += obj_box_coll_sum
+    #         self.L2 += L2.sum(dim=0)
+    #         if torch.isnan(self.L2).max().item():
+    #             debug=1
+    #         self.total +=len(trajs)
+
+
+    # def compute(self):
+    #     return {
+    #         'obj_col': self.obj_col / self.total,
+    #         'obj_box_col': self.obj_box_col / self.total,
+    #         'L2' : self.L2 / self.total
+    #     }
\ No newline at end of file
diff --git a/mmcv/models/dense_heads/planning_head_plugin/planning_metrics.py b/mmcv/models/dense_heads/planning_head_plugin/planning_metrics.py
new file mode 100644
index 0000000..d598bb2
--- /dev/null
+++ b/mmcv/models/dense_heads/planning_head_plugin/planning_metrics.py
@@ -0,0 +1,147 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+#---------------------------------------------------------------------------------#
+
+import torch
+import torch.nn as nn
+import numpy as np
+from skimage.draw import polygon
+from mmcv.metrics.metric import Metric
+from ..occ_head_plugin import calculate_birds_eye_view_parameters, gen_dx_bx
+
+
+class UniADPlanningMetric(Metric):
+    def __init__(
+        self,
+        n_future=6,
+    ):
+        super().__init__()
+        dx, bx, _ = gen_dx_bx([-50.0, 50.0, 0.5], [-50.0, 50.0, 0.5], [-10.0, 10.0, 20.0])
+        dx, bx = dx[:2], bx[:2]
+        self.dx = nn.Parameter(dx, requires_grad=False)
+        self.bx = nn.Parameter(bx, requires_grad=False)
+
+        _, _, self.bev_dimension = calculate_birds_eye_view_parameters(
+            [-50.0, 50.0, 0.5], [-50.0, 50.0, 0.5], [-10.0, 10.0, 20.0]
+        )
+        self.bev_dimension = self.bev_dimension.numpy()
+
+        self.W = 1.85
+        self.H = 4.084
+
+        self.n_future = n_future
+
+        self.add_state("obj_col", default=torch.zeros(self.n_future), dist_reduce_fx="sum")
+        self.add_state("obj_box_col", default=torch.zeros(self.n_future), dist_reduce_fx="sum")
+        self.add_state("L2", default=torch.zeros(self.n_future),dist_reduce_fx="sum")
+        self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum")
+
+
+    def evaluate_single_coll(self, traj, segmentation):
+        '''
+        gt_segmentation
+        traj: torch.Tensor (n_future, 2)
+        segmentation: torch.Tensor (n_future, 200, 200)
+        '''
+        pts = np.array([
+            [-self.H / 2. + 0.5, self.W / 2.],
+            [self.H / 2. + 0.5, self.W / 2.],
+            [self.H / 2. + 0.5, -self.W / 2.],
+            [-self.H / 2. + 0.5, -self.W / 2.],
+        ])
+        pts = (pts - self.bx.cpu().numpy()) / (self.dx.cpu().numpy())
+        pts[:, [0, 1]] = pts[:, [1, 0]]
+        rr, cc = polygon(pts[:,1], pts[:,0])
+        rc = np.concatenate([rr[:,None], cc[:,None]], axis=-1)
+
+        n_future, _ = traj.shape
+        trajs = traj.view(n_future, 1, 2)
+        trajs[:,:,[0,1]] = trajs[:,:,[1,0]] # can also change original tensor
+        trajs = trajs / self.dx
+        trajs = trajs.cpu().numpy() + rc # (n_future, 32, 2)
+
+        r = trajs[:,:,0].astype(np.int32)
+        r = np.clip(r, 0, self.bev_dimension[0] - 1)
+
+        c = trajs[:,:,1].astype(np.int32)
+        c = np.clip(c, 0, self.bev_dimension[1] - 1)
+
+        collision = np.full(n_future, False)
+        for t in range(n_future):
+            rr = r[t]
+            cc = c[t]
+            I = np.logical_and(
+                np.logical_and(rr >= 0, rr < self.bev_dimension[0]),
+                np.logical_and(cc >= 0, cc < self.bev_dimension[1]),
+            )
+            collision[t] = np.any(segmentation[t, rr[I], cc[I]].cpu().numpy())
+
+        return torch.from_numpy(collision).to(device=traj.device)
+
+    def evaluate_coll(self, trajs, gt_trajs, segmentation):
+        '''
+        trajs: torch.Tensor (B, n_future, 2)
+        gt_trajs: torch.Tensor (B, n_future, 2)
+        segmentation: torch.Tensor (B, n_future, 200, 200)
+        '''
+        B, n_future, _ = trajs.shape
+        trajs = trajs * torch.tensor([-1, 1], device=trajs.device)
+        gt_trajs = gt_trajs * torch.tensor([-1, 1], device=gt_trajs.device)
+
+        obj_coll_sum = torch.zeros(n_future, device=segmentation.device)
+        obj_box_coll_sum = torch.zeros(n_future, device=segmentation.device)
+
+        for i in range(B):
+            gt_box_coll = self.evaluate_single_coll(gt_trajs[i], segmentation[i])
+
+            xx, yy = trajs[i,:,0], trajs[i, :, 1]
+            yi = ((yy - self.bx[0]) / self.dx[0]).long()
+            xi = ((xx - self.bx[1]) / self.dx[1]).long()
+
+            m1 = torch.logical_and(
+                torch.logical_and(yi >= 0, yi < self.bev_dimension[0]),
+                torch.logical_and(xi >= 0, xi < self.bev_dimension[1]),
+            )
+            m1 = torch.logical_and(m1, torch.logical_not(gt_box_coll))
+
+            ti = torch.arange(n_future, device=m1.device)
+            obj_coll_sum[ti[m1]] += segmentation[i, ti[m1], yi[m1], xi[m1]].long()
+
+            m2 = torch.logical_not(gt_box_coll)
+            box_coll = self.evaluate_single_coll(trajs[i], segmentation[i])
+            obj_box_coll_sum[ti[m2]] += (box_coll[ti[m2]]).long()
+
+        return obj_coll_sum, obj_box_coll_sum
+
+    def compute_L2(self, trajs, gt_trajs, gt_trajs_mask):
+        '''
+        trajs: torch.Tensor (B, n_future, 3)
+        gt_trajs: torch.Tensor (B, n_future, 3)
+        '''
+        return torch.sqrt((((trajs[:, :, :2] - gt_trajs[:, :, :2]) ** 2) * gt_trajs_mask).sum(dim=-1)) 
+
+    def update(self, trajs, gt_trajs, gt_trajs_mask, segmentation):
+        '''
+        trajs: torch.Tensor (B, n_future, 3)
+        gt_trajs: torch.Tensor (B, n_future, 3)
+        segmentation: torch.Tensor (B, n_future, 200, 200)
+        '''
+        assert trajs.shape == gt_trajs.shape
+        trajs[..., 0] = - trajs[..., 0]
+        gt_trajs[..., 0] = - gt_trajs[..., 0]
+        L2 = self.compute_L2(trajs, gt_trajs, gt_trajs_mask)
+        obj_coll_sum, obj_box_coll_sum = self.evaluate_coll(trajs[:,:,:2], gt_trajs[:,:,:2], segmentation)
+
+        self.obj_col += obj_coll_sum
+        self.obj_box_col += obj_box_coll_sum
+        self.L2 += L2.sum(dim=0)
+        self.total +=len(trajs)
+
+    def compute(self):
+        return {
+            'obj_col': self.obj_col / self.total,
+            'obj_box_col': self.obj_box_col / self.total,
+            'L2' : self.L2 / self.total
+        }
\ No newline at end of file
diff --git a/mmcv/models/dense_heads/rpn_head.py b/mmcv/models/dense_heads/rpn_head.py
new file mode 100644
index 0000000..aa3f56b
--- /dev/null
+++ b/mmcv/models/dense_heads/rpn_head.py
@@ -0,0 +1,319 @@
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.ops.nms import batched_nms
+from mmcv.utils import force_fp32
+
+from ..builder import HEADS
+from .anchor_head import AnchorHead
+
+
+@HEADS.register_module()
+class RPNHead(AnchorHead):
+    """RPN head.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """  # noqa: W605
+
+    def __init__(self,
+                 in_channels,
+                 init_cfg=dict(type='Normal', layer='Conv2d', std=0.01),
+                 **kwargs):
+        super(RPNHead, self).__init__(
+            1, in_channels, init_cfg=init_cfg, **kwargs)
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self.rpn_conv = nn.Conv2d(
+            self.in_channels, self.feat_channels, 3, padding=1)
+        self.rpn_cls = nn.Conv2d(self.feat_channels,
+                                 self.num_anchors * self.cls_out_channels, 1)
+        self.rpn_reg = nn.Conv2d(self.feat_channels, self.num_anchors * 4, 1)
+
+    def forward_single(self, x):
+        """Forward feature map of a single scale level."""
+        x = self.rpn_conv(x)
+        x = F.relu(x, inplace=True)
+        rpn_cls_score = self.rpn_cls(x)
+        rpn_bbox_pred = self.rpn_reg(x)
+        return rpn_cls_score, rpn_bbox_pred
+
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             gt_bboxes,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """Compute losses of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        losses = super(RPNHead, self).loss(
+            cls_scores,
+            bbox_preds,
+            gt_bboxes,
+            None,
+            img_metas,
+            gt_bboxes_ignore=gt_bboxes_ignore)
+        return dict(
+            loss_rpn_cls=losses['loss_cls'], loss_rpn_bbox=losses['loss_bbox'])
+
+    @force_fp32(apply_to=('cls_scores', 'bbox_preds'))
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   img_metas,
+                   cfg=None,
+                   rescale=False,
+                   with_nms=True):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                Has shape (N, num_anchors * num_classes, H, W)
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_anchors * 4, H, W)
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            cfg (mmcv.Config | None): Test / postprocessing configuration,
+                if None, test_cfg would be used
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is an (n, 5) tensor, where the first 4 columns
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
+                5-th column is a score between 0 and 1. The second item is a
+                (n,) tensor where each item is the predicted class label of the
+                corresponding box.
+        """
+        assert with_nms, '``with_nms`` in RPNHead should always True'
+        assert len(cls_scores) == len(bbox_preds)
+        num_levels = len(cls_scores)
+        device = cls_scores[0].device
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_anchors = self.anchor_generator.grid_anchors(
+            featmap_sizes, device=device)
+
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            proposals = self._get_bboxes_single(cls_score_list, bbox_pred_list,
+                                                mlvl_anchors, img_shape,
+                                                scale_factor, cfg, rescale)
+            result_list.append(proposals)
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_scores,
+                           bbox_preds,
+                           mlvl_anchors,
+                           img_shape,
+                           scale_factor,
+                           cfg,
+                           rescale=False):
+        """Transform outputs for a single batch item into bbox predictions.
+
+          Args:
+            cls_scores (list[Tensor]): Box scores of all scale level
+                each item has shape (num_anchors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas of all
+                scale level, each item has shape (num_anchors * 4, H, W).
+            mlvl_anchors (list[Tensor]): Anchors of all scale level
+                each item has shape (num_total_anchors, 4).
+            img_shape (tuple[int]): Shape of the input image,
+                (height, width, 3).
+            scale_factor (ndarray): Scale factor of the image arrange as
+                (w_scale, h_scale, w_scale, h_scale).
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+
+        Returns:
+            Tensor: Labeled boxes in shape (n, 5), where the first 4 columns
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
+                5-th column is a score between 0 and 1.
+        """
+        cfg = self.test_cfg if cfg is None else cfg
+        cfg = copy.deepcopy(cfg)
+        # bboxes from different level should be independent during NMS,
+        # level_ids are used as labels for batched NMS to separate them
+        level_ids = []
+        mlvl_scores = []
+        mlvl_bbox_preds = []
+        mlvl_valid_anchors = []
+        for idx in range(len(cls_scores)):
+            rpn_cls_score = cls_scores[idx]
+            rpn_bbox_pred = bbox_preds[idx]
+            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
+            rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
+            if self.use_sigmoid_cls:
+                rpn_cls_score = rpn_cls_score.reshape(-1)
+                scores = rpn_cls_score.sigmoid()
+            else:
+                rpn_cls_score = rpn_cls_score.reshape(-1, 2)
+                # We set FG labels to [0, num_class-1] and BG label to
+                # num_class in RPN head since mmdet v2.5, which is unified to
+                # be consistent with other head since mmdet v2.0. In mmdet v2.0
+                # to v2.4 we keep BG label as 0 and FG label as 1 in rpn head.
+                scores = rpn_cls_score.softmax(dim=1)[:, 0]
+            rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            anchors = mlvl_anchors[idx]
+            if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre:
+                # sort is faster than topk
+                # _, topk_inds = scores.topk(cfg.nms_pre)
+                ranked_scores, rank_inds = scores.sort(descending=True)
+                topk_inds = rank_inds[:cfg.nms_pre]
+                scores = ranked_scores[:cfg.nms_pre]
+                rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
+                anchors = anchors[topk_inds, :]
+            mlvl_scores.append(scores)
+            mlvl_bbox_preds.append(rpn_bbox_pred)
+            mlvl_valid_anchors.append(anchors)
+            level_ids.append(
+                scores.new_full((scores.size(0), ), idx, dtype=torch.long))
+
+        scores = torch.cat(mlvl_scores)
+        anchors = torch.cat(mlvl_valid_anchors)
+        rpn_bbox_pred = torch.cat(mlvl_bbox_preds)
+        proposals = self.bbox_coder.decode(
+            anchors, rpn_bbox_pred, max_shape=img_shape)
+        ids = torch.cat(level_ids)
+
+        if cfg.min_bbox_size > 0:
+            w = proposals[:, 2] - proposals[:, 0]
+            h = proposals[:, 3] - proposals[:, 1]
+            valid_mask = (w >= cfg.min_bbox_size) & (h >= cfg.min_bbox_size)
+            if not valid_mask.all():
+                proposals = proposals[valid_mask]
+                scores = scores[valid_mask]
+                ids = ids[valid_mask]
+        if proposals.numel() > 0:
+            dets, keep = batched_nms(proposals, scores, ids, cfg.nms)
+        else:
+            return proposals.new_zeros(0, 5)
+
+        return dets[:cfg.max_per_img]
+
+    def onnx_export(self, x, img_metas):
+        """Test without augmentation.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            img_metas (list[dict]): Meta info of each image.
+
+        Returns:
+            tuple[Tensor, Tensor]: dets of shape [N, num_det, 5]
+                and class labels of shape [N, num_det].
+        """
+        cls_scores, bbox_preds = self(x)
+
+        assert len(cls_scores) == len(bbox_preds)
+        num_levels = len(cls_scores)
+
+        device = cls_scores[0].device
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_anchors = self.anchor_generator.grid_anchors(
+            featmap_sizes, device=device)
+
+        cls_scores = [cls_scores[i].detach() for i in range(num_levels)]
+        bbox_preds = [bbox_preds[i].detach() for i in range(num_levels)]
+
+        assert len(
+            img_metas
+        ) == 1, 'Only support one input image while in exporting to ONNX'
+        img_shapes = img_metas[0]['img_shape_for_onnx']
+
+        cfg = copy.deepcopy(self.test_cfg)
+
+        mlvl_scores = []
+        mlvl_bbox_preds = []
+        mlvl_valid_anchors = []
+        batch_size = cls_scores[0].shape[0]
+        nms_pre_tensor = torch.tensor(
+            cfg.nms_pre, device=cls_scores[0].device, dtype=torch.long)
+        for idx in range(len(cls_scores)):
+            rpn_cls_score = cls_scores[idx]
+            rpn_bbox_pred = bbox_preds[idx]
+            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
+            rpn_cls_score = rpn_cls_score.permute(0, 2, 3, 1)
+            if self.use_sigmoid_cls:
+                rpn_cls_score = rpn_cls_score.reshape(batch_size, -1)
+                scores = rpn_cls_score.sigmoid()
+            else:
+                rpn_cls_score = rpn_cls_score.reshape(batch_size, -1, 2)
+                # We set FG labels to [0, num_class-1] and BG label to
+                # num_class in RPN head since mmdet v2.5, which is unified to
+                # be consistent with other head since mmdet v2.0. In mmdet v2.0
+                # to v2.4 we keep BG label as 0 and FG label as 1 in rpn head.
+                scores = rpn_cls_score.softmax(-1)[..., 0]
+            rpn_bbox_pred = rpn_bbox_pred.permute(0, 2, 3, 1).reshape(
+                batch_size, -1, 4)
+            anchors = mlvl_anchors[idx]
+            anchors = anchors.expand_as(rpn_bbox_pred)
+            # Get top-k prediction
+            from mmcv.core.export import get_k_for_topk
+            nms_pre = get_k_for_topk(nms_pre_tensor, rpn_bbox_pred.shape[1])
+            if nms_pre > 0:
+                _, topk_inds = scores.topk(nms_pre)
+                batch_inds = torch.arange(batch_size).view(
+                    -1, 1).expand_as(topk_inds)
+                # Avoid onnx2tensorrt issue in https://github.com/NVIDIA/TensorRT/issues/1134 # noqa: E501
+                # Mind k<=3480 in TensorRT for TopK
+                transformed_inds = scores.shape[1] * batch_inds + topk_inds
+                scores = scores.reshape(-1, 1)[transformed_inds].reshape(
+                    batch_size, -1)
+                rpn_bbox_pred = rpn_bbox_pred.reshape(
+                    -1, 4)[transformed_inds, :].reshape(batch_size, -1, 4)
+                anchors = anchors.reshape(-1, 4)[transformed_inds, :].reshape(
+                    batch_size, -1, 4)
+            mlvl_scores.append(scores)
+            mlvl_bbox_preds.append(rpn_bbox_pred)
+            mlvl_valid_anchors.append(anchors)
+
+        batch_mlvl_scores = torch.cat(mlvl_scores, dim=1)
+        batch_mlvl_anchors = torch.cat(mlvl_valid_anchors, dim=1)
+        batch_mlvl_rpn_bbox_pred = torch.cat(mlvl_bbox_preds, dim=1)
+        batch_mlvl_proposals = self.bbox_coder.decode(
+            batch_mlvl_anchors, batch_mlvl_rpn_bbox_pred, max_shape=img_shapes)
+
+        # Use ONNX::NonMaxSuppression in deployment
+        from mmcv.core.export import add_dummy_nms_for_onnx
+        batch_mlvl_scores = batch_mlvl_scores.unsqueeze(2)
+        score_threshold = cfg.nms.get('score_thr', 0.0)
+        nms_pre = cfg.get('deploy_nms_pre', -1)
+        dets, _ = add_dummy_nms_for_onnx(batch_mlvl_proposals,
+                                         batch_mlvl_scores, cfg.max_per_img,
+                                         cfg.nms.iou_threshold,
+                                         score_threshold, nms_pre,
+                                         cfg.max_per_img)
+        return dets
diff --git a/mmcv/models/dense_heads/seg_head_plugin/__init__.py b/mmcv/models/dense_heads/seg_head_plugin/__init__.py
new file mode 100644
index 0000000..4e36f7d
--- /dev/null
+++ b/mmcv/models/dense_heads/seg_head_plugin/__init__.py
@@ -0,0 +1,5 @@
+from .seg_detr_head import SegDETRHead
+from .seg_mask_head import SegMaskHead
+from .seg_deformable_transformer import SegDeformableTransformer
+from .seg_assigner import *
+from .seg_utils import *
\ No newline at end of file
diff --git a/mmcv/models/dense_heads/seg_head_plugin/seg_assigner.py b/mmcv/models/dense_heads/seg_head_plugin/seg_assigner.py
new file mode 100644
index 0000000..ee25dc2
--- /dev/null
+++ b/mmcv/models/dense_heads/seg_head_plugin/seg_assigner.py
@@ -0,0 +1,446 @@
+from mmcv.core import mask
+import torch
+from mmcv.core.bbox.assigners.base_assigner import BaseAssigner
+
+from mmcv.core.bbox.assigners.assign_result import AssignResult
+from mmcv.core.bbox.transforms import bbox_cxcywh_to_xyxy
+from mmcv.core.bbox.match_costs import build_match_cost
+from mmcv.core.bbox.builder import BBOX_ASSIGNERS
+try:
+    from scipy.optimize import linear_sum_assignment
+except ImportError:
+    linear_sum_assignment = None
+
+from mmcv.core.bbox.samplers.base_sampler import BaseSampler
+from mmcv.core.bbox.builder import BBOX_SAMPLERS
+from mmcv.core import mask
+import torch
+
+from mmcv.utils import util_mixins
+
+
+INF = 10000000
+
+
+class SamplingResult_segformer(util_mixins.NiceRepr):
+    """
+    """
+
+    def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, gt_masks,assign_result,
+                 gt_flags):
+        self.pos_inds = pos_inds
+        self.neg_inds = neg_inds
+        self.pos_bboxes = bboxes[pos_inds]
+        self.neg_bboxes = bboxes[neg_inds]
+        self.pos_is_gt = gt_flags[pos_inds]
+
+        self.num_gts = gt_bboxes.shape[0]
+        self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+       
+
+        if gt_bboxes.numel() == 0:
+            # hack for index error case
+            assert self.pos_assigned_gt_inds.numel() == 0
+            self.pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4)
+            
+            #print('pos_gt_bboxes',self.pos_gt_bboxes.shape)
+            #print('gt_mask',gt_masks.shape)
+            n,h,w = gt_masks.shape
+            #n = self.pos_gt_bboxes.shape[0]
+            self.pos_gt_masks = torch.empty_like(gt_masks).view(-1, h,w)
+        else:
+            if len(gt_bboxes.shape) < 2:
+                gt_bboxes = gt_bboxes.view(-1, 4)
+            
+            self.pos_gt_bboxes = gt_bboxes[self.pos_assigned_gt_inds, :]
+            self.pos_gt_masks = gt_masks[self.pos_assigned_gt_inds, :]
+
+        if assign_result.labels is not None:
+            self.pos_gt_labels = assign_result.labels[pos_inds]
+        else:
+            self.pos_gt_labels = None
+
+    @property
+    def bboxes(self):
+        """torch.Tensor: concatenated positive and negative boxes"""
+        return torch.cat([self.pos_bboxes, self.neg_bboxes])
+  
+
+    def to(self, device):
+        """Change the device of the data inplace.
+
+        Example:
+            >>> self = SamplingResult.random()
+            >>> print(f'self = {self.to(None)}')
+            >>> # xdoctest: +REQUIRES(--gpu)
+            >>> print(f'self = {self.to(0)}')
+        """
+        _dict = self.__dict__
+        for key, value in _dict.items():
+            if isinstance(value, torch.Tensor):
+                _dict[key] = value.to(device)
+        return self
+
+    def __nice__(self):
+        data = self.info.copy()
+        data['pos_bboxes'] = data.pop('pos_bboxes').shape
+        data['neg_bboxes'] = data.pop('neg_bboxes').shape
+        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
+        body = '    ' + ',\n    '.join(parts)
+        return '{\n' + body + '\n}'
+
+    @property
+    def info(self):
+        """Returns a dictionary of info about the object."""
+        return {
+            'pos_inds': self.pos_inds,
+            'neg_inds': self.neg_inds,
+            'pos_bboxes': self.pos_bboxes,
+            'neg_bboxes': self.neg_bboxes,
+            'pos_is_gt': self.pos_is_gt,
+            'num_gts': self.num_gts,
+            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
+        }
+
+    @classmethod
+    def random(cls, rng=None, **kwargs):
+        """
+        Args:
+            rng (None | int | numpy.random.RandomState): seed or state.
+            kwargs (keyword arguments):
+                - num_preds: number of predicted boxes
+                - num_gts: number of true boxes
+                - p_ignore (float): probability of a predicted box assigned to \
+                    an ignored truth.
+                - p_assigned (float): probability of a predicted box not being \
+                    assigned.
+                - p_use_label (float | bool): with labels or not.
+
+        Returns:
+            :obj:`SamplingResult`: Randomly generated sampling result.
+
+        Example:
+            >>> from mmcv.core.bbox.samplers.sampling_result import *  # NOQA
+            >>> self = SamplingResult.random()
+            >>> print(self.__dict__)
+        """
+        from mmcv.core.bbox.samplers.random_sampler import RandomSampler
+        from mmcv.core.bbox.assigners.assign_result import AssignResult
+        from mmcv.core.bbox import demodata
+        rng = demodata.ensure_rng(rng)
+
+        # make probabalistic?
+        num = 32
+        pos_fraction = 0.5
+        neg_pos_ub = -1
+
+        assign_result = AssignResult.random(rng=rng, **kwargs)
+
+        # Note we could just compute an assignment
+        bboxes = demodata.random_boxes(assign_result.num_preds, rng=rng)
+        gt_bboxes = demodata.random_boxes(assign_result.num_gts, rng=rng)
+
+        if rng.rand() > 0.2:
+            # sometimes algorithms squeeze their data, be robust to that
+            gt_bboxes = gt_bboxes.squeeze()
+            bboxes = bboxes.squeeze()
+
+        if assign_result.labels is None:
+            gt_labels = None
+        else:
+            gt_labels = None  # todo
+
+        if gt_labels is None:
+            add_gt_as_proposals = False
+        else:
+            add_gt_as_proposals = True  # make probabalistic?
+
+        sampler = RandomSampler(
+            num,
+            pos_fraction,
+            neg_pos_ub=neg_pos_ub,
+            add_gt_as_proposals=add_gt_as_proposals,
+            rng=rng)
+        self = sampler.sample(assign_result, bboxes, gt_bboxes, gt_labels)
+        return self
+
+
+@BBOX_SAMPLERS.register_module()
+class PseudoSampler_segformer(BaseSampler):
+    """A pseudo sampler that does not do sampling actually."""
+
+    def __init__(self, **kwargs):
+        pass
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
+
+    def sample(self, assign_result, bboxes, gt_bboxes,gt_masks, **kwargs):
+        """Directly returns the positive and negative indices  of samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigned results
+            bboxes (torch.Tensor): Bounding boxes
+            gt_bboxes (torch.Tensor): Ground truth boxes
+
+        Returns:
+            :obj:`SamplingResult`: sampler results
+        """
+        pos_inds = torch.nonzero(
+            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(
+            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1).unique()
+        gt_flags = bboxes.new_zeros(bboxes.shape[0], dtype=torch.uint8)
+        sampling_result = SamplingResult_segformer(pos_inds, neg_inds, bboxes, gt_bboxes,gt_masks,
+                                         assign_result, gt_flags,**kwargs)
+        return sampling_result
+
+
+@BBOX_ASSIGNERS.register_module()
+class HungarianAssigner_filter(BaseAssigner):
+    """
+    """
+
+    def __init__(self,
+                 cls_cost=dict(type='ClassificationCost', weight=1.),
+                 reg_cost=dict(type='BBoxL1Cost', weight=1.0),
+                 iou_cost=dict(type='IoUCost', iou_mode='giou', weight=1.0),
+                 max_pos = 3
+                 ):
+        self.cls_cost = build_match_cost(cls_cost)
+        self.reg_cost = build_match_cost(reg_cost)
+        self.iou_cost = build_match_cost(iou_cost)
+        self.max_pos = max_pos
+    def assign(self,
+               bbox_pred,
+               cls_pred,
+               gt_bboxes,
+               gt_labels,
+               img_meta,
+               gt_bboxes_ignore=None,
+               eps=1e-7):
+        """
+        """
+        assert gt_bboxes_ignore is None, \
+            'Only case when gt_bboxes_ignore is None is supported.'
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              -1,
+                                              dtype=torch.long)
+        
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),-1,dtype=torch.long)
+
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                assigned_gt_inds[:] = 0
+                # No ground truth, assign all to background
+                pos_ind = assigned_gt_inds.gt(0).nonzero().squeeze(1)
+                neg_ind = assigned_gt_inds.eq(0).nonzero().squeeze(1)
+                # No ground truth, assign all to background
+            return pos_ind, neg_ind,  AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+        img_h, img_w, _ = img_meta['img_shape']
+        factor = gt_bboxes.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+
+        # 2. compute the weighted costs
+        # classification and bboxcost.
+        
+        cls_cost = self.cls_cost(cls_pred, gt_labels)
+        # regression L1 cost
+        normalize_gt_bboxes = gt_bboxes / factor
+        reg_cost = self.reg_cost(bbox_pred, normalize_gt_bboxes)
+        # regression iou cost, defaultly giou is used in official DETR.
+        bboxes = bbox_cxcywh_to_xyxy(bbox_pred) * factor
+        iou_cost = self.iou_cost(bboxes, gt_bboxes)
+        # weighted sum of above three cost
+        
+        cost = cls_cost + reg_cost + iou_cost 
+        
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+
+        assigned_gt_inds[:] = 0
+        #index_set = []
+        
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+        result=None
+        for i in range(max(min(self.max_pos, 300//num_gts),1)):
+            matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+            
+            matched_row_inds = torch.from_numpy(matched_row_inds).to(
+                bbox_pred.device)
+            matched_col_inds = torch.from_numpy(matched_col_inds).to(
+                bbox_pred.device)     
+            #print(matched_row_inds)
+                
+            cost[matched_row_inds,:] = INF   
+            #index_set.(matched_row_inds)
+            #print('this mathed row inds ', len(matched_row_inds), i)
+            assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+            assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+            if i == 0:
+                result = AssignResult(num_gts, assigned_gt_inds.clone(), None, labels=assigned_labels.clone())
+            if cost[matched_row_inds.cpu(), matched_col_inds.cpu()].max()>=INF:
+                break
+        pos_ind = assigned_gt_inds.gt(0).nonzero().squeeze(1)
+        neg_ind = assigned_gt_inds.eq(0).nonzero().squeeze(1)
+        
+        return pos_ind, neg_ind, result
+            
+
+
+@BBOX_ASSIGNERS.register_module()
+class HungarianAssigner_multi_info(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth.
+
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of three components:
+    classification cost, regression L1 cost and regression iou cost. The
+    targets don't include the no_object, so generally there are more
+    predictions than targets. After the one-to-one matching, the un-matched
+    are treated as backgrounds. Thus each query prediction will be assigned
+    with `0` or a positive integer indicating the ground truth index:
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+        bbox_weight (int | float, optional): The scale factor for regression
+            L1 cost. Default 1.0.
+        iou_weight (int | float, optional): The scale factor for regression
+            iou cost. Default 1.0.
+        iou_calculator (dict | optional): The config for the iou calculation.
+            Default type `BboxOverlaps2D`.
+        iou_mode (str | optional): "iou" (intersection over union), "iof"
+                (intersection over foreground), or "giou" (generalized
+                intersection over union). Default "giou".
+    """
+
+    def __init__(self,
+                 cls_cost=dict(type='ClassificationCost', weight=1.),
+                 reg_cost=dict(type='BBoxL1Cost', weight=1.0),
+                 iou_cost=dict(type='IoUCost', iou_mode='giou', weight=1.0),
+                 mask_cost=dict(type='DiceCost', weight=1.0)
+                
+                 ):
+        cls_cost['weight'] *= 2
+        self.cls_cost = build_match_cost(cls_cost)
+        self.reg_cost = build_match_cost(reg_cost)
+        self.iou_cost = build_match_cost(iou_cost)
+        self.mask_cost = build_match_cost(mask_cost)
+     
+
+    def assign(self,
+               bbox_pred,
+               cls_pred,
+               mask_pred,
+               gt_bboxes,
+               gt_labels,
+               gt_mask,
+               img_meta,
+               gt_bboxes_ignore=None,
+               eps=1e-7):
+        """Computes one-to-one matching based on the weighted costs.
+
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+            img_meta (dict): Meta information for current image.
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`. Default None.
+            eps (int | float, optional): A value added to the denominator for
+                numerical stability. Default 1e-7.
+
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert gt_bboxes_ignore is None, \
+            'Only case when gt_bboxes_ignore is None is supported.'
+        #print(bbox_pred.shape, cls_pred.shape,mask_pred.shape,gt_bboxes.shape,gt_labels.shape,gt_mask.shape)
+        num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
+                                              -1,
+                                              dtype=torch.long)
+
+        assigned_labels = bbox_pred.new_full((num_bboxes, ),
+                                             -1,
+                                             dtype=torch.long)
+
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(
+                num_gts, assigned_gt_inds, None, labels=assigned_labels)
+        img_h, img_w, _ = img_meta['img_shape']
+        
+        factor = bbox_pred.new_tensor([img_w, img_h, img_w,img_h]).unsqueeze(0)
+
+      
+        # classification and bboxcost.
+        cls_cost = self.cls_cost(cls_pred, gt_labels)
+        # regression L1 cost
+        normalize_gt_bboxes = gt_bboxes / factor
+        reg_cost = self.reg_cost(bbox_pred, normalize_gt_bboxes)
+        # regression iou cost, defaultly giou is used in official DETR.
+        bboxes = bbox_cxcywh_to_xyxy(bbox_pred) * factor
+        iou_cost = self.iou_cost(bboxes, gt_bboxes)
+        # weighted sum of above three costs
+        mask_cost = self.mask_cost(mask_pred,gt_mask)
+        #
+        cost = cls_cost + reg_cost + iou_cost + mask_cost
+
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(
+            bbox_pred.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(
+            bbox_pred.device)
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+     
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
+        return AssignResult(
+            num_gts, assigned_gt_inds, None, labels=assigned_labels)
\ No newline at end of file
diff --git a/mmcv/models/dense_heads/seg_head_plugin/seg_deformable_transformer.py b/mmcv/models/dense_heads/seg_head_plugin/seg_deformable_transformer.py
new file mode 100644
index 0000000..83c06af
--- /dev/null
+++ b/mmcv/models/dense_heads/seg_head_plugin/seg_deformable_transformer.py
@@ -0,0 +1,385 @@
+from mmcv.utils import force_fp32
+from mmcv.models.utils.builder import TRANSFORMER
+from mmcv.models.utils import Transformer
+import warnings
+import math
+import copy
+import torch
+import torch.nn as nn
+from mmcv.models.bricks import build_activation_layer, build_norm_layer
+from mmcv.models.utils import xavier_init
+from mmcv.models.bricks.registry import (TRANSFORMER_LAYER,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmcv.models.bricks.transformer import (BaseTransformerLayer,
+                                         MultiScaleDeformableAttention,
+                                         TransformerLayerSequence,
+                                         build_transformer_layer_sequence)
+from mmcv.models.backbones.base_module import BaseModule
+from torch.nn.init import normal_
+
+from mmcv.models.utils.builder import TRANSFORMER
+from mmcv.models.bricks.registry import ATTENTION
+from torch import einsum
+
+from einops import rearrange, repeat
+from einops.layers.torch import Rearrange
+
+# Copy-paste from defromable detr in mmcv.
+@TRANSFORMER.register_module()
+class SegDeformableTransformer(Transformer):
+    """Implements the DeformableDETR transformer.
+
+    Args:
+        as_two_stage (bool): Generate query from encoder features.
+            Default: False.
+        num_feature_levels (int): Number of feature maps from FPN:
+            Default: 4.
+        two_stage_num_proposals (int): Number of proposals when set
+            `as_two_stage` as True. Default: 300.
+    """
+    def __init__(self,
+                 as_two_stage=False,
+                 num_feature_levels=4,
+                 two_stage_num_proposals=300,
+                 **kwargs):
+        super(SegDeformableTransformer, self).__init__(**kwargs)
+        self.fp16_enabled = False
+        self.as_two_stage = as_two_stage
+        self.num_feature_levels = num_feature_levels
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.embed_dims = self.encoder.embed_dims
+        self.init_layers()
+
+    def init_layers(self):
+        """Initialize layers of the DeformableDetrTransformer."""
+        self.level_embeds = nn.Parameter(
+            torch.Tensor(self.num_feature_levels, self.embed_dims))
+
+        if self.as_two_stage:
+            self.enc_output = nn.Linear(self.embed_dims, self.embed_dims)
+            self.enc_output_norm = nn.LayerNorm(self.embed_dims)
+            self.pos_trans = nn.Linear(self.embed_dims * 2,
+                                       self.embed_dims * 2)
+            self.pos_trans_norm = nn.LayerNorm(self.embed_dims * 2)
+        else:
+            self.reference_points = nn.Linear(self.embed_dims, 2)
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                try:
+                    m.init_weight()
+                except:
+                    m.init_weights()
+        if not self.as_two_stage:
+            xavier_init(self.reference_points, distribution='uniform', bias=0.)
+        normal_(self.level_embeds)
+
+    def gen_encoder_output_proposals(self, memory, memory_padding_mask,
+                                     spatial_shapes):
+        """Generate proposals from encoded memory.
+
+        Args:
+            memory (Tensor) : The output of encoder,
+                has shape (bs, num_key, embed_dim).  num_key is
+                equal the number of points on feature map from
+                all level.
+            memory_padding_mask (Tensor): Padding mask for memory.
+                has shape (bs, num_key).
+            spatial_shapes (Tensor): The shape of all feature maps.
+                has shape (num_level, 2).
+
+        Returns:
+            tuple: A tuple of feature map and bbox prediction.
+
+                - output_memory (Tensor): The input of decoder,  \
+                    has shape (bs, num_key, embed_dim).  num_key is \
+                    equal the number of points on feature map from \
+                    all levels.
+                - output_proposals (Tensor): The normalized proposal \
+                    after a inverse sigmoid, has shape \
+                    (bs, num_keys, 4).
+        """
+
+        N, S, C = memory.shape
+        proposals = []
+        _cur = 0
+        for lvl, (H, W) in enumerate(spatial_shapes):
+            mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H * W)].view(
+                N, H, W, 1)
+            valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+            grid_y, grid_x = torch.meshgrid(
+                torch.linspace(0,
+                               H - 1,
+                               H,
+                               dtype=torch.float32,
+                               device=memory.device),
+                torch.linspace(0,
+                               W - 1,
+                               W,
+                               dtype=torch.float32,
+                               device=memory.device))
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+            scale = torch.cat([valid_W.unsqueeze(-1),
+                               valid_H.unsqueeze(-1)], 1).view(N, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(N, -1, -1, -1) + 0.5) / scale
+            wh = torch.ones_like(grid) * 0.05 * (2.0**lvl)
+            proposal = torch.cat((grid, wh), -1).view(N, -1, 4)
+            proposals.append(proposal)
+            _cur += (H * W)
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) &
+                                  (output_proposals < 0.99)).all(-1,
+                                                                 keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))
+        output_proposals = output_proposals.masked_fill(
+            memory_padding_mask.unsqueeze(-1), float('inf'))
+        output_proposals = output_proposals.masked_fill(
+            ~output_proposals_valid, float('inf'))
+
+        output_memory = memory
+        output_memory = output_memory.masked_fill(
+            memory_padding_mask.unsqueeze(-1), float(0))
+        output_memory = output_memory.masked_fill(~output_proposals_valid,
+                                                  float(0))
+        output_memory = self.enc_output_norm(self.enc_output(output_memory))
+        return output_memory, output_proposals
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        """Get the reference points used in decoder.
+
+        Args:
+            spatial_shapes (Tensor): The shape of all
+                feature maps, has shape (num_level, 2).
+            valid_ratios (Tensor): The radios of valid
+                points on the feature map, has shape
+                (bs, num_levels, 2)
+            device (obj:`device`): The device where
+                reference_points should be.
+
+        Returns:
+            Tensor: reference points used in decoder, has \
+                shape (bs, num_keys, num_levels, 2).
+        """
+        reference_points_list = []
+        for lvl, (H, W) in enumerate(spatial_shapes):
+            #  TODO  check this 0.5
+            ref_y, ref_x = torch.meshgrid(
+                torch.linspace(0.5,
+                               H - 0.5,
+                               H,
+                               dtype=torch.float32,
+                               device=device),
+                torch.linspace(0.5,
+                               W - 0.5,
+                               W,
+                               dtype=torch.float32,
+                               device=device))
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] *
+                                               H)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] *
+                                               W)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def get_valid_ratio(self, mask):
+        """Get the valid radios of feature maps of all  level."""
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def get_proposal_pos_embed(self,
+                               proposals,
+                               num_pos_feats=128,
+                               temperature=10000):
+        """Get the position embedding of proposal."""
+        scale = 2 * math.pi
+        dim_t = torch.arange(num_pos_feats,
+                             dtype=torch.float32,
+                             device=proposals.device)
+        dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats)
+        # N, L, 4
+        proposals = proposals.sigmoid() * scale
+        # N, L, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # N, L, 4, 64, 2
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()),
+                          dim=4).flatten(2)
+        return pos
+
+    @force_fp32(apply_to=('mlvl_feats', 'query_embed', 'mlvl_pos_embeds'))
+    def forward(self,
+                mlvl_feats,
+                mlvl_masks,
+                query_embed,
+                mlvl_pos_embeds,
+                reg_branches=None,
+                cls_branches=None,
+                **kwargs):
+        """Forward function for `Transformer`.
+
+        Args:
+            mlvl_feats (list(Tensor)): Input queries from
+                different level. Each element has shape
+                [bs, embed_dims, h, w].
+            mlvl_masks (list(Tensor)): The key_padding_mask from
+                different level used for encoder and decoder,
+                each element has shape  [bs, h, w].
+            query_embed (Tensor): The query embedding for decoder,
+                with shape [num_query, c].
+            mlvl_pos_embeds (list(Tensor)): The positional encoding
+                of feats from different level, has the shape
+                 [bs, embed_dims, h, w].
+            reg_branches (obj:`nn.ModuleList`): Regression heads for
+                feature maps from each decoder layer. Only would
+                be passed when
+                `with_box_refine` is True. Default to None.
+            cls_branches (obj:`nn.ModuleList`): Classification heads
+                for feature maps from each decoder layer. Only would
+                 be passed when `as_two_stage`
+                 is True. Default to None.
+
+
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+
+                - inter_states: Outputs from decoder. If
+                    return_intermediate_dec is True output has shape \
+                      (num_dec_layers, bs, num_query, embed_dims), else has \
+                      shape (1, bs, num_query, embed_dims).
+                - init_reference_out: The initial value of reference \
+                    points, has shape (bs, num_queries, 4).
+                - inter_references_out: The internal value of reference \
+                    points in decoder, has shape \
+                    (num_dec_layers, bs,num_query, embed_dims)
+                - enc_outputs_class: The classification score of \
+                    proposals generated from \
+                    encoder's feature maps, has shape \
+                    (batch, h*w, num_classes). \
+                    Only would be returned when `as_two_stage` is True, \
+                    otherwise None.
+                - enc_outputs_coord_unact: The regression results \
+                    generated from encoder's feature maps., has shape \
+                    (batch, h*w, 4). Only would \
+                    be returned when `as_two_stage` is True, \
+                    otherwise None.
+        """
+
+        assert self.as_two_stage or query_embed is not None
+        feat_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (feat, mask, pos_embed) in enumerate(
+                zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):
+            bs, c, h, w = feat.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            feat = feat.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embeds[lvl].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            feat_flatten.append(feat)
+            mask_flatten.append(mask)
+        feat_flatten = torch.cat(feat_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes,
+                                         dtype=torch.long,
+                                         device=feat_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack(
+            [self.get_valid_ratio(m) for m in mlvl_masks], 1)
+
+        reference_points = \
+            self.get_reference_points(spatial_shapes,
+                                      valid_ratios,
+                                      device=feat.device)
+
+        feat_flatten = feat_flatten.permute(1, 0, 2)  # (H*W, bs, embed_dims)
+        lvl_pos_embed_flatten = lvl_pos_embed_flatten.permute(
+            1, 0, 2)  # (H*W, bs, embed_dims)
+        memory = self.encoder(query=feat_flatten,
+                              key=None,
+                              value=None,
+                              query_pos=lvl_pos_embed_flatten,
+                              query_key_padding_mask=mask_flatten,
+                              spatial_shapes=spatial_shapes,
+                              reference_points=reference_points,
+                              level_start_index=level_start_index,
+                              valid_ratios=valid_ratios,
+                              **kwargs)
+
+        memory = memory.permute(1, 0, 2)
+        bs, _, c = memory.shape
+        if self.as_two_stage:
+            output_memory, output_proposals = \
+                self.gen_encoder_output_proposals(
+                    memory, mask_flatten, spatial_shapes)
+            enc_outputs_class = cls_branches[self.decoder.num_layers](
+                output_memory)
+            enc_outputs_coord_unact = \
+                reg_branches[
+                    self.decoder.num_layers](output_memory) + output_proposals
+
+            topk = self.two_stage_num_proposals
+            topk_proposals = torch.topk(enc_outputs_class[..., 0], topk,
+                                        dim=1)[1]
+            topk_coords_unact = torch.gather(
+                enc_outputs_coord_unact, 1,
+                topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
+            topk_coords_unact = topk_coords_unact.detach()
+            reference_points = topk_coords_unact.sigmoid()
+            init_reference_out = reference_points
+            pos_trans_out = self.pos_trans_norm(
+                self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact)))
+            query_pos, query = torch.split(pos_trans_out, c, dim=2)
+        else:
+            #print('query_embd',query_embed.shape, c)
+            # query_embed N *(2C)
+            query_pos, query = torch.split(query_embed, c, dim=1)
+            query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1)
+            query = query.unsqueeze(0).expand(bs, -1, -1)
+            reference_points = self.reference_points(query_pos).sigmoid()
+            init_reference_out = reference_points
+
+        # decoder
+        query = query.permute(1, 0, 2)
+        memory = memory.permute(1, 0, 2)
+        query_pos = query_pos.permute(1, 0, 2)
+        inter_states, inter_references = self.decoder(
+            query=query,
+            key=None,
+            value=memory,
+            query_pos=query_pos,
+            key_padding_mask=mask_flatten,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            reg_branches=reg_branches,
+            **kwargs)
+        inter_references_out = inter_references
+        if self.as_two_stage:
+            return (memory,lvl_pos_embed_flatten,mask_flatten,query_pos), inter_states, init_reference_out,\
+                inter_references_out, enc_outputs_class,\
+                enc_outputs_coord_unact
+        return (memory,lvl_pos_embed_flatten,mask_flatten,query_pos), inter_states, init_reference_out, \
+            inter_references_out, None, None
\ No newline at end of file
diff --git a/mmcv/models/dense_heads/seg_head_plugin/seg_detr_head.py b/mmcv/models/dense_heads/seg_head_plugin/seg_detr_head.py
new file mode 100644
index 0000000..06d0352
--- /dev/null
+++ b/mmcv/models/dense_heads/seg_head_plugin/seg_detr_head.py
@@ -0,0 +1,689 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.models.bricks import Conv2d, Linear, build_activation_layer
+from mmcv.models.bricks.transformer import FFN, build_positional_encoding
+from mmcv.utils import force_fp32
+
+from mmcv.core.bbox.transforms import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh
+from mmcv.core.bbox.builder import build_assigner, build_sampler
+from mmcv.core.utils import multi_apply, reduce_mean
+from mmcv.models.utils import build_transformer
+
+from mmcv.models.dense_heads.anchor_free_head import AnchorFreeHead
+from mmcv.models.builder import HEADS, build_loss
+
+
+@HEADS.register_module()
+class SegDETRHead(
+        AnchorFreeHead
+):  # I modify DETRHead to make it to support panoptic segmentation
+    """Implements the DETR transformer head.
+
+    See `paper: End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        num_classes (int): Number of categories excluding the background.
+        in_channels (int): Number of channels in the input feature map.
+        num_query (int): Number of query in Transformer.
+        num_reg_fcs (int, optional): Number of fully-connected layers used in
+            `FFN`, which is then used for the regression head. Default 2.
+        transformer (obj:`mmcv.ConfigDict`|dict): Config for transformer.
+            Default: None.
+        sync_cls_avg_factor (bool): Whether to sync the avg_factor of
+            all ranks. Default to False.
+        positional_encoding (obj:`mmcv.ConfigDict`|dict):
+            Config for position encoding.
+        loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the
+            classification loss. Default `CrossEntropyLoss`.
+        loss_bbox (obj:`mmcv.ConfigDict`|dict): Config of the
+            regression loss. Default `L1Loss`.
+        loss_iou (obj:`mmcv.ConfigDict`|dict): Config of the
+            regression iou loss. Default `GIoULoss`.
+        tran_cfg (obj:`mmcv.ConfigDict`|dict): Training config of
+            transformer head.
+        test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of
+            transformer head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    _version = 2
+
+    def __init__(
+            self,
+            num_classes,
+            num_things_classes,
+            num_stuff_classes,
+            in_channels,
+            num_query=100,
+            num_reg_fcs=2,
+            transformer=None,
+            sync_cls_avg_factor=False,
+            positional_encoding=dict(type='SinePositionalEncoding',
+                                     num_feats=128,
+                                     normalize=True),
+            loss_cls=dict(type='CrossEntropyLoss',
+                          bg_cls_weight=0.1,
+                          use_sigmoid=False,
+                          loss_weight=1.0,
+                          class_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+            loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+            train_cfg=dict(assigner=dict(
+                type='HungarianAssigner',
+                cls_cost=dict(type='ClassificationCost', weight=1.),
+                reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+                iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))),
+            test_cfg=dict(max_per_img=100),
+            init_cfg=None,
+            **kwargs):
+        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
+        # since it brings inconvenience when the initialization of
+        # `AnchorFreeHead` is called.
+        super(AnchorFreeHead, self).__init__(init_cfg)
+        self.bg_cls_weight = 0
+        self.sync_cls_avg_factor = sync_cls_avg_factor
+        class_weight = loss_cls.get('class_weight', None)
+        if class_weight is not None and (self.__class__ is SegDETRHead):
+            assert isinstance(class_weight, float), 'Expected ' \
+                'class_weight to have type float. Found ' \
+                f'{type(class_weight)}.'
+            # NOTE following the official DETR rep0, bg_cls_weight means
+            # relative classification weight of the no-object class.
+            bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight)
+            assert isinstance(bg_cls_weight, float), 'Expected ' \
+                'bg_cls_weight to have type float. Found ' \
+                f'{type(bg_cls_weight)}.'
+            class_weight = torch.ones(num_things_classes + 1) * class_weight
+            # set background class as the last indice
+            class_weight[num_things_classes] = bg_cls_weight
+            loss_cls.update({'class_weight': class_weight})
+            if 'bg_cls_weight' in loss_cls:
+                loss_cls.pop('bg_cls_weight')
+            self.bg_cls_weight = bg_cls_weight
+
+        if train_cfg:
+            assert 'assigner' in train_cfg, 'assigner should be provided '\
+                'when train_cfg is set.'
+            assigner = train_cfg['assigner']
+            # assert loss_cls['loss_weight'] == assigner['cls_cost']['weight'], \
+            #     'The classification weight for loss and matcher should be' \
+            #     'exactly the same.'
+            # assert loss_bbox['loss_weight'] == assigner['reg_cost'][
+            #     'weight'], 'The regression L1 weight for loss and matcher ' \
+            #     'should be exactly the same.'
+            # assert loss_iou['loss_weight'] == assigner['iou_cost']['weight'], \
+            #     'The regression iou weight for loss and matcher should be' \
+            #     'exactly the same.'
+            self.assigner = build_assigner(assigner)
+            # DETR sampling=False, so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+        self.num_query = num_query
+        self.num_classes = num_classes
+        self.num_things_classes = num_things_classes
+        self.num_stuff_classes = num_stuff_classes
+        self.in_channels = in_channels
+        self.num_reg_fcs = num_reg_fcs
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.fp16_enabled = False
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_iou = build_loss(loss_iou)
+
+        if self.loss_cls.use_sigmoid:
+            self.cls_out_channels = num_things_classes
+        else:
+            self.cls_out_channels = num_things_classes + 1
+        self.act_cfg = transformer.get('act_cfg',
+                                       dict(type='ReLU', inplace=True))
+        self.activate = build_activation_layer(self.act_cfg)
+        self.positional_encoding = build_positional_encoding(
+            positional_encoding)
+        self.transformer = build_transformer(transformer)
+        self.embed_dims = self.transformer.embed_dims
+        assert 'num_feats' in positional_encoding
+        num_feats = positional_encoding['num_feats']
+        assert num_feats * 2 == self.embed_dims, 'embed_dims should' \
+            f' be exactly 2 times of num_feats. Found {self.embed_dims}' \
+            f' and {num_feats}.'
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers of the transformer head."""
+        self.input_proj = Conv2d(self.in_channels,
+                                 self.embed_dims,
+                                 kernel_size=1)
+        self.fc_cls = Linear(self.embed_dims, self.cls_out_channels)
+        self.reg_ffn = FFN(self.embed_dims,
+                           self.embed_dims,
+                           self.num_reg_fcs,
+                           self.act_cfg,
+                           dropout=0.0,
+                           add_residual=False)
+        self.fc_reg = Linear(self.embed_dims, 4)
+        self.query_embedding = nn.Embedding(self.num_query, self.embed_dims)
+
+    def init_weights(self):
+        """Initialize weights of the transformer head."""
+        # The initialization for transformer is important
+        self.transformer.init_weights()
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        """load checkpoints."""
+        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
+        # since `AnchorFreeHead._load_from_state_dict` should not be
+        # called here. Invoking the default `Module._load_from_state_dict`
+        # is enough.
+
+        # Names of some parameters in has been changed.
+        version = local_metadata.get('version', None)
+        if (version is None or version < 2) and self.__class__ is SegDETRHead:
+            convert_dict = {
+                '.self_attn.': '.attentions.0.',
+                '.ffn.': '.ffns.0.',
+                '.multihead_attn.': '.attentions.1.',
+                '.decoder.norm.': '.decoder.post_norm.'
+            }
+            for k in state_dict.keys():
+                for ori_key, convert_key in convert_dict.items():
+                    if ori_key in k:
+                        convert_key = k.replace(ori_key, convert_key)
+                        state_dict[convert_key] = state_dict[k]
+                        del state_dict[k]
+
+        super(AnchorFreeHead,
+              self)._load_from_state_dict(state_dict, prefix, local_metadata,
+                                          strict, missing_keys,
+                                          unexpected_keys, error_msgs)
+
+    def forward(self, feats, img_metas):
+        """Forward function.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple[list[Tensor], list[Tensor]]: Outputs for all scale levels.
+
+                - all_cls_scores_list (list[Tensor]): Classification scores \
+                    for each scale level. Each is a 4D-tensor with shape \
+                    [nb_dec, bs, num_query, cls_out_channels]. Note \
+                    `cls_out_channels` should includes background.
+                - all_bbox_preds_list (list[Tensor]): Sigmoid regression \
+                    outputs for each scale level. Each is a 4D-tensor with \
+                    normalized coordinate format (cx, cy, w, h) and shape \
+                    [nb_dec, bs, num_query, 4].
+        """
+        num_levels = len(feats)
+        img_metas_list = [img_metas for _ in range(num_levels)]
+        return multi_apply(self.forward_single, feats, img_metas_list)
+
+    def forward_single(self, x, img_metas):
+        """"Forward function for a single feature level.
+
+        Args:
+            x (Tensor): Input feature from backbone's single stage, shape
+                [bs, c, h, w].
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head,
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression
+                head with normalized coordinate format (cx, cy, w, h).
+                Shape [nb_dec, bs, num_query, 4].
+        """
+        # construct binary masks which used for the transformer.
+        # NOTE following the official DETR repo, non-zero values representing
+        # ignored positions, while zero values means valid positions.
+        batch_size = x.size(0)
+        input_img_h, input_img_w = img_metas[0]['batch_input_shape']
+        masks = x.new_ones((batch_size, input_img_h, input_img_w))
+        for img_id in range(batch_size):
+            img_h, img_w, _ = img_metas[img_id]['img_shape']
+            masks[img_id, :img_h, :img_w] = 0
+
+        x = self.input_proj(x)
+        # interpolate masks to have the same spatial shape with x
+        masks = F.interpolate(masks.unsqueeze(1),
+                              size=x.shape[-2:]).to(torch.bool).squeeze(1)
+        # position encoding
+        pos_embed = self.positional_encoding(masks)  # [bs, embed_dim, h, w]
+        # outs_dec: [nb_dec, bs, num_query, embed_dim]
+        outs_dec, _ = self.transformer(x, masks, self.query_embedding.weight,
+                                       pos_embed)
+
+        all_cls_scores = self.fc_cls(outs_dec)
+        all_bbox_preds = self.fc_reg(self.activate(
+            self.reg_ffn(outs_dec))).sigmoid()
+        return all_cls_scores, all_bbox_preds
+
+    @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
+    def loss(self,
+             all_cls_scores_list,
+             all_bbox_preds_list,
+             gt_bboxes_list,
+             gt_labels_list,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """"Loss function.
+
+        Only outputs from the last feature level are used for computing
+        losses by default.
+
+        Args:
+            all_cls_scores_list (list[Tensor]): Classification outputs
+                for each feature level. Each is a 4D-tensor with shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds_list (list[Tensor]): Sigmoid regression
+                outputs for each feature level. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        # NOTE defaultly only the outputs from the last feature scale is used.
+        all_cls_scores = all_cls_scores_list[-1]
+        all_bbox_preds = all_bbox_preds_list[-1]
+        assert gt_bboxes_ignore is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+
+        num_dec_layers = len(all_cls_scores)
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+        img_metas_list = [img_metas for _ in range(num_dec_layers)]
+
+        losses_cls, losses_bbox, losses_iou = multi_apply(
+            self.loss_single, all_cls_scores, all_bbox_preds,
+            all_gt_bboxes_list, all_gt_labels_list, img_metas_list,
+            all_gt_bboxes_ignore_list)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        loss_dict['loss_iou'] = losses_iou[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1],
+                                                       losses_bbox[:-1],
+                                                       losses_iou[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def loss_single(self,
+                    cls_scores,
+                    bbox_preds,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs from
+                a single decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           gt_bboxes_list, gt_labels_list,
+                                           img_metas, gt_bboxes_ignore_list)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+        loss_cls = self.loss_cls(cls_scores,
+                                 labels,
+                                 label_weights,
+                                 avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes accross all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(img_metas, bbox_preds):
+            img_h, img_w, _ = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors, 0)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(bboxes,
+                                 bboxes_gt,
+                                 bbox_weights,
+                                 avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(bbox_preds,
+                                   bbox_targets,
+                                   bbox_weights,
+                                   avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def get_targets(self,
+                    cls_scores_list,
+                    bbox_preds_list,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None):
+        """"Compute regression and classification targets for a batch image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+                - labels_list (list[Tensor]): Labels for all images.
+                - label_weights_list (list[Tensor]): Label weights for all \
+                    images.
+                - bbox_targets_list (list[Tensor]): BBox targets for all \
+                    images.
+                - bbox_weights_list (list[Tensor]): BBox weights for all \
+                    images.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+
+        (labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(
+             self._get_target_single, cls_scores_list, bbox_preds_list,
+             gt_bboxes_list, gt_labels_list, img_metas, gt_bboxes_ignore_list)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def _get_target_single(self,
+                           cls_score,
+                           bbox_pred,
+                           gt_bboxes,
+                           gt_labels,
+                           img_meta,
+                           gt_bboxes_ignore=None):
+        """"Compute regression and classification targets for one image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_query, 4].
+            gt_bboxes (Tensor): Ground truth bboxes for one image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth class indices for one image
+                with shape (num_gts, ).
+            img_meta (dict): Meta information for one image.
+            gt_bboxes_ignore (Tensor, optional): Bounding boxes
+                which can be ignored. Default None.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image.
+                - label_weights (Tensor]): Label weights of each image.
+                - bbox_targets (Tensor): BBox targets of each image.
+                - bbox_weights (Tensor): BBox weights of each image.
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+        assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
+                                            gt_labels, img_meta,
+                                            gt_bboxes_ignore)
+        sampling_result = self.sampler.sample(assign_result, bbox_pred,
+                                              gt_bboxes)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes, ),
+                                    self.num_things_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+        img_h, img_w, _ = img_meta['img_shape']
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        pos_gt_bboxes_normalized = sampling_result.pos_gt_bboxes / factor
+        pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized)
+        bbox_targets[pos_inds] = pos_gt_bboxes_targets
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    # over-write because img_metas are needed as inputs for bbox_head.
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels=None,
+                      gt_bboxes_ignore=None,
+                      proposal_cfg=None,
+                      **kwargs):
+        """Forward function for training mode.
+
+        Args:
+            x (list[Tensor]): Features from backbone.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert proposal_cfg is None, '"proposal_cfg" must be None'
+        outs = self(x, img_metas)
+        if gt_labels is None:
+            loss_inputs = outs + (gt_bboxes, img_metas)
+        else:
+            loss_inputs = outs + (gt_bboxes, gt_labels, img_metas)
+        losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        return losses
+
+    @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
+    def get_bboxes(self,
+                   all_cls_scores_list,
+                   all_bbox_preds_list,
+                   img_metas,
+                   rescale=False):
+        """Transform network outputs for a batch into bbox predictions.
+
+        Args:
+            all_cls_scores_list (list[Tensor]): Classification outputs
+                for each feature level. Each is a 4D-tensor with shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds_list (list[Tensor]): Sigmoid regression
+                outputs for each feature level. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            img_metas (list[dict]): Meta information of each image.
+            rescale (bool, optional): If True, return boxes in original
+                image space. Default False.
+
+        Returns:
+            list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple. \
+                The first item is an (n, 5) tensor, where the first 4 columns \
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the \
+                5-th column is a score between 0 and 1. The second item is a \
+                (n,) tensor where each item is the predicted class label of \
+                the corresponding box.
+        """
+        # NOTE defaultly only using outputs from the last feature level,
+        # and only the outputs from the last decoder layer is used.
+        cls_scores = all_cls_scores_list[-1][-1]
+        bbox_preds = all_bbox_preds_list[-1][-1]
+
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score = cls_scores[img_id]
+            bbox_pred = bbox_preds[img_id]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            proposals = self._get_bboxes_single(cls_score, bbox_pred,
+                                                img_shape, scale_factor,
+                                                rescale)
+            result_list.append(proposals)
+
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_score,
+                           bbox_pred,
+                           img_shape,
+                           scale_factor,
+                           rescale=False):
+        """Transform outputs from the last decoder layer into bbox predictions
+        for each image.
+
+        Args:
+            cls_score (Tensor): Box score logits from the last decoder layer
+                for each image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
+                for each image, with coordinate format (cx, cy, w, h) and
+                shape [num_query, 4].
+            img_shape (tuple[int]): Shape of input image, (height, width, 3).
+            scale_factor (ndarray, optional): Scale factor of the image arange
+                as (w_scale, h_scale, w_scale, h_scale).
+            rescale (bool, optional): If True, return boxes in original image
+                space. Default False.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels.
+
+                - det_bboxes: Predicted bboxes with shape [num_query, 5], \
+                    where the first 4 columns are bounding box positions \
+                    (tl_x, tl_y, br_x, br_y) and the 5-th column are scores \
+                    between 0 and 1.
+                - det_labels: Predicted labels of the corresponding box with \
+                    shape [num_query].
+        """
+        assert len(cls_score) == len(bbox_pred)
+        max_per_img = self.test_cfg.get('max_per_img', self.num_query)
+        # exclude background
+        if self.loss_cls.use_sigmoid:
+            cls_score = cls_score.sigmoid()
+            scores, indexes = cls_score.view(-1).topk(max_per_img)
+            det_labels = indexes % self.num_things_classes
+            bbox_index = indexes // self.num_things_classes
+            bbox_pred = bbox_pred[bbox_index]
+        else:
+            scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1)
+            scores, bbox_index = scores.topk(max_per_img)
+            bbox_pred = bbox_pred[bbox_index]
+            det_labels = det_labels[bbox_index]
+
+        det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
+        det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
+        det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
+        det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1])
+        det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0])
+        if rescale:
+            det_bboxes /= det_bboxes.new_tensor(scale_factor)
+        det_bboxes = torch.cat((det_bboxes, scores.unsqueeze(1)), -1)
+
+        return det_bboxes, det_labels
diff --git a/mmcv/models/dense_heads/seg_head_plugin/seg_mask_head.py b/mmcv/models/dense_heads/seg_head_plugin/seg_mask_head.py
new file mode 100644
index 0000000..3a3bdbd
--- /dev/null
+++ b/mmcv/models/dense_heads/seg_head_plugin/seg_mask_head.py
@@ -0,0 +1,393 @@
+"""
+Copy-paste from torch.nn.Transformer, timm, with modifications:
+"""
+import copy
+from typing import Optional, List
+
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+from functools import partial
+from mmcv.models.utils.builder import TRANSFORMER
+import math
+from mmcv.utils import force_fp32
+
+count = 0
+
+
+class Mlp(nn.Module):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        self.fp16_enabled = False
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    @force_fp32(apply_to=('x', ))
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class SelfAttention(nn.Module):
+    def __init__(self,
+                 cfg,
+                 dim,
+                 num_heads=2,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.fp16_enabled = False
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    @force_fp32(apply_to=('x', ))
+    def forward(self, x):
+        B, N, C = x.shape
+
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1,
+                                                               4).contiguous()
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self,
+                 cfg,
+                 dim,
+                 num_heads=2,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.fp16_enabled = False
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.k = nn.Linear(dim, dim, bias=qkv_bias)
+        self.v = nn.Linear(dim, dim, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.linear_l1 = nn.Sequential(
+            nn.Linear(self.num_heads, self.num_heads),
+            nn.ReLU(),
+        )
+        self.linear = nn.Sequential(
+            nn.Linear(self.num_heads, 1),
+            nn.ReLU(),
+        )
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    @force_fp32(apply_to=('query', 'key', 'value'))
+    def forward(self, query, key, value, key_padding_mask, hw_lvl):
+        B, N, C = query.shape
+        _, L, _ = key.shape
+        #print('query, key, value', query.shape, value.shape, key.shape)
+        q = self.q(query).reshape(B, N,
+                                  self.num_heads, C // self.num_heads).permute(
+                                      0, 2, 1,
+                                      3).contiguous()  #.permute(2, 0, 3, 1, 4)
+        k = self.k(key).reshape(B, L,
+                                self.num_heads, C // self.num_heads).permute(
+                                    0, 2, 1,
+                                    3).contiguous()  #.permute(2, 0, 3, 1, 4)
+
+        v = self.v(value).reshape(B, L,
+                                  self.num_heads, C // self.num_heads).permute(
+                                      0, 2, 1,
+                                      3).contiguous()  #.permute(2, 0, 3, 1, 4)
+
+        attn = (q @ k.transpose(-2, -1).contiguous()) * self.scale
+
+        attn = attn.permute(0, 2, 3, 1)
+
+        new_feats = self.linear_l1(attn)
+        mask = self.linear(new_feats)
+
+        attn = attn.permute(0, 3, 1, 2)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).contiguous().reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x, mask
+
+# AttentionTail is a cheap implementation that can make mask decoder 1 layer deeper.
+class AttentionTail(nn.Module): 
+    def __init__(self,
+                 cfg,
+                 dim,
+                 num_heads=2,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.fp16_enabled = False
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.k = nn.Linear(dim, dim, bias=qkv_bias)
+
+        self.linear_l1 = nn.Sequential(
+            nn.Linear(self.num_heads, self.num_heads),
+            nn.ReLU(),
+        )
+        
+        self.linear = nn.Sequential(
+            nn.Linear(self.num_heads, 1),
+            nn.ReLU(),
+        )
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    @force_fp32(apply_to=('query', 'key'))
+    def forward(self, query, key, key_padding_mask, hw_lvl=None):
+        B, N, C = query.shape
+        _, L, _ = key.shape
+        #print('query, key, value', query.shape, value.shape, key.shape)
+        q = self.q(query).reshape(B, N,
+                                  self.num_heads, C // self.num_heads).permute(
+                                      0, 2, 1,
+                                      3).contiguous()  #.permute(2, 0, 3, 1, 4)
+        k = self.k(key).reshape(B, L,
+                                self.num_heads, C // self.num_heads).permute(
+                                    0, 2, 1,
+                                    3).contiguous()  #.permute(2, 0, 3, 1, 4)
+        attn = (q @ k.transpose(-2, -1).contiguous()) * self.scale
+
+        attn = attn.permute(0, 2, 3, 1)
+        
+        new_feats = self.linear_l1(attn)
+        mask = self.linear(new_feats)
+
+        return mask
+
+
+class Block(nn.Module):
+    def __init__(self,
+                 cfg,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 self_attn=False):
+        super().__init__()
+        self.fp16_enabled = False
+        self.head_norm1 = norm_layer(dim)
+        self.self_attn = self_attn
+        self.attn = Attention(cfg,
+                              dim,
+                              num_heads=num_heads,
+                              qkv_bias=qkv_bias,
+                              qk_scale=qk_scale,
+                              attn_drop=attn_drop,
+                              proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.head_norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        if self.self_attn:
+            self.self_attention = SelfAttention(cfg,
+                                                dim,
+                                                num_heads=num_heads,
+                                                qkv_bias=qkv_bias,
+                                                qk_scale=qk_scale,
+                                                attn_drop=attn_drop,
+                                                proj_drop=drop)
+            self.norm3 = norm_layer(dim)
+
+    @force_fp32(apply_to=('query', 'key', 'value'))
+    def forward(self, query, key, value, key_padding_mask=None, hw_lvl=None):
+        if self.self_attn:
+            query = query + self.drop_path(self.self_attention(query))
+            query = self.norm3(query)
+        x, mask = self.attn(query, key, value, key_padding_mask, hw_lvl=hw_lvl)
+        query = query + self.drop_path(x)
+        query = self.head_norm1(query)
+
+        query = query + self.drop_path(self.mlp(query))
+        query = self.head_norm2(query)
+        return query, mask
+
+
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-53296self.num_heads956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0], ) + (1, ) * (
+        x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(
+        shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    @force_fp32(apply_to=('x', ))
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+@TRANSFORMER.register_module()
+class SegMaskHead(nn.Module):
+    def __init__(self,
+                 cfg=None,
+                 d_model=16,
+                 nhead=2,
+                 num_encoder_layers=6,
+                 num_decoder_layers=1,
+                 dim_feedforward=64,
+                 dropout=0.1,
+                 activation="relu",
+                 normalize_before=False,
+                 return_intermediate_dec=False,
+                 self_attn=False):
+        super().__init__()
+
+        self.fp16_enabled = False
+        mlp_ratio = 4
+        qkv_bias = True
+        qk_scale = None
+        drop_rate = 0
+        attn_drop_rate = 0
+
+        norm_layer = None
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        act_layer = None
+        act_layer = act_layer or nn.GELU
+        block = Block(cfg,
+                      dim=d_model,
+                      num_heads=nhead,
+                      mlp_ratio=mlp_ratio,
+                      qkv_bias=qkv_bias,
+                      qk_scale=qk_scale,
+                      drop=drop_rate,
+                      attn_drop=attn_drop_rate,
+                      drop_path=0,
+                      norm_layer=norm_layer,
+                      act_layer=act_layer,
+                      self_attn=self_attn)
+        self.blocks = _get_clones(block, num_decoder_layers)
+        self.attnen = AttentionTail(cfg,
+                                    d_model,
+                                    num_heads=nhead,
+                                    qkv_bias=qkv_bias,
+                                    qk_scale=qk_scale,
+                                    attn_drop=attn_drop_rate,
+                                    proj_drop=0)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        if pos is None:
+            return tensor
+        else:
+            return tensor + pos
+        #return tensor if pos is None else tensor + pos
+    @force_fp32(apply_to=('memory', 'mask_memory', 'pos_memory', 'query_embed',
+                          'mask_query', 'pos_query'))
+    def forward(self, memory, mask_memory, pos_memory, query_embed, mask_query,
+                pos_query, hw_lvl):
+        if mask_memory is not None and isinstance(mask_memory, torch.Tensor):
+            mask_memory = mask_memory.to(torch.bool)
+        masks = []
+        inter_query = []
+        for i, block in enumerate(self.blocks):
+            query_embed, mask = block(self.with_pos_embed(
+                query_embed, pos_query),
+                                      self.with_pos_embed(memory, pos_memory),
+                                      memory,
+                                      key_padding_mask=mask_memory,
+                                      hw_lvl=hw_lvl)
+            masks.append(mask)
+            inter_query.append(query_embed)
+            #if i == 1:
+            #    return mask, masks, inter_query
+        attn = self.attnen(self.with_pos_embed(query_embed, pos_query),
+                           self.with_pos_embed(memory, pos_memory),
+                           key_padding_mask=mask_memory,
+                           hw_lvl=hw_lvl)
+        return attn, masks, inter_query
diff --git a/mmcv/models/dense_heads/seg_head_plugin/seg_utils.py b/mmcv/models/dense_heads/seg_head_plugin/seg_utils.py
new file mode 100644
index 0000000..dd1e61d
--- /dev/null
+++ b/mmcv/models/dense_heads/seg_head_plugin/seg_utils.py
@@ -0,0 +1,7 @@
+
+
+def IOU(intputs, targets):
+    numerator = (intputs * targets).sum(dim=1)
+    denominator = intputs.sum(dim=1) + targets.sum(dim=1) - numerator
+    loss = numerator / (denominator + 0.0000000000001)
+    return loss.cpu(), numerator.cpu(), denominator.cpu()
diff --git a/mmcv/models/dense_heads/track_head.py b/mmcv/models/dense_heads/track_head.py
new file mode 100644
index 0000000..0233b32
--- /dev/null
+++ b/mmcv/models/dense_heads/track_head.py
@@ -0,0 +1,533 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+# Modified from bevformer (https://github.com/fundamentalvision/BEVFormer)        #
+#---------------------------------------------------------------------------------#
+
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.models.bricks import Linear
+from mmcv.models.utils import bias_init_with_prob
+from mmcv.utils import TORCH_VERSION, digit_version
+
+from mmcv.core.utils import (multi_apply, reduce_mean)
+from mmcv.models.utils.transformer import inverse_sigmoid
+from mmcv.models import HEADS
+from mmcv.models.dense_heads import DETRHead
+from mmcv.core.bbox.coder import build_bbox_coder
+from mmcv.core.bbox.util import normalize_bbox
+from mmcv.utils import force_fp32, auto_fp16
+
+
+@HEADS.register_module()
+class BEVFormerTrackHead(DETRHead):
+    """Head of Detr3D.
+    Args:
+        with_box_refine (bool): Whether to refine the reference points
+            in the decoder. Defaults to False.
+        as_two_stage (bool) : Whether to generate the proposal from
+            the outputs of encoder.
+        transformer (obj:`ConfigDict`): ConfigDict is used for building
+            the Encoder and Decoder.
+        bev_h, bev_w (int): spatial shape of BEV queries.
+    """
+
+    def __init__(self,
+                 *args,
+                 with_box_refine=False,
+                 as_two_stage=False,
+                 transformer=None,
+                 bbox_coder=None,
+                 num_cls_fcs=2,
+                 code_weights=None,
+                 bev_h=30,
+                 bev_w=30,
+                 past_steps=4,
+                 fut_steps=4,
+                 **kwargs):
+
+        self.bev_h = bev_h
+        self.bev_w = bev_w
+        self.fp16_enabled = False
+
+        self.with_box_refine = with_box_refine
+
+        assert as_two_stage is False, 'as_two_stage is not supported yet.'
+        self.as_two_stage = as_two_stage
+        if self.as_two_stage:
+            transformer['as_two_stage'] = self.as_two_stage
+        if 'code_size' in kwargs:
+            self.code_size = kwargs['code_size']
+        else:
+            self.code_size = 10
+        if code_weights is not None:
+            self.code_weights = code_weights
+        else:
+            self.code_weights = [1.0, 1.0, 1.0,
+                                 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.pc_range = self.bbox_coder.pc_range
+        self.real_w = self.pc_range[3] - self.pc_range[0]
+        self.real_h = self.pc_range[4] - self.pc_range[1]
+        self.num_cls_fcs = num_cls_fcs - 1
+        self.past_steps = past_steps
+        self.fut_steps = fut_steps
+        super(BEVFormerTrackHead, self).__init__(
+            *args, transformer=transformer, **kwargs)
+        self.code_weights = nn.Parameter(torch.tensor(
+            self.code_weights, requires_grad=False), requires_grad=False)
+
+    def _init_layers(self):
+        """Initialize classification branch and regression branch of head."""
+        cls_branch = []
+        for _ in range(self.num_reg_fcs):
+            cls_branch.append(Linear(self.embed_dims, self.embed_dims))
+            cls_branch.append(nn.LayerNorm(self.embed_dims))
+            cls_branch.append(nn.ReLU(inplace=True))
+        cls_branch.append(Linear(self.embed_dims, self.cls_out_channels))
+        fc_cls = nn.Sequential(*cls_branch)
+
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, self.code_size))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        past_traj_reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            past_traj_reg_branch.append(
+                Linear(self.embed_dims, self.embed_dims))
+            past_traj_reg_branch.append(nn.ReLU())
+        past_traj_reg_branch.append(
+            Linear(self.embed_dims, (self.past_steps + self.fut_steps)*2))
+        past_traj_reg_branch = nn.Sequential(*past_traj_reg_branch)
+
+        def _get_clones(module, N):
+            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+        # last reg_branch is used to generate proposal from
+        # encode feature map when as_two_stage is True.
+        num_pred = (self.transformer.decoder.num_layers + 1) if \
+            self.as_two_stage else self.transformer.decoder.num_layers
+
+        if self.with_box_refine:
+            self.cls_branches = _get_clones(fc_cls, num_pred)
+            self.reg_branches = _get_clones(reg_branch, num_pred)
+            self.past_traj_reg_branches = _get_clones(
+                past_traj_reg_branch, num_pred)
+        else:
+            self.cls_branches = nn.ModuleList(
+                [fc_cls for _ in range(num_pred)])
+            self.reg_branches = nn.ModuleList(
+                [reg_branch for _ in range(num_pred)])
+            self.past_traj_reg_branches = nn.ModuleList(
+                [past_traj_reg_branch for _ in range(num_pred)])
+        if not self.as_two_stage:
+            self.bev_embedding = nn.Embedding(
+                self.bev_h * self.bev_w, self.embed_dims)
+
+    def init_weights(self):
+        """Initialize weights of the DeformDETR head."""
+        self.transformer.init_weights()
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.cls_branches:
+                nn.init.constant_(m[-1].bias, bias_init)
+    
+    def get_bev_features(self, mlvl_feats, img_metas, prev_bev=None):
+        bs, num_cam, _, _, _ = mlvl_feats[0].shape
+        dtype = mlvl_feats[0].dtype
+        bev_queries = self.bev_embedding.weight.to(dtype)
+
+        bev_mask = torch.zeros((bs, self.bev_h, self.bev_w),
+                               device=bev_queries.device).to(dtype)
+        bev_pos = self.positional_encoding(bev_mask).to(dtype)
+        bev_embed = self.transformer.get_bev_features(
+            mlvl_feats,
+            bev_queries,
+            self.bev_h,
+            self.bev_w,
+            grid_length=(self.real_h / self.bev_h,
+                         self.real_w / self.bev_w),
+            bev_pos=bev_pos,
+            prev_bev=prev_bev,
+            img_metas=img_metas,
+        )
+        return bev_embed, bev_pos
+
+    def get_detections(
+        self, 
+        bev_embed,
+        object_query_embeds=None,
+        ref_points=None,
+        img_metas=None,
+    ):
+        assert bev_embed.shape[0] == self.bev_h * self.bev_w
+        hs, init_reference, inter_references = self.transformer.get_states_and_refs(
+            bev_embed,
+            object_query_embeds,
+            self.bev_h,
+            self.bev_w,
+            reference_points=ref_points,
+            reg_branches=self.reg_branches if self.with_box_refine else None,
+            cls_branches=self.cls_branches if self.as_two_stage else None,
+            img_metas=img_metas,
+        )
+        hs = hs.permute(0, 2, 1, 3)
+        outputs_classes = []
+        outputs_coords = []
+        outputs_trajs = []
+        for lvl in range(hs.shape[0]):
+            if lvl == 0:
+                # reference = init_reference
+                reference = ref_points.sigmoid()
+            else:
+                reference = inter_references[lvl - 1]
+                # ref_size_base = inter_box_sizes[lvl - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.cls_branches[lvl](hs[lvl])
+            tmp = self.reg_branches[lvl](hs[lvl])  # xydxdyxdz
+            outputs_past_traj = self.past_traj_reg_branches[lvl](hs[lvl]).view(
+                tmp.shape[0], -1, self.past_steps + self.fut_steps, 2)
+            # TODO: check the shape of reference
+            assert reference.shape[-1] == 3
+            tmp[..., 0:2] += reference[..., 0:2]
+            tmp[..., 0:2] = tmp[..., 0:2].sigmoid()
+            tmp[..., 4:5] += reference[..., 2:3]
+            tmp[..., 4:5] = tmp[..., 4:5].sigmoid()
+
+            last_ref_points = torch.cat(
+                [tmp[..., 0:2], tmp[..., 4:5]], dim=-1,
+            )
+
+            tmp[..., 0:1] = (tmp[..., 0:1] * (self.pc_range[3] -
+                             self.pc_range[0]) + self.pc_range[0])
+            tmp[..., 1:2] = (tmp[..., 1:2] * (self.pc_range[4] -
+                             self.pc_range[1]) + self.pc_range[1])
+            tmp[..., 4:5] = (tmp[..., 4:5] * (self.pc_range[5] -
+                             self.pc_range[2]) + self.pc_range[2])
+
+            # tmp[..., 2:4] = tmp[..., 2:4] + ref_size_basse[..., 0:2]
+            # tmp[..., 5:6] = tmp[..., 5:6] + ref_size_basse[..., 2:3]
+
+            # TODO: check if using sigmoid
+            outputs_coord = tmp
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+            outputs_trajs.append(outputs_past_traj)
+        outputs_classes = torch.stack(outputs_classes)
+        outputs_coords = torch.stack(outputs_coords)
+        outputs_trajs = torch.stack(outputs_trajs)
+        last_ref_points = inverse_sigmoid(last_ref_points)
+        outs = {
+            'all_cls_scores': outputs_classes,
+            'all_bbox_preds': outputs_coords,
+            'all_past_traj_preds': outputs_trajs,
+            'enc_cls_scores': None,
+            'enc_bbox_preds': None,
+            'last_ref_points': last_ref_points,
+            'query_feats': hs,
+        }
+        return outs
+        
+    def _get_target_single(self,
+                           cls_score,
+                           bbox_pred,
+                           gt_labels,
+                           gt_bboxes,
+                           gt_bboxes_ignore=None):
+        """"Compute regression and classification targets for one image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_query, 4].
+            gt_bboxes (Tensor): Ground truth bboxes for one image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth class indices for one image
+                with shape (num_gts, ).
+            gt_bboxes_ignore (Tensor, optional): Bounding boxes
+                which can be ignored. Default None.
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+                - labels (Tensor): Labels of each image.
+                - label_weights (Tensor]): Label weights of each image.
+                - bbox_targets (Tensor): BBox targets of each image.
+                - bbox_weights (Tensor): BBox weights of each image.
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+        gt_c = gt_bboxes.shape[-1]
+
+        assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
+                                             gt_labels, gt_bboxes_ignore)
+
+        sampling_result = self.sampler.sample(assign_result, bbox_pred,
+                                              gt_bboxes)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes,),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)[..., :gt_c]
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+
+        # DETR
+        bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes
+        return (labels, label_weights, bbox_targets, bbox_weights,
+                pos_inds, neg_inds)
+
+    def get_targets(self,
+                    cls_scores_list,
+                    bbox_preds_list,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_bboxes_ignore_list=None):
+        """"Compute regression and classification targets for a batch image.
+        Outputs from a single decoder layer of a single feature level are used.
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            tuple: a tuple containing the following targets.
+                - labels_list (list[Tensor]): Labels for all images.
+                - label_weights_list (list[Tensor]): Label weights for all \
+                    images.
+                - bbox_targets_list (list[Tensor]): BBox targets for all \
+                    images.
+                - bbox_weights_list (list[Tensor]): BBox weights for all \
+                    images.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+
+        (labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(
+            self._get_target_single, cls_scores_list, bbox_preds_list,
+            gt_labels_list, gt_bboxes_list, gt_bboxes_ignore_list)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def loss_single(self,
+                    cls_scores,
+                    bbox_preds,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    gt_bboxes_ignore_list=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs from
+                a single decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           gt_bboxes_list, gt_labels_list,
+                                           gt_bboxes_ignore_list)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+
+        cls_avg_factor = max(cls_avg_factor, 1)
+        loss_cls = self.loss_cls(
+            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes accross all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # regression L1 loss
+        bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1))
+        normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range)
+        isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1)
+        bbox_weights = bbox_weights * self.code_weights
+
+        loss_bbox = self.loss_bbox(
+            bbox_preds[isnotnan, :10], normalized_bbox_targets[isnotnan,
+                                                               :10], bbox_weights[isnotnan, :10],
+            avg_factor=num_total_pos)
+        loss_cls = torch.nan_to_num(loss_cls)
+        loss_bbox = torch.nan_to_num(loss_bbox)
+        return loss_cls, loss_bbox
+
+    @force_fp32(apply_to=('preds_dicts'))
+    def loss(self,
+             gt_bboxes_list,
+             gt_labels_list,
+             preds_dicts,
+             gt_bboxes_ignore=None,
+             img_metas=None):
+        """"Loss function.
+        Args:
+
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            preds_dicts:
+                all_cls_scores (Tensor): Classification score of all
+                    decoder layers, has shape
+                    [nb_dec, bs, num_query, cls_out_channels].
+                all_bbox_preds (Tensor): Sigmoid regression
+                    outputs of all decode layers. Each is a 4D-tensor with
+                    normalized coordinate format (cx, cy, w, h) and shape
+                    [nb_dec, bs, num_query, 4].
+                enc_cls_scores (Tensor): Classification scores of
+                    points on encode feature map , has shape
+                    (N, h*w, num_classes). Only be passed when as_two_stage is
+                    True, otherwise is None.
+                enc_bbox_preds (Tensor): Regression results of each points
+                    on the encode feature map, has shape (N, h*w, 4). Only be
+                    passed when as_two_stage is True, otherwise is None.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert gt_bboxes_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            f'for gt_bboxes_ignore setting to None.'
+
+        all_cls_scores = preds_dicts['all_cls_scores']
+        all_bbox_preds = preds_dicts['all_bbox_preds']
+        enc_cls_scores = preds_dicts['enc_cls_scores']
+        enc_bbox_preds = preds_dicts['enc_bbox_preds']
+
+        num_dec_layers = len(all_cls_scores)
+        device = gt_labels_list[0].device
+
+        gt_bboxes_list = [torch.cat(
+            (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]),
+            dim=1).to(device) for gt_bboxes in gt_bboxes_list]
+
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_single, all_cls_scores, all_bbox_preds,
+            all_gt_bboxes_list, all_gt_labels_list,
+            all_gt_bboxes_ignore_list)
+
+        loss_dict = dict()
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            binary_labels_list = [
+                torch.zeros_like(gt_labels_list[i])
+                for i in range(len(all_gt_labels_list))
+            ]
+            enc_loss_cls, enc_losses_bbox = \
+                self.loss_single(enc_cls_scores, enc_bbox_preds,
+                                 gt_bboxes_list, binary_labels_list, gt_bboxes_ignore)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1],
+                                           losses_bbox[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            num_dec_layer += 1
+        return loss_dict
+
+    @force_fp32(apply_to=('preds_dicts'))
+    def get_bboxes(self, preds_dicts, img_metas, rescale=False):
+        """Generate bboxes from bbox head predictions.
+        Args:
+            preds_dicts (tuple[list[dict]]): Prediction results.
+            img_metas (list[dict]): Point cloud and image's meta info.
+        Returns:
+            list[dict]: Decoded bbox, scores and labels after nms.
+        """
+
+        preds_dicts = self.bbox_coder.decode(preds_dicts)
+
+        num_samples = len(preds_dicts)
+        ret_list = []
+        for i in range(num_samples):
+            preds = preds_dicts[i]
+            bboxes = preds['bboxes']
+
+            bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
+
+            code_size = bboxes.shape[-1]
+            bboxes = img_metas[i]['box_type_3d'](bboxes, code_size)
+            scores = preds['scores']
+            labels = preds['labels']
+            bbox_index = preds['bbox_index']
+            mask = preds['mask']
+
+            ret_list.append([bboxes, scores, labels, bbox_index, mask])
+
+        return ret_list
diff --git a/mmcv/models/dense_heads/track_head_plugin/__init__.py b/mmcv/models/dense_heads/track_head_plugin/__init__.py
new file mode 100644
index 0000000..f7933ab
--- /dev/null
+++ b/mmcv/models/dense_heads/track_head_plugin/__init__.py
@@ -0,0 +1,3 @@
+from .modules import MemoryBank, QueryInteractionModule
+from .track_instance import Instances
+from .tracker import RuntimeTrackerBase
\ No newline at end of file
diff --git a/mmcv/models/dense_heads/track_head_plugin/modules.py b/mmcv/models/dense_heads/track_head_plugin/modules.py
new file mode 100644
index 0000000..db80e6b
--- /dev/null
+++ b/mmcv/models/dense_heads/track_head_plugin/modules.py
@@ -0,0 +1,254 @@
+import torch
+import torch.nn.functional as F
+from torch import nn
+from .track_instance import Instances
+
+# MemoryBank
+class MemoryBank(nn.Module):
+
+    def __init__(self,
+                 args,
+                 dim_in, hidden_dim, dim_out,
+                 ):
+        super().__init__()
+        self._build_layers(args, dim_in, hidden_dim, dim_out)
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def _build_layers(self, args, dim_in, hidden_dim, dim_out):
+        self.save_thresh = args['memory_bank_score_thresh']
+        self.save_period = 3
+        self.max_his_length = args['memory_bank_len']
+
+        self.save_proj = nn.Linear(dim_in, dim_in)
+
+        self.temporal_attn = nn.MultiheadAttention(dim_in, 8, dropout=0)
+        self.temporal_fc1 = nn.Linear(dim_in, hidden_dim)
+        self.temporal_fc2 = nn.Linear(hidden_dim, dim_in)
+        self.temporal_norm1 = nn.LayerNorm(dim_in)
+        self.temporal_norm2 = nn.LayerNorm(dim_in)
+
+    def update(self, track_instances):
+        embed = track_instances.output_embedding[:, None]  #( N, 1, 256)
+        scores = track_instances.scores
+        mem_padding_mask = track_instances.mem_padding_mask
+        device = embed.device
+
+        save_period = track_instances.save_period
+        if self.training:
+            saved_idxes = scores > 0
+        else:
+            saved_idxes = (save_period == 0) & (scores > self.save_thresh)
+            # saved_idxes = (save_period == 0)
+            save_period[save_period > 0] -= 1
+            save_period[saved_idxes] = self.save_period
+
+        saved_embed = embed[saved_idxes]
+        if len(saved_embed) > 0:
+            prev_embed = track_instances.mem_bank[saved_idxes]
+            save_embed = self.save_proj(saved_embed)
+            mem_padding_mask[saved_idxes] = torch.cat([mem_padding_mask[saved_idxes, 1:], torch.zeros((len(saved_embed), 1), dtype=torch.bool, device=device)], dim=1)
+            track_instances.mem_bank = track_instances.mem_bank.clone()
+            track_instances.mem_bank[saved_idxes] = torch.cat([prev_embed[:, 1:], save_embed], dim=1)
+
+    def _forward_temporal_attn(self, track_instances):
+        if len(track_instances) == 0:
+            return track_instances
+
+        key_padding_mask = track_instances.mem_padding_mask  # [n_, memory_bank_len]
+
+        valid_idxes = key_padding_mask[:, -1] == 0
+        embed = track_instances.output_embedding[valid_idxes]  # (n, 256)
+
+        if len(embed) > 0:
+            prev_embed = track_instances.mem_bank[valid_idxes]
+            key_padding_mask = key_padding_mask[valid_idxes]
+            embed2 = self.temporal_attn(
+                embed[None],                  # (num_track, dim) to (1, num_track, dim)
+                prev_embed.transpose(0, 1),   # (num_track, mem_len, dim) to (mem_len, num_track, dim)
+                prev_embed.transpose(0, 1),
+                key_padding_mask=key_padding_mask,
+            )[0][0]
+
+            embed = self.temporal_norm1(embed + embed2)
+            embed2 = self.temporal_fc2(F.relu(self.temporal_fc1(embed)))
+            embed = self.temporal_norm2(embed + embed2)
+            track_instances.output_embedding = track_instances.output_embedding.clone()
+            track_instances.output_embedding[valid_idxes] = embed
+
+        return track_instances
+
+    def forward_temporal_attn(self, track_instances):
+        return self._forward_temporal_attn(track_instances)
+
+    def forward(self, track_instances: Instances, update_bank=True) -> Instances:
+        track_instances = self._forward_temporal_attn(track_instances)
+        if update_bank:
+            self.update(track_instances)
+        return track_instances
+
+
+# QIM
+class QueryInteractionBase(nn.Module):
+
+    def __init__(self, args, dim_in, hidden_dim, dim_out):
+        super().__init__()
+        self.args = args
+        self._build_layers(args, dim_in, hidden_dim, dim_out)
+        self._reset_parameters()
+
+    def _build_layers(self, args, dim_in, hidden_dim, dim_out):
+        raise NotImplementedError()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def _select_active_tracks(self, data: dict) -> Instances:
+        raise NotImplementedError()
+
+    def _update_track_embedding(self, track_instances):
+        raise NotImplementedError()
+
+class QueryInteractionModule(QueryInteractionBase):
+
+    def __init__(self, args, dim_in, hidden_dim, dim_out):
+        super().__init__(args, dim_in, hidden_dim, dim_out)
+        self.random_drop = args["random_drop"]
+        self.fp_ratio = args["fp_ratio"]
+        self.update_query_pos = args["update_query_pos"]
+
+    def _build_layers(self, args, dim_in, hidden_dim, dim_out):
+        dropout = args["merger_dropout"]
+
+        self.self_attn = nn.MultiheadAttention(dim_in, 8, dropout)
+        self.linear1 = nn.Linear(dim_in, hidden_dim)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(hidden_dim, dim_in)
+
+        if args["update_query_pos"]:
+            self.linear_pos1 = nn.Linear(dim_in, hidden_dim)
+            self.linear_pos2 = nn.Linear(hidden_dim, dim_in)
+            self.dropout_pos1 = nn.Dropout(dropout)
+            self.dropout_pos2 = nn.Dropout(dropout)
+            self.norm_pos = nn.LayerNorm(dim_in)
+
+        self.linear_feat1 = nn.Linear(dim_in, hidden_dim)
+        self.linear_feat2 = nn.Linear(hidden_dim, dim_in)
+        self.dropout_feat1 = nn.Dropout(dropout)
+        self.dropout_feat2 = nn.Dropout(dropout)
+        self.norm_feat = nn.LayerNorm(dim_in)
+
+        self.norm1 = nn.LayerNorm(dim_in)
+        self.norm2 = nn.LayerNorm(dim_in)
+
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = F.relu
+
+    def _update_track_embedding(self, track_instances: Instances) -> Instances:
+        if len(track_instances) == 0:
+            return track_instances
+        dim = track_instances.query.shape[1]
+        out_embed = track_instances.output_embedding
+        query_pos = track_instances.query[:, :dim // 2]
+        query_feat = track_instances.query[:, dim // 2:]
+        q = k = query_pos + out_embed
+
+        # attention
+        tgt = out_embed
+        tgt2 = self.self_attn(q[:, None], k[:, None], value=tgt[:, None])[0][:,
+                                                                             0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # ffn
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+
+        if self.update_query_pos:
+            # ffn: linear_pos2
+            query_pos2 = self.linear_pos2(
+                self.dropout_pos1(self.activation(self.linear_pos1(tgt))))
+            query_pos = query_pos + self.dropout_pos2(query_pos2)
+            query_pos = self.norm_pos(query_pos)
+            track_instances.query[:, :dim // 2] = query_pos
+
+        query_feat2 = self.linear_feat2(
+            self.dropout_feat1(self.activation(self.linear_feat1(tgt))))
+        query_feat = query_feat + self.dropout_feat2(query_feat2)
+        query_feat = self.norm_feat(query_feat)
+        track_instances.query[:, dim // 2:] = query_feat
+        # track_instances.ref_pts = inverse_sigmoid(track_instances.pred_boxes[:, :2].detach().clone())
+        # update ref_pts using track_instances.pred_boxes
+        return track_instances
+
+    def _random_drop_tracks(self, track_instances: Instances) -> Instances:
+        drop_probability = self.random_drop
+        if drop_probability > 0 and len(track_instances) > 0:
+            keep_idxes = torch.rand_like(track_instances.scores) > drop_probability
+            track_instances = track_instances[keep_idxes]
+        return track_instances
+
+    def _add_fp_tracks(self, track_instances: Instances,
+                       active_track_instances: Instances) -> Instances:
+        """
+        self.fp_ratio is used to control num(add_fp) / num(active)
+        """
+        inactive_instances = track_instances[track_instances.obj_idxes < 0]
+
+        # add fp for each active track in a specific probability.
+        fp_prob = torch.ones_like(
+            active_track_instances.scores) * self.fp_ratio
+        selected_active_track_instances = active_track_instances[
+            torch.bernoulli(fp_prob).bool()]
+        num_fp = len(selected_active_track_instances)
+
+        if len(inactive_instances) > 0 and num_fp > 0:
+            if num_fp >= len(inactive_instances):
+                fp_track_instances = inactive_instances
+            else:
+                # randomly select num_fp from inactive_instances
+                # fp_indexes = np.random.permutation(len(inactive_instances))
+                # fp_indexes = fp_indexes[:num_fp]
+                # fp_track_instances = inactive_instances[fp_indexes]
+
+                # v2: select the fps with top scores rather than random selection
+                fp_indexes = torch.argsort(inactive_instances.scores)[-num_fp:]
+                fp_track_instances = inactive_instances[fp_indexes]
+
+            merged_track_instances = Instances.cat(
+                [active_track_instances, fp_track_instances])
+            return merged_track_instances
+
+        return active_track_instances
+
+    def _select_active_tracks(self, data: dict) -> Instances:
+        track_instances: Instances = data["track_instances"]
+        if self.training:
+            active_idxes = (track_instances.obj_idxes >=
+                            0) & (track_instances.iou > 0.5)
+            active_track_instances = track_instances[active_idxes]
+            # set -2 instead of -1 to ensure that these tracks will not be selected in matching.
+            active_track_instances = self._random_drop_tracks(
+                active_track_instances)
+            if self.fp_ratio > 0:
+                active_track_instances = self._add_fp_tracks(
+                    track_instances, active_track_instances)
+        else:
+            active_track_instances = track_instances[
+                track_instances.obj_idxes >= 0]
+
+        return active_track_instances
+
+    def forward(self, data) -> Instances:
+        active_track_instances = self._select_active_tracks(data)
+        active_track_instances = self._update_track_embedding(
+            active_track_instances)
+        init_track_instances: Instances = data["init_track_instances"]
+        merged_track_instances = Instances.cat(
+            [init_track_instances, active_track_instances])
+        return merged_track_instances
diff --git a/mmcv/models/dense_heads/track_head_plugin/track_instance.py b/mmcv/models/dense_heads/track_head_plugin/track_instance.py
new file mode 100644
index 0000000..bfc7864
--- /dev/null
+++ b/mmcv/models/dense_heads/track_head_plugin/track_instance.py
@@ -0,0 +1,198 @@
+import itertools
+from typing import Any, Dict, List, Tuple, Union
+import torch
+
+
+class Instances:
+    """
+    This class represents a list of instances in an image.
+    It stores the attributes of instances (e.g., boxes, masks, labels, scores) as "fields".
+    All fields must have the same ``__len__`` which is the number of instances.
+    All other (non-field) attributes of this class are considered private:
+    they must start with '_' and are not modifiable by a user.
+    Some basic usage:
+    1. Set/get/check a field:
+       .. code-block:: python
+          instances.gt_boxes = Boxes(...)
+          print(instances.pred_masks)  # a tensor of shape (N, H, W)
+          print('gt_masks' in instances)
+    2. ``len(instances)`` returns the number of instances
+    3. Indexing: ``instances[indices]`` will apply the indexing on all the fields
+       and returns a new :class:`Instances`.
+       Typically, ``indices`` is a integer vector of indices,
+       or a binary mask of length ``num_instances``
+       .. code-block:: python
+          category_3_detections = instances[instances.pred_classes == 3]
+          confident_detections = instances[instances.scores > 0.9]
+    """
+
+    def __init__(self, image_size: Tuple[int, int], **kwargs: Any):
+        """
+        Args:
+            image_size (height, width): the spatial size of the image.
+            kwargs: fields to add to this `Instances`.
+        """
+        self._image_size = image_size
+        self._fields: Dict[str, Any] = {}
+        for k, v in kwargs.items():
+            self.set(k, v)
+
+    @property
+    def image_size(self) -> Tuple[int, int]:
+        """
+        Returns:
+            tuple: height, width
+        """
+        return self._image_size
+
+    def __setattr__(self, name: str, val: Any) -> None:
+        if name.startswith("_"):
+            super().__setattr__(name, val)
+        else:
+            self.set(name, val)
+
+    def __getattr__(self, name: str) -> Any:
+        if name == "_fields" or name not in self._fields:
+            raise AttributeError("Cannot find field '{}' in the given Instances!".format(name))
+        return self._fields[name]
+
+    def set(self, name: str, value: Any) -> None:
+        """
+        Set the field named `name` to `value`.
+        The length of `value` must be the number of instances,
+        and must agree with other existing fields in this object.
+        """
+        data_len = len(value)
+        if len(self._fields):
+            assert (
+                len(self) == data_len
+            ), "Adding a field of length {} to a Instances of length {}".format(data_len, len(self))
+        self._fields[name] = value
+
+    def has(self, name: str) -> bool:
+        """
+        Returns:
+            bool: whether the field called `name` exists.
+        """
+        return name in self._fields
+
+    def remove(self, name: str) -> None:
+        """
+        Remove the field called `name`.
+        """
+        del self._fields[name]
+
+    def get(self, name: str) -> Any:
+        """
+        Returns the field called `name`.
+        """
+        return self._fields[name]
+
+    def get_fields(self) -> Dict[str, Any]:
+        """
+        Returns:
+            dict: a dict which maps names (str) to data of the fields
+        Modifying the returned dict will modify this instance.
+        """
+        return self._fields
+
+    # Tensor-like methods
+    def to(self, *args: Any, **kwargs: Any) -> "Instances":
+        """
+        Returns:
+            Instances: all fields are called with a `to(device)`, if the field has this method.
+        """
+        ret = Instances(self._image_size)
+        for k, v in self._fields.items():
+            if hasattr(v, "to"):
+                v = v.to(*args, **kwargs)
+            ret.set(k, v)
+        return ret
+
+    def numpy(self):
+        ret = Instances(self._image_size)
+        for k, v in self._fields.items():
+            if hasattr(v, "numpy"):
+                v = v.numpy()
+            ret.set(k, v)
+        return ret
+
+    def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Instances":
+        """
+        Args:
+            item: an index-like object and will be used to index all the fields.
+        Returns:
+            If `item` is a string, return the data in the corresponding field.
+            Otherwise, returns an `Instances` where all fields are indexed by `item`.
+        """
+        if type(item) == int:
+            if item >= len(self) or item < -len(self):
+                raise IndexError("Instances index out of range!")
+            else:
+                item = slice(item, None, len(self))
+
+        ret = Instances(self._image_size)
+        for k, v in self._fields.items():
+            # print(k, type(item), 'getitem', item.type(), item.dtype)
+            # if index by torch.BoolTensor
+            if k == 'kalman_models' and isinstance(item, torch.Tensor):
+                # print(item.shape, 'in get item')
+                ret_list = []
+                for i, if_true in enumerate(item):
+                    if if_true:
+                        ret_list.append(self.kalman_models[i])
+                ret.set(k, ret_list)
+
+            else:
+                ret.set(k, v[item])
+        return ret
+
+    def __len__(self) -> int:
+        for v in self._fields.values():
+            # use __len__ because len() has to be int and is not friendly to tracing
+            return v.__len__()
+        raise NotImplementedError("Empty Instances does not support __len__!")
+
+    def __iter__(self):
+        raise NotImplementedError("`Instances` object is not iterable!")
+
+    @staticmethod
+    def cat(instance_lists: List["Instances"]) -> "Instances":
+        """
+        Args:
+            instance_lists (list[Instances])
+        Returns:
+            Instances
+        """
+        assert all(isinstance(i, Instances) for i in instance_lists)
+        assert len(instance_lists) > 0
+        if len(instance_lists) == 1:
+            return instance_lists[0]
+
+        image_size = instance_lists[0].image_size
+        for i in instance_lists[1:]:
+            assert i.image_size == image_size
+        ret = Instances(image_size)
+        for k in instance_lists[0]._fields.keys():
+            values = [i.get(k) for i in instance_lists]
+            v0 = values[0]
+            if isinstance(v0, torch.Tensor):
+                values = torch.cat(values, dim=0)
+            elif isinstance(v0, list):
+                values = list(itertools.chain(*values))
+            elif hasattr(type(v0), "cat"):
+                values = type(v0).cat(values)
+            else:
+                raise ValueError("Unsupported type {} for concatenation".format(type(v0)))
+            ret.set(k, values)
+        return ret
+
+    def __str__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={}, ".format(len(self))
+        s += "image_height={}, ".format(self._image_size[0])
+        s += "image_width={}, ".format(self._image_size[1])
+        s += "fields=[{}])".format(", ".join((f"{k}: {v}" for k, v in self._fields.items())))
+        return s
+
+    __repr__ = __str__
\ No newline at end of file
diff --git a/mmcv/models/dense_heads/track_head_plugin/tracker.py b/mmcv/models/dense_heads/track_head_plugin/tracker.py
new file mode 100644
index 0000000..1355436
--- /dev/null
+++ b/mmcv/models/dense_heads/track_head_plugin/tracker.py
@@ -0,0 +1,42 @@
+from .track_instance import Instances
+from mmcv.core.bbox.iou_calculators.iou3d_calculator import (
+    bbox_overlaps_nearest_3d as iou_3d, )
+from mmcv.core.bbox.util import denormalize_bbox
+
+class RuntimeTrackerBase(object):
+    def __init__(self, score_thresh=0.5, filter_score_thresh=0.4,  miss_tolerance=5):
+        self.score_thresh = score_thresh
+        self.filter_score_thresh = filter_score_thresh
+        self.miss_tolerance = miss_tolerance
+        self.max_obj_id = 0
+
+    def clear(self):
+        self.max_obj_id = 0
+
+    def update(self, track_instances: Instances, iou_thre=None):
+        track_instances.disappear_time[track_instances.scores >= self.score_thresh] = 0
+        for i in range(len(track_instances)):
+            if (
+                track_instances.obj_idxes[i] == -1
+                and track_instances.scores[i] >= self.score_thresh
+            ):  
+                if iou_thre is not None and track_instances.pred_boxes[track_instances.obj_idxes>=0].shape[0]!=0:
+                    iou3ds = iou_3d(denormalize_bbox(track_instances.pred_boxes[i].unsqueeze(0), None)[...,:7], denormalize_bbox(track_instances.pred_boxes[track_instances.obj_idxes>=0], None)[...,:7])
+                    if iou3ds.max()>iou_thre:
+                        continue
+                # new track
+                # print("track {} has score {}, assign obj_id {}".format(i, track_instances.scores[i], self.max_obj_id))
+                track_instances.obj_idxes[i] = self.max_obj_id
+                self.max_obj_id += 1
+            elif (
+                track_instances.obj_idxes[i] >= 0
+                and track_instances.scores[i] < self.filter_score_thresh
+            ):
+                # sleep time ++
+                track_instances.disappear_time[i] += 1
+                if track_instances.disappear_time[i] >= self.miss_tolerance:
+                    # mark deaded tracklets: Set the obj_id to -1.
+                    # TODO: remove it by following functions
+                    # Then this track will be removed by TrackEmbeddingLayer.
+                    track_instances.obj_idxes[i] = -1
+                    
\ No newline at end of file
diff --git a/mmcv/models/dense_heads/train_mixins.py b/mmcv/models/dense_heads/train_mixins.py
new file mode 100644
index 0000000..3d387f1
--- /dev/null
+++ b/mmcv/models/dense_heads/train_mixins.py
@@ -0,0 +1,347 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from mmcv.core import limit_period
+from mmcv.core import images_to_levels, multi_apply
+
+
+class AnchorTrainMixin(object):
+    """Mixin class for target assigning of dense heads."""
+
+    def anchor_target_3d(self,
+                         anchor_list,
+                         gt_bboxes_list,
+                         input_metas,
+                         gt_bboxes_ignore_list=None,
+                         gt_labels_list=None,
+                         label_channels=1,
+                         num_classes=1,
+                         sampling=True):
+        """Compute regression and classification targets for anchors.
+
+        Args:
+            anchor_list (list[list]): Multi level anchors of each image.
+            gt_bboxes_list (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                bboxes of each image.
+            input_metas (list[dict]): Meta info of each image.
+            gt_bboxes_ignore_list (None | list): Ignore list of gt bboxes.
+            gt_labels_list (list[torch.Tensor]): Gt labels of batches.
+            label_channels (int): The channel of labels.
+            num_classes (int): The number of classes.
+            sampling (bool): Whether to sample anchors.
+
+        Returns:
+            tuple (list, list, list, list, list, list, int, int):
+                Anchor targets, including labels, label weights,
+                bbox targets, bbox weights, direction targets,
+                direction weights, number of postive anchors and
+                number of negative anchors.
+        """
+        num_imgs = len(input_metas)
+        assert len(anchor_list) == num_imgs
+
+        if isinstance(anchor_list[0][0], list):
+            # sizes of anchors are different
+            # anchor number of a single level
+            num_level_anchors = [
+                sum([anchor.size(0) for anchor in anchors])
+                for anchors in anchor_list[0]
+            ]
+            for i in range(num_imgs):
+                anchor_list[i] = anchor_list[i][0]
+        else:
+            # anchor number of multi levels
+            num_level_anchors = [
+                anchors.view(-1, self.box_code_size).size(0)
+                for anchors in anchor_list[0]
+            ]
+            # concat all level anchors and flags to a single tensor
+            for i in range(num_imgs):
+                anchor_list[i] = torch.cat(anchor_list[i])
+
+        # compute targets for each image
+        if gt_bboxes_ignore_list is None:
+            gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+        if gt_labels_list is None:
+            gt_labels_list = [None for _ in range(num_imgs)]
+
+        (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights,
+         all_dir_targets, all_dir_weights, pos_inds_list,
+         neg_inds_list) = multi_apply(
+             self.anchor_target_3d_single,
+             anchor_list,
+             gt_bboxes_list,
+             gt_bboxes_ignore_list,
+             gt_labels_list,
+             input_metas,
+             label_channels=label_channels,
+             num_classes=num_classes,
+             sampling=sampling)
+
+        # no valid anchors
+        if any([labels is None for labels in all_labels]):
+            return None
+        # sampled anchors of all images
+        num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+        num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+        # split targets to a list w.r.t. multiple levels
+        labels_list = images_to_levels(all_labels, num_level_anchors)
+        label_weights_list = images_to_levels(all_label_weights,
+                                              num_level_anchors)
+        bbox_targets_list = images_to_levels(all_bbox_targets,
+                                             num_level_anchors)
+        bbox_weights_list = images_to_levels(all_bbox_weights,
+                                             num_level_anchors)
+        dir_targets_list = images_to_levels(all_dir_targets, num_level_anchors)
+        dir_weights_list = images_to_levels(all_dir_weights, num_level_anchors)
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, dir_targets_list, dir_weights_list,
+                num_total_pos, num_total_neg)
+
+    def anchor_target_3d_single(self,
+                                anchors,
+                                gt_bboxes,
+                                gt_bboxes_ignore,
+                                gt_labels,
+                                input_meta,
+                                label_channels=1,
+                                num_classes=1,
+                                sampling=True):
+        """Compute targets of anchors in single batch.
+
+        Args:
+            anchors (torch.Tensor): Concatenated multi-level anchor.
+            gt_bboxes (:obj:`BaseInstance3DBoxes`): Gt bboxes.
+            gt_bboxes_ignore (torch.Tensor): Ignored gt bboxes.
+            gt_labels (torch.Tensor): Gt class labels.
+            input_meta (dict): Meta info of each image.
+            label_channels (int): The channel of labels.
+            num_classes (int): The number of classes.
+            sampling (bool): Whether to sample anchors.
+
+        Returns:
+            tuple[torch.Tensor]: Anchor targets.
+        """
+        if isinstance(self.bbox_assigner,
+                      list) and (not isinstance(anchors, list)):
+            feat_size = anchors.size(0) * anchors.size(1) * anchors.size(2)
+            rot_angles = anchors.size(-2)
+            assert len(self.bbox_assigner) == anchors.size(-3)
+            (total_labels, total_label_weights, total_bbox_targets,
+             total_bbox_weights, total_dir_targets, total_dir_weights,
+             total_pos_inds, total_neg_inds) = [], [], [], [], [], [], [], []
+            current_anchor_num = 0
+            for i, assigner in enumerate(self.bbox_assigner):
+                current_anchors = anchors[..., i, :, :].reshape(
+                    -1, self.box_code_size)
+                current_anchor_num += current_anchors.size(0)
+                if self.assign_per_class:
+                    gt_per_cls = (gt_labels == i)
+                    anchor_targets = self.anchor_target_single_assigner(
+                        assigner, current_anchors, gt_bboxes[gt_per_cls, :],
+                        gt_bboxes_ignore, gt_labels[gt_per_cls], input_meta,
+                        num_classes, sampling)
+                else:
+                    anchor_targets = self.anchor_target_single_assigner(
+                        assigner, current_anchors, gt_bboxes, gt_bboxes_ignore,
+                        gt_labels, input_meta, num_classes, sampling)
+
+                (labels, label_weights, bbox_targets, bbox_weights,
+                 dir_targets, dir_weights, pos_inds, neg_inds) = anchor_targets
+                total_labels.append(labels.reshape(feat_size, 1, rot_angles))
+                total_label_weights.append(
+                    label_weights.reshape(feat_size, 1, rot_angles))
+                total_bbox_targets.append(
+                    bbox_targets.reshape(feat_size, 1, rot_angles,
+                                         anchors.size(-1)))
+                total_bbox_weights.append(
+                    bbox_weights.reshape(feat_size, 1, rot_angles,
+                                         anchors.size(-1)))
+                total_dir_targets.append(
+                    dir_targets.reshape(feat_size, 1, rot_angles))
+                total_dir_weights.append(
+                    dir_weights.reshape(feat_size, 1, rot_angles))
+                total_pos_inds.append(pos_inds)
+                total_neg_inds.append(neg_inds)
+
+            total_labels = torch.cat(total_labels, dim=-2).reshape(-1)
+            total_label_weights = torch.cat(
+                total_label_weights, dim=-2).reshape(-1)
+            total_bbox_targets = torch.cat(
+                total_bbox_targets, dim=-3).reshape(-1, anchors.size(-1))
+            total_bbox_weights = torch.cat(
+                total_bbox_weights, dim=-3).reshape(-1, anchors.size(-1))
+            total_dir_targets = torch.cat(
+                total_dir_targets, dim=-2).reshape(-1)
+            total_dir_weights = torch.cat(
+                total_dir_weights, dim=-2).reshape(-1)
+            total_pos_inds = torch.cat(total_pos_inds, dim=0).reshape(-1)
+            total_neg_inds = torch.cat(total_neg_inds, dim=0).reshape(-1)
+            return (total_labels, total_label_weights, total_bbox_targets,
+                    total_bbox_weights, total_dir_targets, total_dir_weights,
+                    total_pos_inds, total_neg_inds)
+        elif isinstance(self.bbox_assigner, list) and isinstance(
+                anchors, list):
+            # class-aware anchors with different feature map sizes
+            assert len(self.bbox_assigner) == len(anchors), \
+                'The number of bbox assigners and anchors should be the same.'
+            (total_labels, total_label_weights, total_bbox_targets,
+             total_bbox_weights, total_dir_targets, total_dir_weights,
+             total_pos_inds, total_neg_inds) = [], [], [], [], [], [], [], []
+            current_anchor_num = 0
+            for i, assigner in enumerate(self.bbox_assigner):
+                current_anchors = anchors[i]
+                current_anchor_num += current_anchors.size(0)
+                if self.assign_per_class:
+                    gt_per_cls = (gt_labels == i)
+                    anchor_targets = self.anchor_target_single_assigner(
+                        assigner, current_anchors, gt_bboxes[gt_per_cls, :],
+                        gt_bboxes_ignore, gt_labels[gt_per_cls], input_meta,
+                        num_classes, sampling)
+                else:
+                    anchor_targets = self.anchor_target_single_assigner(
+                        assigner, current_anchors, gt_bboxes, gt_bboxes_ignore,
+                        gt_labels, input_meta, num_classes, sampling)
+
+                (labels, label_weights, bbox_targets, bbox_weights,
+                 dir_targets, dir_weights, pos_inds, neg_inds) = anchor_targets
+                total_labels.append(labels)
+                total_label_weights.append(label_weights)
+                total_bbox_targets.append(
+                    bbox_targets.reshape(-1, anchors[i].size(-1)))
+                total_bbox_weights.append(
+                    bbox_weights.reshape(-1, anchors[i].size(-1)))
+                total_dir_targets.append(dir_targets)
+                total_dir_weights.append(dir_weights)
+                total_pos_inds.append(pos_inds)
+                total_neg_inds.append(neg_inds)
+
+            total_labels = torch.cat(total_labels, dim=0)
+            total_label_weights = torch.cat(total_label_weights, dim=0)
+            total_bbox_targets = torch.cat(total_bbox_targets, dim=0)
+            total_bbox_weights = torch.cat(total_bbox_weights, dim=0)
+            total_dir_targets = torch.cat(total_dir_targets, dim=0)
+            total_dir_weights = torch.cat(total_dir_weights, dim=0)
+            total_pos_inds = torch.cat(total_pos_inds, dim=0)
+            total_neg_inds = torch.cat(total_neg_inds, dim=0)
+            return (total_labels, total_label_weights, total_bbox_targets,
+                    total_bbox_weights, total_dir_targets, total_dir_weights,
+                    total_pos_inds, total_neg_inds)
+        else:
+            return self.anchor_target_single_assigner(self.bbox_assigner,
+                                                      anchors, gt_bboxes,
+                                                      gt_bboxes_ignore,
+                                                      gt_labels, input_meta,
+                                                      num_classes, sampling)
+
+    def anchor_target_single_assigner(self,
+                                      bbox_assigner,
+                                      anchors,
+                                      gt_bboxes,
+                                      gt_bboxes_ignore,
+                                      gt_labels,
+                                      input_meta,
+                                      num_classes=1,
+                                      sampling=True):
+        """Assign anchors and encode positive anchors.
+
+        Args:
+            bbox_assigner (BaseAssigner): assign positive and negative boxes.
+            anchors (torch.Tensor): Concatenated multi-level anchor.
+            gt_bboxes (:obj:`BaseInstance3DBoxes`): Gt bboxes.
+            gt_bboxes_ignore (torch.Tensor): Ignored gt bboxes.
+            gt_labels (torch.Tensor): Gt class labels.
+            input_meta (dict): Meta info of each image.
+            num_classes (int): The number of classes.
+            sampling (bool): Whether to sample anchors.
+
+        Returns:
+            tuple[torch.Tensor]: Anchor targets.
+        """
+        anchors = anchors.reshape(-1, anchors.size(-1))
+        num_valid_anchors = anchors.shape[0]
+        bbox_targets = torch.zeros_like(anchors)
+        bbox_weights = torch.zeros_like(anchors)
+        dir_targets = anchors.new_zeros((anchors.shape[0]), dtype=torch.long)
+        dir_weights = anchors.new_zeros((anchors.shape[0]), dtype=torch.float)
+        labels = anchors.new_zeros(num_valid_anchors, dtype=torch.long)
+        label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
+        if len(gt_bboxes) > 0:
+            if not isinstance(gt_bboxes, torch.Tensor):
+                gt_bboxes = gt_bboxes.tensor.to(anchors.device)
+            assign_result = bbox_assigner.assign(anchors, gt_bboxes,
+                                                 gt_bboxes_ignore, gt_labels)
+            sampling_result = self.bbox_sampler.sample(assign_result, anchors,
+                                                       gt_bboxes)
+            pos_inds = sampling_result.pos_inds
+            neg_inds = sampling_result.neg_inds
+        else:
+            pos_inds = torch.nonzero(
+                anchors.new_zeros((anchors.shape[0], ), dtype=torch.bool) > 0,
+                as_tuple=False).squeeze(-1).unique()
+            neg_inds = torch.nonzero(
+                anchors.new_zeros((anchors.shape[0], ), dtype=torch.bool) == 0,
+                as_tuple=False).squeeze(-1).unique()
+
+        if gt_labels is not None:
+            labels += num_classes
+        if len(pos_inds) > 0:
+            pos_bbox_targets = self.bbox_coder.encode(
+                sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
+            pos_dir_targets = get_direction_target(
+                sampling_result.pos_bboxes,
+                pos_bbox_targets,
+                self.dir_offset,
+                one_hot=False)
+            bbox_targets[pos_inds, :] = pos_bbox_targets
+            bbox_weights[pos_inds, :] = 1.0
+            dir_targets[pos_inds] = pos_dir_targets
+            dir_weights[pos_inds] = 1.0
+
+            if gt_labels is None:
+                labels[pos_inds] = 1
+            else:
+                labels[pos_inds] = gt_labels[
+                    sampling_result.pos_assigned_gt_inds]
+            if self.train_cfg.pos_weight <= 0:
+                label_weights[pos_inds] = 1.0
+            else:
+                label_weights[pos_inds] = self.train_cfg.pos_weight
+
+        if len(neg_inds) > 0:
+            label_weights[neg_inds] = 1.0
+        return (labels, label_weights, bbox_targets, bbox_weights, dir_targets,
+                dir_weights, pos_inds, neg_inds)
+
+
+def get_direction_target(anchors,
+                         reg_targets,
+                         dir_offset=0,
+                         num_bins=2,
+                         one_hot=True):
+    """Encode direction to 0 ~ num_bins-1.
+
+    Args:
+        anchors (torch.Tensor): Concatenated multi-level anchor.
+        reg_targets (torch.Tensor): Bbox regression targets.
+        dir_offset (int): Direction offset.
+        num_bins (int): Number of bins to divide 2*PI.
+        one_hot (bool): Whether to encode as one hot.
+
+    Returns:
+        torch.Tensor: Encoded direction targets.
+    """
+    rot_gt = reg_targets[..., 6] + anchors[..., 6]
+    offset_rot = limit_period(rot_gt - dir_offset, 0, 2 * np.pi)
+    dir_cls_targets = torch.floor(offset_rot / (2 * np.pi / num_bins)).long()
+    dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1)
+    if one_hot:
+        dir_targets = torch.zeros(
+            *list(dir_cls_targets.shape),
+            num_bins,
+            dtype=anchors.dtype,
+            device=dir_cls_targets.device)
+        dir_targets.scatter_(dir_cls_targets.unsqueeze(dim=-1).long(), 1.0)
+        dir_cls_targets = dir_targets
+    return dir_cls_targets
\ No newline at end of file
diff --git a/mmcv/models/detectors/VAD.py b/mmcv/models/detectors/VAD.py
new file mode 100644
index 0000000..34c6f5f
--- /dev/null
+++ b/mmcv/models/detectors/VAD.py
@@ -0,0 +1,684 @@
+import time
+import copy
+
+import torch
+from mmcv.models import DETECTORS
+from mmcv.core.bbox.transforms import bbox3d2result
+from mmcv.utils import force_fp32, auto_fp16
+from scipy.optimize import linear_sum_assignment
+from mmcv.models.detectors.mvx_two_stage import MVXTwoStageDetector
+
+from mmcv.models.utils.grid_mask import GridMask
+from mmcv.models.dense_heads.planning_head_plugin.metric_stp3 import PlanningMetric
+
+
+@DETECTORS.register_module()
+class VAD(MVXTwoStageDetector):
+    """VAD model.
+    """
+    def __init__(self,
+                 use_grid_mask=False,
+                 pts_voxel_layer=None,
+                 pts_voxel_encoder=None,
+                 pts_middle_encoder=None,
+                 pts_fusion_layer=None,
+                 img_backbone=None,
+                 pts_backbone=None,
+                 img_neck=None,
+                 pts_neck=None,
+                 pts_bbox_head=None,
+                 img_roi_head=None,
+                 img_rpn_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 video_test_mode=False,
+                 prev_frame_num=0,
+                 fut_ts=6,
+                 fut_mode=6
+                 ):
+
+        super(VAD,
+              self).__init__(pts_voxel_layer, pts_voxel_encoder,
+                             pts_middle_encoder, pts_fusion_layer,
+                             img_backbone, pts_backbone, img_neck, pts_neck,
+                             pts_bbox_head, img_roi_head, img_rpn_head,
+                             train_cfg, test_cfg, pretrained)
+        self.grid_mask = GridMask(
+            True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7)
+        self.use_grid_mask = use_grid_mask
+        self.fp16_enabled = False
+        self.fut_ts = fut_ts
+        self.fut_mode = fut_mode
+        self.valid_fut_ts = pts_bbox_head['valid_fut_ts']
+        self.prev_frame_num = prev_frame_num
+        self.prev_frame_infos = []
+
+        # temporal
+        self.video_test_mode = video_test_mode
+        self.prev_frame_info = {
+            'prev_bev': None,
+            'scene_token': None,
+            'prev_pos': 0,
+            'prev_angle': 0,
+        }
+
+        self.planning_metric = None
+
+    def extract_img_feat(self, img, img_metas, len_queue=None):
+        """Extract features of images."""
+        B = img.size(0)
+        if img is not None:
+            
+            # input_shape = img.shape[-2:]
+            # # update real input shape of each single img
+            # for img_meta in img_metas:
+            #     img_meta.update(input_shape=input_shape)
+
+            if img.dim() == 5 and img.size(0) == 1:
+                img.squeeze_()
+            elif img.dim() == 5 and img.size(0) > 1:
+                B, N, C, H, W = img.size()
+                img = img.reshape(B * N, C, H, W)
+            if self.use_grid_mask:
+                img = self.grid_mask(img)
+
+            img_feats = self.img_backbone(img)
+            if isinstance(img_feats, dict):
+                img_feats = list(img_feats.values())
+        else:
+            return None
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+
+        img_feats_reshaped = []
+        for img_feat in img_feats:
+            BN, C, H, W = img_feat.size()
+            if len_queue is not None:
+                img_feats_reshaped.append(img_feat.view(int(B/len_queue), len_queue, int(BN / B), C, H, W))
+            else:
+                img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W))
+        return img_feats_reshaped
+
+    @auto_fp16(apply_to=('img'), out_fp32=True)
+    def extract_feat(self, img, img_metas=None, len_queue=None):
+        """Extract features from images and points."""
+
+        img_feats = self.extract_img_feat(img, img_metas, len_queue=len_queue)
+        
+        return img_feats
+
+    def forward_pts_train(self,
+                          pts_feats,
+                          gt_bboxes_3d,
+                          gt_labels_3d,
+                          map_gt_bboxes_3d,
+                          map_gt_labels_3d,                          
+                          img_metas,
+                          gt_bboxes_ignore=None,
+                          map_gt_bboxes_ignore=None,
+                          prev_bev=None,
+                          ego_his_trajs=None,
+                          ego_fut_trajs=None,
+                          ego_fut_masks=None,
+                          ego_fut_cmd=None,
+                          ego_lcf_feat=None,
+                          gt_attr_labels=None):
+        """Forward function'
+        Args:
+            pts_feats (list[torch.Tensor]): Features of point cloud branch
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes for each sample.
+            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
+                boxes of each sampole
+            img_metas (list[dict]): Meta information of samples.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                boxes to be ignored. Defaults to None.
+            prev_bev (torch.Tensor, optional): BEV features of previous frame.
+        Returns:
+            dict: Losses of each branch.
+        """
+
+        outs = self.pts_bbox_head(pts_feats, img_metas, prev_bev,
+                                  ego_his_trajs=ego_his_trajs, ego_lcf_feat=ego_lcf_feat)
+        loss_inputs = [
+            gt_bboxes_3d, gt_labels_3d, map_gt_bboxes_3d, map_gt_labels_3d,
+            outs, ego_fut_trajs, ego_fut_masks, ego_fut_cmd, gt_attr_labels
+        ]
+        losses = self.pts_bbox_head.loss(*loss_inputs, img_metas=img_metas)
+        return losses
+
+    def forward_dummy(self, img):
+        dummy_metas = None
+        return self.forward_test(img=img, img_metas=[[dummy_metas]])
+
+    def forward(self, inputs,return_loss=True,rescale=False):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True.
+        Note this setting will change the expected inputs. When
+        `return_loss=True`, img and img_metas are single-nested (i.e.
+        torch.Tensor and list[dict]), and when `resturn_loss=False`, img and
+        img_metas should be double nested (i.e.  list[torch.Tensor],
+        list[list[dict]]), with the outer list indicating test time
+        augmentations.
+        """
+        if return_loss:
+            losses = self.forward_train(**inputs)
+            loss, log_vars = self._parse_losses(losses)
+            outputs = dict(
+                loss=loss, log_vars=log_vars, num_samples=len(inputs['img_metas']))
+            return outputs
+        else:
+            outputs = self.forward_test(**inputs,rescale=rescale)            
+            return outputs
+    
+    def obtain_history_bev(self, imgs_queue, img_metas_list):
+        """Obtain history BEV features iteratively. To save GPU memory, gradients are not calculated.
+        """
+        self.eval()
+
+        with torch.no_grad():
+            prev_bev = None
+            bs, len_queue, num_cams, C, H, W = imgs_queue.shape
+            imgs_queue = imgs_queue.reshape(bs*len_queue, num_cams, C, H, W)
+            img_feats_list = self.extract_feat(img=imgs_queue, len_queue=len_queue)
+            for i in range(len_queue):
+                img_metas = [each[i] for each in img_metas_list]
+                # img_feats = self.extract_feat(img=img, img_metas=img_metas)
+                img_feats = [each_scale[:, i] for each_scale in img_feats_list]
+                prev_bev = self.pts_bbox_head(
+                    img_feats, img_metas, prev_bev, only_bev=True)
+            self.train()
+            return prev_bev
+
+    # @auto_fp16(apply_to=('img', 'points'))
+    @force_fp32(apply_to=('img','points','prev_bev'))
+    def forward_train(self,
+                      points=None,
+                      img_metas=None,
+                      gt_bboxes_3d=None,
+                      gt_labels_3d=None,
+                      map_gt_bboxes_3d=None,
+                      map_gt_labels_3d=None,
+                      gt_labels=None,
+                      gt_bboxes=None,
+                      img=None,
+                      proposals=None,
+                      gt_bboxes_ignore=None,
+                      map_gt_bboxes_ignore=None,
+                      img_depth=None,
+                      img_mask=None,
+                      ego_his_trajs=None,
+                      ego_fut_trajs=None,
+                      ego_fut_masks=None,
+                      ego_fut_cmd=None,
+                      ego_lcf_feat=None,
+                      gt_attr_labels=None,
+                      **kwargs
+                      ):
+        """Forward training function.
+        Args:
+            points (list[torch.Tensor], optional): Points of each sample.
+                Defaults to None.
+            img_metas (list[dict], optional): Meta information of each sample.
+                Defaults to None.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
+                Ground truth 3D boxes. Defaults to None.
+            gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
+                of 3D boxes. Defaults to None.
+            gt_labels (list[torch.Tensor], optional): Ground truth labels
+                of 2D boxes in images. Defaults to None.
+            gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
+                images. Defaults to None.
+            img (torch.Tensor optional): Images of each sample with shape
+                (N, C, H, W). Defaults to None.
+            proposals ([list[torch.Tensor], optional): Predicted proposals
+                used for training Fast RCNN. Defaults to None.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                2D boxes in images to be ignored. Defaults to None.
+        Returns:
+            dict: Losses of different branches.
+        """
+        
+        len_queue = img.size(1)
+        prev_img = img[:, :-1, ...]
+        img = img[:, -1, ...]
+
+        prev_img_metas = copy.deepcopy(img_metas)
+        # prev_bev = self.obtain_history_bev(prev_img, prev_img_metas)
+        # import pdb;pdb.set_trace()
+        prev_bev = self.obtain_history_bev(prev_img, prev_img_metas) if len_queue > 1 else None
+
+        img_metas = [each[len_queue-1] for each in img_metas]
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+        losses = dict()
+        losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d, gt_labels_3d,
+                                            map_gt_bboxes_3d, map_gt_labels_3d, img_metas,
+                                            gt_bboxes_ignore, map_gt_bboxes_ignore, prev_bev,
+                                            ego_his_trajs=ego_his_trajs, ego_fut_trajs=ego_fut_trajs,
+                                            ego_fut_masks=ego_fut_masks, ego_fut_cmd=ego_fut_cmd,
+                                            ego_lcf_feat=ego_lcf_feat, gt_attr_labels=gt_attr_labels)
+
+        losses.update(losses_pts)
+        return losses
+
+    def forward_test(
+        self,
+        img_metas,
+        gt_bboxes_3d=None,
+        gt_labels_3d=None,
+        img=None,
+        ego_his_trajs=None,
+        ego_fut_trajs=None,
+        ego_fut_cmd=None,
+        ego_lcf_feat=None,
+        gt_attr_labels=None,
+        **kwargs
+    ):
+        for var, name in [(img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError('{} must be a list, but got {}'.format(
+                    name, type(var)))
+        img = [img] if img is None else img
+
+        if self.prev_frame_num > 0:
+            if len(self.prev_frame_infos) < self.prev_frame_num:
+                self.prev_frame_info = {
+                "prev_bev": None,
+                "scene_token": None,
+                "prev_pos": 0,
+                "prev_angle": 0,
+            }
+            else:
+                self.prev_frame_info = self.prev_frame_infos.pop(0)
+
+
+
+
+        if img_metas[0][0]['scene_token'] != self.prev_frame_info['scene_token']:
+            # the first sample of each scene is truncated
+            self.prev_frame_info['prev_bev'] = None
+        # update idx
+        self.prev_frame_info['scene_token'] = img_metas[0][0]['scene_token']
+
+        # do not use temporal information
+        if not self.video_test_mode:
+            self.prev_frame_info['prev_bev'] = None
+
+        # Get the delta of ego position and angle between two timestamps.
+        tmp_pos = copy.deepcopy(img_metas[0][0]['can_bus'][:3])
+        tmp_angle = copy.deepcopy(img_metas[0][0]['can_bus'][-1])
+        if self.prev_frame_info['prev_bev'] is not None:
+            img_metas[0][0]['can_bus'][:3] -= self.prev_frame_info['prev_pos']
+            img_metas[0][0]['can_bus'][-1] -= self.prev_frame_info['prev_angle']
+        else:
+            img_metas[0][0]['can_bus'][-1] = 0
+            img_metas[0][0]['can_bus'][:3] = 0
+
+
+        if ego_his_trajs is not None:
+            ego_his_trajs=ego_his_trajs[0]
+        if ego_fut_trajs is not None:
+            ego_fut_trajs=ego_fut_trajs[0]
+        if ego_fut_cmd is not None:
+            ego_fut_cmd=ego_fut_cmd[0]
+        if ego_lcf_feat is not None:
+            ego_lcf_feat=ego_lcf_feat[0]
+
+        new_prev_bev, bbox_results = self.simple_test(
+            img_metas=img_metas[0],
+            img=img[0],
+            prev_bev=self.prev_frame_info['prev_bev'],
+            gt_bboxes_3d=gt_bboxes_3d,
+            gt_labels_3d=gt_labels_3d,
+            ego_his_trajs=ego_his_trajs,
+            ego_fut_trajs=ego_fut_trajs,
+            ego_fut_cmd=ego_fut_cmd,
+            ego_lcf_feat=ego_lcf_feat,
+            gt_attr_labels=gt_attr_labels,
+            **kwargs
+        )
+        # During inference, we save the BEV features and ego motion of each timestamp.
+        self.prev_frame_info['prev_pos'] = tmp_pos
+        self.prev_frame_info['prev_angle'] = tmp_angle
+        self.prev_frame_info['prev_bev'] = new_prev_bev
+        if self.prev_frame_num > 0:        
+            self.prev_frame_infos.append(self.prev_frame_info)        
+
+        return bbox_results
+
+    def simple_test(
+        self,
+        img_metas,
+        gt_bboxes_3d,
+        gt_labels_3d,
+        img=None,
+        prev_bev=None,
+        points=None,
+        fut_valid_flag=None,
+        rescale=False,
+        ego_his_trajs=None,
+        ego_fut_trajs=None,
+        ego_fut_cmd=None,
+        ego_lcf_feat=None,
+        gt_attr_labels=None,
+        **kwargs
+    ):
+        """Test function without augmentaiton."""
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+        bbox_list = [dict() for i in range(len(img_metas))]
+        new_prev_bev, bbox_pts, metric_dict = self.simple_test_pts(
+            img_feats,
+            img_metas,
+            gt_bboxes_3d,
+            gt_labels_3d,
+            prev_bev,
+            fut_valid_flag=fut_valid_flag,
+            rescale=rescale,
+            start=None,
+            ego_his_trajs=ego_his_trajs,
+            ego_fut_trajs=ego_fut_trajs,
+            ego_fut_cmd=ego_fut_cmd,
+            ego_lcf_feat=ego_lcf_feat,
+            gt_attr_labels=gt_attr_labels,
+        )
+        for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
+            result_dict['pts_bbox'] = pts_bbox
+            result_dict['metric_results'] = metric_dict
+
+        return new_prev_bev, bbox_list
+
+    def simple_test_pts(
+        self,
+        x,
+        img_metas,
+        gt_bboxes_3d,
+        gt_labels_3d,
+        prev_bev=None,
+        fut_valid_flag=None,
+        rescale=False,
+        start=None,
+        ego_his_trajs=None,
+        ego_fut_trajs=None,
+        ego_fut_cmd=None,
+        ego_lcf_feat=None,
+        gt_attr_labels=None,
+    ):
+        """Test function"""
+        mapped_class_names = [
+            'car', 'truck', 'construction_vehicle', 'bus',
+            'trailer', 'barrier', 'motorcycle', 'bicycle', 
+            'pedestrian', 'traffic_cone'
+        ]
+
+        outs = self.pts_bbox_head(x, img_metas, prev_bev=prev_bev,
+                                  ego_his_trajs=ego_his_trajs, ego_lcf_feat=ego_lcf_feat)
+        bbox_list = self.pts_bbox_head.get_bboxes(outs, img_metas, rescale=rescale)
+
+        bbox_results = []
+        for i, (bboxes, scores, labels, trajs, map_bboxes, \
+                map_scores, map_labels, map_pts) in enumerate(bbox_list):
+            bbox_result = bbox3d2result(bboxes, scores, labels)
+            bbox_result['trajs_3d'] = trajs.cpu()
+            map_bbox_result = self.map_pred2result(map_bboxes, map_scores, map_labels, map_pts)
+            bbox_result.update(map_bbox_result)
+            bbox_result['ego_fut_preds'] = outs['ego_fut_preds'][i].cpu()
+            bbox_result['ego_fut_cmd'] = ego_fut_cmd.cpu()
+            bbox_results.append(bbox_result)
+
+        metric_dict = None
+
+        if gt_attr_labels is not None:
+
+
+            assert len(bbox_results) == 1, 'only support batch_size=1 now'
+            score_threshold = 0.6
+            with torch.no_grad():
+                c_bbox_results = copy.deepcopy(bbox_results)
+
+                bbox_result = c_bbox_results[0]
+                gt_bbox = gt_bboxes_3d[0][0]
+                gt_label = gt_labels_3d[0][0].to('cpu')
+                gt_attr_label = gt_attr_labels[0][0].to('cpu')
+                fut_valid_flag = bool(fut_valid_flag[0][0])
+                # filter pred bbox by score_threshold
+                mask = bbox_result['scores_3d'] > score_threshold
+                bbox_result['boxes_3d'] = bbox_result['boxes_3d'][mask]
+                bbox_result['scores_3d'] = bbox_result['scores_3d'][mask]
+                bbox_result['labels_3d'] = bbox_result['labels_3d'][mask]
+                bbox_result['trajs_3d'] = bbox_result['trajs_3d'][mask]
+
+                matched_bbox_result = self.assign_pred_to_gt_vip3d(
+                    bbox_result, gt_bbox, gt_label)
+
+                metric_dict = self.compute_motion_metric_vip3d(
+                    gt_bbox, gt_label, gt_attr_label, bbox_result,
+                    matched_bbox_result, mapped_class_names)
+
+                # ego planning metric
+                assert ego_fut_trajs.shape[0] == 1, 'only support batch_size=1 for testing'
+                ego_fut_preds = bbox_result['ego_fut_preds']
+                ego_fut_trajs = ego_fut_trajs[0, 0]
+                ego_fut_cmd = ego_fut_cmd[0, 0, 0]
+                ego_fut_cmd_idx = torch.nonzero(ego_fut_cmd)[0, 0]
+                ego_fut_pred = ego_fut_preds[ego_fut_cmd_idx]
+                ego_fut_pred = ego_fut_pred.cumsum(dim=-2)
+                ego_fut_trajs = ego_fut_trajs.cumsum(dim=-2)
+
+                metric_dict_planner_stp3 = self.compute_planner_metric_stp3(
+                    pred_ego_fut_trajs = ego_fut_pred[None],
+                    gt_ego_fut_trajs = ego_fut_trajs[None],
+                    gt_agent_boxes = gt_bbox,
+                    gt_agent_feats = gt_attr_label.unsqueeze(0),
+                    fut_valid_flag = fut_valid_flag
+                )
+                metric_dict.update(metric_dict_planner_stp3)
+
+        return outs['bev_embed'], bbox_results, metric_dict
+
+    def map_pred2result(self, bboxes, scores, labels, pts, attrs=None):
+        """Convert detection results to a list of numpy arrays.
+
+        Args:
+            bboxes (torch.Tensor): Bounding boxes with shape of (n, 5).
+            labels (torch.Tensor): Labels with shape of (n, ).
+            scores (torch.Tensor): Scores with shape of (n, ).
+            attrs (torch.Tensor, optional): Attributes with shape of (n, ). \
+                Defaults to None.
+
+        Returns:
+            dict[str, torch.Tensor]: Bounding box results in cpu mode.
+
+                - boxes_3d (torch.Tensor): 3D boxes.
+                - scores (torch.Tensor): Prediction scores.
+                - labels_3d (torch.Tensor): Box labels.
+                - attrs_3d (torch.Tensor, optional): Box attributes.
+        """
+        result_dict = dict(
+            map_boxes_3d=bboxes.to('cpu'),
+            map_scores_3d=scores.cpu(),
+            map_labels_3d=labels.cpu(),
+            map_pts_3d=pts.to('cpu'))
+
+        if attrs is not None:
+            result_dict['map_attrs_3d'] = attrs.cpu()
+
+        return result_dict
+
+    def assign_pred_to_gt_vip3d(
+        self,
+        bbox_result,
+        gt_bbox,
+        gt_label,
+        match_dis_thresh=2.0
+    ):
+        """Assign pred boxs to gt boxs according to object center preds in lcf.
+        Args:
+            bbox_result (dict): Predictions.
+                'boxes_3d': (LiDARInstance3DBoxes)
+                'scores_3d': (Tensor), [num_pred_bbox]
+                'labels_3d': (Tensor), [num_pred_bbox]
+                'trajs_3d': (Tensor), [fut_ts*2]
+            gt_bboxs (LiDARInstance3DBoxes): GT Bboxs.
+            gt_label (Tensor): GT labels for gt_bbox, [num_gt_bbox].
+            match_dis_thresh (float): dis thresh for determine a positive sample for a gt bbox.
+
+        Returns:
+            matched_bbox_result (np.array): assigned pred index for each gt box [num_gt_bbox].
+        """     
+        dynamic_list = [0,1,3,4,6,7,8]
+        matched_bbox_result = torch.ones(
+            (len(gt_bbox)), dtype=torch.long) * -1  # -1: not assigned
+        gt_centers = gt_bbox.center[:, :2]
+        pred_centers = bbox_result['boxes_3d'].center[:, :2]
+        dist = torch.linalg.norm(pred_centers[:, None, :] - gt_centers[None, :, :], dim=-1)
+        pred_not_dyn = [label not in dynamic_list for label in bbox_result['labels_3d']]
+        gt_not_dyn = [label not in dynamic_list for label in gt_label]
+        dist[pred_not_dyn] = 1e6
+        dist[:, gt_not_dyn] = 1e6
+        dist[dist > match_dis_thresh] = 1e6
+
+        r_list, c_list = linear_sum_assignment(dist)
+
+        for i in range(len(r_list)):
+            if dist[r_list[i], c_list[i]] <= match_dis_thresh:
+                matched_bbox_result[c_list[i]] = r_list[i]
+
+        return matched_bbox_result
+
+    def compute_motion_metric_vip3d(
+        self,
+        gt_bbox,
+        gt_label,
+        gt_attr_label,
+        pred_bbox,
+        matched_bbox_result,
+        mapped_class_names,
+        match_dis_thresh=2.0,
+    ):
+        """Compute EPA metric for one sample.
+        Args:
+            gt_bboxs (LiDARInstance3DBoxes): GT Bboxs.
+            gt_label (Tensor): GT labels for gt_bbox, [num_gt_bbox].
+            pred_bbox (dict): Predictions.
+                'boxes_3d': (LiDARInstance3DBoxes)
+                'scores_3d': (Tensor), [num_pred_bbox]
+                'labels_3d': (Tensor), [num_pred_bbox]
+                'trajs_3d': (Tensor), [fut_ts*2]
+            matched_bbox_result (np.array): assigned pred index for each gt box [num_gt_bbox].
+            match_dis_thresh (float): dis thresh for determine a positive sample for a gt bbox.
+
+        Returns:
+            EPA_dict (dict): EPA metric dict of each cared class.
+        """
+        motion_cls_names = ['car', 'pedestrian']
+        motion_metric_names = ['gt', 'cnt_ade', 'cnt_fde', 'hit',
+                               'fp', 'ADE', 'FDE', 'MR']
+        
+        metric_dict = {}
+        for met in motion_metric_names:
+            for cls in motion_cls_names:
+                metric_dict[met+'_'+cls] = 0.0
+
+        veh_list = [0,1,3,4]
+        ignore_list = ['construction_vehicle', 'barrier',
+                       'traffic_cone', 'motorcycle', 'bicycle']
+
+        for i in range(pred_bbox['labels_3d'].shape[0]):
+            pred_bbox['labels_3d'][i] = 0 if pred_bbox['labels_3d'][i] in veh_list else pred_bbox['labels_3d'][i]
+            box_name = mapped_class_names[pred_bbox['labels_3d'][i]]
+            if box_name in ignore_list:
+                continue
+            if i not in matched_bbox_result:
+                metric_dict['fp_'+box_name] += 1
+
+        for i in range(gt_label.shape[0]):
+            gt_label[i] = 0 if gt_label[i] in veh_list else gt_label[i]
+            box_name = mapped_class_names[gt_label[i]]
+            if box_name in ignore_list:
+                continue
+            gt_fut_masks = gt_attr_label[i][self.fut_ts*2:self.fut_ts*3]
+            num_valid_ts = sum(gt_fut_masks==1)
+            if num_valid_ts == self.fut_ts:
+                metric_dict['gt_'+box_name] += 1
+            if matched_bbox_result[i] >= 0 and num_valid_ts > 0:
+                metric_dict['cnt_ade_'+box_name] += 1
+                m_pred_idx = matched_bbox_result[i]
+                gt_fut_trajs = gt_attr_label[i][:self.fut_ts*2].reshape(-1, 2)
+                gt_fut_trajs = gt_fut_trajs[:num_valid_ts]
+                pred_fut_trajs = pred_bbox['trajs_3d'][m_pred_idx].reshape(self.fut_mode, self.fut_ts, 2)
+                pred_fut_trajs = pred_fut_trajs[:, :num_valid_ts, :]
+                gt_fut_trajs = gt_fut_trajs.cumsum(dim=-2)
+                pred_fut_trajs = pred_fut_trajs.cumsum(dim=-2)
+                gt_fut_trajs = gt_fut_trajs + gt_bbox[i].center[0, :2]
+                pred_fut_trajs = pred_fut_trajs + pred_bbox['boxes_3d'][int(m_pred_idx)].center[0, :2]
+
+                dist = torch.linalg.norm(gt_fut_trajs[None, :, :] - pred_fut_trajs, dim=-1)
+                ade = dist.sum(-1) / num_valid_ts
+                ade = ade.min()
+
+                metric_dict['ADE_'+box_name] += ade
+                if num_valid_ts == self.fut_ts:
+                    fde = dist[:, -1].min()
+                    metric_dict['cnt_fde_'+box_name] += 1
+                    metric_dict['FDE_'+box_name] += fde
+                    if fde <= match_dis_thresh:
+                        metric_dict['hit_'+box_name] += 1
+                    else:
+                        metric_dict['MR_'+box_name] += 1
+
+        return metric_dict
+
+    ### same planning metric as stp3
+    def compute_planner_metric_stp3(
+        self,
+        pred_ego_fut_trajs,
+        gt_ego_fut_trajs,
+        gt_agent_boxes,
+        gt_agent_feats,
+        fut_valid_flag
+    ):
+        """Compute planner metric for one sample same as stp3."""
+        metric_dict = {
+            'plan_L2_1s':0,
+            'plan_L2_2s':0,
+            'plan_L2_3s':0,
+            'plan_obj_col_1s':0,
+            'plan_obj_col_2s':0,
+            'plan_obj_col_3s':0,
+            'plan_obj_box_col_1s':0,
+            'plan_obj_box_col_2s':0,
+            'plan_obj_box_col_3s':0,
+        }
+        metric_dict['fut_valid_flag'] = fut_valid_flag
+        future_second = 3
+        assert pred_ego_fut_trajs.shape[0] == 1, 'only support bs=1'
+        if self.planning_metric is None:
+            self.planning_metric = PlanningMetric()
+        segmentation, pedestrian = self.planning_metric.get_label(
+            gt_agent_boxes, gt_agent_feats)
+        occupancy = torch.logical_or(segmentation, pedestrian)
+
+        for i in range(future_second):
+            if fut_valid_flag:
+                cur_time = (i+1)*2
+                traj_L2 = self.planning_metric.compute_L2(
+                    pred_ego_fut_trajs[0, :cur_time].detach().to(gt_ego_fut_trajs.device),
+                    gt_ego_fut_trajs[0, :cur_time]
+                )
+                obj_coll, obj_box_coll = self.planning_metric.evaluate_coll(
+                    pred_ego_fut_trajs[:, :cur_time].detach(),
+                    gt_ego_fut_trajs[:, :cur_time],
+                    occupancy)
+                metric_dict['plan_L2_{}s'.format(i+1)] = traj_L2
+                metric_dict['plan_obj_col_{}s'.format(i+1)] = obj_coll.mean().item()
+                metric_dict['plan_obj_box_col_{}s'.format(i+1)] = obj_box_coll.mean().item()
+            else:
+                metric_dict['plan_L2_{}s'.format(i+1)] = 0.0
+                metric_dict['plan_obj_col_{}s'.format(i+1)] = 0.0
+                metric_dict['plan_obj_box_col_{}s'.format(i+1)] = 0.0
+            
+        return metric_dict
+
+    def set_epoch(self, epoch): 
+        self.pts_bbox_head.epoch = epoch
\ No newline at end of file
diff --git a/mmcv/models/detectors/__init__.py b/mmcv/models/detectors/__init__.py
new file mode 100644
index 0000000..d3fb4b0
--- /dev/null
+++ b/mmcv/models/detectors/__init__.py
@@ -0,0 +1,5 @@
+from .base import BaseDetector, Base3DDetector
+from .single_stage_mono3d import SingleStageMono3DDetector
+from .uniad_e2e import UniAD
+from .bevformer import BEVFormer
+from .VAD import VAD
\ No newline at end of file
diff --git a/mmcv/models/detectors/base.py b/mmcv/models/detectors/base.py
new file mode 100644
index 0000000..9856b10
--- /dev/null
+++ b/mmcv/models/detectors/base.py
@@ -0,0 +1,407 @@
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+
+from os import path as osp
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.parallel import DataContainer as DC
+from mmcv.models.backbones import BaseModule
+from mmcv.utils import auto_fp16
+
+from mmcv.core.bbox.structures.box_3d_mode import Box3DMode
+from mmcv.core.bbox.structures.coord_3d_mode import Coord3DMode
+from mmcv.core.visualizer import show_result
+from mmcv.core.visualization import imshow_det_bboxes
+from mmcv.utils import concat_list, is_list_of
+from mmcv.image import imread
+
+
+class BaseDetector(BaseModule, metaclass=ABCMeta):
+    """Base class for detectors."""
+
+    def __init__(self, init_cfg=None):
+        super(BaseDetector, self).__init__(init_cfg)
+        self.fp16_enabled = False
+
+    @property
+    def with_neck(self):
+        """bool: whether the detector has a neck"""
+        return hasattr(self, 'neck') and self.neck is not None
+
+    # TODO: these properties need to be carefully handled
+    # for both single stage & two stage detectors
+    @property
+    def with_shared_head(self):
+        """bool: whether the detector has a shared head in the RoI Head"""
+        return hasattr(self, 'roi_head') and self.roi_head.with_shared_head
+
+    @property
+    def with_bbox(self):
+        """bool: whether the detector has a bbox head"""
+        return ((hasattr(self, 'roi_head') and self.roi_head.with_bbox)
+                or (hasattr(self, 'bbox_head') and self.bbox_head is not None))
+
+    @property
+    def with_mask(self):
+        """bool: whether the detector has a mask head"""
+        return ((hasattr(self, 'roi_head') and self.roi_head.with_mask)
+                or (hasattr(self, 'mask_head') and self.mask_head is not None))
+
+    @abstractmethod
+    def extract_feat(self, imgs):
+        """Extract features from images."""
+        pass
+
+    def extract_feats(self, imgs):
+        """Extract features from multiple images.
+
+        Args:
+            imgs (list[torch.Tensor]): A list of images. The images are
+                augmented from the same image but in different ways.
+
+        Returns:
+            list[torch.Tensor]: Features of different images
+        """
+        assert isinstance(imgs, list)
+        return [self.extract_feat(img) for img in imgs]
+
+    def forward_train(self, imgs, img_metas, **kwargs):
+        """
+        Args:
+            img (list[Tensor]): List of tensors of shape (1, C, H, W).
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys, see
+                :class:`mmcv.datasets.pipelines.Collect`.
+            kwargs (keyword arguments): Specific to concrete implementation.
+        """
+        # NOTE the batched image size information may be useful, e.g.
+        # in DETR, this is needed for the construction of masks, which is
+        # then used for the transformer_head.
+        batch_input_shape = tuple(imgs[0].size()[-2:])
+        for img_meta in img_metas:
+            img_meta['batch_input_shape'] = batch_input_shape
+
+    async def async_simple_test(self, img, img_metas, **kwargs):
+        raise NotImplementedError
+
+    @abstractmethod
+    def simple_test(self, img, img_metas, **kwargs):
+        pass
+
+    @abstractmethod
+    def aug_test(self, imgs, img_metas, **kwargs):
+        """Test function with test time augmentation."""
+        pass
+
+    async def aforward_test(self, *, img, img_metas, **kwargs):
+        for var, name in [(img, 'img'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError(f'{name} must be a list, but got {type(var)}')
+
+        num_augs = len(img)
+        if num_augs != len(img_metas):
+            raise ValueError(f'num of augmentations ({len(img)}) '
+                             f'!= num of image metas ({len(img_metas)})')
+        # TODO: remove the restriction of samples_per_gpu == 1 when prepared
+        samples_per_gpu = img[0].size(0)
+        assert samples_per_gpu == 1
+
+        if num_augs == 1:
+            return await self.async_simple_test(img[0], img_metas[0], **kwargs)
+        else:
+            raise NotImplementedError
+
+    def forward_test(self, imgs, img_metas, **kwargs):
+        """
+        Args:
+            imgs (List[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            img_metas (List[List[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch.
+        """
+        for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError(f'{name} must be a list, but got {type(var)}')
+
+        num_augs = len(imgs)
+        if num_augs != len(img_metas):
+            raise ValueError(f'num of augmentations ({len(imgs)}) '
+                             f'!= num of image meta ({len(img_metas)})')
+
+        # NOTE the batched image size information may be useful, e.g.
+        # in DETR, this is needed for the construction of masks, which is
+        # then used for the transformer_head.
+        for img, img_meta in zip(imgs, img_metas):
+            batch_size = len(img_meta)
+            for img_id in range(batch_size):
+                img_meta[img_id]['batch_input_shape'] = tuple(img.size()[-2:])
+
+        if num_augs == 1:
+            # proposals (List[List[Tensor]]): the outer list indicates
+            # test-time augs (multiscale, flip, etc.) and the inner list
+            # indicates images in a batch.
+            # The Tensor should have a shape Px4, where P is the number of
+            # proposals.
+            if 'proposals' in kwargs:
+                kwargs['proposals'] = kwargs['proposals'][0]
+            return self.simple_test(imgs[0], img_metas[0], **kwargs)
+        else:
+            assert imgs[0].size(0) == 1, 'aug test does not support ' \
+                                         'inference with batch size ' \
+                                         f'{imgs[0].size(0)}'
+            # TODO: support test augmentation for predefined proposals
+            assert 'proposals' not in kwargs
+            return self.aug_test(imgs, img_metas, **kwargs)
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self, img, img_metas, return_loss=True, **kwargs):
+        """Calls either :func:`forward_train` or :func:`forward_test` depending
+        on whether ``return_loss`` is ``True``.
+
+        Note this setting will change the expected inputs. When
+        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
+        and List[dict]), and when ``resturn_loss=False``, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+        """
+        if torch.onnx.is_in_onnx_export():
+            assert len(img_metas) == 1
+            return self.onnx_export(img[0], img_metas[0])
+
+        if return_loss:
+            return self.forward_train(img, img_metas, **kwargs)
+        else:
+            return self.forward_test(img, img_metas, **kwargs)
+
+    def _parse_losses(self, losses):
+        """Parse the raw outputs (losses) of the network.
+
+        Args:
+            losses (dict): Raw output of the network, which usually contain
+                losses and other necessary infomation.
+
+        Returns:
+            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor \
+                which may be a weighted sum of all losses, log_vars contains \
+                all the variables to be sent to the logger.
+        """
+        log_vars = OrderedDict()
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars[loss_name] = loss_value.mean()
+            elif isinstance(loss_value, list):
+                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+            else:
+                raise TypeError(
+                    f'{loss_name} is not a tensor or list of tensors')
+
+        loss = sum(_value for _key, _value in log_vars.items()
+                   if 'loss' in _key)
+
+        log_vars['loss'] = loss
+        for loss_name, loss_value in log_vars.items():
+            # reduce loss when distributed training
+            if dist.is_available() and dist.is_initialized():
+                loss_value = loss_value.data.clone()
+                dist.all_reduce(loss_value.div_(dist.get_world_size()))
+            log_vars[loss_name] = loss_value.item()
+
+        return loss, log_vars
+
+
+    def show_result(self,
+                    img,
+                    result,
+                    score_thr=0.3,
+                    bbox_color=(72, 101, 241),
+                    text_color=(72, 101, 241),
+                    mask_color=None,
+                    thickness=2,
+                    font_size=13,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (Tensor or tuple): The results to draw over `img`
+                bbox_result or (bbox_result, segm_result).
+            score_thr (float, optional): Minimum score of bboxes to be shown.
+                Default: 0.3.
+            bbox_color (str or tuple(int) or :obj:`Color`):Color of bbox lines.
+               The tuple of color should be in BGR order. Default: 'green'
+            text_color (str or tuple(int) or :obj:`Color`):Color of texts.
+               The tuple of color should be in BGR order. Default: 'green'
+            mask_color (None or str or tuple(int) or :obj:`Color`):
+               Color of masks. The tuple of color should be in BGR order.
+               Default: None
+            thickness (int): Thickness of lines. Default: 2
+            font_size (int): Font size of texts. Default: 13
+            win_name (str): The window name. Default: ''
+            wait_time (float): Value of waitKey param.
+                Default: 0.
+            show (bool): Whether to show the image.
+                Default: False.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+
+        Returns:
+            img (Tensor): Only if not `show` or `out_file`
+        """
+        img = imread(img)
+        img = img.copy()
+        if isinstance(result, tuple):
+            bbox_result, segm_result = result
+            if isinstance(segm_result, tuple):
+                segm_result = segm_result[0]  # ms rcnn
+        else:
+            bbox_result, segm_result = result, None
+        bboxes = np.vstack(bbox_result)
+        labels = [
+            np.full(bbox.shape[0], i, dtype=np.int32)
+            for i, bbox in enumerate(bbox_result)
+        ]
+        labels = np.concatenate(labels)
+        # draw segmentation masks
+        segms = None
+        if segm_result is not None and len(labels) > 0:  # non empty
+            segms = concat_list(segm_result)
+            if isinstance(segms[0], torch.Tensor):
+                segms = torch.stack(segms, dim=0).detach().cpu().numpy()
+            else:
+                segms = np.stack(segms, axis=0)
+        # if out_file specified, do not show image in window
+        if out_file is not None:
+            show = False
+        # draw bounding boxes
+        img = imshow_det_bboxes(
+            img,
+            bboxes,
+            labels,
+            segms,
+            class_names=self.CLASSES,
+            score_thr=score_thr,
+            bbox_color=bbox_color,
+            text_color=text_color,
+            mask_color=mask_color,
+            thickness=thickness,
+            font_size=font_size,
+            win_name=win_name,
+            show=show,
+            wait_time=wait_time,
+            out_file=out_file)
+
+        if not (show or out_file):
+            return img
+
+    def onnx_export(self, img, img_metas):
+        raise NotImplementedError(f'{self.__class__.__name__} does '
+                                  f'not support ONNX EXPORT')
+
+
+class Base3DDetector(BaseDetector):
+    """Base class for detectors."""
+
+    def forward_test(self, points, img_metas, img=None, **kwargs):
+        """
+        Args:
+            points (list[torch.Tensor]): the outer list indicates test-time
+                augmentations and inner torch.Tensor should have a shape NxC,
+                which contains all points in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch
+            img (list[torch.Tensor], optional): the outer
+                list indicates test-time augmentations and inner
+                torch.Tensor should have a shape NxCxHxW, which contains
+                all images in the batch. Defaults to None.
+        """
+        for var, name in [(points, 'points'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError('{} must be a list, but got {}'.format(
+                    name, type(var)))
+
+        num_augs = len(points)
+        if num_augs != len(img_metas):
+            raise ValueError(
+                'num of augmentations ({}) != num of image meta ({})'.format(
+                    len(points), len(img_metas)))
+
+        if num_augs == 1:
+            img = [img] if img is None else img
+            return self.simple_test(points[0], img_metas[0], img[0], **kwargs)
+        else:
+            return self.aug_test(points, img_metas, img, **kwargs)
+
+    @auto_fp16(apply_to=('img', 'points'))
+    def forward(self, return_loss=True, **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True.
+
+        Note this setting will change the expected inputs. When
+        `return_loss=True`, img and img_metas are single-nested (i.e.
+        torch.Tensor and list[dict]), and when `resturn_loss=False`, img and
+        img_metas should be double nested (i.e.  list[torch.Tensor],
+        list[list[dict]]), with the outer list indicating test time
+        augmentations.
+        """
+        if return_loss:
+            return self.forward_train(**kwargs)
+        else:
+            return self.forward_test(**kwargs)
+
+    def show_results(self, data, result, out_dir):
+        """Results visualization.
+
+        Args:
+            data (list[dict]): Input points and the information of the sample.
+            result (list[dict]): Prediction results.
+            out_dir (str): Output directory of visualization result.
+        """
+        for batch_id in range(len(result)):
+            if isinstance(data['points'][0], DC):
+                points = data['points'][0]._data[0][batch_id].numpy()
+            elif is_list_of(data['points'][0], torch.Tensor):
+                points = data['points'][0][batch_id]
+            else:
+                ValueError(f"Unsupported data type {type(data['points'][0])} "
+                           f'for visualization!')
+            if isinstance(data['img_metas'][0], DC):
+                pts_filename = data['img_metas'][0]._data[0][batch_id][
+                    'pts_filename']
+                box_mode_3d = data['img_metas'][0]._data[0][batch_id][
+                    'box_mode_3d']
+            elif is_list_of(data['img_metas'][0], dict):
+                pts_filename = data['img_metas'][0][batch_id]['pts_filename']
+                box_mode_3d = data['img_metas'][0][batch_id]['box_mode_3d']
+            else:
+                ValueError(
+                    f"Unsupported data type {type(data['img_metas'][0])} "
+                    f'for visualization!')
+            file_name = osp.split(pts_filename)[-1].split('.')[0]
+
+            assert out_dir is not None, 'Expect out_dir, got none.'
+
+            pred_bboxes = result[batch_id]['boxes_3d']
+
+            # for now we convert points and bbox into depth mode
+            if (box_mode_3d == Box3DMode.CAM) or (box_mode_3d
+                                                  == Box3DMode.LIDAR):
+                points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
+                                                   Coord3DMode.DEPTH)
+                pred_bboxes = Box3DMode.convert(pred_bboxes, box_mode_3d,
+                                                Box3DMode.DEPTH)
+            elif box_mode_3d != Box3DMode.DEPTH:
+                ValueError(
+                    f'Unsupported box_mode_3d {box_mode_3d} for convertion!')
+            pred_bboxes = pred_bboxes.tensor.cpu().numpy()
+            show_result(points, None, pred_bboxes, out_dir, file_name)
+
diff --git a/mmcv/models/detectors/bevformer.py b/mmcv/models/detectors/bevformer.py
new file mode 100644
index 0000000..a51778c
--- /dev/null
+++ b/mmcv/models/detectors/bevformer.py
@@ -0,0 +1,295 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import torch
+from mmcv.utils import force_fp32, auto_fp16
+from mmcv.models import DETECTORS
+from mmcv.core import bbox3d2result
+from mmcv.models.detectors.mvx_two_stage import MVXTwoStageDetector
+from mmcv.models.utils.grid_mask import GridMask
+import time
+import copy
+import numpy as np
+from mmcv.utils.bricks import run_time
+
+
+@DETECTORS.register_module()
+class BEVFormer(MVXTwoStageDetector):
+    """BEVFormer.
+    Args:
+        video_test_mode (bool): Decide whether to use temporal information during inference.
+    """
+
+    def __init__(self,
+                 use_grid_mask=False,
+                 pts_voxel_layer=None,
+                 pts_voxel_encoder=None,
+                 pts_middle_encoder=None,
+                 pts_fusion_layer=None,
+                 img_backbone=None,
+                 pts_backbone=None,
+                 img_neck=None,
+                 pts_neck=None,
+                 pts_bbox_head=None,
+                 img_roi_head=None,
+                 img_rpn_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 video_test_mode=False
+                 ):
+
+        super(BEVFormer,
+              self).__init__(pts_voxel_layer, pts_voxel_encoder,
+                             pts_middle_encoder, pts_fusion_layer,
+                             img_backbone, pts_backbone, img_neck, pts_neck,
+                             pts_bbox_head, img_roi_head, img_rpn_head,
+                             train_cfg, test_cfg, pretrained)
+        self.grid_mask = GridMask(
+            True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7)
+        self.use_grid_mask = use_grid_mask
+        self.fp16_enabled = False
+
+        # temporal
+        self.video_test_mode = video_test_mode
+        self.prev_frame_info = {
+            'prev_bev': None,
+            'scene_token': None,
+            'prev_pos': 0,
+            'prev_angle': 0,
+        }
+
+
+    def extract_img_feat(self, img, img_metas, len_queue=None):
+        """Extract features of images."""
+        B = img.size(0)
+        if img is not None:
+            
+            # input_shape = img.shape[-2:]
+            # # update real input shape of each single img
+            # for img_meta in img_metas:
+            #     img_meta.update(input_shape=input_shape)
+
+            if img.dim() == 5 and img.size(0) == 1:
+                img.squeeze_()
+            elif img.dim() == 5 and img.size(0) > 1:
+                B, N, C, H, W = img.size()
+                img = img.reshape(B * N, C, H, W)
+            if self.use_grid_mask:
+                img = self.grid_mask(img)
+
+            img_feats = self.img_backbone(img)
+            if isinstance(img_feats, dict):
+                img_feats = list(img_feats.values())
+        else:
+            return None
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+
+        img_feats_reshaped = []
+        for img_feat in img_feats:
+            BN, C, H, W = img_feat.size()
+            if len_queue is not None:
+                img_feats_reshaped.append(img_feat.view(int(B/len_queue), len_queue, int(BN / B), C, H, W))
+            else:
+                img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W))
+        return img_feats_reshaped
+
+    @auto_fp16(apply_to=('img'))
+    def extract_feat(self, img, img_metas=None, len_queue=None):
+        """Extract features from images and points."""
+
+        img_feats = self.extract_img_feat(img, img_metas, len_queue=len_queue)
+        
+        return img_feats
+
+
+    def forward_pts_train(self,
+                          pts_feats,
+                          gt_bboxes_3d,
+                          gt_labels_3d,
+                          img_metas,
+                          gt_bboxes_ignore=None,
+                          prev_bev=None):
+        """Forward function'
+        Args:
+            pts_feats (list[torch.Tensor]): Features of point cloud branch
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes for each sample.
+            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
+                boxes of each sampole
+            img_metas (list[dict]): Meta information of samples.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                boxes to be ignored. Defaults to None.
+            prev_bev (torch.Tensor, optional): BEV features of previous frame.
+        Returns:
+            dict: Losses of each branch.
+        """
+
+        outs = self.pts_bbox_head(
+            pts_feats, img_metas, prev_bev)
+        loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs]
+        losses = self.pts_bbox_head.loss(*loss_inputs, img_metas=img_metas)
+        return losses
+
+    def forward_dummy(self, img):
+        dummy_metas = None
+        return self.forward_test(img=img, img_metas=[[dummy_metas]])
+
+    def forward(self, inputs, return_loss=True, rescale=False):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True.
+        Note this setting will change the expected inputs. When
+        `return_loss=True`, img and img_metas are single-nested (i.e.
+        torch.Tensor and list[dict]), and when `resturn_loss=False`, img and
+        img_metas should be double nested (i.e.  list[torch.Tensor],
+        list[list[dict]]), with the outer list indicating test time
+        augmentations.
+        """
+        if return_loss:
+            losses = self.forward_train(**inputs)
+            loss, log_vars = self._parse_losses(losses)
+            outputs = dict(
+                loss=loss, log_vars=log_vars, num_samples=len(inputs['img_metas']))
+            return outputs
+        else:
+            outputs = self.forward_test(**inputs, rescale=rescale)
+            return outputs
+    
+    def obtain_history_bev(self, imgs_queue, img_metas_list):
+        """Obtain history BEV features iteratively. To save GPU memory, gradients are not calculated.
+        """
+        self.eval()
+
+        with torch.no_grad():
+            prev_bev = None
+            bs, len_queue, num_cams, C, H, W = imgs_queue.shape
+            imgs_queue = imgs_queue.reshape(bs*len_queue, num_cams, C, H, W)
+            img_feats_list = self.extract_feat(img=imgs_queue, len_queue=len_queue)
+            for i in range(len_queue):
+                img_metas = [each[i] for each in img_metas_list]
+                if not img_metas[0]['prev_bev_exists']:
+                    prev_bev = None
+                # img_feats = self.extract_feat(img=img, img_metas=img_metas)
+                img_feats = [each_scale[:, i] for each_scale in img_feats_list]
+                prev_bev = self.pts_bbox_head(
+                    img_feats, img_metas, prev_bev, only_bev=True)
+            self.train()
+            return prev_bev
+
+    @auto_fp16(apply_to=('img', 'points'))
+    def forward_train(self,
+                      points=None,
+                      img_metas=None,
+                      gt_bboxes_3d=None,
+                      gt_labels_3d=None,
+                      gt_labels=None,
+                      gt_bboxes=None,
+                      img=None,
+                      proposals=None,
+                      gt_bboxes_ignore=None,
+                      img_depth=None,
+                      img_mask=None,
+                      ):
+        """Forward training function.
+        Args:
+            points (list[torch.Tensor], optional): Points of each sample.
+                Defaults to None.
+            img_metas (list[dict], optional): Meta information of each sample.
+                Defaults to None.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
+                Ground truth 3D boxes. Defaults to None.
+            gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
+                of 3D boxes. Defaults to None.
+            gt_labels (list[torch.Tensor], optional): Ground truth labels
+                of 2D boxes in images. Defaults to None.
+            gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
+                images. Defaults to None.
+            img (torch.Tensor optional): Images of each sample with shape
+                (N, C, H, W). Defaults to None.
+            proposals ([list[torch.Tensor], optional): Predicted proposals
+                used for training Fast RCNN. Defaults to None.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                2D boxes in images to be ignored. Defaults to None.
+        Returns:
+            dict: Losses of different branches.
+        """
+        len_queue = img.size(1)
+        prev_img = img[:, :-1, ...]
+        img = img[:, -1, ...]
+
+        prev_img_metas = copy.deepcopy(img_metas)
+        prev_bev = self.obtain_history_bev(prev_img, prev_img_metas)
+
+        img_metas = [each[len_queue-1] for each in img_metas]
+        if not img_metas[0]['prev_bev_exists']:
+            prev_bev = None
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+        losses = dict()
+        losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d,
+                                            gt_labels_3d, img_metas,
+                                            gt_bboxes_ignore, prev_bev)
+
+        losses.update(losses_pts)
+        return losses
+
+    def forward_test(self, img_metas, img=None, rescale=None):
+        for var, name in [(img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError('{} must be a list, but got {}'.format(
+                    name, type(var)))
+        img = [img] if img is None else img
+
+        if img_metas[0][0]['scene_token'] != self.prev_frame_info['scene_token']:
+            # the first sample of each scene is truncated
+            self.prev_frame_info['prev_bev'] = None
+        # update idx
+        self.prev_frame_info['scene_token'] = img_metas[0][0]['scene_token']
+
+        # do not use temporal information
+        if not self.video_test_mode:
+            self.prev_frame_info['prev_bev'] = None
+
+        # Get the delta of ego position and angle between two timestamps.
+        tmp_pos = copy.deepcopy(img_metas[0][0]['can_bus'][:3])
+        tmp_angle = copy.deepcopy(img_metas[0][0]['can_bus'][-1])
+        if self.prev_frame_info['prev_bev'] is not None:
+            img_metas[0][0]['can_bus'][:3] -= self.prev_frame_info['prev_pos']
+            img_metas[0][0]['can_bus'][-1] -= self.prev_frame_info['prev_angle']
+        else:
+            img_metas[0][0]['can_bus'][-1] = 0
+            img_metas[0][0]['can_bus'][:3] = 0
+
+        new_prev_bev, bbox_results = self.simple_test(
+            img_metas[0], img[0], prev_bev=self.prev_frame_info['prev_bev'], rescale=rescale)
+        # During inference, we save the BEV features and ego motion of each timestamp.
+        self.prev_frame_info['prev_pos'] = tmp_pos
+        self.prev_frame_info['prev_angle'] = tmp_angle
+        self.prev_frame_info['prev_bev'] = new_prev_bev
+        return bbox_results
+
+    def simple_test_pts(self, x, img_metas, prev_bev=None, rescale=False):
+        """Test function"""
+        outs = self.pts_bbox_head(x, img_metas, prev_bev=prev_bev)
+
+        bbox_list = self.pts_bbox_head.get_bboxes(
+            outs, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return outs['bev_embed'], bbox_results
+
+    def simple_test(self, img_metas, img=None, prev_bev=None, rescale=False):
+        """Test function without augmentaiton."""
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+
+        bbox_list = [dict() for i in range(len(img_metas))]
+        new_prev_bev, bbox_pts = self.simple_test_pts(
+            img_feats, img_metas, prev_bev, rescale=rescale)
+        for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
+            result_dict['pts_bbox'] = pts_bbox
+        return new_prev_bev, bbox_list
diff --git a/mmcv/models/detectors/bevformerV2.py b/mmcv/models/detectors/bevformerV2.py
new file mode 100644
index 0000000..79efa12
--- /dev/null
+++ b/mmcv/models/detectors/bevformerV2.py
@@ -0,0 +1,269 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import copy
+from collections import OrderedDict
+import torch
+from mmdet.models import DETECTORS
+from mmdet3d.core import bbox3d2result
+from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
+from mmdet3d.models.builder import build_head
+from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask
+
+
+@DETECTORS.register_module()
+class BEVFormerV2(MVXTwoStageDetector):
+    """BEVFormer.
+    Args:
+        video_test_mode (bool): Decide whether to use temporal information during inference.
+    """
+
+    def __init__(self,
+                 use_grid_mask=False,
+                 pts_voxel_layer=None,
+                 pts_voxel_encoder=None,
+                 pts_middle_encoder=None,
+                 pts_fusion_layer=None,
+                 img_backbone=None,
+                 pts_backbone=None,
+                 img_neck=None,
+                 pts_neck=None,
+                 pts_bbox_head=None,
+                 fcos3d_bbox_head=None,
+                 img_roi_head=None,
+                 img_rpn_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 video_test_mode=False,
+                 num_levels=None,
+                 num_mono_levels=None,
+                 mono_loss_weight=1.0,
+                 frames=(0,),
+                 ):
+
+        super(BEVFormerV2,
+              self).__init__(pts_voxel_layer, pts_voxel_encoder,
+                             pts_middle_encoder, pts_fusion_layer,
+                             img_backbone, pts_backbone, img_neck, pts_neck,
+                             pts_bbox_head, img_roi_head, img_rpn_head,
+                             train_cfg, test_cfg, pretrained)
+        self.grid_mask = GridMask(
+            True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7)
+        self.use_grid_mask = use_grid_mask
+        self.fp16_enabled = False
+        assert not self.fp16_enabled  # not support fp16 yet
+        # temporal
+        self.video_test_mode = video_test_mode
+        assert not self.video_test_mode  # not support video_test_mode yet
+
+        # fcos3d head
+        self.fcos3d_bbox_head = build_head(fcos3d_bbox_head) if fcos3d_bbox_head else None
+        # loss weight
+        self.mono_loss_weight = mono_loss_weight
+
+        # levels of features
+        self.num_levels = num_levels
+        self.num_mono_levels = num_mono_levels
+        self.frames = frames
+    def extract_img_feat(self, img):
+        """Extract features of images."""
+        B = img.size(0)
+        if img is not None:
+            if img.dim() == 5 and img.size(0) == 1:
+                img.squeeze_()
+            elif img.dim() == 5 and img.size(0) > 1:
+                B, N, C, H, W = img.size()
+                img = img.reshape(B * N, C, H, W)
+            if self.use_grid_mask:
+                img = self.grid_mask(img)
+
+            img_feats = self.img_backbone(img)
+            if isinstance(img_feats, dict):
+                img_feats = list(img_feats.values())
+        else:
+            return None
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+
+        img_feats_reshaped = []
+        for img_feat in img_feats:
+            BN, C, H, W = img_feat.size()
+            img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W))
+        return img_feats_reshaped
+
+    def extract_feat(self, img, img_metas, len_queue=None):
+        """Extract features from images and points."""
+
+        img_feats = self.extract_img_feat(img)
+        if 'aug_param' in img_metas[0] and img_metas[0]['aug_param']['CropResizeFlipImage_param'][-1] is True:
+            # flip feature 
+            img_feats = [torch.flip(x, dims=[-1, ]) for x in img_feats]
+        return img_feats
+
+    def forward_pts_train(self,
+                          pts_feats,
+                          gt_bboxes_3d,
+                          gt_labels_3d,
+                          img_metas,
+                          gt_bboxes_ignore=None,
+                          prev_bev=None):
+        outs = self.pts_bbox_head(
+            pts_feats, img_metas, prev_bev)
+        loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs]
+        losses = self.pts_bbox_head.loss(*loss_inputs, img_metas=img_metas)
+        return losses
+
+    def forward_mono_train(self, img_feats, mono_input_dict):
+        """
+        img_feats (list[Tensor]): 5-D tensor for each level, (B, N, C, H, W)
+        gt_bboxes (list[list[Tensor]]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+        gt_labels (list[list[Tensor]]): class indices corresponding to each box
+        gt_bboxes_3d (list[list[[Tensor]]): 3D boxes ground truth with shape of
+                (num_gts, code_size).
+        gt_labels_3d (list[list[Tensor]]): same as gt_labels
+        centers2d (list[list[Tensor]]): 2D centers on the image with shape of
+                (num_gts, 2).
+        depths (list[list[Tensor]]): Depth ground truth with shape of
+                (num_gts, ).
+        attr_labels (list[list[Tensor]]): Attributes indices of each box.
+        img_metas (list[list[dict]]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+        ann_idx (list[list[idx]]): indicate which image has mono annotation.
+        """
+        bsz = img_feats[0].shape[0];
+        num_lvls = len(img_feats)
+
+        img_feats_select = [[] for lvl in range(num_lvls)]
+        for lvl, img_feat in enumerate(img_feats):
+            for i in range(bsz):
+                img_feats_select[lvl].append(img_feat[i, mono_input_dict['mono_ann_idx'][i]])
+            img_feats_select[lvl] = torch.cat(img_feats_select[lvl], dim=0)
+        bsz_new = img_feats_select[0].shape[0]
+        assert bsz == len(mono_input_dict['mono_input_dict'])
+        input_dict = []
+        for i in range(bsz):
+            input_dict.extend(mono_input_dict['mono_input_dict'][i])
+        assert bsz_new == len(input_dict)
+        losses = self.fcos3d_bbox_head.forward_train(img_feats_select, input_dict)
+        return losses
+
+    def forward_dummy(self, img):
+        dummy_metas = None
+        return self.forward_test(img=img, img_metas=[[dummy_metas]])
+
+    def forward(self, return_loss=True, **kwargs):
+        if return_loss:
+            return self.forward_train(**kwargs)
+        else:
+            return self.forward_test(**kwargs)
+
+    def obtain_history_bev(self, img_dict, img_metas_dict):
+        """Obtain history BEV features iteratively. To save GPU memory, gradients are not calculated.
+        """
+        # Modify: roll back to previous version for single frame
+        is_training = self.training
+        self.eval()
+        prev_bev = OrderedDict({i: None for i in self.frames})
+        with torch.no_grad():
+            for t in img_dict.keys():
+                img = img_dict[t]
+                img_metas = [img_metas_dict[t], ]
+                img_feats = self.extract_feat(img=img, img_metas=img_metas)
+                if self.num_levels:
+                    img_feats = img_feats[:self.num_levels]
+                bev = self.pts_bbox_head(
+                    img_feats, img_metas, None, only_bev=True)
+                prev_bev[t] = bev.detach()
+        if is_training:
+            self.train()
+        return list(prev_bev.values())
+
+    def forward_train(self,
+                      points=None,
+                      img_metas=None,
+                      gt_bboxes_3d=None,
+                      gt_labels_3d=None,
+                      img=None,
+                      gt_bboxes_ignore=None,
+                      **mono_input_dict,
+                      ):
+        img_metas = OrderedDict(sorted(img_metas[0].items()))
+        img_dict = {}
+        for ind, t in enumerate(img_metas.keys()):
+            img_dict[t] = img[:, ind, ...]
+
+        img = img_dict[0]
+        img_dict.pop(0)
+
+        prev_img_metas = copy.deepcopy(img_metas)
+        prev_img_metas.pop(0)
+        prev_bev = self.obtain_history_bev(img_dict, prev_img_metas)
+
+        img_metas = [img_metas[0], ]
+
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+        losses = dict()
+        losses_pts = self.forward_pts_train(img_feats if self.num_levels is None
+                                            else img_feats[:self.num_levels], gt_bboxes_3d,
+                                            gt_labels_3d, img_metas,
+                                            gt_bboxes_ignore, prev_bev)
+        losses.update(losses_pts)
+
+        if self.fcos3d_bbox_head:
+            losses_mono = self.forward_mono_train(img_feats=img_feats if self.num_mono_levels is None
+            else img_feats[:self.num_mono_levels],
+                                                  mono_input_dict=mono_input_dict)
+            for k, v in losses_mono.items():
+                losses[f'{k}_mono'] = v * self.mono_loss_weight
+
+        return losses
+
+    def forward_test(self, img_metas, img=None, **kwargs):
+        for var, name in [(img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError('{} must be a list, but got {}'.format(
+                    name, type(var)))
+        img = [img] if img is None else img
+        new_prev_bev, bbox_results = self.simple_test(img_metas[0], img[0], prev_bev=None, **kwargs)
+        return bbox_results
+
+    def simple_test_pts(self, x, img_metas, prev_bev=None, rescale=False):
+        """Test function"""
+        outs = self.pts_bbox_head(x, img_metas, prev_bev=prev_bev)
+
+        bbox_list = self.pts_bbox_head.get_bboxes(
+            outs, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return outs['bev_embed'], bbox_results
+
+    def simple_test(self, img_metas, img=None, prev_bev=None, rescale=False, **kwargs):
+        """Test function without augmentaiton."""
+        img_metas = OrderedDict(sorted(img_metas[0].items()))
+        img_dict = {}
+        for ind, t in enumerate(img_metas.keys()):
+            img_dict[t] = img[:, ind, ...]
+        img = img_dict[0]
+        img_dict.pop(0)
+
+        prev_img_metas = copy.deepcopy(img_metas)
+        prev_bev = self.obtain_history_bev(img_dict, prev_img_metas)
+
+        img_metas = [img_metas[0], ]
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+        if self.num_levels:
+            img_feats = img_feats[:self.num_levels]
+
+        bbox_list = [dict() for i in range(len(img_metas))]
+        new_prev_bev, bbox_pts = self.simple_test_pts(
+            img_feats, img_metas, prev_bev, rescale=rescale)
+        for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
+            result_dict['pts_bbox'] = pts_bbox
+        return new_prev_bev, bbox_list
diff --git a/mmcv/models/detectors/bevformer_fp16.py b/mmcv/models/detectors/bevformer_fp16.py
new file mode 100644
index 0000000..5325e3c
--- /dev/null
+++ b/mmcv/models/detectors/bevformer_fp16.py
@@ -0,0 +1,89 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from tkinter.messagebox import NO
+import torch
+from mmcv.runner import force_fp32, auto_fp16
+from mmdet.models import DETECTORS
+from mmdet3d.core import bbox3d2result
+from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
+from projects.mmdet3d_plugin.models.utils.grid_mask import GridMask
+from projects.mmdet3d_plugin.bevformer.detectors.bevformer import BEVFormer
+import time
+import copy
+import numpy as np
+import mmdet3d
+from projects.mmdet3d_plugin.models.utils.bricks import run_time
+
+
+@DETECTORS.register_module()
+class BEVFormer_fp16(BEVFormer):
+    """
+    The default version BEVFormer currently can not support FP16. 
+    We provide this version to resolve this issue.
+    """
+    
+    @auto_fp16(apply_to=('img', 'prev_bev', 'points'))
+    def forward_train(self,
+                      points=None,
+                      img_metas=None,
+                      gt_bboxes_3d=None,
+                      gt_labels_3d=None,
+                      gt_labels=None,
+                      gt_bboxes=None,
+                      img=None,
+                      proposals=None,
+                      gt_bboxes_ignore=None,
+                      img_depth=None,
+                      img_mask=None,
+                      prev_bev=None,
+                      ):
+        """Forward training function.
+        Args:
+            points (list[torch.Tensor], optional): Points of each sample.
+                Defaults to None.
+            img_metas (list[dict], optional): Meta information of each sample.
+                Defaults to None.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
+                Ground truth 3D boxes. Defaults to None.
+            gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
+                of 3D boxes. Defaults to None.
+            gt_labels (list[torch.Tensor], optional): Ground truth labels
+                of 2D boxes in images. Defaults to None.
+            gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
+                images. Defaults to None.
+            img (torch.Tensor optional): Images of each sample with shape
+                (N, C, H, W). Defaults to None.
+            proposals ([list[torch.Tensor], optional): Predicted proposals
+                used for training Fast RCNN. Defaults to None.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                2D boxes in images to be ignored. Defaults to None.
+        Returns:
+            dict: Losses of different branches.
+        """
+        
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+
+        losses = dict()
+        losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d,
+                                            gt_labels_3d, img_metas,
+                                            gt_bboxes_ignore, prev_bev=prev_bev)
+        losses.update(losses_pts)
+        return losses
+
+
+    def val_step(self, data, optimizer):
+        """
+        In BEVFormer_fp16, we use this `val_step` function to inference the `prev_pev`.
+        This is not the standard function of `val_step`.
+        """
+
+        img = data['img']
+        img_metas = data['img_metas']
+        img_feats = self.extract_feat(img=img,  img_metas=img_metas)
+        prev_bev = data.get('prev_bev', None)
+        prev_bev = self.pts_bbox_head(img_feats, img_metas, prev_bev=prev_bev, only_bev=True)
+        return prev_bev
\ No newline at end of file
diff --git a/mmcv/models/detectors/mvx_two_stage.py b/mmcv/models/detectors/mvx_two_stage.py
new file mode 100644
index 0000000..dc99ac3
--- /dev/null
+++ b/mmcv/models/detectors/mvx_two_stage.py
@@ -0,0 +1,506 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import warnings
+from mmcv.parallel import DataContainer as DC
+from mmcv.utils import force_fp32
+from os import path as osp
+from torch.nn import functional as F
+
+from mmcv.core.bbox.structures.box_3d_mode import Box3DMode
+from mmcv.core.bbox.structures.coord_3d_mode import Coord3DMode
+from mmcv.core.bbox.transforms import bbox3d2result
+from mmcv.core.post_processing.merge_augs import merge_aug_bboxes_3d
+from mmcv.core.visualizer import show_result
+from mmcv.ops.voxelize import Voxelization
+from mmcv.core.utils import multi_apply
+from mmcv.models import DETECTORS
+from mmcv.utils import is_list_of
+from .. import builder
+from .base import Base3DDetector
+
+
+@DETECTORS.register_module()
+class MVXTwoStageDetector(Base3DDetector):
+    """Base class of Multi-modality VoxelNet."""
+
+    def __init__(self,
+                 pts_voxel_layer=None,
+                 pts_voxel_encoder=None,
+                 pts_middle_encoder=None,
+                 pts_fusion_layer=None,
+                 img_backbone=None,
+                 pts_backbone=None,
+                 img_neck=None,
+                 pts_neck=None,
+                 pts_bbox_head=None,
+                 img_roi_head=None,
+                 img_rpn_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(MVXTwoStageDetector, self).__init__(init_cfg=init_cfg)
+
+        if pts_voxel_layer:
+            self.pts_voxel_layer = Voxelization(**pts_voxel_layer)
+        if pts_voxel_encoder:
+            self.pts_voxel_encoder = builder.build_voxel_encoder(
+                pts_voxel_encoder)
+        if pts_middle_encoder:
+            self.pts_middle_encoder = builder.build_middle_encoder(
+                pts_middle_encoder)
+        if pts_backbone:
+            self.pts_backbone = builder.build_backbone(pts_backbone)
+        if pts_fusion_layer:
+            self.pts_fusion_layer = builder.build_fusion_layer(
+                pts_fusion_layer)
+        if pts_neck is not None:
+            self.pts_neck = builder.build_neck(pts_neck)
+        if pts_bbox_head:
+            pts_train_cfg = train_cfg.pts if train_cfg else None
+            pts_bbox_head.update(train_cfg=pts_train_cfg)
+            pts_test_cfg = test_cfg.pts if test_cfg else None
+            pts_bbox_head.update(test_cfg=pts_test_cfg)
+            self.pts_bbox_head = builder.build_head(pts_bbox_head)
+
+        if img_backbone:
+            self.img_backbone = builder.build_backbone(img_backbone)
+        if img_neck is not None:
+            self.img_neck = builder.build_neck(img_neck)
+        if img_rpn_head is not None:
+            self.img_rpn_head = builder.build_head(img_rpn_head)
+        if img_roi_head is not None:
+            self.img_roi_head = builder.build_head(img_roi_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if pretrained is None:
+            img_pretrained = None
+            pts_pretrained = None
+        elif isinstance(pretrained, dict):
+            img_pretrained = pretrained.get('img', None)
+            pts_pretrained = pretrained.get('pts', None)
+        else:
+            raise ValueError(
+                f'pretrained should be a dict, got {type(pretrained)}')
+
+        if self.with_img_backbone:
+            if img_pretrained is not None:
+                warnings.warn('DeprecationWarning: pretrained is a deprecated \
+                    key, please consider using init_cfg')
+                self.img_backbone.init_cfg = dict(
+                    type='Pretrained', checkpoint=img_pretrained)
+        if self.with_img_roi_head:
+            if img_pretrained is not None:
+                warnings.warn('DeprecationWarning: pretrained is a deprecated \
+                    key, please consider using init_cfg')
+                self.img_roi_head.init_cfg = dict(
+                    type='Pretrained', checkpoint=img_pretrained)
+
+        if self.with_pts_backbone:
+            if pts_pretrained is not None:
+                warnings.warn('DeprecationWarning: pretrained is a deprecated \
+                    key, please consider using init_cfg')
+                self.pts_backbone.init_cfg = dict(
+                    type='Pretrained', checkpoint=pts_pretrained)
+
+    @property
+    def with_img_shared_head(self):
+        """bool: Whether the detector has a shared head in image branch."""
+        return hasattr(self,
+                       'img_shared_head') and self.img_shared_head is not None
+
+    @property
+    def with_pts_bbox(self):
+        """bool: Whether the detector has a 3D box head."""
+        return hasattr(self,
+                       'pts_bbox_head') and self.pts_bbox_head is not None
+
+    @property
+    def with_img_bbox(self):
+        """bool: Whether the detector has a 2D image box head."""
+        return hasattr(self,
+                       'img_bbox_head') and self.img_bbox_head is not None
+
+    @property
+    def with_img_backbone(self):
+        """bool: Whether the detector has a 2D image backbone."""
+        return hasattr(self, 'img_backbone') and self.img_backbone is not None
+
+    @property
+    def with_pts_backbone(self):
+        """bool: Whether the detector has a 3D backbone."""
+        return hasattr(self, 'pts_backbone') and self.pts_backbone is not None
+
+    @property
+    def with_fusion(self):
+        """bool: Whether the detector has a fusion layer."""
+        return hasattr(self,
+                       'pts_fusion_layer') and self.fusion_layer is not None
+
+    @property
+    def with_img_neck(self):
+        """bool: Whether the detector has a neck in image branch."""
+        return hasattr(self, 'img_neck') and self.img_neck is not None
+
+    @property
+    def with_pts_neck(self):
+        """bool: Whether the detector has a neck in 3D detector branch."""
+        return hasattr(self, 'pts_neck') and self.pts_neck is not None
+
+    @property
+    def with_img_rpn(self):
+        """bool: Whether the detector has a 2D RPN in image detector branch."""
+        return hasattr(self, 'img_rpn_head') and self.img_rpn_head is not None
+
+    @property
+    def with_img_roi_head(self):
+        """bool: Whether the detector has a RoI Head in image branch."""
+        return hasattr(self, 'img_roi_head') and self.img_roi_head is not None
+
+    @property
+    def with_voxel_encoder(self):
+        """bool: Whether the detector has a voxel encoder."""
+        return hasattr(self,
+                       'voxel_encoder') and self.voxel_encoder is not None
+
+    @property
+    def with_middle_encoder(self):
+        """bool: Whether the detector has a middle encoder."""
+        return hasattr(self,
+                       'middle_encoder') and self.middle_encoder is not None
+
+    def extract_img_feat(self, img, img_metas):
+        """Extract features of images."""
+        if self.with_img_backbone and img is not None:
+            input_shape = img.shape[-2:]
+            # update real input shape of each single img
+            for img_meta in img_metas:
+                img_meta.update(input_shape=input_shape)
+
+            if img.dim() == 5 and img.size(0) == 1:
+                img.squeeze_()
+            elif img.dim() == 5 and img.size(0) > 1:
+                B, N, C, H, W = img.size()
+                img = img.view(B * N, C, H, W)
+            img_feats = self.img_backbone(img)
+        else:
+            return None
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+        return img_feats
+
+    def extract_pts_feat(self, pts, img_feats, img_metas):
+        """Extract features of points."""
+        if not self.with_pts_bbox:
+            return None
+        voxels, num_points, coors = self.voxelize(pts)
+        voxel_features = self.pts_voxel_encoder(voxels, num_points, coors,
+                                                img_feats, img_metas)
+        batch_size = coors[-1, 0] + 1
+        x = self.pts_middle_encoder(voxel_features, coors, batch_size)
+        x = self.pts_backbone(x)
+        if self.with_pts_neck:
+            x = self.pts_neck(x)
+        return x
+
+    def extract_feat(self, points, img, img_metas):
+        """Extract features from images and points."""
+        img_feats = self.extract_img_feat(img, img_metas)
+        pts_feats = self.extract_pts_feat(points, img_feats, img_metas)
+        return (img_feats, pts_feats)
+
+    @torch.no_grad()
+    @force_fp32()
+    def voxelize(self, points):
+        """Apply dynamic voxelization to points.
+
+        Args:
+            points (list[torch.Tensor]): Points of each sample.
+
+        Returns:
+            tuple[torch.Tensor]: Concatenated points, number of points
+                per voxel, and coordinates.
+        """
+        voxels, coors, num_points = [], [], []
+        for res in points:
+            res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res)
+            voxels.append(res_voxels)
+            coors.append(res_coors)
+            num_points.append(res_num_points)
+        voxels = torch.cat(voxels, dim=0)
+        num_points = torch.cat(num_points, dim=0)
+        coors_batch = []
+        for i, coor in enumerate(coors):
+            coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
+            coors_batch.append(coor_pad)
+        coors_batch = torch.cat(coors_batch, dim=0)
+        return voxels, num_points, coors_batch
+
+    def forward_train(self,
+                      points=None,
+                      img_metas=None,
+                      gt_bboxes_3d=None,
+                      gt_labels_3d=None,
+                      gt_labels=None,
+                      gt_bboxes=None,
+                      img=None,
+                      proposals=None,
+                      gt_bboxes_ignore=None):
+        """Forward training function.
+
+        Args:
+            points (list[torch.Tensor], optional): Points of each sample.
+                Defaults to None.
+            img_metas (list[dict], optional): Meta information of each sample.
+                Defaults to None.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
+                Ground truth 3D boxes. Defaults to None.
+            gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
+                of 3D boxes. Defaults to None.
+            gt_labels (list[torch.Tensor], optional): Ground truth labels
+                of 2D boxes in images. Defaults to None.
+            gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
+                images. Defaults to None.
+            img (torch.Tensor optional): Images of each sample with shape
+                (N, C, H, W). Defaults to None.
+            proposals ([list[torch.Tensor], optional): Predicted proposals
+                used for training Fast RCNN. Defaults to None.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                2D boxes in images to be ignored. Defaults to None.
+
+        Returns:
+            dict: Losses of different branches.
+        """
+        img_feats, pts_feats = self.extract_feat(
+            points, img=img, img_metas=img_metas)
+        losses = dict()
+        if pts_feats:
+            losses_pts = self.forward_pts_train(pts_feats, gt_bboxes_3d,
+                                                gt_labels_3d, img_metas,
+                                                gt_bboxes_ignore)
+            losses.update(losses_pts)
+        if img_feats:
+            losses_img = self.forward_img_train(
+                img_feats,
+                img_metas=img_metas,
+                gt_bboxes=gt_bboxes,
+                gt_labels=gt_labels,
+                gt_bboxes_ignore=gt_bboxes_ignore,
+                proposals=proposals)
+            losses.update(losses_img)
+        return losses
+
+    def forward_pts_train(self,
+                          pts_feats,
+                          gt_bboxes_3d,
+                          gt_labels_3d,
+                          img_metas,
+                          gt_bboxes_ignore=None):
+        """Forward function for point cloud branch.
+
+        Args:
+            pts_feats (list[torch.Tensor]): Features of point cloud branch
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes for each sample.
+            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
+                boxes of each sampole
+            img_metas (list[dict]): Meta information of samples.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                boxes to be ignored. Defaults to None.
+
+        Returns:
+            dict: Losses of each branch.
+        """
+        outs = self.pts_bbox_head(pts_feats)
+        loss_inputs = outs + (gt_bboxes_3d, gt_labels_3d, img_metas)
+        losses = self.pts_bbox_head.loss(
+            *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        return losses
+
+    def forward_img_train(self,
+                          x,
+                          img_metas,
+                          gt_bboxes,
+                          gt_labels,
+                          gt_bboxes_ignore=None,
+                          proposals=None,
+                          **kwargs):
+        """Forward function for image branch.
+
+        This function works similar to the forward function of Faster R-CNN.
+
+        Args:
+            x (list[torch.Tensor]): Image features of shape (B, C, H, W)
+                of multiple levels.
+            img_metas (list[dict]): Meta information of images.
+            gt_bboxes (list[torch.Tensor]): Ground truth boxes of each image
+                sample.
+            gt_labels (list[torch.Tensor]): Ground truth labels of boxes.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                boxes to be ignored. Defaults to None.
+            proposals (list[torch.Tensor], optional): Proposals of each sample.
+                Defaults to None.
+
+        Returns:
+            dict: Losses of each branch.
+        """
+        losses = dict()
+        # RPN forward and loss
+        if self.with_img_rpn:
+            rpn_outs = self.img_rpn_head(x)
+            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_metas,
+                                          self.train_cfg.img_rpn)
+            rpn_losses = self.img_rpn_head.loss(
+                *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+            losses.update(rpn_losses)
+
+            proposal_cfg = self.train_cfg.get('img_rpn_proposal',
+                                              self.test_cfg.img_rpn)
+            proposal_inputs = rpn_outs + (img_metas, proposal_cfg)
+            proposal_list = self.img_rpn_head.get_bboxes(*proposal_inputs)
+        else:
+            proposal_list = proposals
+
+        # bbox head forward and loss
+        if self.with_img_bbox:
+            # bbox head forward and loss
+            img_roi_losses = self.img_roi_head.forward_train(
+                x, img_metas, proposal_list, gt_bboxes, gt_labels,
+                gt_bboxes_ignore, **kwargs)
+            losses.update(img_roi_losses)
+
+        return losses
+
+    def simple_test_img(self, x, img_metas, proposals=None, rescale=False):
+        """Test without augmentation."""
+        if proposals is None:
+            proposal_list = self.simple_test_rpn(x, img_metas,
+                                                 self.test_cfg.img_rpn)
+        else:
+            proposal_list = proposals
+
+        return self.img_roi_head.simple_test(
+            x, proposal_list, img_metas, rescale=rescale)
+
+    def simple_test_rpn(self, x, img_metas, rpn_test_cfg):
+        """RPN test function."""
+        rpn_outs = self.img_rpn_head(x)
+        proposal_inputs = rpn_outs + (img_metas, rpn_test_cfg)
+        proposal_list = self.img_rpn_head.get_bboxes(*proposal_inputs)
+        return proposal_list
+
+    def simple_test_pts(self, x, img_metas, rescale=False):
+        """Test function of point cloud branch."""
+        outs = self.pts_bbox_head(x)
+        bbox_list = self.pts_bbox_head.get_bboxes(
+            *outs, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox3d2result(bboxes, scores, labels)
+            for bboxes, scores, labels in bbox_list
+        ]
+        return bbox_results
+
+    def simple_test(self, points, img_metas, img=None, rescale=False):
+        """Test function without augmentaiton."""
+        img_feats, pts_feats = self.extract_feat(
+            points, img=img, img_metas=img_metas)
+
+        bbox_list = [dict() for i in range(len(img_metas))]
+        if pts_feats and self.with_pts_bbox:
+            bbox_pts = self.simple_test_pts(
+                pts_feats, img_metas, rescale=rescale)
+            for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
+                result_dict['pts_bbox'] = pts_bbox
+        if img_feats and self.with_img_bbox:
+            bbox_img = self.simple_test_img(
+                img_feats, img_metas, rescale=rescale)
+            for result_dict, img_bbox in zip(bbox_list, bbox_img):
+                result_dict['img_bbox'] = img_bbox
+        return bbox_list
+
+    def aug_test(self, points, img_metas, imgs=None, rescale=False):
+        """Test function with augmentaiton."""
+        img_feats, pts_feats = self.extract_feats(points, img_metas, imgs)
+
+        bbox_list = dict()
+        if pts_feats and self.with_pts_bbox:
+            bbox_pts = self.aug_test_pts(pts_feats, img_metas, rescale)
+            bbox_list.update(pts_bbox=bbox_pts)
+        return [bbox_list]
+
+    def extract_feats(self, points, img_metas, imgs=None):
+        """Extract point and image features of multiple samples."""
+        if imgs is None:
+            imgs = [None] * len(img_metas)
+        img_feats, pts_feats = multi_apply(self.extract_feat, points, imgs,
+                                           img_metas)
+        return img_feats, pts_feats
+
+    def aug_test_pts(self, feats, img_metas, rescale=False):
+        """Test function of point cloud branch with augmentaiton."""
+        # only support aug_test for one sample
+        aug_bboxes = []
+        for x, img_meta in zip(feats, img_metas):
+            outs = self.pts_bbox_head(x)
+            bbox_list = self.pts_bbox_head.get_bboxes(
+                *outs, img_meta, rescale=rescale)
+            bbox_list = [
+                dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
+                for bboxes, scores, labels in bbox_list
+            ]
+            aug_bboxes.append(bbox_list[0])
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
+                                            self.pts_bbox_head.test_cfg)
+        return merged_bboxes
+
+    def show_results(self, data, result, out_dir):
+        """Results visualization.
+
+        Args:
+            data (dict): Input points and the information of the sample.
+            result (dict): Prediction results.
+            out_dir (str): Output directory of visualization result.
+        """
+        for batch_id in range(len(result)):
+            if isinstance(data['points'][0], DC):
+                points = data['points'][0]._data[0][batch_id].numpy()
+            elif is_list_of(data['points'][0], torch.Tensor):
+                points = data['points'][0][batch_id]
+            else:
+                ValueError(f"Unsupported data type {type(data['points'][0])} "
+                           f'for visualization!')
+            if isinstance(data['img_metas'][0], DC):
+                pts_filename = data['img_metas'][0]._data[0][batch_id][
+                    'pts_filename']
+                box_mode_3d = data['img_metas'][0]._data[0][batch_id][
+                    'box_mode_3d']
+            elif is_list_of(data['img_metas'][0], dict):
+                pts_filename = data['img_metas'][0][batch_id]['pts_filename']
+                box_mode_3d = data['img_metas'][0][batch_id]['box_mode_3d']
+            else:
+                ValueError(
+                    f"Unsupported data type {type(data['img_metas'][0])} "
+                    f'for visualization!')
+            file_name = osp.split(pts_filename)[-1].split('.')[0]
+
+            assert out_dir is not None, 'Expect out_dir, got none.'
+            inds = result[batch_id]['pts_bbox']['scores_3d'] > 0.1
+            pred_bboxes = result[batch_id]['pts_bbox']['boxes_3d'][inds]
+
+            # for now we convert points and bbox into depth mode
+            if (box_mode_3d == Box3DMode.CAM) or (box_mode_3d
+                                                  == Box3DMode.LIDAR):
+                points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
+                                                   Coord3DMode.DEPTH)
+                pred_bboxes = Box3DMode.convert(pred_bboxes, box_mode_3d,
+                                                Box3DMode.DEPTH)
+            elif box_mode_3d != Box3DMode.DEPTH:
+                ValueError(
+                    f'Unsupported box_mode_3d {box_mode_3d} for convertion!')
+
+            pred_bboxes = pred_bboxes.tensor.cpu().numpy()
+            show_result(points, None, pred_bboxes, out_dir, file_name)
diff --git a/mmcv/models/detectors/single_stage.py b/mmcv/models/detectors/single_stage.py
new file mode 100644
index 0000000..4e0748b
--- /dev/null
+++ b/mmcv/models/detectors/single_stage.py
@@ -0,0 +1,234 @@
+import warnings
+
+import torch
+
+from mmcv.core.bbox.transforms import bbox2result
+from mmcv.models import DETECTORS, build_backbone, build_head, build_neck
+from ..builder import DETECTORS, build_backbone, build_head, build_neck
+from .base import BaseDetector, Base3DDetector
+
+
+@DETECTORS.register_module()
+class SingleStageDetector(BaseDetector):
+    """Base class for single-stage detectors.
+
+    Single-stage detectors directly and densely predict bounding boxes on the
+    output features of the backbone+neck.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 pretrained=None,
+                 init_cfg=None):
+        super(SingleStageDetector, self).__init__(init_cfg)
+        if pretrained:
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            backbone.pretrained = pretrained
+        self.backbone = build_backbone(backbone)
+        if neck is not None:
+            self.neck = build_neck(neck)
+        bbox_head.update(train_cfg=train_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.bbox_head = build_head(bbox_head)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def extract_feat(self, img):
+        """Directly extract features from the backbone+neck."""
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def forward_dummy(self, img):
+        """Used for computing network flops.
+
+        See `mmcvection/tools/analysis_tools/get_flops.py`
+        """
+        x = self.extract_feat(img)
+        outs = self.bbox_head(x)
+        return outs
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_ignore=None):
+        """
+        Args:
+            img (Tensor): Input images of shape (N, C, H, W).
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): A List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                :class:`mmcv.datasets.pipelines.Collect`.
+            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        super(SingleStageDetector, self).forward_train(img, img_metas)
+        x = self.extract_feat(img)
+        losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,
+                                              gt_labels, gt_bboxes_ignore)
+        return losses
+
+    def simple_test(self, img, img_metas, rescale=False):
+        """Test function without test-time augmentation.
+
+        Args:
+            img (torch.Tensor): Images with shape (N, C, H, W).
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[list[np.ndarray]]: BBox results of each image and classes.
+                The outer list corresponds to each image. The inner list
+                corresponds to each class.
+        """
+        feat = self.extract_feat(img)
+        results_list = self.bbox_head.simple_test(
+            feat, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
+            for det_bboxes, det_labels in results_list
+        ]
+        return bbox_results
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test function with test time augmentation.
+
+        Args:
+            imgs (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[list[np.ndarray]]: BBox results of each image and classes.
+                The outer list corresponds to each image. The inner list
+                corresponds to each class.
+        """
+        assert hasattr(self.bbox_head, 'aug_test'), \
+            f'{self.bbox_head.__class__.__name__}' \
+            ' does not support test-time augmentation'
+
+        feats = self.extract_feats(imgs)
+        results_list = self.bbox_head.aug_test(
+            feats, img_metas, rescale=rescale)
+        bbox_results = [
+            bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
+            for det_bboxes, det_labels in results_list
+        ]
+        return bbox_results
+
+    def onnx_export(self, img, img_metas):
+        """Test function without test time augmentation.
+
+        Args:
+            img (torch.Tensor): input images.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple[Tensor, Tensor]: dets of shape [N, num_det, 5]
+                and class labels of shape [N, num_det].
+        """
+        x = self.extract_feat(img)
+        outs = self.bbox_head(x)
+        # get origin input shape to support onnx dynamic shape
+
+        # get shape as tensor
+        img_shape = torch._shape_as_tensor(img)[2:]
+        img_metas[0]['img_shape_for_onnx'] = img_shape
+        # get pad input shape to support onnx dynamic shape for exporting
+        # `CornerNet` and `CentripetalNet`, which 'pad_shape' is used
+        # for inference
+        img_metas[0]['pad_shape_for_onnx'] = img_shape
+        # TODO:move all onnx related code in bbox_head to onnx_export function
+        det_bboxes, det_labels = self.bbox_head.get_bboxes(*outs, img_metas)
+
+        return det_bboxes, det_labels
+    
+@DETECTORS.register_module()
+class SingleStage3DDetector(Base3DDetector):
+    """SingleStage3DDetector.
+
+    This class serves as a base class for single-stage 3D detectors.
+
+    Args:
+        backbone (dict): Config dict of detector's backbone.
+        neck (dict, optional): Config dict of neck. Defaults to None.
+        bbox_head (dict, optional): Config dict of box head. Defaults to None.
+        train_cfg (dict, optional): Config dict of training hyper-parameters.
+            Defaults to None.
+        test_cfg (dict, optional): Config dict of test hyper-parameters.
+            Defaults to None.
+        pretrained (str, optional): Path of pretrained models.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 backbone,
+                 neck=None,
+                 bbox_head=None,
+                 train_cfg=None,
+                 test_cfg=None,
+                 init_cfg=None,
+                 pretrained=None):
+        super(SingleStage3DDetector, self).__init__(init_cfg)
+        self.backbone = build_backbone(backbone)
+        if neck is not None:
+            self.neck = build_neck(neck)
+        bbox_head.update(train_cfg=train_cfg)
+        bbox_head.update(test_cfg=test_cfg)
+        self.bbox_head = build_head(bbox_head)
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+    def forward_dummy(self, points):
+        """Used for computing network flops.
+
+        See `mmcvection/tools/analysis_tools/get_flops.py`
+        """
+        x = self.extract_feat(points)
+        try:
+            sample_mod = self.train_cfg.sample_mod
+            outs = self.bbox_head(x, sample_mod)
+        except AttributeError:
+            outs = self.bbox_head(x)
+        return outs
+
+    def extract_feat(self, points, img_metas=None):
+        """Directly extract features from the backbone+neck.
+
+        Args:
+            points (torch.Tensor): Input points.
+        """
+        x = self.backbone(points)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def extract_feats(self, points, img_metas):
+        """Extract features of multiple samples."""
+        return [
+            self.extract_feat(pts, img_meta)
+            for pts, img_meta in zip(points, img_metas)
+        ]
+
diff --git a/mmcv/models/detectors/single_stage_mono3d.py b/mmcv/models/detectors/single_stage_mono3d.py
new file mode 100644
index 0000000..2b42072
--- /dev/null
+++ b/mmcv/models/detectors/single_stage_mono3d.py
@@ -0,0 +1,224 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmcv.parallel import DataContainer as DC
+from os import path as osp
+
+from mmcv.core.bbox.structures.cam_box3d import CameraInstance3DBoxes
+from mmcv.core.bbox.transforms import bbox3d2result
+from mmcv.core.visualizer import show_multi_modality_result
+
+
+# from mmcv.core import (CameraInstance3DBoxes, bbox3d2result,
+#                           show_multi_modality_result)
+from mmcv.models.builder import DETECTORS
+from mmcv.models.detectors.single_stage import SingleStageDetector
+from mmcv.utils import is_list_of
+from mmcv.image import imread
+
+
+@DETECTORS.register_module()
+class SingleStageMono3DDetector(SingleStageDetector):
+    """Base class for monocular 3D single-stage detectors.
+
+    Single-stage detectors directly and densely predict bounding boxes on the
+    output features of the backbone+neck.
+    """
+
+    def extract_feats(self, imgs):
+        """Directly extract features from the backbone+neck."""
+        assert isinstance(imgs, list)
+        return [self.extract_feat(img) for img in imgs]
+
+    def forward_train(self,
+                      img,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels,
+                      gt_bboxes_3d,
+                      gt_labels_3d,
+                      centers2d,
+                      depths,
+                      attr_labels=None,
+                      gt_bboxes_ignore=None):
+        """
+        Args:
+            img (Tensor): Input images of shape (N, C, H, W).
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): A List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                :class:`mmcv.datasets.pipelines.Collect`.
+            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box
+            gt_bboxes_3d (list[Tensor]): Each item are the 3D truth boxes for
+                each image in [x, y, z, w, l, h, theta, vx, vy] format.
+            gt_labels_3d (list[Tensor]): 3D class indices corresponding to
+                each box.
+            centers2d (list[Tensor]): Projected 3D centers onto 2D images.
+            depths (list[Tensor]): Depth of projected centers on 2D images.
+            attr_labels (list[Tensor], optional): Attribute indices
+                corresponding to each box
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        x = self.extract_feat(img)
+        losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,
+                                              gt_labels, gt_bboxes_3d,
+                                              gt_labels_3d, centers2d, depths,
+                                              attr_labels, gt_bboxes_ignore)
+        return losses
+
+    def simple_test(self, img, img_metas, rescale=False):
+        """Test function without test time augmentation.
+
+        Args:
+            imgs (list[torch.Tensor]): List of multiple images
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[list[np.ndarray]]: BBox results of each image and classes.
+                The outer list corresponds to each image. The inner list
+                corresponds to each class.
+        """
+        x = self.extract_feat(img)
+        outs = self.bbox_head(x)
+        bbox_outputs = self.bbox_head.get_bboxes(
+            *outs, img_metas, rescale=rescale)
+
+        if self.bbox_head.pred_bbox2d:
+            from mmcv.core import bbox2result
+            bbox2d_img = [
+                bbox2result(bboxes2d, labels, self.bbox_head.num_classes)
+                for bboxes, scores, labels, attrs, bboxes2d in bbox_outputs
+            ]
+            bbox_outputs = [bbox_outputs[0][:-1]]
+
+        bbox_img = [
+            bbox3d2result(bboxes, scores, labels, attrs)
+            for bboxes, scores, labels, attrs in bbox_outputs
+        ]
+
+        bbox_list = [dict() for i in range(len(img_metas))]
+        for result_dict, img_bbox in zip(bbox_list, bbox_img):
+            result_dict['img_bbox'] = img_bbox
+        if self.bbox_head.pred_bbox2d:
+            for result_dict, img_bbox2d in zip(bbox_list, bbox2d_img):
+                result_dict['img_bbox2d'] = img_bbox2d
+        return bbox_list
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test function with test time augmentation."""
+        feats = self.extract_feats(imgs)
+
+        # only support aug_test for one sample
+        outs_list = [self.bbox_head(x) for x in feats]
+        for i, img_meta in enumerate(img_metas):
+            if img_meta[0]['pcd_horizontal_flip']:
+                for j in range(len(outs_list[i])):  # for each prediction
+                    if outs_list[i][j][0] is None:
+                        continue
+                    for k in range(len(outs_list[i][j])):
+                        # every stride of featmap
+                        outs_list[i][j][k] = torch.flip(
+                            outs_list[i][j][k], dims=[3])
+                reg = outs_list[i][1]
+                for reg_feat in reg:
+                    # offset_x
+                    reg_feat[:, 0, :, :] = 1 - reg_feat[:, 0, :, :]
+                    # velo_x
+                    if self.bbox_head.pred_velo:
+                        reg_feat[:, 7, :, :] = -reg_feat[:, 7, :, :]
+                    # rotation
+                    reg_feat[:, 6, :, :] = -reg_feat[:, 6, :, :] + np.pi
+
+        merged_outs = []
+        for i in range(len(outs_list[0])):  # for each prediction
+            merged_feats = []
+            for j in range(len(outs_list[0][i])):
+                if outs_list[0][i][0] is None:
+                    merged_feats.append(None)
+                    continue
+                # for each stride of featmap
+                avg_feats = torch.mean(
+                    torch.cat([x[i][j] for x in outs_list]),
+                    dim=0,
+                    keepdim=True)
+                if i == 1:  # regression predictions
+                    # rot/velo/2d det keeps the original
+                    avg_feats[:, 6:, :, :] = \
+                        outs_list[0][i][j][:, 6:, :, :]
+                if i == 2:
+                    # dir_cls keeps the original
+                    avg_feats = outs_list[0][i][j]
+                merged_feats.append(avg_feats)
+            merged_outs.append(merged_feats)
+        merged_outs = tuple(merged_outs)
+
+        bbox_outputs = self.bbox_head.get_bboxes(
+            *merged_outs, img_metas[0], rescale=rescale)
+        if self.bbox_head.pred_bbox2d:
+            from mmcv.core import bbox2result
+            bbox2d_img = [
+                bbox2result(bboxes2d, labels, self.bbox_head.num_classes)
+                for bboxes, scores, labels, attrs, bboxes2d in bbox_outputs
+            ]
+            bbox_outputs = [bbox_outputs[0][:-1]]
+
+        bbox_img = [
+            bbox3d2result(bboxes, scores, labels, attrs)
+            for bboxes, scores, labels, attrs in bbox_outputs
+        ]
+
+        bbox_list = dict()
+        bbox_list.update(img_bbox=bbox_img[0])
+        if self.bbox_head.pred_bbox2d:
+            bbox_list.update(img_bbox2d=bbox2d_img[0])
+
+        return [bbox_list]
+
+    def show_results(self, data, result, out_dir):
+        """Results visualization.
+
+        Args:
+            data (list[dict]): Input images and the information of the sample.
+            result (list[dict]): Prediction results.
+            out_dir (str): Output directory of visualization result.
+        """
+        for batch_id in range(len(result)):
+            if isinstance(data['img_metas'][0], DC):
+                img_filename = data['img_metas'][0]._data[0][batch_id][
+                    'filename']
+                cam2img = data['img_metas'][0]._data[0][batch_id]['cam2img']
+            elif is_list_of(data['img_metas'][0], dict):
+                img_filename = data['img_metas'][0][batch_id]['filename']
+                cam2img = data['img_metas'][0][batch_id]['cam2img']
+            else:
+                ValueError(
+                    f"Unsupported data type {type(data['img_metas'][0])} "
+                    f'for visualization!')
+            img = imread(img_filename)
+            file_name = osp.split(img_filename)[-1].split('.')[0]
+
+            assert out_dir is not None, 'Expect out_dir, got none.'
+
+            pred_bboxes = result[batch_id]['img_bbox']['boxes_3d']
+            assert isinstance(pred_bboxes, CameraInstance3DBoxes), \
+                f'unsupported predicted bbox type {type(pred_bboxes)}'
+
+            show_multi_modality_result(
+                img,
+                None,
+                pred_bboxes,
+                cam2img,
+                out_dir,
+                file_name,
+                'camera',
+                show=True)
diff --git a/mmcv/models/detectors/uniad_e2e.py b/mmcv/models/detectors/uniad_e2e.py
new file mode 100644
index 0000000..1e0bfc8
--- /dev/null
+++ b/mmcv/models/detectors/uniad_e2e.py
@@ -0,0 +1,385 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+#---------------------------------------------------------------------------------#
+
+import torch
+from mmcv.utils import auto_fp16
+from mmcv.models import DETECTORS
+import copy
+import os
+from ..dense_heads.seg_head_plugin import IOU
+from .uniad_track import UniADTrack
+from mmcv.models.builder import build_head
+
+@DETECTORS.register_module()
+class UniAD(UniADTrack):
+    """
+    UniAD: Unifying Detection, Tracking, Segmentation, Motion Forecasting, Occupancy Prediction and Planning for Autonomous Driving
+    """
+    def __init__(
+        self,
+        seg_head=None,
+        motion_head=None,
+        occ_head=None,
+        planning_head=None,
+        task_loss_weight=dict(
+            track=1.0,
+            map=1.0,
+            motion=1.0,
+            occ=1.0,
+            planning=1.0
+        ),
+        **kwargs,
+    ):
+        super(UniAD, self).__init__(**kwargs)
+        if seg_head:
+            self.seg_head = build_head(seg_head)
+        if occ_head:
+            self.occ_head = build_head(occ_head)
+        if motion_head:
+            self.motion_head = build_head(motion_head)
+        if planning_head:
+            self.planning_head = build_head(planning_head)
+        
+        self.task_loss_weight = task_loss_weight
+        assert set(task_loss_weight.keys()) == \
+               {'track', 'occ', 'motion', 'map', 'planning'}
+
+    @property
+    def with_planning_head(self):
+        return hasattr(self, 'planning_head') and self.planning_head is not None
+    
+    @property
+    def with_occ_head(self):
+        return hasattr(self, 'occ_head') and self.occ_head is not None
+
+    @property
+    def with_motion_head(self):
+        return hasattr(self, 'motion_head') and self.motion_head is not None
+
+    @property
+    def with_seg_head(self):
+        return hasattr(self, 'seg_head') and self.seg_head is not None
+
+    def forward_dummy(self, img):
+        dummy_metas = None
+        return self.forward_test(img=img, img_metas=[[dummy_metas]])
+
+    def forward(self, inputs, return_loss=True, rescale=False):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True.
+        Note this setting will change the expected inputs. When
+        `return_loss=True`, img and img_metas are single-nested (i.e.
+        torch.Tensor and list[dict]), and when `resturn_loss=False`, img and
+        img_metas should be double nested (i.e.  list[torch.Tensor],
+        list[list[dict]]), with the outer list indicating test time
+        augmentations.
+        """
+        if return_loss:
+            losses = self.forward_train(**inputs)
+            loss, log_vars = self._parse_losses(losses)
+            outputs = dict(
+                loss=loss, log_vars=log_vars, num_samples=len(inputs['img_metas']))
+            return outputs
+        else:
+            outputs = self.forward_test(**inputs, rescale=rescale)
+            return outputs
+
+    # Add the subtask loss to the whole model loss
+    @auto_fp16(apply_to=('img', 'points'))
+    def forward_train(self,
+                      img=None,
+                      img_metas=None,
+                      gt_bboxes_3d=None,
+                      gt_labels_3d=None,
+                      gt_inds=None,
+                      l2g_t=None,
+                      l2g_r_mat=None,
+                      timestamp=None,
+                      gt_lane_labels=None,
+                      gt_lane_bboxes=None,
+                      gt_lane_masks=None,
+                      gt_fut_traj=None,
+                      gt_fut_traj_mask=None,
+                      gt_past_traj=None,
+                      gt_past_traj_mask=None,
+                      gt_sdc_bbox=None,
+                      gt_sdc_label=None,
+                      gt_sdc_fut_traj=None,
+                      gt_sdc_fut_traj_mask=None,
+                      
+                      # Occ_gt
+                      gt_segmentation=None,
+                      gt_instance=None, 
+                      gt_occ_img_is_valid=None,
+                      
+                      #planning
+                      sdc_planning=None,
+                      sdc_planning_mask=None,
+                      command=None,
+                      
+                      # fut gt for planning
+                      gt_future_boxes=None,
+                      **kwargs,  # [1, 9]
+                      ):
+        """Forward training function for the model that includes multiple tasks, such as tracking, segmentation, motion prediction, occupancy prediction, and planning.
+
+            Args:
+            img (torch.Tensor, optional): Tensor containing images of each sample with shape (N, C, H, W). Defaults to None.
+            img_metas (list[dict], optional): List of dictionaries containing meta information for each sample. Defaults to None.
+            gt_bboxes_3d (list[:obj:BaseInstance3DBoxes], optional): List of ground truth 3D bounding boxes for each sample. Defaults to None.
+            gt_labels_3d (list[torch.Tensor], optional): List of tensors containing ground truth labels for 3D bounding boxes. Defaults to None.
+            gt_inds (list[torch.Tensor], optional): List of tensors containing indices of ground truth objects. Defaults to None.
+            l2g_t (list[torch.Tensor], optional): List of tensors containing translation vectors from local to global coordinates. Defaults to None.
+            l2g_r_mat (list[torch.Tensor], optional): List of tensors containing rotation matrices from local to global coordinates. Defaults to None.
+            timestamp (list[float], optional): List of timestamps for each sample. Defaults to None.
+            gt_bboxes_ignore (list[torch.Tensor], optional): List of tensors containing ground truth 2D bounding boxes in images to be ignored. Defaults to None.
+            gt_lane_labels (list[torch.Tensor], optional): List of tensors containing ground truth lane labels. Defaults to None.
+            gt_lane_bboxes (list[torch.Tensor], optional): List of tensors containing ground truth lane bounding boxes. Defaults to None.
+            gt_lane_masks (list[torch.Tensor], optional): List of tensors containing ground truth lane masks. Defaults to None.
+            gt_fut_traj (list[torch.Tensor], optional): List of tensors containing ground truth future trajectories. Defaults to None.
+            gt_fut_traj_mask (list[torch.Tensor], optional): List of tensors containing ground truth future trajectory masks. Defaults to None.
+            gt_past_traj (list[torch.Tensor], optional): List of tensors containing ground truth past trajectories. Defaults to None.
+            gt_past_traj_mask (list[torch.Tensor], optional): List of tensors containing ground truth past trajectory masks. Defaults to None.
+            gt_sdc_bbox (list[torch.Tensor], optional): List of tensors containing ground truth self-driving car bounding boxes. Defaults to None.
+            gt_sdc_label (list[torch.Tensor], optional): List of tensors containing ground truth self-driving car labels. Defaults to None.
+            gt_sdc_fut_traj (list[torch.Tensor], optional): List of tensors containing ground truth self-driving car future trajectories. Defaults to None.
+            gt_sdc_fut_traj_mask (list[torch.Tensor], optional): List of tensors containing ground truth self-driving car future trajectory masks. Defaults to None.
+            gt_segmentation (list[torch.Tensor], optional): List of tensors containing ground truth segmentation masks. Defaults to
+            gt_instance (list[torch.Tensor], optional): List of tensors containing ground truth instance segmentation masks. Defaults to None.
+            gt_occ_img_is_valid (list[torch.Tensor], optional): List of tensors containing binary flags indicating whether an image is valid for occupancy prediction. Defaults to None.
+            sdc_planning (list[torch.Tensor], optional): List of tensors containing self-driving car planning information. Defaults to None.
+            sdc_planning_mask (list[torch.Tensor], optional): List of tensors containing self-driving car planning masks. Defaults to None.
+            command (list[torch.Tensor], optional): List of tensors containing high-level command information for planning. Defaults to None.
+            gt_future_boxes (list[torch.Tensor], optional): List of tensors containing ground truth future bounding boxes for planning. Defaults to None.
+            gt_future_labels (list[torch.Tensor], optional): List of tensors containing ground truth future labels for planning. Defaults to None.
+            
+            Returns:
+                dict: Dictionary containing losses of different tasks, such as tracking, segmentation, motion prediction, occupancy prediction, and planning. Each key in the dictionary 
+                    is prefixed with the corresponding task name, e.g., 'track', 'map', 'motion', 'occ', and 'planning'. The values are the calculated losses for each task.
+        """
+        losses = dict()
+        len_queue = img.size(1)
+        
+
+        losses_track, outs_track = self.forward_track_train(img, gt_bboxes_3d, gt_labels_3d, gt_past_traj, gt_past_traj_mask, gt_inds, gt_sdc_bbox, gt_sdc_label,
+                                                        l2g_t, l2g_r_mat, img_metas, timestamp)
+        losses_track = self.loss_weighted_and_prefixed(losses_track, prefix='track')
+        losses.update(losses_track)
+        
+        # Upsample bev for tiny version
+        outs_track = self.upsample_bev_if_tiny(outs_track)
+
+        bev_embed = outs_track["bev_embed"]
+        bev_pos  = outs_track["bev_pos"]
+
+        img_metas = [each[len_queue-1] for each in img_metas]
+
+        outs_seg = dict()
+        if self.with_seg_head:          
+            losses_seg, outs_seg = self.seg_head.forward_train(bev_embed, img_metas,
+                                                          gt_lane_labels, gt_lane_bboxes, gt_lane_masks)
+            
+            losses_seg = self.loss_weighted_and_prefixed(losses_seg, prefix='map')
+            losses.update(losses_seg)
+
+        outs_motion = dict()
+        # Forward Motion Head
+        if self.with_motion_head:
+            ret_dict_motion = self.motion_head.forward_train(bev_embed,
+                                                        gt_bboxes_3d, gt_labels_3d, 
+                                                        gt_fut_traj, gt_fut_traj_mask, 
+                                                        gt_sdc_fut_traj, gt_sdc_fut_traj_mask, 
+                                                        outs_track=outs_track, outs_seg=outs_seg
+                                                    )
+            losses_motion = ret_dict_motion["losses"]
+            outs_motion = ret_dict_motion["outs_motion"]
+            outs_motion['bev_pos'] = bev_pos
+            losses_motion = self.loss_weighted_and_prefixed(losses_motion, prefix='motion')
+            losses.update(losses_motion)
+
+        # Forward Occ Head
+        if self.with_occ_head:
+            if outs_motion['track_query'].shape[1] == 0:
+                # TODO: rm hard code
+                outs_motion['track_query'] = torch.zeros((1, 1, 256)).to(bev_embed)
+                outs_motion['track_query_pos'] = torch.zeros((1,1, 256)).to(bev_embed)
+                outs_motion['traj_query'] = torch.zeros((3, 1, 1, 6, 256)).to(bev_embed)
+                outs_motion['all_matched_idxes'] = [[-1]]
+            losses_occ = self.occ_head.forward_train(
+                            bev_embed, 
+                            outs_motion, 
+                            gt_inds_list=gt_inds,
+                            gt_segmentation=gt_segmentation,
+                            gt_instance=gt_instance,
+                            gt_img_is_valid=gt_occ_img_is_valid,
+                        )
+            losses_occ = self.loss_weighted_and_prefixed(losses_occ, prefix='occ')
+            losses.update(losses_occ)
+        
+
+        # Forward Plan Head
+        if self.with_planning_head:
+            outs_planning = self.planning_head.forward_train(bev_embed, outs_motion, sdc_planning, sdc_planning_mask, command, gt_future_boxes)
+            losses_planning = outs_planning['losses']
+            losses_planning = self.loss_weighted_and_prefixed(losses_planning, prefix='planning')
+            losses.update(losses_planning)
+        
+        for k,v in losses.items():
+            losses[k] = torch.nan_to_num(v)
+        return losses
+    
+    def loss_weighted_and_prefixed(self, loss_dict, prefix=''):
+        loss_factor = self.task_loss_weight[prefix]
+        loss_dict = {f"{prefix}.{k}" : v*loss_factor for k, v in loss_dict.items()}
+        return loss_dict
+
+    def forward_test(self,
+                     img=None,
+                     img_metas=None,
+                     l2g_t=None,
+                     l2g_r_mat=None,
+                     timestamp=None,
+                     gt_lane_labels=None,
+                     gt_lane_masks=None,
+                     rescale=False,
+                     # planning gt(for evaluation only)
+                     sdc_planning=None,
+                     sdc_planning_mask=None,
+                     command=None,
+ 
+                     # Occ_gt (for evaluation only)
+                     gt_segmentation=None,
+                     gt_instance=None, 
+                     gt_occ_img_is_valid=None,
+                     **kwargs,
+                    ):
+        """Test function
+        """
+        for var, name in [(img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError('{} must be a list, but got {}'.format(
+                    name, type(var)))
+        img = [img] if img is None else img
+        
+        if self.prev_frame_num > 0:
+            if len(self.prev_frame_infos) < self.prev_frame_num:
+                self.prev_frame_info = {
+                "prev_bev": None,
+                "scene_token": None,
+                "prev_pos": 0,
+                "prev_angle": 0,
+            }
+            else:
+                self.prev_frame_info = self.prev_frame_infos.pop(0)
+
+        if img_metas[0][0]['scene_token'] != self.prev_frame_info['scene_token']:
+            # the first sample of each scene is truncated
+            self.prev_frame_info['prev_bev'] = None
+        # update idx
+        self.prev_frame_info['scene_token'] = img_metas[0][0]['scene_token']
+
+        # do not use temporal information
+        if not self.video_test_mode:
+            self.prev_frame_info['prev_bev'] = None
+
+        # Get the delta of ego position and angle between two timestamps.
+        tmp_pos = copy.deepcopy(img_metas[0][0]['can_bus'][:3])
+        tmp_angle = copy.deepcopy(img_metas[0][0]['can_bus'][-1])
+        # first frame
+        if self.prev_frame_info['scene_token'] is None:
+            img_metas[0][0]['can_bus'][:3] = 0
+            img_metas[0][0]['can_bus'][-1] = 0
+        # following frames
+        else:
+            img_metas[0][0]['can_bus'][:3] -= self.prev_frame_info['prev_pos']
+            img_metas[0][0]['can_bus'][-1] -= self.prev_frame_info['prev_angle']
+        self.prev_frame_info['prev_pos'] = tmp_pos
+        self.prev_frame_info['prev_angle'] = tmp_angle
+        
+
+
+        img = img[0]
+        img_metas = img_metas[0]
+        timestamp = timestamp[0] if timestamp is not None else None
+
+        result = [dict() for i in range(len(img_metas))]
+        result_track = self.simple_test_track(img, l2g_t, l2g_r_mat, img_metas, timestamp)
+
+        # Upsample bev for tiny model        
+        result_track[0] = self.upsample_bev_if_tiny(result_track[0])
+        
+        bev_embed = result_track[0]["bev_embed"]
+        
+        if self.prev_frame_num > 0:        
+            self.prev_frame_infos.append(self.prev_frame_info)        
+        
+        
+
+        if self.with_seg_head:
+            result_seg =  self.seg_head.forward_test(bev_embed, gt_lane_labels, gt_lane_masks, img_metas, rescale)
+
+        if self.with_motion_head:
+            result_motion, outs_motion = self.motion_head.forward_test(bev_embed, outs_track=result_track[0], outs_seg=result_seg[0])
+            outs_motion['bev_pos'] = result_track[0]['bev_pos']
+
+        outs_occ = dict()
+        if self.with_occ_head:
+            occ_no_query = outs_motion['track_query'].shape[1] == 0
+            outs_occ = self.occ_head.forward_test(
+                bev_embed, 
+                outs_motion,
+                no_query = occ_no_query,
+                gt_segmentation=gt_segmentation,
+                gt_instance=gt_instance,
+                gt_img_is_valid=gt_occ_img_is_valid,
+            )
+            result[0]['occ'] = outs_occ
+        
+        if self.with_planning_head:
+            planning_gt=dict(
+                segmentation=gt_segmentation,
+                sdc_planning=sdc_planning,
+                sdc_planning_mask=sdc_planning_mask,
+                command=command
+            )
+            result_planning = self.planning_head.forward_test(bev_embed, outs_motion, outs_occ, command)
+            result[0]['planning'] = dict(
+                planning_gt=planning_gt,
+                result_planning=result_planning,
+            )
+
+        pop_track_list = ['prev_bev', 'bev_pos', 'bev_embed', 'track_query_embeddings', 'sdc_embedding']
+        result_track[0] = pop_elem_in_result(result_track[0], pop_track_list)
+
+        if self.with_seg_head:
+            result_seg[0] = pop_elem_in_result(result_seg[0], pop_list=['pts_bbox', 'args_tuple'])
+        if self.with_motion_head:
+            result_motion[0] = pop_elem_in_result(result_motion[0])
+        if self.with_occ_head:
+            result[0]['occ'] = pop_elem_in_result(result[0]['occ'],  \
+                pop_list=['seg_out_mask', 'flow_out', 'future_states_occ', 'pred_ins_masks', 'pred_raw_occ', 'pred_ins_logits', 'pred_ins_sigmoid'])
+        
+        for i, res in enumerate(result):
+            #res['token'] = img_metas[i]['sample_idx']
+            res.update(result_track[i])
+            if self.with_motion_head:
+                res.update(result_motion[i])
+            if self.with_seg_head:
+                res.update(result_seg[i])
+
+        return result
+
+
+def pop_elem_in_result(task_result:dict, pop_list:list=None):
+    all_keys = list(task_result.keys())
+    for k in all_keys:
+        if k.endswith('query') or k.endswith('query_pos') or k.endswith('embedding'):
+            task_result.pop(k)
+    
+    if pop_list is not None:
+        for pop_k in pop_list:
+            task_result.pop(pop_k, None)
+    return task_result
diff --git a/mmcv/models/detectors/uniad_track.py b/mmcv/models/detectors/uniad_track.py
new file mode 100644
index 0000000..1989974
--- /dev/null
+++ b/mmcv/models/detectors/uniad_track.py
@@ -0,0 +1,869 @@
+#---------------------------------------------------------------------------------#
+# UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156)  #
+# Source code: https://github.com/OpenDriveLab/UniAD                              #
+# Copyright (c) OpenDriveLab. All rights reserved.                                #
+#---------------------------------------------------------------------------------#
+
+import torch
+import torch.nn as nn
+from mmcv.utils import auto_fp16
+from mmcv.models import DETECTORS
+from mmcv.core.bbox.coder import build_bbox_coder
+from mmcv.models.detectors.mvx_two_stage import MVXTwoStageDetector
+from mmcv.models.utils.grid_mask import GridMask
+import copy
+import math
+from mmcv.core.bbox.util import normalize_bbox
+from mmcv.models import build_loss
+from einops import rearrange
+from mmcv.models.utils.transformer import inverse_sigmoid
+from ..dense_heads.track_head_plugin import MemoryBank, QueryInteractionModule, Instances, RuntimeTrackerBase
+
+@DETECTORS.register_module()
+class UniADTrack(MVXTwoStageDetector):
+    """UniAD tracking part
+    """
+    def __init__(
+        self, 
+        use_grid_mask=False,
+        img_backbone=None,
+        img_neck=None,
+        pts_bbox_head=None,
+        train_cfg=None,
+        test_cfg=None,
+        pretrained=None,
+        video_test_mode=False,
+        loss_cfg=None,
+        prev_frame_num=0,
+        qim_args=dict(
+            qim_type="QIMBase",
+            merger_dropout=0,
+            update_query_pos=False,
+            fp_ratio=0.3,
+            random_drop=0.1,
+        ),
+        mem_args=dict(
+            memory_bank_type="MemoryBank",
+            memory_bank_score_thresh=0.0,
+            memory_bank_len=4,
+        ),
+        bbox_coder=dict(
+            type="DETRTrack3DCoder",
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
+            max_num=300,
+            num_classes=10,
+            score_threshold=0.0,
+            with_nms=False,
+            iou_thres=0.3,
+        ),
+        pc_range=None,
+        embed_dims=256,
+        num_query=900,
+        num_classes=10,
+        vehicle_id_list=None,
+        score_thresh=0.2,
+        filter_score_thresh=0.1,
+        miss_tolerance=5,
+        gt_iou_threshold=0.0,
+        freeze_img_backbone=False,
+        freeze_img_neck=False,
+        freeze_bn=False,
+        freeze_bev_encoder=False,
+        queue_length=3,
+    ):
+        super(UniADTrack, self).__init__(
+            img_backbone=img_backbone,
+            img_neck=img_neck,
+            pts_bbox_head=pts_bbox_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            pretrained=pretrained,
+        )
+
+        self.grid_mask = GridMask(
+            True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7
+        )
+        self.use_grid_mask = use_grid_mask
+        self.fp16_enabled = False
+        self.embed_dims = embed_dims
+        self.num_query = num_query
+        self.num_classes = num_classes
+        self.vehicle_id_list = vehicle_id_list
+        self.pc_range = pc_range
+        self.queue_length = queue_length
+        if freeze_img_backbone:
+            if freeze_bn:
+                self.img_backbone.eval()
+            for param in self.img_backbone.parameters():
+                param.requires_grad = False
+        
+        if freeze_img_neck:
+            if freeze_bn:
+                self.img_neck.eval()
+            for param in self.img_neck.parameters():
+                param.requires_grad = False
+
+        # temporal
+        self.video_test_mode = video_test_mode
+        assert self.video_test_mode
+        self.prev_frame_num = prev_frame_num
+        self.prev_frame_infos = []
+        self.prev_frame_info = {
+            "prev_bev": None,
+            "scene_token": None,
+            "prev_pos": 0,
+            "prev_angle": 0,
+        }
+        self.query_embedding = nn.Embedding(self.num_query+1, self.embed_dims * 2)   # the final one is ego query, which constantly models ego-vehicle
+        self.reference_points = nn.Linear(self.embed_dims, 3)
+
+        self.mem_bank_len = mem_args["memory_bank_len"]
+        self.track_base = RuntimeTrackerBase(
+            score_thresh=score_thresh,
+            filter_score_thresh=filter_score_thresh,
+            miss_tolerance=miss_tolerance,
+        )  # hyper-param for removing inactive queries
+
+        self.query_interact = QueryInteractionModule(
+            qim_args,
+            dim_in=embed_dims,
+            hidden_dim=embed_dims,
+            dim_out=embed_dims,
+        )
+
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+
+        self.memory_bank = MemoryBank(
+            mem_args,
+            dim_in=embed_dims,
+            hidden_dim=embed_dims,
+            dim_out=embed_dims,
+        )
+        self.mem_bank_len = (
+            0 if self.memory_bank is None else self.memory_bank.max_his_length
+        )
+        self.criterion = build_loss(loss_cfg)
+        self.test_track_instances = None
+        self.l2g_r_mat = None
+        self.l2g_t = None
+        self.gt_iou_threshold = gt_iou_threshold
+        self.bev_h, self.bev_w = self.pts_bbox_head.bev_h, self.pts_bbox_head.bev_w
+        self.freeze_bev_encoder = freeze_bev_encoder
+
+    def extract_img_feat(self, img, len_queue=None):
+        """Extract features of images."""
+        if img is None:
+            return None
+        assert img.dim() == 5
+        B, N, C, H, W = img.size()
+        img = img.reshape(B * N, C, H, W)
+        if self.use_grid_mask:
+            img = self.grid_mask(img)
+        img_feats = self.img_backbone(img)
+        if isinstance(img_feats, dict):
+            img_feats = list(img_feats.values())
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+
+        img_feats_reshaped = []
+        for img_feat in img_feats:
+            _, c, h, w = img_feat.size()
+            if len_queue is not None:
+                img_feat_reshaped = img_feat.view(B//len_queue, len_queue, N, c, h, w)
+            else:
+                img_feat_reshaped = img_feat.view(B, N, c, h, w)
+            img_feats_reshaped.append(img_feat_reshaped)
+        return img_feats_reshaped
+
+    def _generate_empty_tracks(self):
+        track_instances = Instances((1, 1))
+        num_queries, dim = self.query_embedding.weight.shape  # (300, 256 * 2)
+        device = self.query_embedding.weight.device
+        query = self.query_embedding.weight
+        track_instances.ref_pts = self.reference_points(query[..., : dim // 2])
+
+        # init boxes: xy, wl, z, h, sin, cos, vx, vy, vz
+        pred_boxes_init = torch.zeros(
+            (len(track_instances), 10), dtype=torch.float, device=device
+        )
+        track_instances.query = query
+
+        track_instances.output_embedding = torch.zeros(
+            (num_queries, dim >> 1), device=device
+        )
+
+        track_instances.obj_idxes = torch.full(
+            (len(track_instances),), -1, dtype=torch.long, device=device
+        )
+        track_instances.matched_gt_idxes = torch.full(
+            (len(track_instances),), -1, dtype=torch.long, device=device
+        )
+        track_instances.disappear_time = torch.zeros(
+            (len(track_instances),), dtype=torch.long, device=device
+        )
+
+        track_instances.iou = torch.zeros(
+            (len(track_instances),), dtype=torch.float, device=device
+        )
+        track_instances.scores = torch.zeros(
+            (len(track_instances),), dtype=torch.float, device=device
+        )
+        track_instances.track_scores = torch.zeros(
+            (len(track_instances),), dtype=torch.float, device=device
+        )
+        # xy, wl, z, h, sin, cos, vx, vy, vz
+        track_instances.pred_boxes = pred_boxes_init
+
+        track_instances.pred_logits = torch.zeros(
+            (len(track_instances), self.num_classes), dtype=torch.float, device=device
+        )
+
+        mem_bank_len = self.mem_bank_len
+        track_instances.mem_bank = torch.zeros(
+            (len(track_instances), mem_bank_len, dim // 2),
+            dtype=torch.float32,
+            device=device,
+        )
+        track_instances.mem_padding_mask = torch.ones(
+            (len(track_instances), mem_bank_len), dtype=torch.bool, device=device
+        )
+        track_instances.save_period = torch.zeros(
+            (len(track_instances),), dtype=torch.float32, device=device
+        )
+
+        return track_instances.to(self.query_embedding.weight.device)
+
+    def velo_update(
+        self, ref_pts, velocity, l2g_r1, l2g_t1, l2g_r2, l2g_t2, time_delta
+    ):
+        """
+        Args:
+            ref_pts (Tensor): (num_query, 3).  in inevrse sigmoid space
+            velocity (Tensor): (num_query, 2). m/s
+                in lidar frame. vx, vy
+            global2lidar (np.Array) [4,4].
+        Outs:
+            ref_pts (Tensor): (num_query, 3).  in inevrse sigmoid space
+        """
+        # print(l2g_r1.type(), l2g_t1.type(), ref_pts.type())
+        
+        if isinstance(l2g_r1,list):
+            l2g_r1 = l2g_r1[0]
+        if isinstance(l2g_t1,list):
+            l2g_t1 = l2g_t1[0]
+        if isinstance(l2g_r2,list):
+            l2g_r2 = l2g_r2[0]
+        if isinstance(l2g_t2,list):
+            l2g_t2 = l2g_t2[0]          
+        
+        l2g_r1 = l2g_r1.type(torch.float)
+        l2g_t1 = l2g_t1.type(torch.float)
+        l2g_t2 = l2g_t2.type(torch.float)
+        time_delta = time_delta.type(torch.float)
+        
+        num_query = ref_pts.size(0)
+        velo_pad_ = velocity.new_zeros((num_query, 1))
+        velo_pad = torch.cat((velocity, velo_pad_), dim=-1)
+
+        reference_points = ref_pts.sigmoid().clone()
+        pc_range = self.pc_range
+        reference_points[..., 0:1] = (
+            reference_points[..., 0:1] * (pc_range[3] - pc_range[0]) + pc_range[0]
+        )
+        reference_points[..., 1:2] = (
+            reference_points[..., 1:2] * (pc_range[4] - pc_range[1]) + pc_range[1]
+        )
+        reference_points[..., 2:3] = (
+            reference_points[..., 2:3] * (pc_range[5] - pc_range[2]) + pc_range[2]
+        )
+
+        reference_points = reference_points + velo_pad * time_delta
+
+        ref_pts = reference_points @ l2g_r1 + l2g_t1 - l2g_t2
+
+        g2l_r = torch.linalg.inv(l2g_r2).type(torch.float)
+
+        ref_pts = ref_pts @ g2l_r
+
+        ref_pts[..., 0:1] = (ref_pts[..., 0:1] - pc_range[0]) / (
+            pc_range[3] - pc_range[0]
+        )
+        ref_pts[..., 1:2] = (ref_pts[..., 1:2] - pc_range[1]) / (
+            pc_range[4] - pc_range[1]
+        )
+        ref_pts[..., 2:3] = (ref_pts[..., 2:3] - pc_range[2]) / (
+            pc_range[5] - pc_range[2]
+        )
+
+        ref_pts = inverse_sigmoid(ref_pts)
+
+        return ref_pts
+
+    def _copy_tracks_for_loss(self, tgt_instances):
+        device = self.query_embedding.weight.device
+        track_instances = Instances((1, 1))
+
+        track_instances.obj_idxes = copy.deepcopy(tgt_instances.obj_idxes)
+
+        track_instances.matched_gt_idxes = copy.deepcopy(tgt_instances.matched_gt_idxes)
+        track_instances.disappear_time = copy.deepcopy(tgt_instances.disappear_time)
+
+        track_instances.scores = torch.zeros(
+            (len(track_instances),), dtype=torch.float, device=device
+        )
+        track_instances.track_scores = torch.zeros(
+            (len(track_instances),), dtype=torch.float, device=device
+        )
+        track_instances.pred_boxes = torch.zeros(
+            (len(track_instances), 10), dtype=torch.float, device=device
+        )
+        track_instances.iou = torch.zeros(
+            (len(track_instances),), dtype=torch.float, device=device
+        )
+        track_instances.pred_logits = torch.zeros(
+            (len(track_instances), self.num_classes), dtype=torch.float, device=device
+        )
+
+        track_instances.save_period = copy.deepcopy(tgt_instances.save_period)
+        return track_instances.to(device)
+
+    def get_history_bev(self, imgs_queue, img_metas_list):
+        """
+        Get history BEV features iteratively. To save GPU memory, gradients are not calculated.
+        """
+        self.eval()
+        with torch.no_grad():
+            prev_bev = None
+            bs, len_queue, num_cams, C, H, W = imgs_queue.shape
+            imgs_queue = imgs_queue.reshape(bs * len_queue, num_cams, C, H, W)
+            img_feats_list = self.extract_img_feat(img=imgs_queue, len_queue=len_queue)
+            for i in range(len_queue):
+                img_metas = [each[i] for each in img_metas_list]
+                img_feats = [each_scale[:, i] for each_scale in img_feats_list]
+                prev_bev, _ = self.pts_bbox_head.get_bev_features(
+                    mlvl_feats=img_feats, 
+                    img_metas=img_metas, 
+                    prev_bev=prev_bev)
+        self.train()
+        return prev_bev
+
+    # Generate bev using bev_encoder in BEVFormer
+    def get_bevs(self, imgs, img_metas, prev_img=None, prev_img_metas=None, prev_bev=None):
+        if prev_img is not None and prev_img_metas is not None:
+            assert prev_bev is None
+            prev_bev = self.get_history_bev(prev_img, prev_img_metas)
+
+        img_feats = self.extract_img_feat(img=imgs)
+        if self.freeze_bev_encoder:
+            with torch.no_grad():
+                bev_embed, bev_pos = self.pts_bbox_head.get_bev_features(
+                    mlvl_feats=img_feats, img_metas=img_metas, prev_bev=prev_bev)
+        else:
+            bev_embed, bev_pos = self.pts_bbox_head.get_bev_features(
+                    mlvl_feats=img_feats, img_metas=img_metas, prev_bev=prev_bev)
+        
+        if bev_embed.shape[1] == self.bev_h * self.bev_w:
+            bev_embed = bev_embed.permute(1, 0, 2)
+        
+        assert bev_embed.shape[0] == self.bev_h * self.bev_w
+        return bev_embed, bev_pos
+
+    @auto_fp16(apply_to=("img", "prev_bev"))
+    def _forward_single_frame_train(
+        self,
+        img,
+        img_metas,
+        track_instances,
+        prev_img,
+        prev_img_metas,
+        l2g_r1=None,
+        l2g_t1=None,
+        l2g_r2=None,
+        l2g_t2=None,
+        time_delta=None,
+        all_query_embeddings=None,
+        all_matched_indices=None,
+        all_instances_pred_logits=None,
+        all_instances_pred_boxes=None,
+    ):
+        """
+        Perform forward only on one frame. Called in  forward_train
+        Warnning: Only Support BS=1
+        Args:
+            img: shape [B, num_cam, 3, H, W]
+            if l2g_r2 is None or l2g_t2 is None:
+                it means this frame is the end of the training clip,
+                so no need to call velocity update
+        """
+        # NOTE: You can replace BEVFormer with other BEV encoder and provide bev_embed here
+        bev_embed, bev_pos = self.get_bevs(
+            img, img_metas,
+            prev_img=prev_img, prev_img_metas=prev_img_metas,
+        )
+
+        det_output = self.pts_bbox_head.get_detections(
+            bev_embed,
+            object_query_embeds=track_instances.query,
+            ref_points=track_instances.ref_pts,
+            img_metas=img_metas,
+        )
+
+        output_classes = det_output["all_cls_scores"]
+        output_coords = det_output["all_bbox_preds"]
+        output_past_trajs = det_output["all_past_traj_preds"]
+        last_ref_pts = det_output["last_ref_points"]
+        query_feats = det_output["query_feats"]
+
+        out = {
+            "pred_logits": output_classes[-1],
+            "pred_boxes": output_coords[-1],
+            "pred_past_trajs": output_past_trajs[-1],
+            "ref_pts": last_ref_pts,
+            "bev_embed": bev_embed,
+            "bev_pos": bev_pos
+        }
+        with torch.no_grad():
+            track_scores = output_classes[-1, 0, :].sigmoid().max(dim=-1).values
+
+        # Step-1 Update track instances with current prediction
+        # [nb_dec, bs, num_query, xxx]
+        nb_dec = output_classes.size(0)
+
+        # the track id will be assigned by the matcher.
+        track_instances_list = [
+            self._copy_tracks_for_loss(track_instances) for i in range(nb_dec - 1)
+        ]
+        track_instances.output_embedding = query_feats[-1][0]  # [300, feat_dim]
+        velo = output_coords[-1, 0, :, -2:]  # [num_query, 3]
+        if l2g_r2 is not None:
+            # Update ref_pts for next frame considering each agent's velocity
+            ref_pts = self.velo_update(
+                last_ref_pts[0],
+                velo,
+                l2g_r1,
+                l2g_t1,
+                l2g_r2,
+                l2g_t2,
+                time_delta=time_delta,
+            )
+        else:
+            ref_pts = last_ref_pts[0]
+
+        dim = track_instances.query.shape[-1]
+        track_instances.ref_pts = self.reference_points(track_instances.query[..., :dim//2])
+        track_instances.ref_pts[...,:2] = ref_pts[...,:2]
+
+        track_instances_list.append(track_instances)
+        
+        for i in range(nb_dec):
+            track_instances = track_instances_list[i]
+
+            track_instances.scores = track_scores
+            track_instances.pred_logits = output_classes[i, 0]  # [300, num_cls]
+            track_instances.pred_boxes = output_coords[i, 0]  # [300, box_dim]
+            track_instances.pred_past_trajs = output_past_trajs[i, 0]  # [300,past_steps, 2]
+
+            out["track_instances"] = track_instances
+            track_instances, matched_indices = self.criterion.match_for_single_frame(
+                out, i, if_step=(i == (nb_dec - 1))
+            )
+            all_query_embeddings.append(query_feats[i][0])
+            all_matched_indices.append(matched_indices)
+            all_instances_pred_logits.append(output_classes[i, 0])
+            all_instances_pred_boxes.append(output_coords[i, 0])   # Not used
+        
+        active_index = (track_instances.obj_idxes>=0) & (track_instances.iou >= self.gt_iou_threshold) & (track_instances.matched_gt_idxes >=0)
+        out.update(self.select_active_track_query(track_instances, active_index, img_metas))
+        out.update(self.select_sdc_track_query(track_instances[900], img_metas))
+        
+        # memory bank 
+        if self.memory_bank is not None:
+            track_instances = self.memory_bank(track_instances)
+        # Step-2 Update track instances using matcher
+
+        tmp = {}
+        tmp["init_track_instances"] = self._generate_empty_tracks()
+        tmp["track_instances"] = track_instances
+        out_track_instances = self.query_interact(tmp)
+        out["track_instances"] = out_track_instances
+        return out
+
+    def select_active_track_query(self, track_instances, active_index, img_metas, with_mask=True):
+        result_dict = self._track_instances2results(track_instances[active_index], img_metas, with_mask=with_mask)
+        result_dict["track_query_embeddings"] = track_instances.output_embedding[active_index][result_dict['bbox_index']][result_dict['mask']]
+        result_dict["track_query_matched_idxes"] = track_instances.matched_gt_idxes[active_index][result_dict['bbox_index']][result_dict['mask']]
+        return result_dict
+    
+    def select_sdc_track_query(self, sdc_instance, img_metas):
+        out = dict()
+        result_dict = self._track_instances2results(sdc_instance, img_metas, with_mask=False)
+        out["sdc_boxes_3d"] = result_dict['boxes_3d']
+        out["sdc_scores_3d"] = result_dict['scores_3d']
+        out["sdc_track_scores"] = result_dict['track_scores']
+        out["sdc_track_bbox_results"] = result_dict['track_bbox_results']
+        out["sdc_embedding"] = sdc_instance.output_embedding[0]
+        return out
+
+    @auto_fp16(apply_to=("img", "points"))
+    def forward_track_train(self,
+                            img,
+                            gt_bboxes_3d,
+                            gt_labels_3d,
+                            gt_past_traj,
+                            gt_past_traj_mask,
+                            gt_inds,
+                            gt_sdc_bbox,
+                            gt_sdc_label,
+                            l2g_t,
+                            l2g_r_mat,
+                            img_metas,
+                            timestamp):
+        """Forward funciton
+        Args:
+        Returns:
+        """
+        track_instances = self._generate_empty_tracks()
+        num_frame = img.size(1)
+        # init gt instances!
+        gt_instances_list = []
+
+        for i in range(num_frame):
+            gt_instances = Instances((1, 1))
+            boxes = gt_bboxes_3d[0][i].tensor.to(img.device)
+            # normalize gt bboxes here!
+            boxes = normalize_bbox(boxes, self.pc_range)
+            sd_boxes = gt_sdc_bbox[0][i].tensor.to(img.device)
+            sd_boxes = normalize_bbox(sd_boxes, self.pc_range)
+            gt_instances.boxes = boxes
+            gt_instances.labels = gt_labels_3d[0][i]
+            gt_instances.obj_ids = gt_inds[0][i]
+            gt_instances.past_traj = gt_past_traj[0][i].float()
+            gt_instances.past_traj_mask = gt_past_traj_mask[0][i].float()
+            gt_instances.sdc_boxes = torch.cat([sd_boxes for _ in range(boxes.shape[0])], dim=0)  # boxes.shape[0] sometimes 0
+            gt_instances.sdc_labels = torch.cat([gt_sdc_label[0][i] for _ in range(gt_labels_3d[0][i].shape[0])], dim=0)
+            gt_instances_list.append(gt_instances)
+
+        self.criterion.initialize_for_single_clip(gt_instances_list)
+
+        out = dict()
+
+        for i in range(num_frame):
+            prev_img = img[:, :i, ...] if i != 0 else img[:, :1, ...]
+            prev_img_metas = copy.deepcopy(img_metas)
+            # TODO: Generate prev_bev in an RNN way.
+
+            img_single = torch.stack([img_[i] for img_ in img], dim=0)
+            img_metas_single = [copy.deepcopy(img_metas[0][i])]
+            if i == num_frame - 1:
+                l2g_r2 = None
+                l2g_t2 = None
+                time_delta = None
+            else:
+                l2g_r2 = l2g_r_mat[0][i + 1]
+                l2g_t2 = l2g_t[0][i + 1]
+                time_delta = timestamp[0][i + 1] - timestamp[0][i]
+            all_query_embeddings = []
+            all_matched_idxes = []
+            all_instances_pred_logits = []
+            all_instances_pred_boxes = []
+            frame_res = self._forward_single_frame_train(
+                img_single,
+                img_metas_single,
+                track_instances,
+                prev_img,
+                prev_img_metas,
+                l2g_r_mat[0][i],
+                l2g_t[0][i],
+                l2g_r2,
+                l2g_t2,
+                time_delta,
+                all_query_embeddings,
+                all_matched_idxes,
+                all_instances_pred_logits,
+                all_instances_pred_boxes,
+            )
+            # all_query_embeddings: len=dec nums, N*256
+            # all_matched_idxes: len=dec nums, N*2
+            track_instances = frame_res["track_instances"]
+        
+        get_keys = ["bev_embed", "bev_pos",
+                    "track_query_embeddings", "track_query_matched_idxes", "track_bbox_results",
+                    "sdc_boxes_3d", "sdc_scores_3d", "sdc_track_scores", "sdc_track_bbox_results", "sdc_embedding"]
+        out.update({k: frame_res[k] for k in get_keys})
+        
+        losses = self.criterion.losses_dict
+        return losses, out
+
+    def upsample_bev_if_tiny(self, outs_track):
+        if outs_track["bev_embed"].size(0) == 100 * 100:
+            # For tiny model
+            # bev_emb
+            bev_embed = outs_track["bev_embed"] # [10000, 1, 256]
+            dim, _, _ = bev_embed.size()
+            w = h = int(math.sqrt(dim))
+            assert h == w == 100
+
+            bev_embed = rearrange(bev_embed, '(h w) b c -> b c h w', h=h, w=w)  # [1, 256, 100, 100]
+            bev_embed = nn.Upsample(scale_factor=2)(bev_embed)  # [1, 256, 200, 200]
+            bev_embed = rearrange(bev_embed, 'b c h w -> (h w) b c')
+            outs_track["bev_embed"] = bev_embed
+
+            # prev_bev
+            prev_bev = outs_track.get("prev_bev", None)
+            if prev_bev is not None:
+                if self.training:
+                    #  [1, 10000, 256]
+                    prev_bev = rearrange(prev_bev, 'b (h w) c -> b c h w', h=h, w=w)
+                    prev_bev = nn.Upsample(scale_factor=2)(prev_bev)  # [1, 256, 200, 200]
+                    prev_bev = rearrange(prev_bev, 'b c h w -> b (h w) c')
+                    outs_track["prev_bev"] = prev_bev
+                else:
+                    #  [10000, 1, 256]
+                    prev_bev = rearrange(prev_bev, '(h w) b c -> b c h w', h=h, w=w)
+                    prev_bev = nn.Upsample(scale_factor=2)(prev_bev)  # [1, 256, 200, 200]
+                    prev_bev = rearrange(prev_bev, 'b c h w -> (h w) b c')
+                    outs_track["prev_bev"] = prev_bev
+
+            # bev_pos
+            bev_pos  = outs_track["bev_pos"]  # [1, 256, 100, 100]
+            bev_pos = nn.Upsample(scale_factor=2)(bev_pos)  # [1, 256, 200, 200]
+            outs_track["bev_pos"] = bev_pos
+        return outs_track
+
+
+    def _forward_single_frame_inference(
+        self,
+        img,
+        img_metas,
+        track_instances,
+        prev_bev=None,
+        l2g_r1=None,
+        l2g_t1=None,
+        l2g_r2=None,
+        l2g_t2=None,
+        time_delta=None,
+    ):
+        """
+        img: B, num_cam, C, H, W = img.shape
+        """
+
+        """ velo update """
+        active_inst = track_instances[track_instances.obj_idxes >= 0]
+        other_inst = track_instances[track_instances.obj_idxes < 0]
+
+        if l2g_r2 is not None and len(active_inst) > 0 and l2g_r1 is not None:
+            ref_pts = active_inst.ref_pts
+            velo = active_inst.pred_boxes[:, -2:]
+            ref_pts = self.velo_update(
+                ref_pts, velo, l2g_r1, l2g_t1, l2g_r2, l2g_t2, time_delta=time_delta
+            )
+            ref_pts = ref_pts.squeeze(0)
+            dim = active_inst.query.shape[-1]
+            active_inst.ref_pts = self.reference_points(active_inst.query[..., :dim//2])
+            active_inst.ref_pts[...,:2] = ref_pts[...,:2]
+
+        track_instances = Instances.cat([other_inst, active_inst])
+
+        # NOTE: You can replace BEVFormer with other BEV encoder and provide bev_embed here
+        bev_embed, bev_pos = self.get_bevs(img, img_metas, prev_bev=prev_bev)
+        det_output = self.pts_bbox_head.get_detections(
+            bev_embed, 
+            object_query_embeds=track_instances.query,
+            ref_points=track_instances.ref_pts,
+            img_metas=img_metas,
+        )
+        output_classes = det_output["all_cls_scores"]
+        output_coords = det_output["all_bbox_preds"]
+        last_ref_pts = det_output["last_ref_points"]
+        query_feats = det_output["query_feats"]
+
+        out = {
+            "pred_logits": output_classes,
+            "pred_boxes": output_coords,
+            "ref_pts": last_ref_pts,
+            "bev_embed": bev_embed,
+            "query_embeddings": query_feats,
+            "all_past_traj_preds": det_output["all_past_traj_preds"],
+            "bev_pos": bev_pos,
+        }
+
+        """ update track instances with predict results """
+        track_scores = output_classes[-1, 0, :].sigmoid().max(dim=-1).values
+        # each track will be assigned an unique global id by the track base.
+        track_instances.scores = track_scores
+        # track_instances.track_scores = track_scores  # [300]
+        track_instances.pred_logits = output_classes[-1, 0]  # [300, num_cls]
+        track_instances.pred_boxes = output_coords[-1, 0]  # [300, box_dim]
+        track_instances.output_embedding = query_feats[-1][0]  # [300, feat_dim]
+        track_instances.ref_pts = last_ref_pts[0]
+        # hard_code: assume the 901 query is sdc query 
+        track_instances.obj_idxes[900] = -2
+        """ update track base """
+        self.track_base.update(track_instances, None)
+       
+        active_index = (track_instances.obj_idxes>=0) & (track_instances.scores >= self.track_base.filter_score_thresh)    # filter out sleep objects
+        out.update(self.select_active_track_query(track_instances, active_index, img_metas))
+        out.update(self.select_sdc_track_query(track_instances[track_instances.obj_idxes==-2], img_metas))
+
+        """ update with memory_bank """
+        if self.memory_bank is not None:
+            track_instances = self.memory_bank(track_instances)
+
+        """  Update track instances using matcher """
+        tmp = {}
+        tmp["init_track_instances"] = self._generate_empty_tracks()
+        tmp["track_instances"] = track_instances
+        out_track_instances = self.query_interact(tmp)
+        out["track_instances_fordet"] = track_instances
+        out["track_instances"] = out_track_instances
+        out["track_obj_idxes"] = track_instances.obj_idxes
+        return out
+
+    def simple_test_track(
+        self,
+        img=None,
+        l2g_t=None,
+        l2g_r_mat=None,
+        img_metas=None,
+        timestamp=None,
+    ):
+        """only support bs=1 and sequential input"""
+
+        bs = img.size(0)
+        # img_metas = img_metas[0]
+
+        """ init track instances for first frame """
+        if (
+            self.test_track_instances is None
+            or img_metas[0]["scene_token"] != self.scene_token
+        ):
+            self.timestamp = timestamp
+            self.scene_token = img_metas[0]["scene_token"]
+            self.prev_bev = None
+            track_instances = self._generate_empty_tracks()
+            time_delta, l2g_r1, l2g_t1, l2g_r2, l2g_t2 = None, None, None, None, None
+            
+        else:
+            track_instances = self.test_track_instances
+            time_delta = timestamp - self.timestamp
+            l2g_r1 = self.l2g_r_mat
+            l2g_t1 = self.l2g_t
+            l2g_r2 = l2g_r_mat
+            l2g_t2 = l2g_t
+            self.prev_bev = self.prev_frame_info['prev_bev']
+        
+        """ get time_delta and l2g r/t infos """
+        """ update frame info for next frame"""
+        self.timestamp = timestamp
+        self.l2g_t = l2g_t
+        self.l2g_r_mat = l2g_r_mat
+
+        """ predict and update """
+        
+        prev_bev = self.prev_bev
+        frame_res = self._forward_single_frame_inference(
+            img,
+            img_metas,
+            track_instances,
+            prev_bev,
+            l2g_r1,
+            l2g_t1,
+            l2g_r2,
+            l2g_t2,
+            time_delta,
+        )
+
+        self.prev_bev = frame_res["bev_embed"]
+        self.prev_frame_info['prev_bev'] = self.prev_bev
+        track_instances = frame_res["track_instances"]
+        track_instances_fordet = frame_res["track_instances_fordet"]
+
+        self.test_track_instances = track_instances
+        results = [dict()]
+        get_keys = ["bev_embed", "bev_pos", 
+                    "track_query_embeddings", "track_bbox_results", 
+                    "boxes_3d", "scores_3d", "labels_3d", "track_scores", "track_ids"]
+        if self.with_motion_head:
+            get_keys += ["sdc_boxes_3d", "sdc_scores_3d", "sdc_track_scores", "sdc_track_bbox_results", "sdc_embedding"]
+        results[0].update({k: frame_res[k] for k in get_keys})
+        results = self._det_instances2results(track_instances_fordet, results, img_metas)
+        return results
+    
+    def _track_instances2results(self, track_instances, img_metas, with_mask=True):
+        bbox_dict = dict(
+            cls_scores=track_instances.pred_logits,
+            bbox_preds=track_instances.pred_boxes,
+            track_scores=track_instances.scores,
+            obj_idxes=track_instances.obj_idxes,
+        )
+        # bboxes_dict = self.bbox_coder.decode(bbox_dict, with_mask=with_mask)[0]
+        bboxes_dict = self.bbox_coder.decode(bbox_dict, with_mask=with_mask, img_metas=img_metas)[0]
+        bboxes = bboxes_dict["bboxes"]
+        # bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
+        bboxes = img_metas[0]["box_type_3d"](bboxes, 9)
+        labels = bboxes_dict["labels"]
+        scores = bboxes_dict["scores"]
+        bbox_index = bboxes_dict["bbox_index"]
+
+        track_scores = bboxes_dict["track_scores"]
+        obj_idxes = bboxes_dict["obj_idxes"]
+        result_dict = dict(
+            boxes_3d=bboxes.to("cpu"),
+            scores_3d=scores.cpu(),
+            labels_3d=labels.cpu(),
+            track_scores=track_scores.cpu(),
+            bbox_index=bbox_index.cpu(),
+            track_ids=obj_idxes.cpu(),
+            mask=bboxes_dict["mask"].cpu(),
+            track_bbox_results=[[bboxes.to("cpu"), scores.cpu(), labels.cpu(), bbox_index.cpu(), bboxes_dict["mask"].cpu()]]
+        )
+        return result_dict
+
+    def _det_instances2results(self, instances, results, img_metas):
+        """
+        Outs:
+        active_instances. keys:
+        - 'pred_logits':
+        - 'pred_boxes': normalized bboxes
+        - 'scores'
+        - 'obj_idxes'
+        out_dict. keys:
+            - boxes_3d (torch.Tensor): 3D boxes.
+            - scores (torch.Tensor): Prediction scores.
+            - labels_3d (torch.Tensor): Box labels.
+            - attrs_3d (torch.Tensor, optional): Box attributes.
+            - track_ids
+            - tracking_score
+        """
+        # filter out sleep querys
+        if instances.pred_logits.numel() == 0:
+            return [None]
+        bbox_dict = dict(
+            cls_scores=instances.pred_logits,
+            bbox_preds=instances.pred_boxes,
+            track_scores=instances.scores,
+            obj_idxes=instances.obj_idxes,
+        )
+        bboxes_dict = self.bbox_coder.decode(bbox_dict, img_metas=img_metas)[0]
+        bboxes = bboxes_dict["bboxes"]
+        # import pdb;pdb.set_trace()
+        bboxes = img_metas[0]["box_type_3d"](bboxes, 9)
+        labels = bboxes_dict["labels"]
+        scores = bboxes_dict["scores"]
+
+        track_scores = bboxes_dict["track_scores"]
+        obj_idxes = bboxes_dict["obj_idxes"]
+        result_dict = results[0]
+        result_dict_det = dict(
+            boxes_3d_det=bboxes.to("cpu"),
+            scores_3d_det=scores.cpu(),
+            labels_3d_det=labels.cpu(),
+        )
+        if result_dict is not None:
+            result_dict.update(result_dict_det)
+        else:
+            result_dict = None
+
+        return [result_dict]
+
diff --git a/mmcv/models/losses/__init__.py b/mmcv/models/losses/__init__.py
new file mode 100644
index 0000000..cb85d01
--- /dev/null
+++ b/mmcv/models/losses/__init__.py
@@ -0,0 +1,20 @@
+from .focal_loss import FocalLoss, sigmoid_focal_loss
+from .iou_loss import (BoundedIoULoss, CIoULoss, DIoULoss, GIoULoss, IoULoss,
+                       bounded_iou_loss, iou_loss)
+from .smooth_l1_loss import L1Loss, SmoothL1Loss, l1_loss, smooth_l1_loss
+
+
+# __all__ = [
+#     'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy',
+#     'mask_cross_entropy', 'CrossEntropyLoss', 'sigmoid_focal_loss',
+#     'FocalLoss', 'smooth_l1_loss', 'SmoothL1Loss', 'balanced_l1_loss',
+#     'BalancedL1Loss', 'mse_loss', 'MSELoss', 'iou_loss', 'bounded_iou_loss',
+#     'IoULoss', 'BoundedIoULoss', 'GIoULoss', 'DIoULoss', 'CIoULoss', 'GHMC',
+#     'GHMR', 'reduce_loss', 'weight_reduce_loss', 'weighted_loss', 'L1Loss',
+#     'l1_loss', 'isr_p', 'carl_loss', 'AssociativeEmbeddingLoss',
+#     'GaussianFocalLoss', 'QualityFocalLoss', 'DistributionFocalLoss',
+#     'VarifocalLoss', 'KnowledgeDistillationKLDivLoss', 'SeesawLoss',
+#     'ChamferDistance', 'chamfer_distance', 'axis_aligned_iou_loss', 
+#     'AxisAlignedIoULoss', 'PAConvRegularizationLoss',
+#     'LovaszLoss'
+# ]
diff --git a/mmcv/models/losses/focal_loss.py b/mmcv/models/losses/focal_loss.py
new file mode 100644
index 0000000..1212e42
--- /dev/null
+++ b/mmcv/models/losses/focal_loss.py
@@ -0,0 +1,181 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.ops.focal_loss import sigmoid_focal_loss as _sigmoid_focal_loss
+
+from ..builder import LOSSES
+from .utils import weight_reduce_loss
+
+
+# This method is only for debugging
+def py_sigmoid_focal_loss(pred,
+                          target,
+                          weight=None,
+                          gamma=2.0,
+                          alpha=0.25,
+                          reduction='mean',
+                          avg_factor=None):
+    """PyTorch version of `Focal Loss <https://arxiv.org/abs/1708.02002>`_.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the
+            number of classes
+        target (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 0.25.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    pred_sigmoid = pred.sigmoid()
+    target = target.type_as(pred)
+    pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target)
+    focal_weight = (alpha * target + (1 - alpha) *
+                    (1 - target)) * pt.pow(gamma)
+    loss = F.binary_cross_entropy_with_logits(
+        pred, target, reduction='none') * focal_weight
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.size(0) == loss.size(0):
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.view(-1, 1)
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.view(loss.size(0), -1)
+        assert weight.ndim == loss.ndim
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+def sigmoid_focal_loss(pred,
+                       target,
+                       weight=None,
+                       gamma=2.0,
+                       alpha=0.25,
+                       reduction='mean',
+                       avg_factor=None):
+    r"""A warpper of cuda version `Focal Loss
+    <https://arxiv.org/abs/1708.02002>`_.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        target (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 0.25.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'. Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    # Function.apply does not accept keyword arguments, so the decorator
+    # "weighted_loss" is not applicable
+    loss = _sigmoid_focal_loss(pred.contiguous(), target, gamma, alpha, None,
+                               'none')
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.size(0) == loss.size(0):
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.view(-1, 1)
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.view(loss.size(0), -1)
+        assert weight.ndim == loss.ndim
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@LOSSES.register_module()
+class FocalLoss(nn.Module):
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 gamma=2.0,
+                 alpha=0.25,
+                 reduction='mean',
+                 loss_weight=1.0):
+        """`Focal Loss <https://arxiv.org/abs/1708.02002>`_
+
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            alpha (float, optional): A balanced form for Focal Loss.
+                Defaults to 0.25.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+        """
+        super(FocalLoss, self).__init__()
+        assert use_sigmoid is True, 'Only sigmoid focal loss supported now.'
+        self.use_sigmoid = use_sigmoid
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning label of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            if torch.cuda.is_available() and pred.is_cuda:
+                calculate_loss_func = sigmoid_focal_loss
+            else:
+                num_classes = pred.size(1)
+                target = F.one_hot(target, num_classes=num_classes + 1)
+                target = target[:, :num_classes]
+                calculate_loss_func = py_sigmoid_focal_loss
+
+            loss_cls = self.loss_weight * calculate_loss_func(
+                pred,
+                target,
+                weight,
+                gamma=self.gamma,
+                alpha=self.alpha,
+                reduction=reduction,
+                avg_factor=avg_factor)
+
+        else:
+            raise NotImplementedError
+        return loss_cls
diff --git a/mmcv/models/losses/iou_loss.py b/mmcv/models/losses/iou_loss.py
new file mode 100644
index 0000000..466e299
--- /dev/null
+++ b/mmcv/models/losses/iou_loss.py
@@ -0,0 +1,440 @@
+import math
+import torch
+import torch.nn as nn
+
+from mmcv.core.bbox.iou_calculators.iou3d_calculator import bbox_overlaps
+from ..builder import LOSSES
+from .utils import weighted_loss
+
+
+
+@weighted_loss
+def iou_loss(pred, target, linear=False, eps=1e-6):
+    """IoU loss.
+
+    Computing the IoU loss between a set of predicted bboxes and target bboxes.
+    The loss is calculated as negative log of IoU.
+
+    Args:
+        pred (torch.Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (torch.Tensor): Corresponding gt bboxes, shape (n, 4).
+        linear (bool, optional): If True, use linear scale of loss instead of
+            log scale. Default: False.
+        eps (float): Eps to avoid log(0).
+
+    Return:
+        torch.Tensor: Loss tensor.
+    """
+    ious = bbox_overlaps(pred, target, is_aligned=True).clamp(min=eps)
+    if linear:
+        loss = 1 - ious
+    else:
+        loss = -ious.log()
+    return loss
+
+
+@weighted_loss
+def bounded_iou_loss(pred, target, beta=0.2, eps=1e-3):
+    """BIoULoss.
+
+    This is an implementation of paper
+    `Improving Object Localization with Fitness NMS and Bounded IoU Loss.
+    <https://arxiv.org/abs/1711.00164>`_.
+
+    Args:
+        pred (torch.Tensor): Predicted bboxes.
+        target (torch.Tensor): Target bboxes.
+        beta (float): beta parameter in smoothl1.
+        eps (float): eps to avoid NaN.
+    """
+    pred_ctrx = (pred[:, 0] + pred[:, 2]) * 0.5
+    pred_ctry = (pred[:, 1] + pred[:, 3]) * 0.5
+    pred_w = pred[:, 2] - pred[:, 0]
+    pred_h = pred[:, 3] - pred[:, 1]
+    with torch.no_grad():
+        target_ctrx = (target[:, 0] + target[:, 2]) * 0.5
+        target_ctry = (target[:, 1] + target[:, 3]) * 0.5
+        target_w = target[:, 2] - target[:, 0]
+        target_h = target[:, 3] - target[:, 1]
+
+    dx = target_ctrx - pred_ctrx
+    dy = target_ctry - pred_ctry
+
+    loss_dx = 1 - torch.max(
+        (target_w - 2 * dx.abs()) /
+        (target_w + 2 * dx.abs() + eps), torch.zeros_like(dx))
+    loss_dy = 1 - torch.max(
+        (target_h - 2 * dy.abs()) /
+        (target_h + 2 * dy.abs() + eps), torch.zeros_like(dy))
+    loss_dw = 1 - torch.min(target_w / (pred_w + eps), pred_w /
+                            (target_w + eps))
+    loss_dh = 1 - torch.min(target_h / (pred_h + eps), pred_h /
+                            (target_h + eps))
+    loss_comb = torch.stack([loss_dx, loss_dy, loss_dw, loss_dh],
+                            dim=-1).view(loss_dx.size(0), -1)
+
+    loss = torch.where(loss_comb < beta, 0.5 * loss_comb * loss_comb / beta,
+                       loss_comb - 0.5 * beta)
+    return loss
+
+
+
+@weighted_loss
+def giou_loss(pred, target, eps=1e-7):
+    r"""`Generalized Intersection over Union: A Metric and A Loss for Bounding
+    Box Regression <https://arxiv.org/abs/1902.09630>`_.
+
+    Args:
+        pred (torch.Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (torch.Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    gious = bbox_overlaps(pred, target, mode='giou', is_aligned=True, eps=eps)
+    loss = 1 - gious
+    return loss
+
+
+@weighted_loss
+def diou_loss(pred, target, eps=1e-7):
+    r"""`Implementation of Distance-IoU Loss: Faster and Better
+    Learning for Bounding Box Regression, https://arxiv.org/abs/1911.08287`_.
+
+    Code is modified from https://github.com/Zzh-tju/DIoU.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+    Return:
+        Tensor: Loss tensor.
+    """
+    # overlap
+    lt = torch.max(pred[:, :2], target[:, :2])
+    rb = torch.min(pred[:, 2:], target[:, 2:])
+    wh = (rb - lt).clamp(min=0)
+    overlap = wh[:, 0] * wh[:, 1]
+
+    # union
+    ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1])
+    ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
+    union = ap + ag - overlap + eps
+
+    # IoU
+    ious = overlap / union
+
+    # enclose area
+    enclose_x1y1 = torch.min(pred[:, :2], target[:, :2])
+    enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:])
+    enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0)
+
+    cw = enclose_wh[:, 0]
+    ch = enclose_wh[:, 1]
+
+    c2 = cw**2 + ch**2 + eps
+
+    b1_x1, b1_y1 = pred[:, 0], pred[:, 1]
+    b1_x2, b1_y2 = pred[:, 2], pred[:, 3]
+    b2_x1, b2_y1 = target[:, 0], target[:, 1]
+    b2_x2, b2_y2 = target[:, 2], target[:, 3]
+
+    left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
+    right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
+    rho2 = left + right
+
+    # DIoU
+    dious = ious - rho2 / c2
+    loss = 1 - dious
+    return loss
+
+
+@weighted_loss
+def ciou_loss(pred, target, eps=1e-7):
+    r"""`Implementation of paper `Enhancing Geometric Factors into
+    Model Learning and Inference for Object Detection and Instance
+    Segmentation <https://arxiv.org/abs/2005.03572>`_.
+
+    Code is modified from https://github.com/Zzh-tju/CIoU.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+    Return:
+        Tensor: Loss tensor.
+    """
+    # overlap
+    lt = torch.max(pred[:, :2], target[:, :2])
+    rb = torch.min(pred[:, 2:], target[:, 2:])
+    wh = (rb - lt).clamp(min=0)
+    overlap = wh[:, 0] * wh[:, 1]
+
+    # union
+    ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1])
+    ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
+    union = ap + ag - overlap + eps
+
+    # IoU
+    ious = overlap / union
+
+    # enclose area
+    enclose_x1y1 = torch.min(pred[:, :2], target[:, :2])
+    enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:])
+    enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0)
+
+    cw = enclose_wh[:, 0]
+    ch = enclose_wh[:, 1]
+
+    c2 = cw**2 + ch**2 + eps
+
+    b1_x1, b1_y1 = pred[:, 0], pred[:, 1]
+    b1_x2, b1_y2 = pred[:, 2], pred[:, 3]
+    b2_x1, b2_y1 = target[:, 0], target[:, 1]
+    b2_x2, b2_y2 = target[:, 2], target[:, 3]
+
+    w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
+    w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
+
+    left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
+    right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
+    rho2 = left + right
+
+    factor = 4 / math.pi**2
+    v = factor * torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2)
+
+    # CIoU
+    cious = ious - (rho2 / c2 + v**2 / (1 - ious + v))
+    loss = 1 - cious
+    return loss
+
+
+class IoULoss(nn.Module):
+    """IoULoss.
+
+    Computing the IoU loss between a set of predicted bboxes and target bboxes.
+
+    Args:
+        linear (bool): If True, use linear scale of loss instead of log scale.
+            Default: False.
+        eps (float): Eps to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+    """
+
+    def __init__(self,
+                 linear=False,
+                 eps=1e-6,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(IoULoss, self).__init__()
+        self.linear = linear
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if (weight is not None) and (not torch.any(weight > 0)) and (
+                reduction != 'none'):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # iou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * iou_loss(
+            pred,
+            target,
+            weight,
+            linear=self.linear,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@LOSSES.register_module()
+class BoundedIoULoss(nn.Module):
+
+    def __init__(self, beta=0.2, eps=1e-3, reduction='mean', loss_weight=1.0):
+        super(BoundedIoULoss, self).__init__()
+        self.beta = beta
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss = self.loss_weight * bounded_iou_loss(
+            pred,
+            target,
+            weight,
+            beta=self.beta,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@LOSSES.register_module()
+class GIoULoss(nn.Module):
+
+    def __init__(self, eps=1e-6, reduction='mean', loss_weight=1.0):
+        super(GIoULoss, self).__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * giou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@LOSSES.register_module()
+class DIoULoss(nn.Module):
+
+    def __init__(self, eps=1e-6, reduction='mean', loss_weight=1.0):
+        super(DIoULoss, self).__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * diou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+@LOSSES.register_module()
+class CIoULoss(nn.Module):
+
+    def __init__(self, eps=1e-6, reduction='mean', loss_weight=1.0):
+        super(CIoULoss, self).__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * ciou_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
diff --git a/mmcv/models/losses/smooth_l1_loss.py b/mmcv/models/losses/smooth_l1_loss.py
new file mode 100644
index 0000000..ad5e8a4
--- /dev/null
+++ b/mmcv/models/losses/smooth_l1_loss.py
@@ -0,0 +1,136 @@
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+from .utils import weighted_loss
+
+
+@weighted_loss
+def smooth_l1_loss(pred, target, beta=1.0):
+    """Smooth L1 loss.
+
+    Args:
+        pred (torch.Tensor): The prediction.
+        target (torch.Tensor): The learning target of the prediction.
+        beta (float, optional): The threshold in the piecewise function.
+            Defaults to 1.0.
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    assert beta > 0
+    assert pred.size() == target.size() and target.numel() > 0
+    diff = torch.abs(pred - target)
+    loss = torch.where(diff < beta, 0.5 * diff * diff / beta,
+                       diff - 0.5 * beta)
+    return loss
+
+
+@weighted_loss
+def l1_loss(pred, target):
+    """L1 loss.
+
+    Args:
+        pred (torch.Tensor): The prediction.
+        target (torch.Tensor): The learning target of the prediction.
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    assert pred.size() == target.size() and target.numel() > 0
+    loss = torch.abs(pred - target)
+    return loss
+
+
+@LOSSES.register_module()
+class SmoothL1Loss(nn.Module):
+    """Smooth L1 loss.
+
+    Args:
+        beta (float, optional): The threshold in the piecewise function.
+            Defaults to 1.0.
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum". Defaults to "mean".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self, beta=1.0, reduction='mean', loss_weight=1.0):
+        super(SmoothL1Loss, self).__init__()
+        self.beta = beta
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * smooth_l1_loss(
+            pred,
+            target,
+            weight,
+            beta=self.beta,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss_bbox
+
+
+@LOSSES.register_module()
+class L1Loss(nn.Module):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(L1Loss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * l1_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
diff --git a/mmcv/models/losses/utils.py b/mmcv/models/losses/utils.py
new file mode 100644
index 0000000..0496fb7
--- /dev/null
+++ b/mmcv/models/losses/utils.py
@@ -0,0 +1,115 @@
+import functools
+
+from mmcv.fileio.io import load
+import numpy as np
+import torch.nn.functional as F
+
+def get_class_weight(class_weight):
+    """Get class weight for loss function.
+
+    Args:
+        class_weight (list[float] | str | None): If class_weight is a str,
+            take it as a file name and read from it.
+    """
+    if isinstance(class_weight, str):
+        # take it as a file path
+        if class_weight.endswith('.npy'):
+            class_weight = np.load(class_weight)
+        else:
+            # pkl, json or yaml
+            class_weight = load(class_weight)
+
+    return class_weight
+
+def reduce_loss(loss, reduction):
+    """Reduce loss as specified.
+
+    Args:
+        loss (Tensor): Elementwise loss tensor.
+        reduction (str): Options are "none", "mean" and "sum".
+
+    Return:
+        Tensor: Reduced loss tensor.
+    """
+    reduction_enum = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction_enum == 0:
+        return loss
+    elif reduction_enum == 1:
+        return loss.mean()
+    elif reduction_enum == 2:
+        return loss.sum()
+
+def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None):
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): Element-wise loss.
+        weight (Tensor): Element-wise weights.
+        reduction (str): Same as built-in losses of PyTorch.
+        avg_factor (float): Avarage factor when computing the mean of losses.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            loss = loss.sum() / avg_factor
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+
+def weighted_loss(loss_func):
+    """Create a weighted version of a given loss function.
+
+    To use this decorator, the loss function must have the signature like
+    `loss_func(pred, target, **kwargs)`. The function only needs to compute
+    element-wise loss without any reduction. This decorator will add weight
+    and reduction arguments to the function. The decorated function will have
+    the signature like `loss_func(pred, target, weight=None, reduction='mean',
+    avg_factor=None, **kwargs)`.
+
+    :Example:
+
+    >>> import torch
+    >>> @weighted_loss
+    >>> def l1_loss(pred, target):
+    >>>     return (pred - target).abs()
+
+    >>> pred = torch.Tensor([0, 2, 3])
+    >>> target = torch.Tensor([1, 1, 1])
+    >>> weight = torch.Tensor([1, 0, 1])
+
+    >>> l1_loss(pred, target)
+    tensor(1.3333)
+    >>> l1_loss(pred, target, weight)
+    tensor(1.)
+    >>> l1_loss(pred, target, reduction='none')
+    tensor([1., 1., 2.])
+    >>> l1_loss(pred, target, weight, avg_factor=2)
+    tensor(1.5000)
+    """
+
+    @functools.wraps(loss_func)
+    def wrapper(pred,
+                target,
+                weight=None,
+                reduction='mean',
+                avg_factor=None,
+                **kwargs):
+        # get element-wise loss
+        loss = loss_func(pred, target, **kwargs)
+        loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+        return loss
+
+    return wrapper
diff --git a/mmcv/models/modules/VAD_transformer.py b/mmcv/models/modules/VAD_transformer.py
new file mode 100644
index 0000000..6e2ec0b
--- /dev/null
+++ b/mmcv/models/modules/VAD_transformer.py
@@ -0,0 +1,489 @@
+import torch
+import numpy as np
+import torch.nn as nn
+from mmcv.models.utils import xavier_init
+from mmcv.utils import ext_loader
+from torch.nn.init import normal_
+from mmcv.models.backbones.base_module import BaseModule
+from mmcv.models.utils.builder import TRANSFORMER
+from torchvision.transforms.functional import rotate
+from mmcv.models.bricks.registry import TRANSFORMER_LAYER_SEQUENCE
+from mmcv.models.bricks.transformer import TransformerLayerSequence
+from mmcv.models.bricks.transformer import build_transformer_layer_sequence
+
+from mmcv.models.modules.decoder import CustomMSDeformableAttention
+from mmcv.models.modules.temporal_self_attention import TemporalSelfAttention
+from mmcv.models.modules.spatial_cross_attention import MSDeformableAttention3D
+
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+def inverse_sigmoid(x, eps=1e-5):
+    """Inverse function of sigmoid.
+    Args:
+        x (Tensor): The tensor to do the
+            inverse.
+        eps (float): EPS avoid numerical
+            overflow. Defaults 1e-5.
+    Returns:
+        Tensor: The x has passed the inverse
+            function of sigmoid, has same
+            shape with input.
+    """
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class MapDetectionTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR3D transformer.
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default:
+            `LN`.
+    """
+
+    def __init__(self, *args, return_intermediate=False, **kwargs):
+        super(MapDetectionTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+        self.fp16_enabled = False
+
+    def forward(self,
+                query,
+                *args,
+                reference_points=None,
+                reg_branches=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `Detr3DTransformerDecoder`.
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+            reference_points (Tensor): The reference
+                points of offset. has shape
+                (bs, num_query, 4) when as_two_stage,
+                otherwise has shape ((bs, num_query, 2).
+            reg_branch: (obj:`nn.ModuleList`): Used for
+                refining the regression results. Only would
+                be passed when with_box_refine is True,
+                otherwise would be passed a `None`.
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        output = query
+        intermediate = []
+        intermediate_reference_points = []
+        for lid, layer in enumerate(self.layers):
+
+            reference_points_input = reference_points[..., :2].unsqueeze(
+                2)  # BS NUM_QUERY NUM_LEVEL 2
+            output = layer(
+                output,
+                *args,
+                reference_points=reference_points_input,
+                key_padding_mask=key_padding_mask,
+                **kwargs)
+            output = output.permute(1, 0, 2)
+
+            if reg_branches is not None:
+                tmp = reg_branches[lid](output)
+
+                assert reference_points.shape[-1] == 2
+
+                new_reference_points = torch.zeros_like(reference_points)
+                new_reference_points[..., :2] = tmp[
+                    ..., :2] + inverse_sigmoid(reference_points[..., :2])
+                # new_reference_points[..., 2:3] = tmp[
+                #     ..., 4:5] + inverse_sigmoid(reference_points[..., 2:3])
+
+                new_reference_points = new_reference_points.sigmoid()
+
+                reference_points = new_reference_points.detach()
+
+            output = output.permute(1, 0, 2)
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return output, reference_points
+
+
+@TRANSFORMER.register_module()
+class VADPerceptionTransformer(BaseModule):
+    """Implements the Detr3D transformer.
+    Args:
+        as_two_stage (bool): Generate query from encoder features.
+            Default: False.
+        num_feature_levels (int): Number of feature maps from FPN:
+            Default: 4.
+        two_stage_num_proposals (int): Number of proposals when set
+            `as_two_stage` as True. Default: 300.
+    """
+
+    def __init__(self,
+                 num_feature_levels=4,
+                 num_cams=6,
+                 two_stage_num_proposals=300,
+                 encoder=None,
+                 decoder=None,
+                 map_decoder=None,
+                 embed_dims=256,
+                 rotate_prev_bev=True,
+                 use_shift=True,
+                 use_can_bus=True,
+                 can_bus_norm=True,
+                 use_cams_embeds=True,
+                 rotate_center=[100, 100],
+                 map_num_vec=50,
+                 map_num_pts_per_vec=10,
+                 **kwargs):
+        super(VADPerceptionTransformer, self).__init__(**kwargs)
+        self.encoder = build_transformer_layer_sequence(encoder)
+        if decoder is not None:
+            self.decoder = build_transformer_layer_sequence(decoder)
+        else:
+            self.decoder = None
+        if map_decoder is not None:
+            self.map_decoder = build_transformer_layer_sequence(map_decoder)
+        else:
+            self.map_decoder = None
+
+        self.embed_dims = embed_dims
+        self.num_feature_levels = num_feature_levels
+        self.num_cams = num_cams
+        self.fp16_enabled = False
+        self.rotate_prev_bev = rotate_prev_bev
+        self.use_shift = use_shift
+        self.use_can_bus = use_can_bus
+        self.can_bus_norm = can_bus_norm
+        self.use_cams_embeds = use_cams_embeds
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.rotate_center = rotate_center
+        self.map_num_vec = map_num_vec
+        self.map_num_pts_per_vec = map_num_pts_per_vec
+        self.init_layers()
+
+    def init_layers(self):
+        """Initialize layers of the Detr3DTransformer."""
+        self.level_embeds = nn.Parameter(torch.Tensor(
+            self.num_feature_levels, self.embed_dims))
+        self.cams_embeds = nn.Parameter(
+            torch.Tensor(self.num_cams, self.embed_dims))
+        self.reference_points = nn.Linear(self.embed_dims, 3)
+        self.map_reference_points = nn.Linear(self.embed_dims, 2)
+        self.can_bus_mlp = nn.Sequential(
+            nn.Linear(18, self.embed_dims // 2),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.embed_dims // 2, self.embed_dims),
+            nn.ReLU(inplace=True),
+        )
+        if self.can_bus_norm:
+            self.can_bus_mlp.add_module('norm', nn.LayerNorm(self.embed_dims))
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention) \
+                    or isinstance(m, CustomMSDeformableAttention):
+                try:
+                    m.init_weight()
+                except AttributeError:
+                    m.init_weights()
+        normal_(self.level_embeds)
+        normal_(self.cams_embeds)
+        xavier_init(self.reference_points, distribution='uniform', bias=0.)
+        xavier_init(self.map_reference_points, distribution='uniform', bias=0.)
+        xavier_init(self.can_bus_mlp, distribution='uniform', bias=0.)
+
+    # TODO apply fp16 to this module cause grad_norm NAN
+    # @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'prev_bev', 'bev_pos'))
+    def get_bev_features(
+            self,
+            mlvl_feats,
+            bev_queries,
+            bev_h,
+            bev_w,
+            grid_length=[0.512, 0.512],
+            bev_pos=None,
+            prev_bev=None,
+            **kwargs):
+        """
+        obtain bev features.
+        """
+
+        bs = mlvl_feats[0].size(0)
+        bev_queries = bev_queries.unsqueeze(1).repeat(1, bs, 1)
+        bev_pos = bev_pos.flatten(2).permute(2, 0, 1)
+
+        # obtain rotation angle and shift with ego motion
+        delta_x = np.array([each['can_bus'][0]
+                           for each in kwargs['img_metas']])
+        delta_y = np.array([each['can_bus'][1]
+                           for each in kwargs['img_metas']])
+        ego_angle = np.array(
+            [each['can_bus'][-2] / np.pi * 180 for each in kwargs['img_metas']])
+        grid_length_y = grid_length[0]
+        grid_length_x = grid_length[1]
+        translation_length = np.sqrt(delta_x ** 2 + delta_y ** 2)
+        translation_angle = np.arctan2(delta_y, delta_x) / np.pi * 180
+        bev_angle = ego_angle - translation_angle
+        shift_y = translation_length * \
+            np.cos(bev_angle / 180 * np.pi) / grid_length_y / bev_h
+        shift_x = translation_length * \
+            np.sin(bev_angle / 180 * np.pi) / grid_length_x / bev_w
+        shift_y = shift_y * self.use_shift
+        shift_x = shift_x * self.use_shift
+        shift = bev_queries.new_tensor(
+            [shift_x, shift_y]).permute(1, 0)  # xy, bs -> bs, xy
+
+        if prev_bev is not None:
+            if prev_bev.shape[1] == bev_h * bev_w:
+                prev_bev = prev_bev.permute(1, 0, 2)
+            if self.rotate_prev_bev:
+                for i in range(bs):
+                    # num_prev_bev = prev_bev.size(1)
+                    rotation_angle = kwargs['img_metas'][i]['can_bus'][-1]
+                    tmp_prev_bev = prev_bev[:, i].reshape(
+                        bev_h, bev_w, -1).permute(2, 0, 1)
+                    tmp_prev_bev = rotate(tmp_prev_bev, rotation_angle,
+                                          center=self.rotate_center)
+                    tmp_prev_bev = tmp_prev_bev.permute(1, 2, 0).reshape(
+                        bev_h * bev_w, 1, -1)
+                    prev_bev[:, i] = tmp_prev_bev[:, 0]
+
+        # add can bus signals
+        can_bus = bev_queries.new_tensor(
+            [each['can_bus'] for each in kwargs['img_metas']])  # [:, :]
+        can_bus = self.can_bus_mlp(can_bus)[None, :, :]
+        bev_queries = bev_queries + can_bus * self.use_can_bus
+
+        feat_flatten = []
+        spatial_shapes = []
+        for lvl, feat in enumerate(mlvl_feats):
+            bs, num_cam, c, h, w = feat.shape
+            spatial_shape = (h, w)
+            feat = feat.flatten(3).permute(1, 0, 3, 2)
+            if self.use_cams_embeds:
+                feat = feat + self.cams_embeds[:, None, None, :].to(feat.dtype)
+            feat = feat + self.level_embeds[None,
+                                            None, lvl:lvl + 1, :].to(feat.dtype)
+            spatial_shapes.append(spatial_shape)
+            feat_flatten.append(feat)
+
+        feat_flatten = torch.cat(feat_flatten, 2)
+        spatial_shapes = torch.as_tensor(
+            spatial_shapes, dtype=torch.long, device=bev_pos.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+
+        feat_flatten = feat_flatten.permute(
+            0, 2, 1, 3)  # (num_cam, H*W, bs, embed_dims)
+
+        bev_embed = self.encoder(
+            bev_queries,
+            feat_flatten,
+            feat_flatten,
+            bev_h=bev_h,
+            bev_w=bev_w,
+            bev_pos=bev_pos,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            prev_bev=prev_bev,
+            shift=shift,
+            **kwargs
+        )
+
+        return bev_embed
+
+    # TODO apply fp16 to this module cause grad_norm NAN
+    # @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'object_query_embed', 'prev_bev', 'bev_pos'))
+    def forward(self,
+                mlvl_feats,
+                bev_queries,
+                object_query_embed,
+                map_query_embed,
+                bev_h,
+                bev_w,
+                grid_length=[0.512, 0.512],
+                bev_pos=None,
+                reg_branches=None,
+                cls_branches=None,
+                map_reg_branches=None,
+                map_cls_branches=None,                
+                prev_bev=None,            
+                **kwargs):
+        """Forward function for `Detr3DTransformer`.
+        Args:
+            mlvl_feats (list(Tensor)): Input queries from
+                different level. Each element has shape
+                [bs, num_cams, embed_dims, h, w].
+            bev_queries (Tensor): (bev_h*bev_w, c)
+            bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w)
+            object_query_embed (Tensor): The query embedding for decoder,
+                with shape [num_query, c].
+            reg_branches (obj:`nn.ModuleList`): Regression heads for
+                feature maps from each decoder layer. Only would
+                be passed when `with_box_refine` is True. Default to None.
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+                - bev_embed: BEV features
+                - inter_states: Outputs from decoder. If
+                    return_intermediate_dec is True output has shape \
+                      (num_dec_layers, bs, num_query, embed_dims), else has \
+                      shape (1, bs, num_query, embed_dims).
+                - init_reference_out: The initial value of reference \
+                    points, has shape (bs, num_queries, 4).
+                - inter_references_out: The internal value of reference \
+                    points in decoder, has shape \
+                    (num_dec_layers, bs,num_query, embed_dims)
+                - enc_outputs_class: The classification score of \
+                    proposals generated from \
+                    encoder's feature maps, has shape \
+                    (batch, h*w, num_classes). \
+                    Only would be returned when `as_two_stage` is True, \
+                    otherwise None.
+                - enc_outputs_coord_unact: The regression results \
+                    generated from encoder's feature maps., has shape \
+                    (batch, h*w, 4). Only would \
+                    be returned when `as_two_stage` is True, \
+                    otherwise None.
+        """
+
+        bev_embed = self.get_bev_features(
+            mlvl_feats,
+            bev_queries,
+            bev_h,
+            bev_w,
+            grid_length=grid_length,
+            bev_pos=bev_pos,
+            prev_bev=prev_bev,
+            **kwargs)  # bev_embed shape: bs, bev_h*bev_w, embed_dims
+
+        bs = mlvl_feats[0].size(0)
+        query_pos, query = torch.split(
+            object_query_embed, self.embed_dims, dim=1)
+        query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1)
+        query = query.unsqueeze(0).expand(bs, -1, -1)
+        reference_points = self.reference_points(query_pos)
+        reference_points = reference_points.sigmoid()
+        init_reference_out = reference_points
+
+        map_query_pos, map_query = torch.split(
+            map_query_embed, self.embed_dims, dim=1)
+        map_query_pos = map_query_pos.unsqueeze(0).expand(bs, -1, -1)
+        map_query = map_query.unsqueeze(0).expand(bs, -1, -1)
+        map_reference_points = self.map_reference_points(map_query_pos)
+        map_reference_points = map_reference_points.sigmoid()
+        map_init_reference_out = map_reference_points        
+
+        query = query.permute(1, 0, 2)
+        query_pos = query_pos.permute(1, 0, 2)
+        map_query = map_query.permute(1, 0, 2)
+        map_query_pos = map_query_pos.permute(1, 0, 2)
+        bev_embed = bev_embed.permute(1, 0, 2)
+
+        if self.decoder is not None:
+            # [L, Q, B, D], [L, B, Q, D]
+            inter_states, inter_references = self.decoder(
+                query=query,
+                key=None,
+                value=bev_embed,
+                query_pos=query_pos,
+                reference_points=reference_points,
+                reg_branches=reg_branches,
+                cls_branches=cls_branches,
+                spatial_shapes=torch.tensor([[bev_h, bev_w]], device=query.device),
+                level_start_index=torch.tensor([0], device=query.device),
+                **kwargs)
+            inter_references_out = inter_references
+        else:
+            inter_states = query.unsqueeze(0)
+            inter_references_out = reference_points.unsqueeze(0)
+
+        if self.map_decoder is not None:
+            # [L, Q, B, D], [L, B, Q, D]
+            map_inter_states, map_inter_references = self.map_decoder(
+                query=map_query,
+                key=None,
+                value=bev_embed,
+                query_pos=map_query_pos,
+                reference_points=map_reference_points,
+                reg_branches=map_reg_branches,
+                cls_branches=map_cls_branches,
+                spatial_shapes=torch.tensor([[bev_h, bev_w]], device=map_query.device),
+                level_start_index=torch.tensor([0], device=map_query.device),
+                **kwargs)
+            map_inter_references_out = map_inter_references
+        else:
+            map_inter_states = map_query.unsqueeze(0)
+            map_inter_references_out = map_reference_points.unsqueeze(0)
+
+        return (
+            bev_embed, inter_states, init_reference_out, inter_references_out,
+            map_inter_states, map_init_reference_out, map_inter_references_out)
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class CustomTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR3D transformer.
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default: `LN`.
+    """
+
+    def __init__(self, *args, return_intermediate=False, **kwargs):
+        super(CustomTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+        self.fp16_enabled = False
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                key_padding_mask=None,
+                *args,
+                **kwargs):
+        """Forward function for `Detr3DTransformerDecoder`.
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        intermediate = []
+        for lid, layer in enumerate(self.layers):
+            query = layer(
+                query=query,
+                key=key,
+                value=value,
+                query_pos=query_pos,
+                key_pos=key_pos,
+                attn_masks=attn_masks,
+                key_padding_mask=key_padding_mask,
+                *args,
+                **kwargs)
+
+            if self.return_intermediate:
+                intermediate.append(query)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+
+        return query
\ No newline at end of file
diff --git a/mmcv/models/modules/__init__.py b/mmcv/models/modules/__init__.py
new file mode 100644
index 0000000..da1e029
--- /dev/null
+++ b/mmcv/models/modules/__init__.py
@@ -0,0 +1,8 @@
+from .transformer import BEVFormerPerceptionTransformer, UniADPerceptionTransformer, GroupFree3DMHA
+from .spatial_cross_attention import SpatialCrossAttention, MSDeformableAttention3D
+from .temporal_self_attention import TemporalSelfAttention
+from .encoder import BEVFormerEncoder, BEVFormerLayer
+from .decoder import DetectionTransformerDecoder
+from .vote_module import VoteModule
+from .VAD_transformer import VADPerceptionTransformer
+
diff --git a/mmcv/models/modules/custom_base_transformer_layer.py b/mmcv/models/modules/custom_base_transformer_layer.py
new file mode 100644
index 0000000..c877db2
--- /dev/null
+++ b/mmcv/models/modules/custom_base_transformer_layer.py
@@ -0,0 +1,243 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import copy
+import warnings
+
+import torch
+
+from mmcv import ConfigDict
+from mmcv.models.bricks import build_norm_layer
+from mmcv.models.backbones.base_module import BaseModule, ModuleList
+
+from mmcv.models.bricks.registry import TRANSFORMER_LAYER
+from mmcv.models.bricks.transformer import build_feedforward_network, build_attention
+
+
+@TRANSFORMER_LAYER.register_module()
+class MyCustomBaseTransformerLayer(BaseModule):
+    """Base `TransformerLayer` for vision transformer.
+    It can be built from `mmcv.ConfigDict` and support more flexible
+    customization, for example, using any number of `FFN or LN ` and
+    use different kinds of `attention` by specifying a list of `ConfigDict`
+    named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
+    when you specifying `norm` as the first element of `operation_order`.
+    More details about the `prenorm`: `On Layer Normalization in the
+    Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for `self_attention` or `cross_attention` modules,
+            The order of the configs in the list should be consistent with
+            corresponding attentions in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config. Default: None.
+        ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for FFN, The order of the configs in the list should be
+            consistent with corresponding ffn in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Support `prenorm` when you specifying first element as `norm`.
+            Default：None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): Key, Query and Value are shape
+            of (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+    """
+
+    def __init__(self,
+                 attn_cfgs=None,
+                 ffn_cfgs=dict(
+                     type='FFN',
+                     embed_dims=256,
+                     feedforward_channels=1024,
+                     num_fcs=2,
+                     ffn_drop=0.,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                 ),
+                 operation_order=None,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None,
+                 batch_first=True,
+                 **kwargs):
+
+        deprecated_args = dict(
+            feedforward_channels='feedforward_channels',
+            ffn_dropout='ffn_drop',
+            ffn_num_fcs='num_fcs')
+        for ori_name, new_name in deprecated_args.items():
+            if ori_name in kwargs:
+                warnings.warn(
+                    f'The arguments `{ori_name}` in BaseTransformerLayer '
+                    f'has been deprecated, now you should set `{new_name}` '
+                    f'and other FFN related arguments '
+                    f'to a dict named `ffn_cfgs`. ')
+                ffn_cfgs[new_name] = kwargs[ori_name]
+
+        super(MyCustomBaseTransformerLayer, self).__init__(init_cfg)
+
+        self.batch_first = batch_first
+
+        assert set(operation_order) & set(
+            ['self_attn', 'norm', 'ffn', 'cross_attn']) == \
+            set(operation_order), f'The operation_order of' \
+            f' {self.__class__.__name__} should ' \
+            f'contains all four operation type ' \
+            f"{['self_attn', 'norm', 'ffn', 'cross_attn']}"
+
+        num_attn = operation_order.count('self_attn') + operation_order.count(
+            'cross_attn')
+        if isinstance(attn_cfgs, dict):
+            attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
+        else:
+            assert num_attn == len(attn_cfgs), f'The length ' \
+                f'of attn_cfg {num_attn} is ' \
+                f'not consistent with the number of attention' \
+                f'in operation_order {operation_order}.'
+
+        self.num_attn = num_attn
+        self.operation_order = operation_order
+        self.norm_cfg = norm_cfg
+        self.pre_norm = operation_order[0] == 'norm'
+        self.attentions = ModuleList()
+
+        index = 0
+        for operation_name in operation_order:
+            if operation_name in ['self_attn', 'cross_attn']:
+                if 'batch_first' in attn_cfgs[index]:
+                    assert self.batch_first == attn_cfgs[index]['batch_first']
+                else:
+                    attn_cfgs[index]['batch_first'] = self.batch_first
+                attention = build_attention(attn_cfgs[index])
+                # Some custom attentions used as `self_attn`
+                # or `cross_attn` can have different behavior.
+                attention.operation_name = operation_name
+                self.attentions.append(attention)
+                index += 1
+
+        self.embed_dims = self.attentions[0].embed_dims
+
+        self.ffns = ModuleList()
+        num_ffns = operation_order.count('ffn')
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = ConfigDict(ffn_cfgs)
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
+        assert len(ffn_cfgs) == num_ffns
+        for ffn_index in range(num_ffns):
+            if 'embed_dims' not in ffn_cfgs[ffn_index]:
+                ffn_cfgs['embed_dims'] = self.embed_dims
+            else:
+                assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
+
+            self.ffns.append(
+                build_feedforward_network(ffn_cfgs[ffn_index]))
+
+        self.norms = ModuleList()
+        num_norms = operation_order.count('norm')
+        for _ in range(num_norms):
+            self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `TransformerDecoderLayer`.
+        **kwargs contains some specific arguments of attentions.
+        Args:
+            query (Tensor): The input query with shape
+                [num_queries, bs, embed_dims] if
+                self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+            value (Tensor): The value tensor with same shape as `key`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor] | None): 2D Tensor used in
+                calculation of corresponding attention. The length of
+                it should equal to the number of `attention` in
+                `operation_order`. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in `self_attn` layer.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+        Returns:
+            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+        """
+
+        norm_index = 0
+        attn_index = 0
+        ffn_index = 0
+        identity = query
+        if attn_masks is None:
+            attn_masks = [None for _ in range(self.num_attn)]
+        elif isinstance(attn_masks, torch.Tensor):
+            attn_masks = [
+                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
+            ]
+            warnings.warn(f'Use same attn_mask in all attentions in '
+                          f'{self.__class__.__name__} ')
+        else:
+            assert len(attn_masks) == self.num_attn, f'The length of ' \
+                f'attn_masks {len(attn_masks)} must be equal ' \
+                f'to the number of attention in ' \
+                f'operation_order {self.num_attn}'
+
+        for layer in self.operation_order:
+            if layer == 'self_attn':
+                temp_key = temp_value = query
+                query = self.attentions[attn_index](
+                    query,
+                    temp_key,
+                    temp_value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=query_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=query_key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'norm':
+                query = self.norms[norm_index](query)
+                norm_index += 1
+
+            elif layer == 'cross_attn':
+                query = self.attentions[attn_index](
+                    query,
+                    key,
+                    value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=key_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'ffn':
+                query = self.ffns[ffn_index](
+                    query, identity if self.pre_norm else None)
+                ffn_index += 1
+
+        return query
diff --git a/mmcv/models/modules/decoder.py b/mmcv/models/modules/decoder.py
new file mode 100644
index 0000000..e320dbf
--- /dev/null
+++ b/mmcv/models/modules/decoder.py
@@ -0,0 +1,344 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch
+import cv2 as cv
+import copy
+import warnings
+from matplotlib import pyplot as plt
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.models.utils import xavier_init, constant_init
+from mmcv.models.bricks.registry import (ATTENTION,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmcv.models.bricks.transformer import TransformerLayerSequence
+import math
+from mmcv.models.backbones.base_module import BaseModule, ModuleList, Sequential
+from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning,
+                        to_2tuple)
+
+from mmcv.utils import ext_loader
+from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \
+    MultiScaleDeformableAttnFunction_fp16
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    """Inverse function of sigmoid.
+    Args:
+        x (Tensor): The tensor to do the
+            inverse.
+        eps (float): EPS avoid numerical
+            overflow. Defaults 1e-5.
+    Returns:
+        Tensor: The x has passed the inverse
+            function of sigmoid, has same
+            shape with input.
+    """
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DetectionTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR3D transformer.
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self, *args, return_intermediate=False, **kwargs):
+        super(DetectionTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+        self.fp16_enabled = False
+
+    def forward(self,
+                query,
+                *args,
+                reference_points=None,
+                reg_branches=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `Detr3DTransformerDecoder`.
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+            reference_points (Tensor): The reference
+                points of offset. has shape
+                (bs, num_query, 4) when as_two_stage,
+                otherwise has shape ((bs, num_query, 2).
+            reg_branch: (obj:`nn.ModuleList`): Used for
+                refining the regression results. Only would
+                be passed when with_box_refine is True,
+                otherwise would be passed a `None`.
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        output = query
+        intermediate = []
+        intermediate_reference_points = []
+        for lid, layer in enumerate(self.layers):
+
+            reference_points_input = reference_points[..., :2].unsqueeze(
+                2)  # BS NUM_QUERY NUM_LEVEL 2
+            output = layer(
+                output,
+                *args,
+                reference_points=reference_points_input,
+                key_padding_mask=key_padding_mask,
+                **kwargs)
+            output = output.permute(1, 0, 2)
+
+            if reg_branches is not None:
+                tmp = reg_branches[lid](output)
+
+                assert reference_points.shape[-1] == 3
+
+                new_reference_points = torch.zeros_like(reference_points)
+                new_reference_points[..., :2] = tmp[
+                    ..., :2] + inverse_sigmoid(reference_points[..., :2])
+                new_reference_points[..., 2:3] = tmp[
+                    ..., 4:5] + inverse_sigmoid(reference_points[..., 2:3])
+
+                new_reference_points = new_reference_points.sigmoid()
+
+                reference_points = new_reference_points.detach()
+
+            output = output.permute(1, 0, 2)
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return output, reference_points
+
+
+@ATTENTION.register_module()
+class CustomMSDeformableAttention(BaseModule):
+    """An attention module used in Deformable-Detr.
+
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=False,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+        self.fp16_enabled = False
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+            self.num_heads, 1, 1,
+            2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiScaleDeformableAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                flag='decoder',
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if torch.cuda.is_available() and value.is_cuda:
+
+            # using fp16 deformable attention is unstable because it performs many sum operations
+            if value.dtype == torch.float16:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            else:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            # (num_query, bs ,embed_dims)
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
diff --git a/mmcv/models/modules/encoder.py b/mmcv/models/modules/encoder.py
new file mode 100644
index 0000000..aa3c1b0
--- /dev/null
+++ b/mmcv/models/modules/encoder.py
@@ -0,0 +1,405 @@
+
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from .custom_base_transformer_layer import MyCustomBaseTransformerLayer
+import copy
+import warnings
+from mmcv.models.bricks.registry import (ATTENTION,
+                                      TRANSFORMER_LAYER,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmcv.models.bricks.transformer import TransformerLayerSequence
+from mmcv.utils import force_fp32, auto_fp16
+import numpy as np
+import torch
+import cv2 as cv
+from mmcv.utils import TORCH_VERSION, digit_version
+from mmcv.utils import ext_loader
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class BEVFormerEncoder(TransformerLayerSequence):
+
+    """
+    Attention with both self and cross
+    Implements the decoder in DETR transformer.
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self, *args, pc_range=None, num_points_in_pillar=4, return_intermediate=False, dataset_type='nuscenes',
+                 **kwargs):
+
+        super(BEVFormerEncoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+
+        self.num_points_in_pillar = num_points_in_pillar
+        self.pc_range = pc_range
+        self.fp16_enabled = False
+
+    @staticmethod
+    def get_reference_points(H, W, Z=8, num_points_in_pillar=4, dim='3d', bs=1, device='cuda', dtype=torch.float):
+        """Get the reference points used in SCA and TSA.
+        Args:
+            H, W: spatial shape of bev.
+            Z: hight of pillar.
+            D: sample D points uniformly from each pillar.
+            device (obj:`device`): The device where
+                reference_points should be.
+        Returns:
+            Tensor: reference points used in decoder, has \
+                shape (bs, num_keys, num_levels, 2).
+        """
+
+        # reference points in 3D space, used in spatial cross-attention (SCA)
+        if dim == '3d':
+            zs = torch.linspace(0.5, Z - 0.5, num_points_in_pillar, dtype=dtype,
+                                device=device).view(-1, 1, 1).expand(num_points_in_pillar, H, W) / Z
+            xs = torch.linspace(0.5, W - 0.5, W, dtype=dtype,
+                                device=device).view(1, 1, W).expand(num_points_in_pillar, H, W) / W
+            ys = torch.linspace(0.5, H - 0.5, H, dtype=dtype,
+                                device=device).view(1, H, 1).expand(num_points_in_pillar, H, W) / H
+            ref_3d = torch.stack((xs, ys, zs), -1)
+            ref_3d = ref_3d.permute(0, 3, 1, 2).flatten(2).permute(0, 2, 1)
+            ref_3d = ref_3d[None].repeat(bs, 1, 1, 1)
+            return ref_3d
+
+        # reference points on 2D bev plane, used in temporal self-attention (TSA).
+        elif dim == '2d':
+            ref_y, ref_x = torch.meshgrid(
+                torch.linspace(
+                    0.5, H - 0.5, H, dtype=dtype, device=device),
+                torch.linspace(
+                    0.5, W - 0.5, W, dtype=dtype, device=device)
+            )
+            ref_y = ref_y.reshape(-1)[None] / H
+            ref_x = ref_x.reshape(-1)[None] / W
+            ref_2d = torch.stack((ref_x, ref_y), -1)
+            ref_2d = ref_2d.repeat(bs, 1, 1).unsqueeze(2)
+            return ref_2d
+
+    # This function must use fp32!!!
+    @force_fp32(apply_to=('reference_points', 'img_metas'))
+    def point_sampling(self, reference_points, pc_range,  img_metas):
+        # NOTE: close tf32 here. TODO(yzj): used in bevformer
+        # allow_tf32 = torch.backends.cuda.matmul.allow_tf32
+        # torch.backends.cuda.matmul.allow_tf32 = False
+        # torch.backends.cudnn.allow_tf32 = False
+
+        lidar2img = []
+        for img_meta in img_metas:
+            lidar2img.append(img_meta['lidar2img'])
+        lidar2img = np.asarray(lidar2img)
+        lidar2img = reference_points.new_tensor(lidar2img)  # (B, N, 4, 4)
+        reference_points = reference_points.clone()
+
+        reference_points[..., 0:1] = reference_points[..., 0:1] * \
+            (pc_range[3] - pc_range[0]) + pc_range[0]
+        reference_points[..., 1:2] = reference_points[..., 1:2] * \
+            (pc_range[4] - pc_range[1]) + pc_range[1]
+        reference_points[..., 2:3] = reference_points[..., 2:3] * \
+            (pc_range[5] - pc_range[2]) + pc_range[2]
+
+        reference_points = torch.cat(
+            (reference_points, torch.ones_like(reference_points[..., :1])), -1)
+
+        reference_points = reference_points.permute(1, 0, 2, 3)
+        D, B, num_query = reference_points.size()[:3]
+        num_cam = lidar2img.size(1)
+
+        reference_points = reference_points.view(
+            D, B, 1, num_query, 4).repeat(1, 1, num_cam, 1, 1).unsqueeze(-1)
+
+        lidar2img = lidar2img.view(
+            1, B, num_cam, 1, 4, 4).repeat(D, 1, 1, num_query, 1, 1)
+
+        reference_points_cam = torch.matmul(lidar2img.to(torch.float32),
+                                            reference_points.to(torch.float32)).squeeze(-1)
+        eps = 1e-5
+
+        bev_mask = (reference_points_cam[..., 2:3] > eps)
+        reference_points_cam = reference_points_cam[..., 0:2] / torch.maximum(
+            reference_points_cam[..., 2:3], torch.ones_like(reference_points_cam[..., 2:3]) * eps)
+
+        reference_points_cam[..., 0] /= img_metas[0]['img_shape'][0][1]
+        reference_points_cam[..., 1] /= img_metas[0]['img_shape'][0][0]
+
+        bev_mask = (bev_mask & (reference_points_cam[..., 1:2] > 0.0)
+                    & (reference_points_cam[..., 1:2] < 1.0)
+                    & (reference_points_cam[..., 0:1] < 1.0)
+                    & (reference_points_cam[..., 0:1] > 0.0))
+        if digit_version(TORCH_VERSION) >= digit_version('1.8'):
+            bev_mask = torch.nan_to_num(bev_mask)
+        else:
+            bev_mask = bev_mask.new_tensor(
+                np.nan_to_num(bev_mask.cpu().numpy()))
+
+        reference_points_cam = reference_points_cam.permute(2, 1, 3, 0, 4)
+        bev_mask = bev_mask.permute(2, 1, 3, 0, 4).squeeze(-1)
+
+        return reference_points_cam, bev_mask
+
+    @auto_fp16()
+    def forward(self,
+                bev_query,
+                key,
+                value,
+                *args,
+                bev_h=None,
+                bev_w=None,
+                bev_pos=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                valid_ratios=None,
+                prev_bev=None,
+                shift=0.,
+                img_metas=None,
+                **kwargs):
+        """Forward function for `TransformerDecoder`.
+        Args:
+            bev_query (Tensor): Input BEV query with shape
+                `(num_query, bs, embed_dims)`.
+            key & value (Tensor): Input multi-cameta features with shape
+                (num_cam, num_value, bs, embed_dims)
+            reference_points (Tensor): The reference
+                points of offset. has shape
+                (bs, num_query, 4) when as_two_stage,
+                otherwise has shape ((bs, num_query, 2).
+            valid_ratios (Tensor): The radios of valid
+                points on the feature map, has shape
+                (bs, num_levels, 2)
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+
+        output = bev_query
+        intermediate = []
+
+        ref_3d = self.get_reference_points(
+            bev_h, bev_w, self.pc_range[5]-self.pc_range[2], self.num_points_in_pillar, dim='3d', bs=bev_query.size(1),  device=bev_query.device, dtype=bev_query.dtype)
+        ref_2d = self.get_reference_points(
+            bev_h, bev_w, dim='2d', bs=bev_query.size(1), device=bev_query.device, dtype=bev_query.dtype)
+
+        reference_points_cam, bev_mask = self.point_sampling(
+            ref_3d, self.pc_range, img_metas)
+
+        # bug: this code should be 'shift_ref_2d = ref_2d.clone()', we keep this bug for reproducing our results in paper.
+        shift_ref_2d = ref_2d  # .clone()
+        shift_ref_2d += shift[:, None, None, :]
+
+        # (num_query, bs, embed_dims) -> (bs, num_query, embed_dims)
+        bev_query = bev_query.permute(1, 0, 2)
+        bev_pos = bev_pos.permute(1, 0, 2)
+        bs, len_bev, num_bev_level, _ = ref_2d.shape
+        if prev_bev is not None:
+            prev_bev = prev_bev.permute(1, 0, 2)
+            prev_bev = torch.stack(
+                [prev_bev, bev_query], 1).reshape(bs*2, len_bev, -1)
+            hybird_ref_2d = torch.stack([shift_ref_2d, ref_2d], 1).reshape(
+                bs*2, len_bev, num_bev_level, 2)
+        else:
+            hybird_ref_2d = torch.stack([ref_2d, ref_2d], 1).reshape(
+                bs*2, len_bev, num_bev_level, 2)
+
+        for lid, layer in enumerate(self.layers):
+            output = layer(
+                bev_query,
+                key,
+                value,
+                *args,
+                bev_pos=bev_pos,
+                ref_2d=hybird_ref_2d,
+                ref_3d=ref_3d,
+                bev_h=bev_h,
+                bev_w=bev_w,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                reference_points_cam=reference_points_cam,
+                bev_mask=bev_mask,
+                prev_bev=prev_bev,
+                **kwargs)
+
+            bev_query = output
+            if self.return_intermediate:
+                intermediate.append(output)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+
+        return output
+
+
+@TRANSFORMER_LAYER.register_module()
+class BEVFormerLayer(MyCustomBaseTransformerLayer):
+    """Implements decoder layer in DETR transformer.
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )):
+            Configs for self_attention or cross_attention, the order
+            should be consistent with it in `operation_order`. If it is
+            a dict, it would be expand to the number of attention in
+            `operation_order`.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        ffn_dropout (float): Probability of an element to be zeroed
+            in ffn. Default 0.0.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Default: None
+        act_cfg (dict): The activation config for FFNs. Default: `LN`
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: `LN`.
+        ffn_num_fcs (int): The number of fully-connected layers in FFNs.
+            Default: 2.
+    """
+
+    def __init__(self,
+                 attn_cfgs,
+                 feedforward_channels,
+                 ffn_dropout=0.0,
+                 operation_order=None,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN'),
+                 ffn_num_fcs=2,
+                 **kwargs):
+        super(BEVFormerLayer, self).__init__(
+            attn_cfgs=attn_cfgs,
+            feedforward_channels=feedforward_channels,
+            ffn_dropout=ffn_dropout,
+            operation_order=operation_order,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg,
+            ffn_num_fcs=ffn_num_fcs,
+            **kwargs)
+        self.fp16_enabled = False
+        assert len(operation_order) == 6
+        assert set(operation_order) == set(
+            ['self_attn', 'norm', 'cross_attn', 'ffn'])
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                bev_pos=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                ref_2d=None,
+                ref_3d=None,
+                bev_h=None,
+                bev_w=None,
+                reference_points_cam=None,
+                mask=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                prev_bev=None,
+                **kwargs):
+        """Forward function for `TransformerDecoderLayer`.
+
+        **kwargs contains some specific arguments of attentions.
+
+        Args:
+            query (Tensor): The input query with shape
+                [num_queries, bs, embed_dims] if
+                self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+            value (Tensor): The value tensor with same shape as `key`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor] | None): 2D Tensor used in
+                calculation of corresponding attention. The length of
+                it should equal to the number of `attention` in
+                `operation_order`. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in `self_attn` layer.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+
+        Returns:
+            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+        """
+
+        norm_index = 0
+        attn_index = 0
+        ffn_index = 0
+        identity = query
+        if attn_masks is None:
+            attn_masks = [None for _ in range(self.num_attn)]
+        elif isinstance(attn_masks, torch.Tensor):
+            attn_masks = [
+                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
+            ]
+            warnings.warn(f'Use same attn_mask in all attentions in '
+                          f'{self.__class__.__name__} ')
+        else:
+            assert len(attn_masks) == self.num_attn, f'The length of ' \
+                                                     f'attn_masks {len(attn_masks)} must be equal ' \
+                                                     f'to the number of attention in ' \
+                f'operation_order {self.num_attn}'
+
+        for layer in self.operation_order:
+            # temporal self attention
+            if layer == 'self_attn':
+
+                query = self.attentions[attn_index](
+                    query,
+                    prev_bev,
+                    prev_bev,
+                    identity if self.pre_norm else None,
+                    query_pos=bev_pos,
+                    key_pos=bev_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=query_key_padding_mask,
+                    reference_points=ref_2d,
+                    spatial_shapes=torch.tensor(
+                        [[bev_h, bev_w]], device=query.device),
+                    level_start_index=torch.tensor([0], device=query.device),
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'norm':
+                query = self.norms[norm_index](query)
+                norm_index += 1
+
+            # spaital cross attention
+            elif layer == 'cross_attn':
+                query = self.attentions[attn_index](
+                    query,
+                    key,
+                    value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=key_pos,
+                    reference_points=ref_3d,
+                    reference_points_cam=reference_points_cam,
+                    mask=mask,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=key_padding_mask,
+                    spatial_shapes=spatial_shapes,
+                    level_start_index=level_start_index,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'ffn':
+                query = self.ffns[ffn_index](
+                    query, identity if self.pre_norm else None)
+                ffn_index += 1
+
+        return query
diff --git a/mmcv/models/modules/group_attention.py b/mmcv/models/modules/group_attention.py
new file mode 100644
index 0000000..907a576
--- /dev/null
+++ b/mmcv/models/modules/group_attention.py
@@ -0,0 +1,162 @@
+import copy
+import math
+import warnings
+from typing import Sequence
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmcv.cnn import (Linear, build_activation_layer, build_conv_layer, build_norm_layer)
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, to_2tuple)
+from mmcv.cnn.bricks.drop import build_dropout
+from mmcv.cnn.bricks.registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING, TRANSFORMER_LAYER,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+
+
+@ATTENTION.register_module()
+class GroupMultiheadAttention(BaseModule):
+    """A wrapper for ``torch.nn.MultiheadAttention``.
+    This module implements MultiheadAttention with identity connection,
+    and positional encoding  is also passed as input.
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): When it is True,  Key, Query and Value are shape of
+            (batch, n, embed_dim), otherwise (n, batch, embed_dim).
+             Default to False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 group=1,
+                 dropout_layer=dict(type='Dropout', drop_prob=0.),
+                 init_cfg=None,
+                 batch_first=False,
+                 **kwargs):
+        super().__init__(init_cfg)
+        if 'dropout' in kwargs:
+            warnings.warn(
+                'The arguments `dropout` in MultiheadAttention '
+                'has been deprecated, now you can separately '
+                'set `attn_drop`(float), proj_drop(float), '
+                'and `dropout_layer`(dict) ', DeprecationWarning)
+            attn_drop = kwargs['dropout']
+            dropout_layer['drop_prob'] = kwargs.pop('dropout')
+
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.group = group
+        self.batch_first = batch_first
+
+        self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop, **kwargs)
+
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.dropout_layer = build_dropout(dropout_layer) if dropout_layer else nn.Identity()
+
+    @deprecated_api_warning({'residual': 'identity'}, cls_name='MultiheadAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_pos=None,
+                attn_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `MultiheadAttention`.
+        **kwargs allow passing a more general data flow when combining
+        with other operations in `transformerlayer`.
+        Args:
+            query (Tensor): The input query with shape [num_queries, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+                If None, the ``query`` will be used. Defaults to None.
+            value (Tensor): The value tensor with same shape as `key`.
+                Same in `nn.MultiheadAttention.forward`. Defaults to None.
+                If None, the `key` will be used.
+            identity (Tensor): This tensor, with the same shape as x,
+                will be used for the identity link.
+                If None, `x` will be used. Defaults to None.
+            query_pos (Tensor): The positional encoding for query, with
+                the same shape as `x`. If not None, it will
+                be added to `x` before forward function. Defaults to None.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`. Defaults to None. If not None, it will
+                be added to `key` before forward function. If None, and
+                `query_pos` has the same shape as `key`, then `query_pos`
+                will be used for `key_pos`. Defaults to None.
+            attn_mask (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+                Defaults to None.
+        Returns:
+            Tensor: forwarded results with shape
+            [num_queries, bs, embed_dims]
+            if self.batch_first is False, else
+            [bs, num_queries embed_dims].
+        """
+
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+        if identity is None:
+            identity = query
+        if key_pos is None:
+            if query_pos is not None:
+                # use query_pos if key_pos is not available
+                if query_pos.shape == key.shape:
+                    key_pos = query_pos
+                else:
+                    warnings.warn(f'position encoding of key is'
+                                  f'missing in {self.__class__.__name__}.')
+        if query_pos is not None:
+            query = query + query_pos
+        if key_pos is not None:
+            key = key + key_pos
+
+        # Because the dataflow('key', 'query', 'value') of
+        # ``torch.nn.MultiheadAttention`` is (num_query, batch,
+        # embed_dims), We should adjust the shape of dataflow from
+        # batch_first (batch, num_query, embed_dims) to num_query_first
+        # (num_query ,batch, embed_dims), and recover ``attn_output``
+        # from num_query_first to batch_first.
+        if self.batch_first:
+            query = query.transpose(0, 1)
+            key = key.transpose(0, 1)
+            value = value.transpose(0, 1)
+
+        num_queries = query.shape[0]
+        bs = query.shape[1]
+        if self.training:
+            query = torch.cat(query.split(num_queries // self.group, dim=0), dim=1)
+            key = torch.cat(key.split(num_queries // self.group, dim=0), dim=1)
+            value = torch.cat(value.split(num_queries // self.group, dim=0), dim=1)
+
+        out = self.attn(query=query, key=key, value=value, attn_mask=attn_mask, key_padding_mask=key_padding_mask)[0]
+
+        if self.training:
+            out = torch.cat(out.split(bs, dim=1), dim=0)  # shape
+
+        if self.batch_first:
+            out = out.transpose(0, 1)
+
+        return identity + self.dropout_layer(self.proj_drop(out))
diff --git a/mmcv/models/modules/multi_scale_deformable_attn_function.py b/mmcv/models/modules/multi_scale_deformable_attn_function.py
new file mode 100644
index 0000000..77b0f31
--- /dev/null
+++ b/mmcv/models/modules/multi_scale_deformable_attn_function.py
@@ -0,0 +1,163 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import torch
+from torch.cuda.amp import custom_bwd, custom_fwd
+from torch.autograd.function import Function, once_differentiable
+from mmcv.utils import ext_loader
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+class MultiScaleDeformableAttnFunction_fp16(Function):
+
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights, im2col_step):
+        """GPU version of multi-scale deformable attention.
+
+        Args:
+            value (Tensor): The value has shape
+                (bs, num_keys, mum_heads, embed_dims//num_heads)
+            value_spatial_shapes (Tensor): Spatial shape of
+                each feature map, has shape (num_levels, 2),
+                last dimension 2 represent (h, w)
+            sampling_locations (Tensor): The location of sampling points,
+                has shape
+                (bs ,num_queries, num_heads, num_levels, num_points, 2),
+                the last dimension 2 represent (x, y).
+            attention_weights (Tensor): The weight of sampling points used
+                when calculate the attention, has shape
+                (bs ,num_queries, num_heads, num_levels, num_points),
+            im2col_step (Tensor): The step used in image to column.
+
+        Returns:
+            Tensor: has shape (bs, num_queries, embed_dims)
+        """
+        ctx.im2col_step = im2col_step
+        output = ext_module.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            im2col_step=ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes,
+                              value_level_start_index, sampling_locations,
+                              attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    @custom_bwd
+    def backward(ctx, grad_output):
+        """GPU version of backward function.
+
+        Args:
+            grad_output (Tensor): Gradient
+                of output tensor of forward.
+
+        Returns:
+             Tuple[Tensor]: Gradient
+                of input tensors in forward.
+        """
+        value, value_spatial_shapes, value_level_start_index, \
+            sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value = torch.zeros_like(value)
+        grad_sampling_loc = torch.zeros_like(sampling_locations)
+        grad_attn_weight = torch.zeros_like(attention_weights)
+
+        ext_module.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output.contiguous(),
+            grad_value,
+            grad_sampling_loc,
+            grad_attn_weight,
+            im2col_step=ctx.im2col_step)
+
+        return grad_value, None, None, \
+            grad_sampling_loc, grad_attn_weight, None
+
+
+class MultiScaleDeformableAttnFunction_fp32(Function):
+
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights, im2col_step):
+        """GPU version of multi-scale deformable attention.
+
+        Args:
+            value (Tensor): The value has shape
+                (bs, num_keys, mum_heads, embed_dims//num_heads)
+            value_spatial_shapes (Tensor): Spatial shape of
+                each feature map, has shape (num_levels, 2),
+                last dimension 2 represent (h, w)
+            sampling_locations (Tensor): The location of sampling points,
+                has shape
+                (bs ,num_queries, num_heads, num_levels, num_points, 2),
+                the last dimension 2 represent (x, y).
+            attention_weights (Tensor): The weight of sampling points used
+                when calculate the attention, has shape
+                (bs ,num_queries, num_heads, num_levels, num_points),
+            im2col_step (Tensor): The step used in image to column.
+
+        Returns:
+            Tensor: has shape (bs, num_queries, embed_dims)
+        """
+
+        ctx.im2col_step = im2col_step
+        output = ext_module.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            im2col_step=ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes,
+                              value_level_start_index, sampling_locations,
+                              attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    @custom_bwd
+    def backward(ctx, grad_output):
+        """GPU version of backward function.
+
+        Args:
+            grad_output (Tensor): Gradient
+                of output tensor of forward.
+
+        Returns:
+             Tuple[Tensor]: Gradient
+                of input tensors in forward.
+        """
+        value, value_spatial_shapes, value_level_start_index, \
+            sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value = torch.zeros_like(value)
+        grad_sampling_loc = torch.zeros_like(sampling_locations)
+        grad_attn_weight = torch.zeros_like(attention_weights)
+
+        ext_module.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output.contiguous(),
+            grad_value,
+            grad_sampling_loc,
+            grad_attn_weight,
+            im2col_step=ctx.im2col_step)
+
+        return grad_value, None, None, \
+            grad_sampling_loc, grad_attn_weight, None
diff --git a/mmcv/models/modules/spatial_cross_attention.py b/mmcv/models/modules/spatial_cross_attention.py
new file mode 100644
index 0000000..e0b7587
--- /dev/null
+++ b/mmcv/models/modules/spatial_cross_attention.py
@@ -0,0 +1,398 @@
+
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch
+import warnings
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.models.utils import xavier_init, constant_init
+from mmcv.models.bricks.registry import (ATTENTION,
+                                      TRANSFORMER_LAYER,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmcv.models.bricks.transformer import build_attention
+import math
+from mmcv.utils import force_fp32, auto_fp16
+
+from mmcv.models.backbones.base_module import BaseModule, ModuleList, Sequential
+
+from mmcv.utils import ext_loader
+from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \
+    MultiScaleDeformableAttnFunction_fp16
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+@ATTENTION.register_module()
+class SpatialCrossAttention(BaseModule):
+    """An attention module used in BEVFormer.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_cams (int): The number of cameras
+        dropout (float): A Dropout layer on `inp_residual`.
+            Default: 0..
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        deformable_attention: (dict): The config for the deformable attention used in SCA.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_cams=6,
+                 pc_range=None,
+                 dropout=0.1,
+                 init_cfg=None,
+                 batch_first=False,
+                 deformable_attention=dict(
+                     type='MSDeformableAttention3D',
+                     embed_dims=256,
+                     num_levels=4),
+                 **kwargs
+                 ):
+        super(SpatialCrossAttention, self).__init__(init_cfg)
+
+        self.init_cfg = init_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.pc_range = pc_range
+        self.fp16_enabled = False
+        self.deformable_attention = build_attention(deformable_attention)
+        self.embed_dims = embed_dims
+        self.num_cams = num_cams
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.batch_first = batch_first
+        self.init_weight()
+
+    def init_weight(self):
+        """Default initialization for Parameters of Module."""
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+    
+    @force_fp32(apply_to=('query', 'key', 'value', 'query_pos', 'reference_points_cam'))
+    def forward(self,
+                query,
+                key,
+                value,
+                residual=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                reference_points_cam=None,
+                bev_mask=None,
+                level_start_index=None,
+                flag='encoder',
+                **kwargs):
+        """Forward Function of Detr3DCrossAtten.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`. (B, N, C, H, W)
+            residual (Tensor): The tensor used for addition, with the
+                same shape as `x`. Default None. If None, `x` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for  `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, 4),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different level. With shape  (num_levels, 2),
+                last dimension represent (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape (num_levels) and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+
+        if residual is None:
+            inp_residual = query
+            slots = torch.zeros_like(query)
+        if query_pos is not None:
+            query = query + query_pos
+
+        bs, num_query, _ = query.size()
+
+        D = reference_points_cam.size(3)
+        indexes = []
+        for i, mask_per_img in enumerate(bev_mask):
+            index_query_per_img = mask_per_img[0].sum(-1).nonzero().squeeze(-1)
+            indexes.append(index_query_per_img)
+        max_len = max([len(each) for each in indexes])
+
+        # each camera only interacts with its corresponding BEV queries. This step can  greatly save GPU memory.
+        queries_rebatch = query.new_zeros(
+            [bs, self.num_cams, max_len, self.embed_dims])
+        reference_points_rebatch = reference_points_cam.new_zeros(
+            [bs, self.num_cams, max_len, D, 2])
+        
+        for j in range(bs):
+            for i, reference_points_per_img in enumerate(reference_points_cam):   
+                index_query_per_img = indexes[i]
+                queries_rebatch[j, i, :len(index_query_per_img)] = query[j, index_query_per_img]
+                reference_points_rebatch[j, i, :len(index_query_per_img)] = reference_points_per_img[j, index_query_per_img]
+
+        num_cams, l, bs, embed_dims = key.shape
+
+        key = key.permute(2, 0, 1, 3).reshape(
+            bs * self.num_cams, l, self.embed_dims)
+        value = value.permute(2, 0, 1, 3).reshape(
+            bs * self.num_cams, l, self.embed_dims)
+
+        queries = self.deformable_attention(query=queries_rebatch.view(bs*self.num_cams, max_len, self.embed_dims), key=key, value=value,
+                                            reference_points=reference_points_rebatch.view(bs*self.num_cams, max_len, D, 2), spatial_shapes=spatial_shapes,
+                                            level_start_index=level_start_index).view(bs, self.num_cams, max_len, self.embed_dims)
+        for j in range(bs):
+            for i, index_query_per_img in enumerate(indexes):
+                slots[j, index_query_per_img] += queries[j, i, :len(index_query_per_img)]
+
+        count = bev_mask.sum(-1) > 0
+        count = count.permute(1, 2, 0).sum(-1)
+        count = torch.clamp(count, min=1.0)
+        slots = slots / count[..., None]
+        slots = self.output_proj(slots)
+
+        return self.dropout(slots) + inp_residual
+
+
+@ATTENTION.register_module()
+class MSDeformableAttention3D(BaseModule):
+    """An attention module used in BEVFormer based on Deformable-Detr.
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=8,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=True,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.batch_first = batch_first
+        self.output_proj = None
+        self.fp16_enabled = False
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+            self.num_heads, 1, 1,
+            2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+        Args:
+            query (Tensor): Query of Transformer with shape
+                ( bs, num_query, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(bs, num_key,  embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(bs, num_key,  embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+
+        if reference_points.shape[-1] == 2:
+            """
+            For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights.
+            After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image.
+            For each referent point, we sample `num_points` sampling points.
+            For `num_Z_anchors` reference points,  it has overall `num_points * num_Z_anchors` sampling points.
+            """
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+
+            bs, num_query, num_Z_anchors, xy = reference_points.shape
+            reference_points = reference_points[:, :, None, None, None, :, :]
+            sampling_offsets = sampling_offsets / \
+                offset_normalizer[None, None, None, :, None, :]
+            bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_offsets.shape
+            sampling_offsets = sampling_offsets.view(
+                bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy)
+            sampling_locations = reference_points + sampling_offsets
+            bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape
+            assert num_all_points == num_points * num_Z_anchors
+
+            sampling_locations = sampling_locations.view(
+                bs, num_query, num_heads, num_levels, num_all_points, xy)
+
+        elif reference_points.shape[-1] == 4:
+            assert False
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+
+        #  sampling_locations.shape: bs, num_query, num_heads, num_levels, num_all_points, 2
+        #  attention_weights.shape: bs, num_query, num_heads, num_levels, num_all_points
+        #
+
+        if torch.cuda.is_available() and value.is_cuda:
+            if value.dtype == torch.float16:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            else:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+        if not self.batch_first:
+            output = output.permute(1, 0, 2)
+
+        return output
diff --git a/mmcv/models/modules/temporal_self_attention.py b/mmcv/models/modules/temporal_self_attention.py
new file mode 100644
index 0000000..6de0020
--- /dev/null
+++ b/mmcv/models/modules/temporal_self_attention.py
@@ -0,0 +1,269 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32
+from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch
+import warnings
+import torch
+import torch.nn as nn
+from mmcv.models.utils import xavier_init, constant_init
+from mmcv.models.bricks.registry import ATTENTION
+import math
+from mmcv.models.backbones.base_module import BaseModule, ModuleList, Sequential
+from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning,
+                        to_2tuple)
+
+from mmcv.utils import ext_loader
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+@ATTENTION.register_module()
+class TemporalSelfAttention(BaseModule):
+    """An attention module used in BEVFormer based on Deformable-Detr.
+
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to True.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        num_bev_queue (int): In this version, we only use one history BEV and one currenct BEV.
+         the length of BEV queue is 2.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 num_bev_queue=2,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=True,
+                 norm_cfg=None,
+                 init_cfg=None):
+
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+        self.fp16_enabled = False
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.num_bev_queue = num_bev_queue
+        self.sampling_offsets = nn.Linear(
+            embed_dims*self.num_bev_queue, num_bev_queue*num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims*self.num_bev_queue,
+                                           num_bev_queue*num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+            self.num_heads, 1, 1,
+            2).repeat(1, self.num_levels*self.num_bev_queue, self.num_points, 1)
+
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                flag='decoder',
+
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            assert self.batch_first
+            bs, len_bev, c = query.shape
+            value = torch.stack([query, query], 1).reshape(bs*2, len_bev, c)
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+        bs,  num_query, embed_dims = query.shape
+        _, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+        assert self.num_bev_queue == 2
+
+        query = torch.cat([value[:bs], query], -1)
+        value = self.value_proj(value)
+
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+
+        value = value.reshape(bs*self.num_bev_queue,
+                              num_value, self.num_heads, -1)
+
+        sampling_offsets = self.sampling_offsets(query)
+        sampling_offsets = sampling_offsets.view(
+            bs, num_query, self.num_heads,  self.num_bev_queue, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query,  self.num_heads, self.num_bev_queue, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_bev_queue,
+                                                   self.num_levels,
+                                                   self.num_points)
+
+        attention_weights = attention_weights.permute(0, 3, 1, 2, 4, 5)\
+            .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points).contiguous()
+        sampling_offsets = sampling_offsets.permute(0, 3, 1, 2, 4, 5, 6)\
+            .reshape(bs*self.num_bev_queue, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if torch.cuda.is_available() and value.is_cuda:
+
+            # using fp16 deformable attention is unstable because it performs many sum operations
+            if value.dtype == torch.float16:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            else:
+                MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+
+        # output shape (bs*num_bev_queue, num_query, embed_dims)
+        # (bs*num_bev_queue, num_query, embed_dims)-> (num_query, embed_dims, bs*num_bev_queue)
+        output = output.permute(1, 2, 0)
+
+        # fuse history value and current value
+        # (num_query, embed_dims, bs*num_bev_queue)-> (num_query, embed_dims, bs, num_bev_queue)
+        output = output.view(num_query, embed_dims, bs, self.num_bev_queue)
+        output = output.mean(-1)
+
+        # (num_query, embed_dims, bs)-> (bs, num_query, embed_dims)
+        output = output.permute(2, 0, 1)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
diff --git a/mmcv/models/modules/transformer.py b/mmcv/models/modules/transformer.py
new file mode 100644
index 0000000..8013048
--- /dev/null
+++ b/mmcv/models/modules/transformer.py
@@ -0,0 +1,632 @@
+# ---------------------------------------------
+# Copyright (c) OpenMMLab. All rights reserved.
+# ---------------------------------------------
+#  Modified by Zhiqi Li
+# ---------------------------------------------
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.models.utils import xavier_init
+from mmcv.models.bricks.transformer import build_transformer_layer_sequence
+from mmcv.models.backbones.base_module import BaseModule
+
+from mmcv.models.utils.builder import TRANSFORMER
+from torch.nn.init import normal_
+from torchvision.transforms.functional import rotate
+from .temporal_self_attention import TemporalSelfAttention
+from .spatial_cross_attention import MSDeformableAttention3D
+from .decoder import CustomMSDeformableAttention
+from mmcv.utils import force_fp32, auto_fp16
+from mmcv.models.bricks.registry import ATTENTION
+from mmcv.models.bricks.transformer import POSITIONAL_ENCODING, MultiheadAttention
+
+@TRANSFORMER.register_module()
+class UniADPerceptionTransformer(BaseModule):
+    """Implements the Detr3D transformer.
+    Args:
+        as_two_stage (bool): Generate query from encoder features.
+            Default: False.
+        num_feature_levels (int): Number of feature maps from FPN:
+            Default: 4.
+        two_stage_num_proposals (int): Number of proposals when set
+            `as_two_stage` as True. Default: 300.
+    """
+
+    def __init__(self,
+                 num_feature_levels=4,
+                 num_cams=6,
+                 two_stage_num_proposals=300,
+                 encoder=None,
+                 decoder=None,
+                 embed_dims=256,
+                 rotate_prev_bev=True,
+                 use_shift=True,
+                 use_can_bus=True,
+                 can_bus_norm=True,
+                 use_cams_embeds=True,
+                 rotate_center=[100, 100],
+                 **kwargs):
+        super(UniADPerceptionTransformer, self).__init__(**kwargs)
+        self.encoder = build_transformer_layer_sequence(encoder)
+        self.decoder = build_transformer_layer_sequence(decoder)
+        self.embed_dims = embed_dims
+        self.num_feature_levels = num_feature_levels
+        self.num_cams = num_cams
+        self.fp16_enabled = False
+
+        self.rotate_prev_bev = rotate_prev_bev
+        self.use_shift = use_shift
+        self.use_can_bus = use_can_bus
+        self.can_bus_norm = can_bus_norm
+        self.use_cams_embeds = use_cams_embeds
+
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.init_layers()
+        self.rotate_center = rotate_center
+
+    def init_layers(self):
+        """Initialize layers of the Detr3DTransformer."""
+        self.level_embeds = nn.Parameter(torch.Tensor(
+            self.num_feature_levels, self.embed_dims))
+        self.cams_embeds = nn.Parameter(
+            torch.Tensor(self.num_cams, self.embed_dims))
+        self.can_bus_mlp = nn.Sequential(
+            nn.Linear(18, self.embed_dims // 2),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.embed_dims // 2, self.embed_dims),
+            nn.ReLU(inplace=True),
+        )
+        if self.can_bus_norm:
+            self.can_bus_mlp.add_module('norm', nn.LayerNorm(self.embed_dims))
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention) \
+                    or isinstance(m, CustomMSDeformableAttention):
+                try:
+                    m.init_weight()
+                except AttributeError:
+                    m.init_weights()
+        normal_(self.level_embeds)
+        normal_(self.cams_embeds)
+        xavier_init(self.can_bus_mlp, distribution='uniform', bias=0.)
+
+    @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'prev_bev', 'bev_pos'))
+    def get_bev_features(
+            self,
+            mlvl_feats,
+            bev_queries,
+            bev_h,
+            bev_w,
+            grid_length=[0.512, 0.512],
+            bev_pos=None,
+            prev_bev=None,
+            img_metas=None):
+        """
+        obtain bev features.
+        """
+
+        bs = mlvl_feats[0].size(0)
+        bev_queries = bev_queries.unsqueeze(1).repeat(1, bs, 1)
+        bev_pos = bev_pos.flatten(2).permute(2, 0, 1)
+        # obtain rotation angle and shift with ego motion
+        delta_x = np.array([each['can_bus'][0]
+                           for each in img_metas])
+        delta_y = np.array([each['can_bus'][1]
+                           for each in img_metas])
+        ego_angle = np.array(
+            [each['can_bus'][-2] / np.pi * 180 for each in img_metas])
+        grid_length_y = grid_length[0]
+        grid_length_x = grid_length[1]
+        translation_length = np.sqrt(delta_x ** 2 + delta_y ** 2)
+        translation_angle = np.arctan2(delta_y, delta_x) / np.pi * 180
+        bev_angle = ego_angle - translation_angle
+        shift_y = translation_length * \
+            np.cos(bev_angle / 180 * np.pi) / grid_length_y / bev_h
+        shift_x = translation_length * \
+            np.sin(bev_angle / 180 * np.pi) / grid_length_x / bev_w
+        shift_y = shift_y * self.use_shift
+        shift_x = shift_x * self.use_shift
+        shift = bev_queries.new_tensor(
+            [shift_x, shift_y]).permute(1, 0)  # xy, bs -> bs, xy
+
+        if prev_bev is not None:
+            if prev_bev.shape[1] == bev_h * bev_w:
+                prev_bev = prev_bev.permute(1, 0, 2)
+            if self.rotate_prev_bev:
+                for i in range(bs):
+                    rotation_angle = img_metas[i]['can_bus'][-1]
+                    tmp_prev_bev = prev_bev[:, i].reshape(
+                        bev_h, bev_w, -1).permute(2, 0, 1)
+                    tmp_prev_bev = rotate(tmp_prev_bev, rotation_angle,
+                                          center=self.rotate_center)
+                    tmp_prev_bev = tmp_prev_bev.permute(1, 2, 0).reshape(
+                        bev_h * bev_w, 1, -1)
+                    prev_bev[:, i] = tmp_prev_bev[:, 0]
+
+        # add can bus signals
+        can_bus = bev_queries.new_tensor(
+            [each['can_bus'] for each in img_metas])  # [:, :]
+        can_bus = self.can_bus_mlp(can_bus)[None, :, :]
+        bev_queries = bev_queries + can_bus * self.use_can_bus
+
+        feat_flatten = []
+        spatial_shapes = []
+        for lvl, feat in enumerate(mlvl_feats):
+            bs, num_cam, c, h, w = feat.shape
+            spatial_shape = (h, w)
+            feat = feat.flatten(3).permute(1, 0, 3, 2)
+            if self.use_cams_embeds:
+                feat = feat + self.cams_embeds[:, None, None, :].to(feat.dtype)
+            feat = feat + self.level_embeds[None,
+                                            None, lvl:lvl + 1, :].to(feat.dtype)
+            spatial_shapes.append(spatial_shape)
+            feat_flatten.append(feat)
+
+        feat_flatten = torch.cat(feat_flatten, 2)
+        spatial_shapes = torch.as_tensor(
+            spatial_shapes, dtype=torch.long, device=bev_pos.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+
+        feat_flatten = feat_flatten.permute(
+            0, 2, 1, 3)  # (num_cam, H*W, bs, embed_dims)
+
+        bev_embed = self.encoder(
+            bev_queries,
+            feat_flatten,
+            feat_flatten,
+            bev_h=bev_h,
+            bev_w=bev_w,
+            bev_pos=bev_pos,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            prev_bev=prev_bev,
+            shift=shift,
+            img_metas=img_metas,
+        )
+
+        return bev_embed
+    
+    def get_states_and_refs(
+        self,
+        bev_embed,
+        object_query_embed,
+        bev_h,
+        bev_w,
+        reference_points,
+        reg_branches=None,
+        cls_branches=None,
+        img_metas=None
+    ):
+        bs = bev_embed.shape[1]
+        query_pos, query = torch.split(
+            object_query_embed, self.embed_dims, dim=1)
+        query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1)
+        query = query.unsqueeze(0).expand(bs, -1, -1)
+
+        reference_points = reference_points.unsqueeze(0).expand(bs, -1, -1)
+        reference_points = reference_points.sigmoid()
+
+        init_reference_out = reference_points
+        query = query.permute(1, 0, 2)
+        query_pos = query_pos.permute(1, 0, 2)
+        inter_states, inter_references = self.decoder(
+            query=query,
+            key=None,
+            value=bev_embed,
+            query_pos=query_pos,
+            reference_points=reference_points,
+            reg_branches=reg_branches,
+            cls_branches=cls_branches,
+            spatial_shapes=torch.tensor([[bev_h, bev_w]], device=query.device),
+            level_start_index=torch.tensor([0], device=query.device),
+            img_metas=img_metas
+        )
+        inter_references_out = inter_references
+
+        return inter_states, init_reference_out, inter_references_out
+
+
+@ATTENTION.register_module()
+class GroupFree3DMHA(MultiheadAttention):
+    """A warpper for torch.nn.MultiheadAttention for GroupFree3D.
+
+    This module implements MultiheadAttention with identity connection,
+    and positional encoding used in DETR is also passed as input.
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads. Same as
+            `nn.MultiheadAttention`.
+        attn_drop (float): A Dropout layer on attn_output_weights. Default 0.0.
+        proj_drop (float): A Dropout layer. Default 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=dict(type='DropOut', drop_prob=0.),
+                 init_cfg=None,
+                 batch_first=False,
+                 **kwargs):
+        super().__init__(embed_dims, num_heads, attn_drop, proj_drop,
+                         dropout_layer, init_cfg, batch_first, **kwargs)
+
+    def forward(self,
+                query,
+                key,
+                value,
+                identity,
+                query_pos=None,
+                key_pos=None,
+                attn_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `GroupFree3DMHA`.
+
+        **kwargs allow passing a more general data flow when combining
+        with other operations in `transformerlayer`.
+
+        Args:
+            query (Tensor): The input query with shape [num_queries, bs,
+                embed_dims]. Same in `nn.MultiheadAttention.forward`.
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims]. Same in `nn.MultiheadAttention.forward`.
+                If None, the ``query`` will be used. Defaults to None.
+            value (Tensor): The value tensor with same shape as `key`.
+                Same in `nn.MultiheadAttention.forward`. Defaults to None.
+                If None, the `key` will be used.
+            identity (Tensor): This tensor, with the same shape as x,
+                will be used for the identity link.
+                If None, `x` will be used. Defaults to None.
+            query_pos (Tensor): The positional encoding for query, with
+                the same shape as `x`. If not None, it will
+                be added to `x` before forward function. Defaults to None.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`. Defaults to None. If not None, it will
+                be added to `key` before forward function. If None, and
+                `query_pos` has the same shape as `key`, then `query_pos`
+                will be used for `key_pos`. Defaults to None.
+            attn_mask (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+                Same in `nn.MultiheadAttention.forward`. Defaults to None.
+
+        Returns:
+            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+        """
+
+        if hasattr(self, 'operation_name'):
+            if self.operation_name == 'self_attn':
+                value = value + query_pos
+            elif self.operation_name == 'cross_attn':
+                value = value + key_pos
+            else:
+                raise NotImplementedError(
+                    f'{self.__class__.name} '
+                    f"can't be used as {self.operation_name}")
+        else:
+            value = value + query_pos
+
+        return super(GroupFree3DMHA, self).forward(
+            query=query,
+            key=key,
+            value=value,
+            identity=identity,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+            **kwargs)
+
+
+@POSITIONAL_ENCODING.register_module()
+class ConvBNPositionalEncoding(nn.Module):
+    """Absolute position embedding with Conv learning.
+
+    Args:
+        input_channel (int): input features dim.
+        num_pos_feats (int): output position features dim.
+            Defaults to 288 to be consistent with seed features dim.
+    """
+
+    def __init__(self, input_channel, num_pos_feats=288):
+        super().__init__()
+        self.position_embedding_head = nn.Sequential(
+            nn.Conv1d(input_channel, num_pos_feats, kernel_size=1),
+            nn.BatchNorm1d(num_pos_feats), nn.ReLU(inplace=True),
+            nn.Conv1d(num_pos_feats, num_pos_feats, kernel_size=1))
+
+    def forward(self, xyz):
+        """Forward pass.
+
+        Args:
+            xyz (Tensor)： (B, N, 3) the coordinates to embed.
+
+        Returns:
+            Tensor: (B, num_pos_feats, N) the embeded position features.
+        """
+        xyz = xyz.permute(0, 2, 1)
+        position_embedding = self.position_embedding_head(xyz)
+        return position_embedding
+
+@TRANSFORMER.register_module()
+class BEVFormerPerceptionTransformer(BaseModule):
+    """Implements the Detr3D transformer.
+    Args:
+        as_two_stage (bool): Generate query from encoder features.
+            Default: False.
+        num_feature_levels (int): Number of feature maps from FPN:
+            Default: 4.
+        two_stage_num_proposals (int): Number of proposals when set
+            `as_two_stage` as True. Default: 300.
+    """
+
+    def __init__(self,
+                 num_feature_levels=4,
+                 num_cams=6,
+                 two_stage_num_proposals=300,
+                 encoder=None,
+                 decoder=None,
+                 embed_dims=256,
+                 rotate_prev_bev=True,
+                 use_shift=True,
+                 use_can_bus=True,
+                 can_bus_norm=True,
+                 use_cams_embeds=True,
+                 rotate_center=[100, 100],
+                 **kwargs):
+        super(BEVFormerPerceptionTransformer, self).__init__(**kwargs)
+        self.encoder = build_transformer_layer_sequence(encoder)
+        self.decoder = build_transformer_layer_sequence(decoder)
+        self.embed_dims = embed_dims
+        self.num_feature_levels = num_feature_levels
+        self.num_cams = num_cams
+        self.fp16_enabled = False
+
+        self.rotate_prev_bev = rotate_prev_bev
+        self.use_shift = use_shift
+        self.use_can_bus = use_can_bus
+        self.can_bus_norm = can_bus_norm
+        self.use_cams_embeds = use_cams_embeds
+
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.init_layers()
+        self.rotate_center = rotate_center
+
+    def init_layers(self):
+        """Initialize layers of the Detr3DTransformer."""
+        self.level_embeds = nn.Parameter(torch.Tensor(
+            self.num_feature_levels, self.embed_dims))
+        self.cams_embeds = nn.Parameter(
+            torch.Tensor(self.num_cams, self.embed_dims))
+        self.reference_points = nn.Linear(self.embed_dims, 3)
+        self.can_bus_mlp = nn.Sequential(
+            nn.Linear(18, self.embed_dims // 2),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.embed_dims // 2, self.embed_dims),
+            nn.ReLU(inplace=True),
+        )
+        if self.can_bus_norm:
+            self.can_bus_mlp.add_module('norm', nn.LayerNorm(self.embed_dims))
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention) \
+                    or isinstance(m, CustomMSDeformableAttention):
+                try:
+                    m.init_weight()
+                except AttributeError:
+                    m.init_weights()
+        normal_(self.level_embeds)
+        normal_(self.cams_embeds)
+        xavier_init(self.reference_points, distribution='uniform', bias=0.)
+        xavier_init(self.can_bus_mlp, distribution='uniform', bias=0.)
+
+    @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'prev_bev', 'bev_pos'))
+    def get_bev_features(
+            self,
+            mlvl_feats,
+            bev_queries,
+            bev_h,
+            bev_w,
+            grid_length=[0.512, 0.512],
+            bev_pos=None,
+            prev_bev=None,
+            **kwargs):
+        """
+        obtain bev features.
+        """
+
+        bs = mlvl_feats[0].size(0)
+        bev_queries = bev_queries.unsqueeze(1).repeat(1, bs, 1)
+        bev_pos = bev_pos.flatten(2).permute(2, 0, 1)
+
+        # obtain rotation angle and shift with ego motion
+        delta_x = np.array([each['can_bus'][0]
+                           for each in kwargs['img_metas']])
+        delta_y = np.array([each['can_bus'][1]
+                           for each in kwargs['img_metas']])
+        ego_angle = np.array(
+            [each['can_bus'][-2] / np.pi * 180 for each in kwargs['img_metas']])
+        grid_length_y = grid_length[0]
+        grid_length_x = grid_length[1]
+        translation_length = np.sqrt(delta_x ** 2 + delta_y ** 2)
+        translation_angle = np.arctan2(delta_y, delta_x) / np.pi * 180
+        bev_angle = ego_angle - translation_angle
+        shift_y = translation_length * \
+            np.cos(bev_angle / 180 * np.pi) / grid_length_y / bev_h
+        shift_x = translation_length * \
+            np.sin(bev_angle / 180 * np.pi) / grid_length_x / bev_w
+        shift_y = shift_y * self.use_shift
+        shift_x = shift_x * self.use_shift
+        shift = bev_queries.new_tensor(
+            [shift_x, shift_y]).permute(1, 0)  # xy, bs -> bs, xy
+
+        if prev_bev is not None:
+            if prev_bev.shape[1] == bev_h * bev_w:
+                prev_bev = prev_bev.permute(1, 0, 2)
+            if self.rotate_prev_bev:
+                for i in range(bs):
+                    # num_prev_bev = prev_bev.size(1)
+                    rotation_angle = kwargs['img_metas'][i]['can_bus'][-1]
+                    tmp_prev_bev = prev_bev[:, i].reshape(
+                        bev_h, bev_w, -1).permute(2, 0, 1)
+                    tmp_prev_bev = rotate(tmp_prev_bev, rotation_angle,
+                                          center=self.rotate_center)
+                    tmp_prev_bev = tmp_prev_bev.permute(1, 2, 0).reshape(
+                        bev_h * bev_w, 1, -1)
+                    prev_bev[:, i] = tmp_prev_bev[:, 0]
+
+        # add can bus signals
+        can_bus = bev_queries.new_tensor(
+            [each['can_bus'] for each in kwargs['img_metas']])  # [:, :]
+        can_bus = self.can_bus_mlp(can_bus)[None, :, :]
+        bev_queries = bev_queries + can_bus * self.use_can_bus
+
+        feat_flatten = []
+        spatial_shapes = []
+        for lvl, feat in enumerate(mlvl_feats):
+            bs, num_cam, c, h, w = feat.shape
+            spatial_shape = (h, w)
+            feat = feat.flatten(3).permute(1, 0, 3, 2)
+            if self.use_cams_embeds:
+                feat = feat + self.cams_embeds[:, None, None, :].to(feat.dtype)
+            feat = feat + self.level_embeds[None,
+                                            None, lvl:lvl + 1, :].to(feat.dtype)
+            spatial_shapes.append(spatial_shape)
+            feat_flatten.append(feat)
+
+        feat_flatten = torch.cat(feat_flatten, 2)
+        spatial_shapes = torch.as_tensor(
+            spatial_shapes, dtype=torch.long, device=bev_pos.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+
+        feat_flatten = feat_flatten.permute(
+            0, 2, 1, 3)  # (num_cam, H*W, bs, embed_dims)
+
+        bev_embed = self.encoder(
+            bev_queries,
+            feat_flatten,
+            feat_flatten,
+            bev_h=bev_h,
+            bev_w=bev_w,
+            bev_pos=bev_pos,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            prev_bev=prev_bev,
+            shift=shift,
+            **kwargs
+        )
+
+        return bev_embed
+
+    @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'object_query_embed', 'prev_bev', 'bev_pos'))
+    def forward(self,
+                mlvl_feats,
+                bev_queries,
+                object_query_embed,
+                bev_h,
+                bev_w,
+                grid_length=[0.512, 0.512],
+                bev_pos=None,
+                reg_branches=None,
+                cls_branches=None,
+                prev_bev=None,
+                **kwargs):
+        """Forward function for `Detr3DTransformer`.
+        Args:
+            mlvl_feats (list(Tensor)): Input queries from
+                different level. Each element has shape
+                [bs, num_cams, embed_dims, h, w].
+            bev_queries (Tensor): (bev_h*bev_w, c)
+            bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w)
+            object_query_embed (Tensor): The query embedding for decoder,
+                with shape [num_query, c].
+            reg_branches (obj:`nn.ModuleList`): Regression heads for
+                feature maps from each decoder layer. Only would
+                be passed when `with_box_refine` is True. Default to None.
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+                - bev_embed: BEV features
+                - inter_states: Outputs from decoder. If
+                    return_intermediate_dec is True output has shape \
+                      (num_dec_layers, bs, num_query, embed_dims), else has \
+                      shape (1, bs, num_query, embed_dims).
+                - init_reference_out: The initial value of reference \
+                    points, has shape (bs, num_queries, 4).
+                - inter_references_out: The internal value of reference \
+                    points in decoder, has shape \
+                    (num_dec_layers, bs,num_query, embed_dims)
+                - enc_outputs_class: The classification score of \
+                    proposals generated from \
+                    encoder's feature maps, has shape \
+                    (batch, h*w, num_classes). \
+                    Only would be returned when `as_two_stage` is True, \
+                    otherwise None.
+                - enc_outputs_coord_unact: The regression results \
+                    generated from encoder's feature maps., has shape \
+                    (batch, h*w, 4). Only would \
+                    be returned when `as_two_stage` is True, \
+                    otherwise None.
+        """
+
+        bev_embed = self.get_bev_features(
+            mlvl_feats,
+            bev_queries,
+            bev_h,
+            bev_w,
+            grid_length=grid_length,
+            bev_pos=bev_pos,
+            prev_bev=prev_bev,
+            **kwargs)  # bev_embed shape: bs, bev_h*bev_w, embed_dims
+
+        bs = mlvl_feats[0].size(0)
+        query_pos, query = torch.split(
+            object_query_embed, self.embed_dims, dim=1)
+        query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1)
+        query = query.unsqueeze(0).expand(bs, -1, -1)
+        reference_points = self.reference_points(query_pos)
+        reference_points = reference_points.sigmoid()
+        init_reference_out = reference_points
+
+        query = query.permute(1, 0, 2)
+        query_pos = query_pos.permute(1, 0, 2)
+        bev_embed = bev_embed.permute(1, 0, 2)
+
+        inter_states, inter_references = self.decoder(
+            query=query,
+            key=None,
+            value=bev_embed,
+            query_pos=query_pos,
+            reference_points=reference_points,
+            reg_branches=reg_branches,
+            cls_branches=cls_branches,
+            spatial_shapes=torch.tensor([[bev_h, bev_w]], device=query.device),
+            level_start_index=torch.tensor([0], device=query.device),
+            **kwargs)
+
+        inter_references_out = inter_references
+
+        return bev_embed, inter_states, init_reference_out, inter_references_out
diff --git a/mmcv/models/modules/transformerV2.py b/mmcv/models/modules/transformerV2.py
new file mode 100644
index 0000000..41587de
--- /dev/null
+++ b/mmcv/models/modules/transformerV2.py
@@ -0,0 +1,353 @@
+import torch
+import torch.nn as nn
+from mmcv.cnn import xavier_init
+from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence
+from mmdet.models.utils.builder import TRANSFORMER
+from torch.nn.init import normal_
+from mmcv.runner.base_module import BaseModule
+from .temporal_self_attention import TemporalSelfAttention
+from .spatial_cross_attention import MSDeformableAttention3D
+from .decoder import CustomMSDeformableAttention
+from mmcv.cnn import build_norm_layer, build_conv_layer
+import torch.utils.checkpoint as checkpoint
+from mmdet.models.backbones.resnet import Bottleneck, BasicBlock
+
+
+class ResNetFusion(BaseModule):
+    def __init__(self, in_channels, out_channels, inter_channels, num_layer, norm_cfg=dict(type='SyncBN'),
+                 with_cp=False):
+        super(ResNetFusion, self).__init__()
+        layers = []
+        self.inter_channels = inter_channels
+        for i in range(num_layer):
+            if i == 0:
+                if inter_channels == in_channels:
+                    layers.append(BasicBlock(in_channels, inter_channels, stride=1, norm_cfg=norm_cfg))
+                else:
+                    downsample = nn.Sequential(
+                        build_conv_layer(None, in_channels, inter_channels, 3, stride=1, padding=1, dilation=1,
+                                         bias=False),
+                        build_norm_layer(norm_cfg, inter_channels)[1])
+                    layers.append(
+                        BasicBlock(in_channels, inter_channels, stride=1, norm_cfg=norm_cfg, downsample=downsample))
+            else:
+                layers.append(BasicBlock(inter_channels, inter_channels, stride=1, norm_cfg=norm_cfg))
+        self.layers = nn.Sequential(*layers)
+        self.layer_norm = nn.Sequential(
+                nn.Linear(inter_channels, out_channels),
+                nn.LayerNorm(out_channels))
+        self.with_cp = with_cp
+
+    def forward(self, x):
+        x = torch.cat(x, 1).contiguous()
+        # x should be [1, in_channels, bev_h, bev_w]
+        for lid, layer in enumerate(self.layers):
+            if self.with_cp and x.requires_grad:
+                x = checkpoint.checkpoint(layer, x)
+            else:
+                x = layer(x)
+        x = x.reshape(x.shape[0], x.shape[1], -1).permute(0, 2, 1)  # nchw -> n(hw)c
+        x = self.layer_norm(x)
+        return x
+
+
+@TRANSFORMER.register_module()
+class PerceptionTransformerBEVEncoder(BaseModule):
+    def __init__(self,
+                 num_feature_levels=4,
+                 num_cams=6,
+                 two_stage_num_proposals=300,
+                 encoder=None,
+                 embed_dims=256,
+                 use_cams_embeds=True,
+                 rotate_center=[100, 100],
+                 **kwargs):
+        super(PerceptionTransformerBEVEncoder, self).__init__(**kwargs)
+        self.encoder = build_transformer_layer_sequence(encoder)
+        self.embed_dims = embed_dims
+        self.num_feature_levels = num_feature_levels
+        self.num_cams = num_cams
+        self.fp16_enabled = False
+
+        self.use_cams_embeds = use_cams_embeds
+
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.rotate_center = rotate_center
+        """Initialize layers of the Detr3DTransformer."""
+        self.level_embeds = nn.Parameter(torch.Tensor(self.num_feature_levels, self.embed_dims))
+        if self.use_cams_embeds:
+            self.cams_embeds = nn.Parameter(torch.Tensor(self.num_cams, self.embed_dims))
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention) \
+                    or isinstance(m, CustomMSDeformableAttention):
+                try:
+                    m.init_weight()
+                except AttributeError:
+                    m.init_weights()
+        normal_(self.level_embeds)
+        if self.use_cams_embeds:
+            normal_(self.cams_embeds)
+
+    def forward(self,
+                mlvl_feats,
+                bev_queries,
+                bev_h,
+                bev_w,
+                grid_length=[0.512, 0.512],
+                bev_pos=None,
+                prev_bev=None,
+                **kwargs):
+        """
+        obtain bev features.
+        """
+        bs = mlvl_feats[0].size(0)
+        bev_queries = bev_queries.unsqueeze(1).repeat(1, bs, 1)
+        bev_pos = bev_pos.flatten(2).permute(2, 0, 1)
+
+        feat_flatten = []
+        spatial_shapes = []
+        for lvl, feat in enumerate(mlvl_feats):
+            bs, num_cam, c, h, w = feat.shape
+            spatial_shape = (h, w)
+            feat = feat.flatten(3).permute(1, 0, 3, 2)
+            if self.use_cams_embeds:
+                feat = feat + self.cams_embeds[:, None, None, :].to(feat.dtype)
+            feat = feat + self.level_embeds[None, None, lvl:lvl + 1, :].to(feat.dtype)
+            spatial_shapes.append(spatial_shape)
+            feat_flatten.append(feat)
+
+        feat_flatten = torch.cat(feat_flatten, 2)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=bev_pos.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+
+        feat_flatten = feat_flatten.permute(0, 2, 1, 3)  # (num_cam, H*W, bs, embed_dims)
+
+        bev_embed = self.encoder(bev_queries,
+                                 feat_flatten,
+                                 feat_flatten,
+                                 bev_h=bev_h,
+                                 bev_w=bev_w,
+                                 bev_pos=bev_pos,
+                                 spatial_shapes=spatial_shapes,
+                                 level_start_index=level_start_index,
+                                 prev_bev=None,
+                                 shift=bev_queries.new_tensor([0, 0]).unsqueeze(0),
+                                 **kwargs)
+        # rotate current bev to final aligned
+        prev_bev = bev_embed
+        if 'aug_param' in kwargs['img_metas'][0] and 'GlobalRotScaleTransImage_param' in kwargs['img_metas'][0][
+            'aug_param']:
+            rot_angle, scale_ratio, flip_dx, flip_dy, bda_mat, only_gt = kwargs['img_metas'][0]['aug_param'][
+                'GlobalRotScaleTransImage_param']
+            prev_bev = prev_bev.reshape(bs, bev_h, bev_w, -1).permute(0, 3, 1, 2)  # bchw
+            if only_gt:
+                # rot angle
+                # prev_bev = torchvision.transforms.functional.rotate(prev_bev, -30, InterpolationMode.BILINEAR)
+                ref_y, ref_x = torch.meshgrid(
+                    torch.linspace(0.5, bev_h - 0.5, bev_h, dtype=bev_queries.dtype, device=bev_queries.device),
+                    torch.linspace(0.5, bev_w - 0.5, bev_w, dtype=bev_queries.dtype, device=bev_queries.device))
+                ref_y = (ref_y / bev_h)
+                ref_x = (ref_x / bev_w)
+                grid = torch.stack((ref_x, ref_y), -1)
+                grid_shift = grid * 2.0 - 1.0
+                grid_shift = grid_shift.unsqueeze(0).unsqueeze(-1)
+                # bda_mat = ( bda_mat[:2, :2] / scale_ratio).to(grid_shift).view(1, 1, 1, 2,2).repeat(grid_shift.shape[0], grid_shift.shape[1], grid_shift.shape[2], 1, 1)
+                bda_mat = bda_mat[:2, :2].to(grid_shift).view(1, 1, 1, 2, 2).repeat(grid_shift.shape[0],
+                                                                                    grid_shift.shape[1],
+                                                                                    grid_shift.shape[2], 1, 1)
+                grid_shift = torch.matmul(bda_mat, grid_shift).squeeze(-1)
+                # grid_shift = grid_shift / scale_ratio
+                prev_bev = torch.nn.functional.grid_sample(prev_bev, grid_shift, align_corners=False)
+                # if flip_dx:
+                #     prev_bev = torch.flip(prev_bev, dims=[-1])
+                # if flip_dy:
+                #     prev_bev = torch.flip(prev_bev, dims=[-2])
+            prev_bev = prev_bev.reshape(bs, -1, bev_h * bev_w)
+            prev_bev = prev_bev.permute(0, 2, 1)
+        return prev_bev
+
+
+@TRANSFORMER.register_module()
+class PerceptionTransformerV2(PerceptionTransformerBEVEncoder):
+    """Implements the Detr3D transformer.
+    Args:
+        as_two_stage (bool): Generate query from encoder features.
+            Default: False.
+        num_feature_levels (int): Number of feature maps from FPN:
+            Default: 4.
+        two_stage_num_proposals (int): Number of proposals when set
+            `as_two_stage` as True. Default: 300.
+    """
+
+    def __init__(self,
+                 num_feature_levels=4,
+                 num_cams=6,
+                 two_stage_num_proposals=300,
+                 encoder=None,
+                 embed_dims=256,
+                 use_cams_embeds=True,
+                 rotate_center=[100, 100],
+                 frames=(0,),
+                 decoder=None,
+                 num_fusion=3,
+                 inter_channels=None,
+                 **kwargs):
+        super(PerceptionTransformerV2, self).__init__(num_feature_levels, num_cams, two_stage_num_proposals, encoder,
+                                                      embed_dims, use_cams_embeds, rotate_center,
+                                                      **kwargs)
+        self.decoder = build_transformer_layer_sequence(decoder)
+        """Initialize layers of the Detr3DTransformer."""
+        self.reference_points = nn.Linear(self.embed_dims, 3)
+        self.frames = frames
+        if len(self.frames) > 1:
+            self.fusion = ResNetFusion(len(self.frames) * self.embed_dims, self.embed_dims,
+                                       inter_channels if inter_channels is not None else len(
+                                           self.frames) * self.embed_dims,
+                                       num_fusion)
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        super().init_weights()
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformableAttention3D) or isinstance(m, TemporalSelfAttention) \
+                    or isinstance(m, CustomMSDeformableAttention):
+                try:
+                    m.init_weight()
+                except AttributeError:
+                    m.init_weights()
+        xavier_init(self.reference_points, distribution='uniform', bias=0.)
+
+    def get_bev_features(
+            self,
+            mlvl_feats,
+            bev_queries,
+            bev_h,
+            bev_w,
+            grid_length=[0.512, 0.512],
+            bev_pos=None,
+            prev_bev=None,
+            **kwargs):
+        return super().forward(
+            mlvl_feats,
+            bev_queries,
+            bev_h,
+            bev_w,
+            grid_length,
+            bev_pos,
+            prev_bev,
+            **kwargs
+        )
+
+    def forward(self,
+                mlvl_feats,
+                bev_queries,
+                object_query_embed,
+                bev_h,
+                bev_w,
+                grid_length=[0.512, 0.512],
+                bev_pos=None,
+                reg_branches=None,
+                cls_branches=None,
+                prev_bev=None,
+                **kwargs):
+        """Forward function for `Detr3DTransformer`.
+        Args:
+            mlvl_feats (list(Tensor)): Input queries from
+                different level. Each element has shape
+                [bs, num_cams, embed_dims, h, w].
+            bev_queries (Tensor): (bev_h*bev_w, c)
+            bev_pos (Tensor): (bs, embed_dims, bev_h, bev_w)
+            object_query_embed (Tensor): The query embedding for decoder,
+                with shape [num_query, c].
+            reg_branches (obj:`nn.ModuleList`): Regression heads for
+                feature maps from each decoder layer. Only would
+                be passed when `with_box_refine` is True. Default to None.
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+                - bev_embed: BEV features
+                - inter_states: Outputs from decoder. If
+                    return_intermediate_dec is True output has shape \
+                      (num_dec_layers, bs, num_query, embed_dims), else has \
+                      shape (1, bs, num_query, embed_dims).
+                - init_reference_out: The initial value of reference \
+                    points, has shape (bs, num_queries, 4).
+                - inter_references_out: The internal value of reference \
+                    points in decoder, has shape \
+                    (num_dec_layers, bs,num_query, embed_dims)
+                - enc_outputs_class: The classification score of \
+                    proposals generated from \
+                    encoder's feature maps, has shape \
+                    (batch, h*w, num_classes). \
+                    Only would be returned when `as_two_stage` is True, \
+                    otherwise None.
+                - enc_outputs_coord_unact: The regression results \
+                    generated from encoder's feature maps., has shape \
+                    (batch, h*w, 4). Only would \
+                    be returned when `as_two_stage` is True, \
+                    otherwise None.
+        """
+        bev_embed = self.get_bev_features(
+            mlvl_feats,
+            bev_queries,
+            bev_h,
+            bev_w,
+            grid_length=grid_length,
+            bev_pos=bev_pos,
+            prev_bev=None,
+            **kwargs)  # bev_embed shape: bs, bev_h*bev_w, embed_dims
+
+        if len(self.frames) > 1:
+            cur_ind = list(self.frames).index(0)
+            assert prev_bev[cur_ind] is None and len(prev_bev) == len(self.frames)
+            prev_bev[cur_ind] = bev_embed
+
+            # fill prev frame feature 
+            for i in range(1, cur_ind + 1):
+                if prev_bev[cur_ind - i] is None:
+                    prev_bev[cur_ind - i] = prev_bev[cur_ind - i + 1].detach()
+
+            # fill next frame feature 
+            for i in range(cur_ind + 1, len(self.frames)):
+                if prev_bev[i] is None:
+                    prev_bev[i] = prev_bev[i - 1].detach()
+            bev_embed = [x.reshape(x.shape[0], bev_h, bev_w, x.shape[-1]).permute(0, 3, 1, 2).contiguous() for x in
+                         prev_bev]
+            bev_embed = self.fusion(bev_embed)
+
+        bs = mlvl_feats[0].size(0)
+        query_pos, query = torch.split(
+            object_query_embed, self.embed_dims, dim=1)
+        query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1)
+        query = query.unsqueeze(0).expand(bs, -1, -1)
+        reference_points = self.reference_points(query_pos)
+        reference_points = reference_points.sigmoid()
+        init_reference_out = reference_points
+
+        query = query.permute(1, 0, 2)
+        query_pos = query_pos.permute(1, 0, 2)
+        bev_embed = bev_embed.permute(1, 0, 2)
+
+        inter_states, inter_references = self.decoder(
+            query=query,
+            key=None,
+            value=bev_embed,
+            query_pos=query_pos,
+            reference_points=reference_points,
+            reg_branches=reg_branches,
+            cls_branches=cls_branches,
+            spatial_shapes=torch.tensor([[bev_h, bev_w]], device=query.device),
+            level_start_index=torch.tensor([0], device=query.device),
+            **kwargs)
+
+        inter_references_out = inter_references
+
+        return bev_embed, inter_states, init_reference_out, inter_references_out
diff --git a/mmcv/models/modules/vote_module.py b/mmcv/models/modules/vote_module.py
new file mode 100644
index 0000000..0f7b5d1
--- /dev/null
+++ b/mmcv/models/modules/vote_module.py
@@ -0,0 +1,181 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv import is_tuple_of
+from mmcv.models.bricks import ConvModule
+from torch import nn as nn
+
+from mmcv.models.builder import build_loss
+
+
+class VoteModule(nn.Module):
+    """Vote module.
+
+    Generate votes from seed point features.
+
+    Args:
+        in_channels (int): Number of channels of seed point features.
+        vote_per_seed (int): Number of votes generated from each seed point.
+        gt_per_seed (int): Number of ground truth votes generated
+            from each seed point.
+        num_points (int): Number of points to be used for voting.
+        conv_channels (tuple[int]): Out channels of vote
+            generating convolution.
+        conv_cfg (dict): Config of convolution.
+            Default: dict(type='Conv1d').
+        norm_cfg (dict): Config of normalization.
+            Default: dict(type='BN1d').
+        norm_feats (bool): Whether to normalize features.
+            Default: True.
+        with_res_feat (bool): Whether to predict residual features.
+            Default: True.
+        vote_xyz_range (list[float], None): The range of points translation.
+        vote_loss (dict): Config of vote loss.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 vote_per_seed=1,
+                 gt_per_seed=3,
+                 num_points=-1,
+                 conv_channels=(16, 16),
+                 conv_cfg=dict(type='Conv1d'),
+                 norm_cfg=dict(type='BN1d'),
+                 act_cfg=dict(type='ReLU'),
+                 norm_feats=True,
+                 with_res_feat=True,
+                 vote_xyz_range=None,
+                 vote_loss=None):
+        super().__init__()
+        self.in_channels = in_channels
+        self.vote_per_seed = vote_per_seed
+        self.gt_per_seed = gt_per_seed
+        self.num_points = num_points
+        self.norm_feats = norm_feats
+        self.with_res_feat = with_res_feat
+
+        assert vote_xyz_range is None or is_tuple_of(vote_xyz_range, float)
+        self.vote_xyz_range = vote_xyz_range
+
+        if vote_loss is not None:
+            self.vote_loss = build_loss(vote_loss)
+
+        prev_channels = in_channels
+        vote_conv_list = list()
+        for k in range(len(conv_channels)):
+            vote_conv_list.append(
+                ConvModule(
+                    prev_channels,
+                    conv_channels[k],
+                    1,
+                    padding=0,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    bias=True,
+                    inplace=True))
+            prev_channels = conv_channels[k]
+        self.vote_conv = nn.Sequential(*vote_conv_list)
+
+        # conv_out predicts coordinate and residual features
+        if with_res_feat:
+            out_channel = (3 + in_channels) * self.vote_per_seed
+        else:
+            out_channel = 3 * self.vote_per_seed
+        self.conv_out = nn.Conv1d(prev_channels, out_channel, 1)
+
+    def forward(self, seed_points, seed_feats):
+        """forward.
+
+        Args:
+            seed_points (torch.Tensor): Coordinate of the seed
+                points in shape (B, N, 3).
+            seed_feats (torch.Tensor): Features of the seed points in shape
+                (B, C, N).
+
+        Returns:
+            tuple[torch.Tensor]:
+
+                - vote_points: Voted xyz based on the seed points \
+                    with shape (B, M, 3), ``M=num_seed*vote_per_seed``.
+                - vote_features: Voted features based on the seed points with \
+                    shape (B, C, M) where ``M=num_seed*vote_per_seed``, \
+                    ``C=vote_feature_dim``.
+        """
+        if self.num_points != -1:
+            assert self.num_points < seed_points.shape[1], \
+                f'Number of vote points ({self.num_points}) should be '\
+                f'smaller than seed points size ({seed_points.shape[1]})'
+            seed_points = seed_points[:, :self.num_points]
+            seed_feats = seed_feats[..., :self.num_points]
+
+        batch_size, feat_channels, num_seed = seed_feats.shape
+        num_vote = num_seed * self.vote_per_seed
+        x = self.vote_conv(seed_feats)
+        # (batch_size, (3+out_dim)*vote_per_seed, num_seed)
+        votes = self.conv_out(x)
+
+        votes = votes.transpose(2, 1).view(batch_size, num_seed,
+                                           self.vote_per_seed, -1)
+
+        offset = votes[:, :, :, 0:3]
+        if self.vote_xyz_range is not None:
+            limited_offset_list = []
+            for axis in range(len(self.vote_xyz_range)):
+                limited_offset_list.append(offset[..., axis].clamp(
+                    min=-self.vote_xyz_range[axis],
+                    max=self.vote_xyz_range[axis]))
+            limited_offset = torch.stack(limited_offset_list, -1)
+            vote_points = (seed_points.unsqueeze(2) +
+                           limited_offset).contiguous()
+        else:
+            vote_points = (seed_points.unsqueeze(2) + offset).contiguous()
+        vote_points = vote_points.view(batch_size, num_vote, 3)
+        offset = offset.reshape(batch_size, num_vote, 3).transpose(2, 1)
+
+        if self.with_res_feat:
+            res_feats = votes[:, :, :, 3:]
+            vote_feats = (seed_feats.transpose(2, 1).unsqueeze(2) +
+                          res_feats).contiguous()
+            vote_feats = vote_feats.view(batch_size,
+                                         num_vote, feat_channels).transpose(
+                                             2, 1).contiguous()
+
+            if self.norm_feats:
+                features_norm = torch.norm(vote_feats, p=2, dim=1)
+                vote_feats = vote_feats.div(features_norm.unsqueeze(1))
+        else:
+            vote_feats = seed_feats
+        return vote_points, vote_feats, offset
+
+    def get_loss(self, seed_points, vote_points, seed_indices,
+                 vote_targets_mask, vote_targets):
+        """Calculate loss of voting module.
+
+        Args:
+            seed_points (torch.Tensor): Coordinate of the seed points.
+            vote_points (torch.Tensor): Coordinate of the vote points.
+            seed_indices (torch.Tensor): Indices of seed points in raw points.
+            vote_targets_mask (torch.Tensor): Mask of valid vote targets.
+            vote_targets (torch.Tensor): Targets of votes.
+
+        Returns:
+            torch.Tensor: Weighted vote loss.
+        """
+        batch_size, num_seed = seed_points.shape[:2]
+
+        seed_gt_votes_mask = torch.gather(vote_targets_mask, 1,
+                                          seed_indices).float()
+
+        seed_indices_expand = seed_indices.unsqueeze(-1).repeat(
+            1, 1, 3 * self.gt_per_seed)
+        seed_gt_votes = torch.gather(vote_targets, 1, seed_indices_expand)
+        seed_gt_votes += seed_points.repeat(1, 1, self.gt_per_seed)
+
+        weight = seed_gt_votes_mask / (torch.sum(seed_gt_votes_mask) + 1e-6)
+        distance = self.vote_loss(
+            vote_points.view(batch_size * num_seed, -1, 3),
+            seed_gt_votes.view(batch_size * num_seed, -1, 3),
+            dst_weight=weight.view(batch_size * num_seed, 1))[1]
+        vote_loss = torch.sum(torch.min(distance, dim=1)[0])
+
+        return vote_loss
diff --git a/mmcv/models/necks/__init__.py b/mmcv/models/necks/__init__.py
new file mode 100644
index 0000000..614ac35
--- /dev/null
+++ b/mmcv/models/necks/__init__.py
@@ -0,0 +1,24 @@
+# from .bfp import BFP
+# from .channel_mapper import ChannelMapper
+# from .ct_resnet_neck import CTResNetNeck
+# from .dilated_encoder import DilatedEncoder
+# from .fpg import FPG
+from .fpn import FPN
+# from .fpn_carafe import FPN_CARAFE
+# from .multilevel_neck import MultiLevelNeck
+# from .hrfpn import HRFPN
+# from .nas_fpn import NASFPN
+# from .nasfcos_fpn import NASFCOS_FPN
+# from .pafpn import PAFPN
+# from .rfp import RFP
+# from .ssd_neck import SSDNeck
+# from .yolo_neck import YOLOV3Neck
+# from .imvoxel_neck import OutdoorImVoxelNeck
+# from .second_fpn import SECONDFPN
+
+# __all__ = [
+#     'FPN', 'BFP', 'ChannelMapper', 'HRFPN', 'NASFPN', 'FPN_CARAFE', 'PAFPN',
+#     'NASFCOS_FPN', 'RFP', 'YOLOV3Neck', 'FPG', 'DilatedEncoder',
+#     'CTResNetNeck', 'SSDNeck', 'SECONDFPN', 'OutdoorImVoxelNeck',
+#     'MultiLevelNeck'
+# ]
diff --git a/mmcv/models/necks/fpn.py b/mmcv/models/necks/fpn.py
new file mode 100644
index 0000000..5ef8b03
--- /dev/null
+++ b/mmcv/models/necks/fpn.py
@@ -0,0 +1,203 @@
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.models.bricks import ConvModule
+from mmcv.models.backbones import BaseModule
+from mmcv.utils import auto_fp16
+
+from ..builder import NECKS
+
+
+@NECKS.register_module()
+class FPN(BaseModule):
+    r"""Feature Pyramid Network.
+
+    This is an implementation of paper `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, it is equivalent to `add_extra_convs='on_input'`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral':  Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (str): Config dict for activation layer in ConvModule.
+            Default: None.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: `dict(mode='nearest')`
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = FPN(in_channels, 11, len(in_channels)).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 relu_before_extra_convs=False,
+                 no_norm_on_lateral=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None,
+                 upsample_cfg=dict(mode='nearest'),
+                 init_cfg=dict(
+                     type='Xavier', layer='Conv2d', distribution='uniform')):
+        super(FPN, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level < inputs, no extra level is allowed
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+            assert num_outs == end_level - start_level
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
+        elif add_extra_convs:  # True
+            self.add_extra_convs = 'on_input'
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                inplace=False)
+
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == 'on_input':
+                    in_channels = self.in_channels[self.backbone_end_level - 1]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+    @auto_fp16()
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if 'scale_factor' in self.upsample_cfg:
+                laterals[i - 1] += F.interpolate(laterals[i],
+                                                 **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] += F.interpolate(
+                    laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == 'on_input':
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == 'on_lateral':
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == 'on_output':
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/mmcv/models/opt/__init__.py b/mmcv/models/opt/__init__.py
new file mode 100644
index 0000000..c7dd426
--- /dev/null
+++ b/mmcv/models/opt/__init__.py
@@ -0,0 +1 @@
+from .adamw import AdamW2
\ No newline at end of file
diff --git a/mmcv/models/opt/adamw.py b/mmcv/models/opt/adamw.py
new file mode 100644
index 0000000..6b6f358
--- /dev/null
+++ b/mmcv/models/opt/adamw.py
@@ -0,0 +1,131 @@
+try:
+    from torch.optim import _functional as F
+except:
+    print('WARNING!!!, I recommend using torch>=1.8')
+
+import torch
+from torch.optim.optimizer import Optimizer
+from mmcv.optims import OPTIMIZERS
+
+@OPTIMIZERS.register_module()
+class AdamW2(Optimizer):
+    r"""Implements AdamW algorithm. Solve the bug of torch 1.8
+
+    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
+    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay coefficient (default: 1e-2)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False)
+
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _Decoupled Weight Decay Regularization:
+        https://arxiv.org/abs/1711.05101
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=1e-2, amsgrad=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad)
+        super(AdamW2, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(AdamW2, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            state_sums = []
+            max_exp_avg_sqs = []
+            state_steps = []
+            amsgrad = group['amsgrad']
+
+            # put this line here for solving bug
+            beta1, beta2 = group['betas']
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError('AdamW does not support sparse gradients')
+                grads.append(p.grad)
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                exp_avgs.append(state['exp_avg'])
+                exp_avg_sqs.append(state['exp_avg_sq'])
+
+                if amsgrad:
+                    max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+
+
+                # update the steps for each param group update
+                state['step'] += 1
+                # record the step after step update
+                state_steps.append(state['step'])
+
+            F.adamw(params_with_grad,
+                    grads,
+                    exp_avgs,
+                    exp_avg_sqs,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad,
+                    beta1,
+                    beta2,
+                    group['lr'],
+                    group['weight_decay'],
+                    group['eps'])
+
+        return loss
\ No newline at end of file
diff --git a/mmcv/models/roi_heads/mask_heads/__init__.py b/mmcv/models/roi_heads/mask_heads/__init__.py
new file mode 100644
index 0000000..89ed5bc
--- /dev/null
+++ b/mmcv/models/roi_heads/mask_heads/__init__.py
@@ -0,0 +1 @@
+from .fused_semantic_head import FusedSemanticHead
\ No newline at end of file
diff --git a/mmcv/models/roi_heads/mask_heads/fused_semantic_head.py b/mmcv/models/roi_heads/mask_heads/fused_semantic_head.py
new file mode 100644
index 0000000..deb6810
--- /dev/null
+++ b/mmcv/models/roi_heads/mask_heads/fused_semantic_head.py
@@ -0,0 +1,107 @@
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.models import ConvModule
+from mmcv.models.backbones import BaseModule
+from mmcv.utils import auto_fp16, force_fp32
+
+from mmcv.models.builder import HEADS
+
+
+@HEADS.register_module()
+class FusedSemanticHead(BaseModule):
+    r"""Multi-level fused semantic segmentation head.
+
+    .. code-block:: none
+
+        in_1 -> 1x1 conv ---
+                            |
+        in_2 -> 1x1 conv -- |
+                           ||
+        in_3 -> 1x1 conv - ||
+                          |||                  /-> 1x1 conv (mask prediction)
+        in_4 -> 1x1 conv -----> 3x3 convs (*4)
+                            |                  \-> 1x1 conv (feature)
+        in_5 -> 1x1 conv ---
+    """  # noqa: W605
+
+    def __init__(self,
+                 num_ins,
+                 fusion_level,
+                 num_convs=4,
+                 in_channels=256,
+                 conv_out_channels=256,
+                 num_classes=183,
+                 ignore_label=255,
+                 loss_weight=0.2,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 init_cfg=dict(
+                     type='Kaiming', override=dict(name='conv_logits'))):
+        super(FusedSemanticHead, self).__init__(init_cfg)
+        self.num_ins = num_ins
+        self.fusion_level = fusion_level
+        self.num_convs = num_convs
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.num_classes = num_classes
+        self.ignore_label = ignore_label
+        self.loss_weight = loss_weight
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.fp16_enabled = False
+
+        self.lateral_convs = nn.ModuleList()
+        for i in range(self.num_ins):
+            self.lateral_convs.append(
+                ConvModule(
+                    self.in_channels,
+                    self.in_channels,
+                    1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    inplace=False))
+
+        self.convs = nn.ModuleList()
+        for i in range(self.num_convs):
+            in_channels = self.in_channels if i == 0 else conv_out_channels
+            self.convs.append(
+                ConvModule(
+                    in_channels,
+                    conv_out_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg))
+        self.conv_embedding = ConvModule(
+            conv_out_channels,
+            conv_out_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg)
+        self.conv_logits = nn.Conv2d(conv_out_channels, self.num_classes, 1)
+
+        self.criterion = nn.CrossEntropyLoss(ignore_index=ignore_label)
+
+    @auto_fp16()
+    def forward(self, feats):
+        x = self.lateral_convs[self.fusion_level](feats[self.fusion_level])
+        fused_size = tuple(x.shape[-2:])
+        for i, feat in enumerate(feats):
+            if i != self.fusion_level:
+                feat = F.interpolate(
+                    feat, size=fused_size, mode='bilinear', align_corners=True)
+                x += self.lateral_convs[i](feat)
+
+        for i in range(self.num_convs):
+            x = self.convs[i](x)
+
+        mask_pred = self.conv_logits(x)
+        x = self.conv_embedding(x)
+        return mask_pred, x
+
+    @force_fp32(apply_to=('mask_pred', ))
+    def loss(self, mask_pred, labels):
+        labels = labels.squeeze(1).long()
+        loss_semantic_seg = self.criterion(mask_pred, labels)
+        loss_semantic_seg *= self.loss_weight
+        return loss_semantic_seg
diff --git a/mmcv/models/segmentors/__init__.py b/mmcv/models/segmentors/__init__.py
new file mode 100644
index 0000000..9358b86
--- /dev/null
+++ b/mmcv/models/segmentors/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import Base3DSegmentor, BaseSegmentor
+# from .cascade_encoder_decoder import CascadeEncoderDecoder
+# from .encoder_decoder import EncoderDecoder3D, EncoderDecoder
+
+# __all__ = ['Base3DSegmentor', 'EncoderDecoder3D',
+#            'BaseSegmentor', 'EncoderDecoder', 'CascadeEncoderDecoder']
diff --git a/mmcv/models/segmentors/base.py b/mmcv/models/segmentors/base.py
new file mode 100644
index 0000000..742e16b
--- /dev/null
+++ b/mmcv/models/segmentors/base.py
@@ -0,0 +1,379 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.parallel import DataContainer as DC
+from mmcv.models.backbones.base_module import BaseModule
+from mmcv.utils import auto_fp16
+from os import path as osp
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+from mmcv.image import imread, imwrite
+from mmcv.utils import is_list_of
+from mmcv.core.visualizer import show_seg_result
+from mmcv.core.visualization import imshow
+
+
+class BaseSegmentor(BaseModule, metaclass=ABCMeta):
+    """Base class for segmentors."""
+
+    def __init__(self, init_cfg=None):
+        super(BaseSegmentor, self).__init__(init_cfg)
+        self.fp16_enabled = False
+
+    @property
+    def with_neck(self):
+        """bool: whether the segmentor has neck"""
+        return hasattr(self, 'neck') and self.neck is not None
+
+    @property
+    def with_auxiliary_head(self):
+        """bool: whether the segmentor has auxiliary head"""
+        return hasattr(self,
+                       'auxiliary_head') and self.auxiliary_head is not None
+
+    @property
+    def with_decode_head(self):
+        """bool: whether the segmentor has decode head"""
+        return hasattr(self, 'decode_head') and self.decode_head is not None
+
+    @abstractmethod
+    def extract_feat(self, imgs):
+        """Placeholder for extract features from images."""
+        pass
+
+    @abstractmethod
+    def encode_decode(self, img, img_metas):
+        """Placeholder for encode images with backbone and decode into a
+        semantic segmentation map of the same size as input."""
+        pass
+
+    @abstractmethod
+    def forward_train(self, imgs, img_metas, **kwargs):
+        """Placeholder for Forward function for training."""
+        pass
+
+    @abstractmethod
+    def simple_test(self, img, img_meta, **kwargs):
+        """Placeholder for single image test."""
+        pass
+
+    @abstractmethod
+    def aug_test(self, imgs, img_metas, **kwargs):
+        """Placeholder for augmentation test."""
+        pass
+
+    def forward_test(self, imgs, img_metas, **kwargs):
+        """
+        Args:
+            imgs (List[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            img_metas (List[List[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch.
+        """
+        for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError(f'{name} must be a list, but got '
+                                f'{type(var)}')
+
+        num_augs = len(imgs)
+        if num_augs != len(img_metas):
+            raise ValueError(f'num of augmentations ({len(imgs)}) != '
+                             f'num of image meta ({len(img_metas)})')
+        # all images in the same aug batch all of the same ori_shape and pad
+        # shape
+        for img_meta in img_metas:
+            ori_shapes = [_['ori_shape'] for _ in img_meta]
+            assert all(shape == ori_shapes[0] for shape in ori_shapes)
+            img_shapes = [_['img_shape'] for _ in img_meta]
+            assert all(shape == img_shapes[0] for shape in img_shapes)
+            pad_shapes = [_['pad_shape'] for _ in img_meta]
+            assert all(shape == pad_shapes[0] for shape in pad_shapes)
+
+        if num_augs == 1:
+            return self.simple_test(imgs[0], img_metas[0], **kwargs)
+        else:
+            return self.aug_test(imgs, img_metas, **kwargs)
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self, img, img_metas, return_loss=True, **kwargs):
+        """Calls either :func:`forward_train` or :func:`forward_test` depending
+        on whether ``return_loss`` is ``True``.
+
+        Note this setting will change the expected inputs. When
+        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
+        and List[dict]), and when ``resturn_loss=False``, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+        """
+        if return_loss:
+            return self.forward_train(img, img_metas, **kwargs)
+        else:
+            return self.forward_test(img, img_metas, **kwargs)
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        """The iteration step during training.
+
+        This method defines an iteration step during training, except for the
+        back propagation and optimizer updating, which are done in an optimizer
+        hook. Note that in some complicated cases or models, the whole process
+        including back propagation and optimizer updating is also defined in
+        this method, such as GAN.
+
+        Args:
+            data (dict): The output of dataloader.
+            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
+                runner is passed to ``train_step()``. This argument is unused
+                and reserved.
+
+        Returns:
+            dict: It should contain at least 3 keys: ``loss``, ``log_vars``,
+                ``num_samples``.
+                ``loss`` is a tensor for back propagation, which can be a
+                weighted sum of multiple losses.
+                ``log_vars`` contains all the variables to be sent to the
+                logger.
+                ``num_samples`` indicates the batch size (when the model is
+                DDP, it means the batch size on each GPU), which is used for
+                averaging the logs.
+        """
+        losses = self(**data_batch)
+        loss, log_vars = self._parse_losses(losses)
+
+        outputs = dict(
+            loss=loss,
+            log_vars=log_vars,
+            num_samples=len(data_batch['img_metas']))
+
+        return outputs
+
+    def val_step(self, data_batch, **kwargs):
+        """The iteration step during validation.
+
+        This method shares the same signature as :func:`train_step`, but used
+        during val epochs. Note that the evaluation after training epochs is
+        not implemented with this method, but an evaluation hook.
+        """
+        output = self(**data_batch, **kwargs)
+        return output
+
+    @staticmethod
+    def _parse_losses(losses):
+        """Parse the raw outputs (losses) of the network.
+
+        Args:
+            losses (dict): Raw output of the network, which usually contain
+                losses and other necessary information.
+
+        Returns:
+            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor
+                which may be a weighted sum of all losses, log_vars contains
+                all the variables to be sent to the logger.
+        """
+        log_vars = OrderedDict()
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars[loss_name] = loss_value.mean()
+            elif isinstance(loss_value, list):
+                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+            else:
+                raise TypeError(
+                    f'{loss_name} is not a tensor or list of tensors')
+
+        loss = sum(_value for _key, _value in log_vars.items()
+                   if 'loss' in _key)
+
+        log_vars['loss'] = loss
+        for loss_name, loss_value in log_vars.items():
+            # reduce loss when distributed training
+            if dist.is_available() and dist.is_initialized():
+                loss_value = loss_value.data.clone()
+                dist.all_reduce(loss_value.div_(dist.get_world_size()))
+            log_vars[loss_name] = loss_value.item()
+
+        return loss, log_vars
+
+    def show_result(self,
+                    img,
+                    result,
+                    palette=None,
+                    win_name='',
+                    show=False,
+                    wait_time=0,
+                    out_file=None,
+                    opacity=0.5):
+        """Draw `result` over `img`.
+
+        Args:
+            img (str or Tensor): The image to be displayed.
+            result (Tensor): The semantic segmentation results to draw over
+                `img`.
+            palette (list[list[int]]] | np.ndarray | None): The palette of
+                segmentation map. If None is given, random palette will be
+                generated. Default: None
+            win_name (str): The window name.
+            wait_time (int): Value of waitKey param.
+                Default: 0.
+            show (bool): Whether to show the image.
+                Default: False.
+            out_file (str or None): The filename to write the image.
+                Default: None.
+            opacity(float): Opacity of painted segmentation map.
+                Default 0.5.
+                Must be in (0, 1] range.
+        Returns:
+            img (Tensor): Only if not `show` or `out_file`
+        """
+        img = imread(img)
+        img = img.copy()
+        seg = result[0]
+        if palette is None:
+            if self.PALETTE is None:
+                palette = np.random.randint(
+                    0, 255, size=(len(self.CLASSES), 3))
+            else:
+                palette = self.PALETTE
+        palette = np.array(palette)
+        assert palette.shape[0] == len(self.CLASSES)
+        assert palette.shape[1] == 3
+        assert len(palette.shape) == 2
+        assert 0 < opacity <= 1.0
+        color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
+        for label, color in enumerate(palette):
+            color_seg[seg == label, :] = color
+        # convert to BGR
+        color_seg = color_seg[..., ::-1]
+
+        img = img * (1 - opacity) + color_seg * opacity
+        img = img.astype(np.uint8)
+        # if out_file specified, do not show image in window
+        if out_file is not None:
+            show = False
+
+        if show:
+            imshow(img, win_name, wait_time)
+        if out_file is not None:
+            imwrite(img, out_file)
+
+        if not (show or out_file):
+            warnings.warn('show==False and out_file is not specified, only '
+                          'result image will be returned')
+            return img
+
+class Base3DSegmentor(BaseSegmentor):
+    """Base class for 3D segmentors.
+
+    The main difference with `BaseSegmentor` is that we modify the keys in
+    data_dict and use a 3D seg specific visualization function.
+    """
+
+    @property
+    def with_regularization_loss(self):
+        """bool: whether the segmentor has regularization loss for weight"""
+        return hasattr(self, 'loss_regularization') and \
+            self.loss_regularization is not None
+
+    def forward_test(self, points, img_metas, **kwargs):
+        """Calls either simple_test or aug_test depending on the length of
+        outer list of points. If len(points) == 1, call simple_test. Otherwise
+        call aug_test to aggregate the test results by e.g. voting.
+
+        Args:
+            points (list[list[torch.Tensor]]): the outer list indicates
+                test-time augmentations and inner torch.Tensor should have a
+                shape BXNxC, which contains all points in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch.
+        """
+        for var, name in [(points, 'points'), (img_metas, 'img_metas')]:
+            if not isinstance(var, list):
+                raise TypeError(f'{name} must be a list, but got {type(var)}')
+
+        num_augs = len(points)
+        if num_augs != len(img_metas):
+            raise ValueError(f'num of augmentations ({len(points)}) != '
+                             f'num of image meta ({len(img_metas)})')
+
+        if num_augs == 1:
+            return self.simple_test(points[0], img_metas[0], **kwargs)
+        else:
+            return self.aug_test(points, img_metas, **kwargs)
+
+    @auto_fp16(apply_to=('points'))
+    def forward(self, return_loss=True, **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True.
+
+        Note this setting will change the expected inputs. When
+        `return_loss=True`, point and img_metas are single-nested (i.e.
+        torch.Tensor and list[dict]), and when `resturn_loss=False`, point and
+        img_metas should be double nested (i.e.  list[torch.Tensor],
+        list[list[dict]]), with the outer list indicating test time
+        augmentations.
+        """
+        if return_loss:
+            return self.forward_train(**kwargs)
+        else:
+            return self.forward_test(**kwargs)
+
+    def show_results(self,
+                     data,
+                     result,
+                     palette=None,
+                     out_dir=None,
+                     ignore_index=None):
+        """Results visualization.
+
+        Args:
+            data (list[dict]): Input points and the information of the sample.
+            result (list[dict]): Prediction results.
+            palette (list[list[int]]] | np.ndarray | None): The palette of
+                segmentation map. If None is given, random palette will be
+                generated. Default: None
+            out_dir (str): Output directory of visualization result.
+            ignore_index (int, optional): The label index to be ignored, e.g.
+                unannotated points. If None is given, set to len(self.CLASSES).
+                Defaults to None.
+        """
+        assert out_dir is not None, 'Expect out_dir, got none.'
+        if palette is None:
+            if self.PALETTE is None:
+                palette = np.random.randint(
+                    0, 255, size=(len(self.CLASSES), 3))
+            else:
+                palette = self.PALETTE
+        palette = np.array(palette)
+        for batch_id in range(len(result)):
+            if isinstance(data['points'][0], DC):
+                points = data['points'][0]._data[0][batch_id].numpy()
+            elif is_list_of(data['points'][0], torch.Tensor):
+                points = data['points'][0][batch_id]
+            else:
+                ValueError(f"Unsupported data type {type(data['points'][0])} "
+                           f'for visualization!')
+            if isinstance(data['img_metas'][0], DC):
+                pts_filename = data['img_metas'][0]._data[0][batch_id][
+                    'pts_filename']
+            elif is_list_of(data['img_metas'][0], dict):
+                pts_filename = data['img_metas'][0][batch_id]['pts_filename']
+            else:
+                ValueError(
+                    f"Unsupported data type {type(data['img_metas'][0])} "
+                    f'for visualization!')
+            file_name = osp.split(pts_filename)[-1].split('.')[0]
+
+            pred_sem_mask = result[batch_id]['semantic_mask'].cpu().numpy()
+
+            show_seg_result(
+                points,
+                None,
+                pred_sem_mask,
+                out_dir,
+                file_name,
+                palette,
+                ignore_index,
+                show=True)
\ No newline at end of file
diff --git a/mmcv/models/utils/__init__.py b/mmcv/models/utils/__init__.py
new file mode 100644
index 0000000..6b9c3c9
--- /dev/null
+++ b/mmcv/models/utils/__init__.py
@@ -0,0 +1,25 @@
+from .builder import build_linear_layer, build_transformer
+from .positional_encoding import (LearnedPositionalEncoding,
+                                  SinePositionalEncoding)
+from .res_layer import ResLayer, SimplifiedBasicBlock
+from .transformer import (DetrTransformerDecoder, DetrTransformerDecoderLayer,
+                          DynamicConv, Transformer)
+from .grid_mask import GridMask
+from .weight_init import (INITIALIZERS, Caffe2XavierInit, ConstantInit,
+                          KaimingInit, NormalInit, PretrainedInit,
+                          TruncNormalInit, UniformInit, XavierInit,
+                          bias_init_with_prob, caffe2_xavier_init,
+                          constant_init, initialize, kaiming_init, normal_init,
+                          trunc_normal_init, uniform_init, xavier_init)
+from .fuse_conv_bn import fuse_conv_bn
+
+
+# __all__ = [
+#     'ResLayer', 'gaussian_radius', 'gen_gaussian_target',
+#     'DetrTransformerDecoderLayer', 'DetrTransformerDecoder', 'Transformer',
+#     'build_transformer', 'build_linear_layer', 'SinePositionalEncoding',
+#     'LearnedPositionalEncoding', 'DynamicConv', 'SimplifiedBasicBlock',
+#     'NormedLinear', 'NormedConv2d', 'make_divisible', 'InvertedResidual',
+#     'SELayer','clip_sigmoid', 'MLP', 'run_time', 'GridMask', 'SelfAttentionBlock',
+#     'UpConvBlock', 'InvertedResidualV3', 'DropPath', 'trunc_normal_'
+# ]
diff --git a/mmcv/models/utils/builder.py b/mmcv/models/utils/builder.py
new file mode 100644
index 0000000..fdcff09
--- /dev/null
+++ b/mmcv/models/utils/builder.py
@@ -0,0 +1,46 @@
+import torch.nn as nn
+from mmcv.utils import Registry, build_from_cfg
+
+TRANSFORMER = Registry('Transformer')
+LINEAR_LAYERS = Registry('linear layers')
+
+
+def build_transformer(cfg, default_args=None):
+    """Builder for Transformer."""
+    return build_from_cfg(cfg, TRANSFORMER, default_args)
+
+
+LINEAR_LAYERS.register_module('Linear', module=nn.Linear)
+
+
+def build_linear_layer(cfg, *args, **kwargs):
+    """Build linear layer.
+    Args:
+        cfg (None or dict): The linear layer config, which should contain:
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate an linear layer.
+        args (argument list): Arguments passed to the `__init__`
+            method of the corresponding linear layer.
+        kwargs (keyword arguments): Keyword arguments passed to the `__init__`
+            method of the corresponding linear layer.
+    Returns:
+        nn.Module: Created linear layer.
+    """
+    if cfg is None:
+        cfg_ = dict(type='Linear')
+    else:
+        if not isinstance(cfg, dict):
+            raise TypeError('cfg must be a dict')
+        if 'type' not in cfg:
+            raise KeyError('the cfg dict must contain the key "type"')
+        cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if layer_type not in LINEAR_LAYERS:
+        raise KeyError(f'Unrecognized linear type {layer_type}')
+    else:
+        linear_layer = LINEAR_LAYERS.get(layer_type)
+
+    layer = linear_layer(*args, **kwargs, **cfg_)
+
+    return layer
diff --git a/mmcv/models/utils/functional.py b/mmcv/models/utils/functional.py
new file mode 100644
index 0000000..b4ae933
--- /dev/null
+++ b/mmcv/models/utils/functional.py
@@ -0,0 +1,141 @@
+import math
+import torch
+from einops import rearrange, repeat
+
+def bivariate_gaussian_activation(ip):
+    """
+    Activation function to output parameters of bivariate Gaussian distribution.
+
+    Args:
+        ip (torch.Tensor): Input tensor.
+
+    Returns:
+        torch.Tensor: Output tensor containing the parameters of the bivariate Gaussian distribution.
+    """
+    mu_x = ip[..., 0:1]
+    mu_y = ip[..., 1:2]
+    sig_x = ip[..., 2:3]
+    sig_y = ip[..., 3:4]
+    rho = ip[..., 4:5]
+    sig_x = torch.exp(sig_x)
+    sig_y = torch.exp(sig_y)
+    rho = torch.tanh(rho)
+    out = torch.cat([mu_x, mu_y, sig_x, sig_y, rho], dim=-1)
+    return out
+
+def norm_points(pos, pc_range):
+    """
+    Normalize the end points of a given position tensor.
+
+    Args:
+        pos (torch.Tensor): Input position tensor.
+        pc_range (List[float]): Point cloud range.
+
+    Returns:
+        torch.Tensor: Normalized end points tensor.
+    """
+    x_norm = (pos[..., 0] - pc_range[0]) / (pc_range[3] - pc_range[0])
+    y_norm = (pos[..., 1] - pc_range[1]) / (pc_range[4] - pc_range[1]) 
+    return torch.stack([x_norm, y_norm], dim=-1)
+
+def pos2posemb2d(pos, num_pos_feats=128, temperature=10000):
+    """
+    Convert 2D position into positional embeddings.
+
+    Args:
+        pos (torch.Tensor): Input 2D position tensor.
+        num_pos_feats (int, optional): Number of positional features. Default is 128.
+        temperature (int, optional): Temperature factor for positional embeddings. Default is 10000.
+
+    Returns:
+        torch.Tensor: Positional embeddings tensor.
+    """
+    scale = 2 * math.pi
+    pos = pos * scale
+    dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos.device)
+    dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
+    pos_x = pos[..., 0, None] / dim_t
+    pos_y = pos[..., 1, None] / dim_t
+    pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(-2)
+    pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()), dim=-1).flatten(-2)
+    posemb = torch.cat((pos_y, pos_x), dim=-1)
+    return posemb
+
+def rot_2d(yaw):
+    """
+    Compute 2D rotation matrix for a given yaw angle tensor.
+
+    Args:
+        yaw (torch.Tensor): Input yaw angle tensor.
+
+    Returns:
+        torch.Tensor: 2D rotation matrix tensor.
+    """
+    sy, cy = torch.sin(yaw), torch.cos(yaw)
+    out = torch.stack([torch.stack([cy, -sy]), torch.stack([sy, cy])]).permute([2,0,1])
+    return out
+
+def anchor_coordinate_transform(anchors, bbox_results, with_translation_transform=True, with_rotation_transform=True):
+    """
+    Transform anchor coordinates with respect to detected bounding boxes in the batch.
+
+    Args:
+        anchors (torch.Tensor): A tensor containing the k-means anchor values.
+        bbox_results (List[Tuple[torch.Tensor]]): A list of tuples containing the bounding box results for each image in the batch.
+        with_translate (bool, optional): Whether to perform translation transformation. Defaults to True.
+        with_rot (bool, optional): Whether to perform rotation transformation. Defaults to True.
+
+    Returns:
+        torch.Tensor: A tensor containing the transformed anchor coordinates.
+    """
+    batch_size = len(bbox_results)
+    batched_anchors = []
+    transformed_anchors = anchors[None, ...] # expand num agents: num_groups, num_modes, 12, 2 -> 1, ...
+    for i in range(batch_size):
+        bboxes, scores, labels, bbox_index, mask = bbox_results[i]
+        yaw = bboxes.yaw.to(transformed_anchors.device)
+        bbox_centers = bboxes.gravity_center.to(transformed_anchors.device)
+        if with_rotation_transform: 
+            angle = yaw - 3.1415953 # num_agents, 1
+            rot_yaw = rot_2d(angle) # num_agents, 2, 2
+            rot_yaw = rot_yaw[:, None, None,:, :] # num_agents, 1, 1, 2, 2
+            transformed_anchors = rearrange(transformed_anchors, 'b g m t c -> b g m c t')  # 1, num_groups, num_modes, 12, 2 -> 1, num_groups, num_modes, 2, 12
+            transformed_anchors = torch.matmul(rot_yaw, transformed_anchors)# -> num_agents, num_groups, num_modes, 12, 2
+            transformed_anchors = rearrange(transformed_anchors, 'b g m c t -> b g m t c')
+        if with_translation_transform:
+            transformed_anchors = bbox_centers[:, None, None, None, :2] + transformed_anchors
+        batched_anchors.append(transformed_anchors)
+    return torch.stack(batched_anchors)
+
+
+def trajectory_coordinate_transform(trajectory, bbox_results, with_translation_transform=True, with_rotation_transform=True):
+    """
+    Transform trajectory coordinates with respect to detected bounding boxes in the batch.
+    Args:
+        trajectory (torch.Tensor): predicted trajectory.
+        bbox_results (List[Tuple[torch.Tensor]]): A list of tuples containing the bounding box results for each image in the batch.
+        with_translate (bool, optional): Whether to perform translation transformation. Defaults to True.
+        with_rot (bool, optional): Whether to perform rotation transformation. Defaults to True.
+
+    Returns:
+        torch.Tensor: A tensor containing the transformed trajectory coordinates.
+    """
+    batch_size = len(bbox_results)
+    batched_trajectories = []
+    for i in range(batch_size):
+        bboxes, scores, labels, bbox_index, mask = bbox_results[i]
+        yaw = bboxes.yaw.to(trajectory.device)
+        bbox_centers = bboxes.gravity_center.to(trajectory.device)
+        transformed_trajectory = trajectory[i,...]
+        if with_rotation_transform:
+            # we take negtive here, to reverse the trajectory back to ego centric coordinate
+            angle = -(yaw - 3.1415953) 
+            rot_yaw = rot_2d(angle)
+            rot_yaw = rot_yaw[:,None, None,:, :] # A, 1, 1, 2, 2
+            transformed_trajectory = rearrange(transformed_trajectory, 'a g p t c -> a g p c t') # A, G, P, 12 ,2 -> # A, G, P, 2, 12
+            transformed_trajectory = torch.matmul(rot_yaw, transformed_trajectory)# -> A, G, P, 12, 2
+            transformed_trajectory = rearrange(transformed_trajectory, 'a g p c t -> a g p t c')
+        if with_translation_transform:
+            transformed_trajectory = bbox_centers[:, None, None, None, :2] + transformed_trajectory
+        batched_trajectories.append(transformed_trajectory)
+    return torch.stack(batched_trajectories)
\ No newline at end of file
diff --git a/mmcv/models/utils/fuse_conv_bn.py b/mmcv/models/utils/fuse_conv_bn.py
new file mode 100644
index 0000000..cb7076f
--- /dev/null
+++ b/mmcv/models/utils/fuse_conv_bn.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+
+def _fuse_conv_bn(conv, bn):
+    """Fuse conv and bn into one module.
+
+    Args:
+        conv (nn.Module): Conv to be fused.
+        bn (nn.Module): BN to be fused.
+
+    Returns:
+        nn.Module: Fused module.
+    """
+    conv_w = conv.weight
+    conv_b = conv.bias if conv.bias is not None else torch.zeros_like(
+        bn.running_mean)
+
+    factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
+    conv.weight = nn.Parameter(conv_w *
+                               factor.reshape([conv.out_channels, 1, 1, 1]))
+    conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
+    return conv
+
+
+def fuse_conv_bn(module):
+    """Recursively fuse conv and bn in a module.
+
+    During inference, the functionary of batch norm layers is turned off
+    but only the mean and var alone channels are used, which exposes the
+    chance to fuse it with the preceding conv layers to save computations and
+    simplify network structures.
+
+    Args:
+        module (nn.Module): Module to be fused.
+
+    Returns:
+        nn.Module: Fused module.
+    """
+    last_conv = None
+    last_conv_name = None
+
+    for name, child in module.named_children():
+        if isinstance(child,
+                      (nn.modules.batchnorm._BatchNorm, nn.SyncBatchNorm)):
+            if last_conv is None:  # only fuse BN that is after Conv
+                continue
+            fused_conv = _fuse_conv_bn(last_conv, child)
+            module._modules[last_conv_name] = fused_conv
+            # To reduce changes, set BN as Identity instead of deleting it.
+            module._modules[name] = nn.Identity()
+            last_conv = None
+        elif isinstance(child, nn.Conv2d):
+            last_conv = child
+            last_conv_name = name
+        else:
+            fuse_conv_bn(child)
+    return module
diff --git a/mmcv/models/utils/grid_mask.py b/mmcv/models/utils/grid_mask.py
new file mode 100755
index 0000000..2986e52
--- /dev/null
+++ b/mmcv/models/utils/grid_mask.py
@@ -0,0 +1,124 @@
+import torch
+import torch.nn as nn
+import numpy as np
+from PIL import Image
+from mmcv.utils import force_fp32, auto_fp16
+
+class Grid(object):
+    def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.):
+        self.use_h = use_h
+        self.use_w = use_w
+        self.rotate = rotate
+        self.offset = offset
+        self.ratio = ratio
+        self.mode=mode
+        self.st_prob = prob
+        self.prob = prob
+
+    def set_prob(self, epoch, max_epoch):
+        self.prob = self.st_prob * epoch / max_epoch
+
+    def __call__(self, img, label):
+        if np.random.rand() > self.prob:
+            return img, label
+        h = img.size(1)
+        w = img.size(2)
+        self.d1 = 2
+        self.d2 = min(h, w)
+        hh = int(1.5*h)
+        ww = int(1.5*w)
+        d = np.random.randint(self.d1, self.d2)
+        if self.ratio == 1:
+            self.l = np.random.randint(1, d)
+        else:
+            self.l = min(max(int(d*self.ratio+0.5),1),d-1)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        if self.use_h:
+            for i in range(hh//d):
+                s = d*i + st_h
+                t = min(s+self.l, hh)
+                mask[s:t,:] *= 0
+        if self.use_w:
+            for i in range(ww//d):
+                s = d*i + st_w
+                t = min(s+self.l, ww)
+                mask[:,s:t] *= 0
+       
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w]
+
+        mask = torch.from_numpy(mask).float()
+        if self.mode == 1:
+            mask = 1-mask
+
+        mask = mask.expand_as(img)
+        if self.offset:
+            offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).float()
+            offset = (1 - mask) * offset
+            img = img * mask + offset
+        else:
+            img = img * mask 
+
+        return img, label
+
+
+class GridMask(nn.Module):
+    def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.):
+        super(GridMask, self).__init__()
+        self.use_h = use_h
+        self.use_w = use_w
+        self.rotate = rotate
+        self.offset = offset
+        self.ratio = ratio
+        self.mode = mode
+        self.st_prob = prob
+        self.prob = prob
+        self.fp16_enable = False
+    def set_prob(self, epoch, max_epoch):
+        self.prob = self.st_prob * epoch / max_epoch #+ 1.#0.5
+    @auto_fp16()
+    def forward(self, x):
+        if np.random.rand() > self.prob or not self.training:
+            return x
+        n,c,h,w = x.size()
+        x = x.view(-1,h,w)
+        hh = int(1.5*h)
+        ww = int(1.5*w)
+        d = np.random.randint(2, h)
+        self.l = min(max(int(d*self.ratio+0.5),1),d-1)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        if self.use_h:
+            for i in range(hh//d):
+                s = d*i + st_h
+                t = min(s+self.l, hh)
+                mask[s:t,:] *= 0
+        if self.use_w:
+            for i in range(ww//d):
+                s = d*i + st_w
+                t = min(s+self.l, ww)
+                mask[:,s:t] *= 0
+       
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w]
+
+        mask = torch.from_numpy(mask).to(x.dtype).cuda()
+        if self.mode == 1:
+            mask = 1-mask
+        mask = mask.expand_as(x)
+        if self.offset:
+            offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).to(x.dtype).cuda()
+            x = x * mask + offset * (1 - mask)
+        else:
+            x = x * mask 
+        
+        return x.view(n,c,h,w)
\ No newline at end of file
diff --git a/mmcv/models/utils/positional_encoding.py b/mmcv/models/utils/positional_encoding.py
new file mode 100644
index 0000000..785516f
--- /dev/null
+++ b/mmcv/models/utils/positional_encoding.py
@@ -0,0 +1,162 @@
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.models.bricks.transformer import POSITIONAL_ENCODING
+from mmcv.models.backbones.base_module import BaseModule
+
+
+@POSITIONAL_ENCODING.register_module()
+class SinePositionalEncoding(BaseModule):
+    """Position encoding with sine and cosine functions.
+
+    See `End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        normalize (bool, optional): Whether to normalize the position
+            embedding. Defaults to False.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Defaults to 1e-6.
+        offset (float): offset add to embed when do the normalization.
+            Defaults to 0.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(self,
+                 num_feats,
+                 temperature=10000,
+                 normalize=False,
+                 scale=2 * math.pi,
+                 eps=1e-6,
+                 offset=0.,
+                 init_cfg=None):
+        super(SinePositionalEncoding, self).__init__(init_cfg)
+        if normalize:
+            assert isinstance(scale, (float, int)), 'when normalize is set,' \
+                'scale should be provided and in float or int type, ' \
+                f'found {type(scale)}'
+        self.num_feats = num_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        self.scale = scale
+        self.eps = eps
+        self.offset = offset
+
+    def forward(self, mask):
+        """Forward function for `SinePositionalEncoding`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        # For convenience of exporting to ONNX, it's required to convert
+        # `masks` from bool to int.
+        mask = mask.to(torch.int)
+        not_mask = 1 - mask  # logical_not
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            y_embed = (y_embed + self.offset) / \
+                      (y_embed[:, -1:, :] + self.eps) * self.scale
+            x_embed = (x_embed + self.offset) / \
+                      (x_embed[:, :, -1:] + self.eps) * self.scale
+        dim_t = torch.arange(
+            self.num_feats, dtype=torch.float32, device=mask.device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        B, H, W = mask.size()
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+            dim=4).view(B, H, W, -1)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+            dim=4).view(B, H, W, -1)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'temperature={self.temperature}, '
+        repr_str += f'normalize={self.normalize}, '
+        repr_str += f'scale={self.scale}, '
+        repr_str += f'eps={self.eps})'
+        return repr_str
+
+
+@POSITIONAL_ENCODING.register_module()
+class LearnedPositionalEncoding(BaseModule):
+    """Position embedding with learnable embedding weights.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. The final returned dimension for
+            each position is 2 times of this value.
+        row_num_embed (int, optional): The dictionary size of row embeddings.
+            Default 50.
+        col_num_embed (int, optional): The dictionary size of col embeddings.
+            Default 50.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 num_feats,
+                 row_num_embed=50,
+                 col_num_embed=50,
+                 init_cfg=dict(type='Uniform', layer='Embedding')):
+        super(LearnedPositionalEncoding, self).__init__(init_cfg)
+        self.row_embed = nn.Embedding(row_num_embed, num_feats)
+        self.col_embed = nn.Embedding(col_num_embed, num_feats)
+        self.num_feats = num_feats
+        self.row_num_embed = row_num_embed
+        self.col_num_embed = col_num_embed
+
+    def forward(self, mask):
+        """Forward function for `LearnedPositionalEncoding`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        h, w = mask.shape[-2:]
+        x = torch.arange(w, device=mask.device)
+        y = torch.arange(h, device=mask.device)
+        x_embed = self.col_embed(x)
+        y_embed = self.row_embed(y)
+        pos = torch.cat(
+            (x_embed.unsqueeze(0).repeat(h, 1, 1), y_embed.unsqueeze(1).repeat(
+                1, w, 1)),
+            dim=-1).permute(2, 0,
+                            1).unsqueeze(0).repeat(mask.shape[0], 1, 1, 1)
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'row_num_embed={self.row_num_embed}, '
+        repr_str += f'col_num_embed={self.col_num_embed})'
+        return repr_str
diff --git a/mmcv/models/utils/res_layer.py b/mmcv/models/utils/res_layer.py
new file mode 100644
index 0000000..03ca1f2
--- /dev/null
+++ b/mmcv/models/utils/res_layer.py
@@ -0,0 +1,191 @@
+from ..bricks.conv import build_conv_layer
+from ..bricks.norm import build_norm_layer
+# from ..bricks import build_conv_layer, build_norm_layer
+from mmcv.models.backbones.base_module import BaseModule, Sequential
+from torch import nn as nn
+
+
+class ResLayer(Sequential):
+    """ResLayer to build ResNet style backbone.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Default: 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        downsample_first (bool): Downsample at the first block or last block.
+            False for Hourglass, True for ResNet. Default: True
+    """
+
+    def __init__(self,
+                 block,
+                 inplanes,
+                 planes,
+                 num_blocks,
+                 stride=1,
+                 avg_down=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 downsample_first=True,
+                 **kwargs):
+        self.block = block
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = []
+            conv_stride = stride
+            if avg_down:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(
+                        kernel_size=stride,
+                        stride=stride,
+                        ceil_mode=True,
+                        count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(
+                    conv_cfg,
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=conv_stride,
+                    bias=False),
+                build_norm_layer(norm_cfg, planes * block.expansion)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        if downsample_first:
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+            inplanes = planes * block.expansion
+            for _ in range(1, num_blocks):
+                layers.append(
+                    block(
+                        inplanes=inplanes,
+                        planes=planes,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        **kwargs))
+
+        else:  # downsample_first=False is for HourglassModule
+            for _ in range(num_blocks - 1):
+                layers.append(
+                    block(
+                        inplanes=inplanes,
+                        planes=inplanes,
+                        stride=1,
+                        conv_cfg=conv_cfg,
+                        norm_cfg=norm_cfg,
+                        **kwargs))
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs))
+        super(ResLayer, self).__init__(*layers)
+
+
+class SimplifiedBasicBlock(BaseModule):
+    """Simplified version of original basic residual block. This is used in
+    `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    - Norm layer is now optional
+    - Last ReLU in forward function is removed
+    """
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_fg=None):
+        super(SimplifiedBasicBlock, self).__init__(init_fg)
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+        assert not with_cp, 'Not implemented yet.'
+        self.with_norm = norm_cfg is not None
+        with_bias = True if norm_cfg is None else False
+        self.conv1 = build_conv_layer(
+            conv_cfg,
+            inplanes,
+            planes,
+            3,
+            stride=stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=with_bias)
+        if self.with_norm:
+            self.norm1_name, norm1 = build_norm_layer(
+                norm_cfg, planes, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(
+            conv_cfg, planes, planes, 3, padding=1, bias=with_bias)
+        if self.with_norm:
+            self.norm2_name, norm2 = build_norm_layer(
+                norm_cfg, planes, postfix=2)
+            self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name) if self.with_norm else None
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name) if self.with_norm else None
+
+    def forward(self, x):
+        """Forward function."""
+
+        identity = x
+
+        out = self.conv1(x)
+        if self.with_norm:
+            out = self.norm1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        if self.with_norm:
+            out = self.norm2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+
+        return out
diff --git a/mmcv/models/utils/transformer.py b/mmcv/models/utils/transformer.py
new file mode 100644
index 0000000..7d6c0c3
--- /dev/null
+++ b/mmcv/models/utils/transformer.py
@@ -0,0 +1,800 @@
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+from ..bricks.activation import build_activation_layer
+from ..bricks.norm import build_norm_layer
+from .weight_init import xavier_init
+# from mmcv.models import build_activation_layer, build_norm_layer, xavier_init
+from mmcv.models.bricks.registry import (TRANSFORMER_LAYER,
+                                      TRANSFORMER_LAYER_SEQUENCE)
+from mmcv.models.bricks.transformer import (BaseTransformerLayer,
+                                         TransformerLayerSequence,
+                                         build_transformer_layer_sequence)
+from mmcv.models.backbones.base_module import BaseModule
+from torch.nn.init import normal_
+
+from mmcv.models.utils.builder import TRANSFORMER
+
+from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention
+
+try:
+    from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention
+
+except ImportError:
+    warnings.warn(
+        '`MultiScaleDeformableAttention` in MMCV has been moved to '
+        '`mmcv.ops.multi_scale_deform_attn`, please update your MMCV')
+    from ..bricks.transformer import MultiScaleDeformableAttention
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    """Inverse function of sigmoid.
+
+    Args:
+        x (Tensor): The tensor to do the
+            inverse.
+        eps (float): EPS avoid numerical
+            overflow. Defaults 1e-5.
+    Returns:
+        Tensor: The x has passed the inverse
+            function of sigmoid, has same
+            shape with input.
+    """
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+@TRANSFORMER_LAYER.register_module()
+class DetrTransformerDecoderLayer(BaseTransformerLayer):
+    """Implements decoder layer in DETR transformer.
+
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )):
+            Configs for self_attention or cross_attention, the order
+            should be consistent with it in `operation_order`. If it is
+            a dict, it would be expand to the number of attention in
+            `operation_order`.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        ffn_dropout (float): Probability of an element to be zeroed
+            in ffn. Default 0.0.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Default：None
+        act_cfg (dict): The activation config for FFNs. Default: `LN`
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: `LN`.
+        ffn_num_fcs (int): The number of fully-connected layers in FFNs.
+            Default：2.
+    """
+
+    def __init__(self,
+                 attn_cfgs,
+                 feedforward_channels,
+                 ffn_dropout=0.0,
+                 operation_order=None,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN'),
+                 ffn_num_fcs=2,
+                 with_cp=True,
+                 **kwargs):
+        super(DetrTransformerDecoderLayer, self).__init__(
+            attn_cfgs=attn_cfgs,
+            feedforward_channels=feedforward_channels,
+            ffn_dropout=ffn_dropout,
+            operation_order=operation_order,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg,
+            ffn_num_fcs=ffn_num_fcs,
+            **kwargs)
+        assert len(operation_order) == 6
+        assert set(operation_order) == set(
+            ['self_attn', 'norm', 'cross_attn', 'ffn'])
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DetrTransformerEncoder(TransformerLayerSequence):
+    """TransformerEncoder of DETR.
+
+    Args:
+        post_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`. Only used when `self.pre_norm` is `True`
+    """
+
+    def __init__(self, *args, post_norm_cfg=dict(type='LN'), **kwargs):
+        super(DetrTransformerEncoder, self).__init__(*args, **kwargs)
+        if post_norm_cfg is not None:
+            self.post_norm = build_norm_layer(
+                post_norm_cfg, self.embed_dims)[1] if self.pre_norm else None
+        else:
+            assert not self.pre_norm, f'Use prenorm in ' \
+                                      f'{self.__class__.__name__},' \
+                                      f'Please specify post_norm_cfg'
+            self.post_norm = None
+
+    def forward(self, *args, **kwargs):
+        """Forward function for `TransformerCoder`.
+
+        Returns:
+            Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+        x = super(DetrTransformerEncoder, self).forward(*args, **kwargs)
+        if self.post_norm is not None:
+            x = self.post_norm(x)
+        return x
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DetrTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR transformer.
+
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        post_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self,
+                 *args,
+                 post_norm_cfg=dict(type='LN'),
+                 return_intermediate=False,
+                 **kwargs):
+
+        super(DetrTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+        if post_norm_cfg is not None:
+            self.post_norm = build_norm_layer(post_norm_cfg,
+                                              self.embed_dims)[1]
+        else:
+            self.post_norm = None
+
+    def forward(self, query, *args, **kwargs):
+        """Forward function for `TransformerDecoder`.
+
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        if not self.return_intermediate:
+            x = super().forward(query, *args, **kwargs)
+            if self.post_norm:
+                x = self.post_norm(x)[None]
+            return x
+
+        intermediate = []
+        for layer in self.layers:
+            query = layer(query, *args, **kwargs)
+            if self.return_intermediate:
+                if self.post_norm is not None:
+                    intermediate.append(self.post_norm(query))
+                else:
+                    intermediate.append(query)
+        return torch.stack(intermediate)
+
+
+@TRANSFORMER.register_module()
+class Transformer(BaseModule):
+    """Implements the DETR transformer.
+
+    Following the official DETR implementation, this module copy-paste
+    from torch.nn.Transformer with modifications:
+
+        * positional encodings are passed in MultiheadAttention
+        * extra LN at the end of encoder is removed
+        * decoder returns a stack of activations from all decoding layers
+
+    See `paper: End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        encoder (`mmcv.ConfigDict` | Dict): Config of
+            TransformerEncoder. Defaults to None.
+        decoder ((`mmcv.ConfigDict` | Dict)): Config of
+            TransformerDecoder. Defaults to None
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self, encoder=None, decoder=None, init_cfg=None):
+        super(Transformer, self).__init__(init_cfg=init_cfg)
+        self.encoder = build_transformer_layer_sequence(encoder)
+        self.decoder = build_transformer_layer_sequence(decoder)
+        self.embed_dims = self.encoder.embed_dims
+
+    def init_weights(self):
+        # follow the official DETR to init parameters
+        for m in self.modules():
+            if hasattr(m, 'weight') and m.weight.dim() > 1:
+                xavier_init(m, distribution='uniform')
+        self._is_init = True
+
+    def forward(self, x, mask, query_embed, pos_embed):
+        """Forward function for `Transformer`.
+
+        Args:
+            x (Tensor): Input query with shape [bs, c, h, w] where
+                c = embed_dims.
+            mask (Tensor): The key_padding_mask used for encoder and decoder,
+                with shape [bs, h, w].
+            query_embed (Tensor): The query embedding for decoder, with shape
+                [num_query, c].
+            pos_embed (Tensor): The positional encoding for encoder and
+                decoder, with the same shape as `x`.
+
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+
+                - out_dec: Output from decoder. If return_intermediate_dec \
+                      is True output has shape [num_dec_layers, bs,
+                      num_query, embed_dims], else has shape [1, bs, \
+                      num_query, embed_dims].
+                - memory: Output results from encoder, with shape \
+                      [bs, embed_dims, h, w].
+        """
+        bs, c, h, w = x.shape
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        x = x.view(bs, c, -1).permute(2, 0, 1)  # [bs, c, h, w] -> [h*w, bs, c]
+        pos_embed = pos_embed.view(bs, c, -1).permute(2, 0, 1)
+        query_embed = query_embed.unsqueeze(1).repeat(
+            1, bs, 1)  # [num_query, dim] -> [num_query, bs, dim]
+        mask = mask.view(bs, -1)  # [bs, h, w] -> [bs, h*w]
+        memory = self.encoder(
+            query=x,
+            key=None,
+            value=None,
+            query_pos=pos_embed,
+            query_key_padding_mask=mask)
+        target = torch.zeros_like(query_embed)
+        # out_dec: [num_layers, num_query, bs, dim]
+        out_dec = self.decoder(
+            query=target,
+            key=memory,
+            value=memory,
+            key_pos=pos_embed,
+            query_pos=query_embed,
+            key_padding_mask=mask)
+        out_dec = out_dec.transpose(1, 2)
+        memory = memory.permute(1, 2, 0).reshape(bs, c, h, w)
+        return out_dec, memory
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DeformableDetrTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR transformer.
+
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self, *args, return_intermediate=False, **kwargs):
+
+        super(DeformableDetrTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+
+    def forward(self,
+                query,
+                *args,
+                reference_points=None,
+                valid_ratios=None,
+                reg_branches=None,
+                **kwargs):
+        """Forward function for `TransformerDecoder`.
+
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+            reference_points (Tensor): The reference
+                points of offset. has shape
+                (bs, num_query, 4) when as_two_stage,
+                otherwise has shape ((bs, num_query, 2).
+            valid_ratios (Tensor): The radios of valid
+                points on the feature map, has shape
+                (bs, num_levels, 2)
+            reg_branch: (obj:`nn.ModuleList`): Used for
+                refining the regression results. Only would
+                be passed when with_box_refine is True,
+                otherwise would be passed a `None`.
+
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        output = query
+        intermediate = []
+        intermediate_reference_points = []
+        for lid, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = reference_points[:, :, None] * \
+                    torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = reference_points[:, :, None] * \
+                    valid_ratios[:, None]
+            output = layer(
+                output,
+                *args,
+                reference_points=reference_points_input,
+                **kwargs)
+            output = output.permute(1, 0, 2)
+
+            if reg_branches is not None:
+                tmp = reg_branches[lid](output)
+                if reference_points.shape[-1] == 4:
+                    new_reference_points = tmp + inverse_sigmoid(
+                        reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                else:
+                    assert reference_points.shape[-1] == 2
+                    new_reference_points = tmp
+                    new_reference_points[..., :2] = tmp[
+                        ..., :2] + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+
+            output = output.permute(1, 0, 2)
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return output, reference_points
+
+
+@TRANSFORMER.register_module()
+class DeformableDetrTransformer(Transformer):
+    """Implements the DeformableDETR transformer.
+
+    Args:
+        as_two_stage (bool): Generate query from encoder features.
+            Default: False.
+        num_feature_levels (int): Number of feature maps from FPN:
+            Default: 4.
+        two_stage_num_proposals (int): Number of proposals when set
+            `as_two_stage` as True. Default: 300.
+    """
+
+    def __init__(self,
+                 as_two_stage=False,
+                 num_feature_levels=4,
+                 two_stage_num_proposals=300,
+                 **kwargs):
+        super(DeformableDetrTransformer, self).__init__(**kwargs)
+        self.as_two_stage = as_two_stage
+        self.num_feature_levels = num_feature_levels
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.embed_dims = self.encoder.embed_dims
+        self.init_layers()
+
+    def init_layers(self):
+        """Initialize layers of the DeformableDetrTransformer."""
+        self.level_embeds = nn.Parameter(
+            torch.Tensor(self.num_feature_levels, self.embed_dims))
+
+        if self.as_two_stage:
+            self.enc_output = nn.Linear(self.embed_dims, self.embed_dims)
+            self.enc_output_norm = nn.LayerNorm(self.embed_dims)
+            self.pos_trans = nn.Linear(self.embed_dims * 2,
+                                       self.embed_dims * 2)
+            self.pos_trans_norm = nn.LayerNorm(self.embed_dims * 2)
+        else:
+            self.reference_points = nn.Linear(self.embed_dims, 2)
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                m.init_weights()
+        if not self.as_two_stage:
+            xavier_init(self.reference_points, distribution='uniform', bias=0.)
+        normal_(self.level_embeds)
+
+    def gen_encoder_output_proposals(self, memory, memory_padding_mask,
+                                     spatial_shapes):
+        """Generate proposals from encoded memory.
+
+        Args:
+            memory (Tensor) : The output of encoder,
+                has shape (bs, num_key, embed_dim).  num_key is
+                equal the number of points on feature map from
+                all level.
+            memory_padding_mask (Tensor): Padding mask for memory.
+                has shape (bs, num_key).
+            spatial_shapes (Tensor): The shape of all feature maps.
+                has shape (num_level, 2).
+
+        Returns:
+            tuple: A tuple of feature map and bbox prediction.
+
+                - output_memory (Tensor): The input of decoder,  \
+                    has shape (bs, num_key, embed_dim).  num_key is \
+                    equal the number of points on feature map from \
+                    all levels.
+                - output_proposals (Tensor): The normalized proposal \
+                    after a inverse sigmoid, has shape \
+                    (bs, num_keys, 4).
+        """
+
+        N, S, C = memory.shape
+        proposals = []
+        _cur = 0
+        for lvl, (H, W) in enumerate(spatial_shapes):
+            mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H * W)].view(
+                N, H, W, 1)
+            valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+            grid_y, grid_x = torch.meshgrid(
+                torch.linspace(
+                    0, H - 1, H, dtype=torch.float32, device=memory.device),
+                torch.linspace(
+                    0, W - 1, W, dtype=torch.float32, device=memory.device))
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+            scale = torch.cat([valid_W.unsqueeze(-1),
+                               valid_H.unsqueeze(-1)], 1).view(N, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(N, -1, -1, -1) + 0.5) / scale
+            wh = torch.ones_like(grid) * 0.05 * (2.0**lvl)
+            proposal = torch.cat((grid, wh), -1).view(N, -1, 4)
+            proposals.append(proposal)
+            _cur += (H * W)
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) &
+                                  (output_proposals < 0.99)).all(
+                                      -1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))
+        output_proposals = output_proposals.masked_fill(
+            memory_padding_mask.unsqueeze(-1), float('inf'))
+        output_proposals = output_proposals.masked_fill(
+            ~output_proposals_valid, float('inf'))
+
+        output_memory = memory
+        output_memory = output_memory.masked_fill(
+            memory_padding_mask.unsqueeze(-1), float(0))
+        output_memory = output_memory.masked_fill(~output_proposals_valid,
+                                                  float(0))
+        output_memory = self.enc_output_norm(self.enc_output(output_memory))
+        return output_memory, output_proposals
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        """Get the reference points used in decoder.
+
+        Args:
+            spatial_shapes (Tensor): The shape of all
+                feature maps, has shape (num_level, 2).
+            valid_ratios (Tensor): The radios of valid
+                points on the feature map, has shape
+                (bs, num_levels, 2)
+            device (obj:`device`): The device where
+                reference_points should be.
+
+        Returns:
+            Tensor: reference points used in decoder, has \
+                shape (bs, num_keys, num_levels, 2).
+        """
+        reference_points_list = []
+        for lvl, (H, W) in enumerate(spatial_shapes):
+            #  TODO  check this 0.5
+            ref_y, ref_x = torch.meshgrid(
+                torch.linspace(
+                    0.5, H - 0.5, H, dtype=torch.float32, device=device),
+                torch.linspace(
+                    0.5, W - 0.5, W, dtype=torch.float32, device=device))
+            ref_y = ref_y.reshape(-1)[None] / (
+                valid_ratios[:, None, lvl, 1] * H)
+            ref_x = ref_x.reshape(-1)[None] / (
+                valid_ratios[:, None, lvl, 0] * W)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def get_valid_ratio(self, mask):
+        """Get the valid radios of feature maps of all  level."""
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def get_proposal_pos_embed(self,
+                               proposals,
+                               num_pos_feats=128,
+                               temperature=10000):
+        """Get the position embedding of proposal."""
+        scale = 2 * math.pi
+        dim_t = torch.arange(
+            num_pos_feats, dtype=torch.float32, device=proposals.device)
+        dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats)
+        # N, L, 4
+        proposals = proposals.sigmoid() * scale
+        # N, L, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # N, L, 4, 64, 2
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()),
+                          dim=4).flatten(2)
+        return pos
+
+    def forward(self,
+                mlvl_feats,
+                mlvl_masks,
+                query_embed,
+                mlvl_pos_embeds,
+                reg_branches=None,
+                cls_branches=None,
+                **kwargs):
+        """Forward function for `Transformer`.
+
+        Args:
+            mlvl_feats (list(Tensor)): Input queries from
+                different level. Each element has shape
+                [bs, embed_dims, h, w].
+            mlvl_masks (list(Tensor)): The key_padding_mask from
+                different level used for encoder and decoder,
+                each element has shape  [bs, h, w].
+            query_embed (Tensor): The query embedding for decoder,
+                with shape [num_query, c].
+            mlvl_pos_embeds (list(Tensor)): The positional encoding
+                of feats from different level, has the shape
+                 [bs, embed_dims, h, w].
+            reg_branches (obj:`nn.ModuleList`): Regression heads for
+                feature maps from each decoder layer. Only would
+                be passed when
+                `with_box_refine` is True. Default to None.
+            cls_branches (obj:`nn.ModuleList`): Classification heads
+                for feature maps from each decoder layer. Only would
+                 be passed when `as_two_stage`
+                 is True. Default to None.
+
+
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+
+                - inter_states: Outputs from decoder. If
+                    return_intermediate_dec is True output has shape \
+                      (num_dec_layers, bs, num_query, embed_dims), else has \
+                      shape (1, bs, num_query, embed_dims).
+                - init_reference_out: The initial value of reference \
+                    points, has shape (bs, num_queries, 4).
+                - inter_references_out: The internal value of reference \
+                    points in decoder, has shape \
+                    (num_dec_layers, bs,num_query, embed_dims)
+                - enc_outputs_class: The classification score of \
+                    proposals generated from \
+                    encoder's feature maps, has shape \
+                    (batch, h*w, num_classes). \
+                    Only would be returned when `as_two_stage` is True, \
+                    otherwise None.
+                - enc_outputs_coord_unact: The regression results \
+                    generated from encoder's feature maps., has shape \
+                    (batch, h*w, 4). Only would \
+                    be returned when `as_two_stage` is True, \
+                    otherwise None.
+        """
+        assert self.as_two_stage or query_embed is not None
+
+        feat_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (feat, mask, pos_embed) in enumerate(
+                zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):
+            bs, c, h, w = feat.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            feat = feat.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embeds[lvl].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            feat_flatten.append(feat)
+            mask_flatten.append(mask)
+        feat_flatten = torch.cat(feat_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(
+            spatial_shapes, dtype=torch.long, device=feat_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack(
+            [self.get_valid_ratio(m) for m in mlvl_masks], 1)
+
+        reference_points = \
+            self.get_reference_points(spatial_shapes,
+                                      valid_ratios,
+                                      device=feat.device)
+
+        feat_flatten = feat_flatten.permute(1, 0, 2)  # (H*W, bs, embed_dims)
+        lvl_pos_embed_flatten = lvl_pos_embed_flatten.permute(
+            1, 0, 2)  # (H*W, bs, embed_dims)
+        memory = self.encoder(
+            query=feat_flatten,
+            key=None,
+            value=None,
+            query_pos=lvl_pos_embed_flatten,
+            query_key_padding_mask=mask_flatten,
+            spatial_shapes=spatial_shapes,
+            reference_points=reference_points,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            **kwargs)
+
+        memory = memory.permute(1, 0, 2)
+        bs, _, c = memory.shape
+        if self.as_two_stage:
+            output_memory, output_proposals = \
+                self.gen_encoder_output_proposals(
+                    memory, mask_flatten, spatial_shapes)
+            enc_outputs_class = cls_branches[self.decoder.num_layers](
+                output_memory)
+            enc_outputs_coord_unact = \
+                reg_branches[
+                    self.decoder.num_layers](output_memory) + output_proposals
+
+            topk = self.two_stage_num_proposals
+            topk_proposals = torch.topk(
+                enc_outputs_class[..., 0], topk, dim=1)[1]
+            topk_coords_unact = torch.gather(
+                enc_outputs_coord_unact, 1,
+                topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
+            topk_coords_unact = topk_coords_unact.detach()
+            reference_points = topk_coords_unact.sigmoid()
+            init_reference_out = reference_points
+            pos_trans_out = self.pos_trans_norm(
+                self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact)))
+            query_pos, query = torch.split(pos_trans_out, c, dim=2)
+        else:
+            query_pos, query = torch.split(query_embed, c, dim=1)
+            query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1)
+            query = query.unsqueeze(0).expand(bs, -1, -1)
+            reference_points = self.reference_points(query_pos).sigmoid()
+            init_reference_out = reference_points
+
+        # decoder
+        query = query.permute(1, 0, 2)
+        memory = memory.permute(1, 0, 2)
+        query_pos = query_pos.permute(1, 0, 2)
+        inter_states, inter_references = self.decoder(
+            query=query,
+            key=None,
+            value=memory,
+            query_pos=query_pos,
+            key_padding_mask=mask_flatten,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            reg_branches=reg_branches,
+            **kwargs)
+
+        inter_references_out = inter_references
+        if self.as_two_stage:
+            return inter_states, init_reference_out,\
+                inter_references_out, enc_outputs_class,\
+                enc_outputs_coord_unact
+        return inter_states, init_reference_out, \
+            inter_references_out, None, None
+
+
+@TRANSFORMER.register_module()
+class DynamicConv(BaseModule):
+    """Implements Dynamic Convolution.
+
+    This module generate parameters for each sample and
+    use bmm to implement 1*1 convolution. Code is modified
+    from the `official github repo <https://github.com/PeizeSun/
+    SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/head.py#L258>`_ .
+
+    Args:
+        in_channels (int): The input feature channel.
+            Defaults to 256.
+        feat_channels (int): The inner feature channel.
+            Defaults to 64.
+        out_channels (int, optional): The output feature channel.
+            When not specified, it will be set to `in_channels`
+            by default
+        input_feat_shape (int): The shape of input feature.
+            Defaults to 7.
+        act_cfg (dict): The activation config for DynamicConv.
+        norm_cfg (dict): Config dict for normalization layer. Default
+            layer normalization.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=256,
+                 feat_channels=64,
+                 out_channels=None,
+                 input_feat_shape=7,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super(DynamicConv, self).__init__(init_cfg)
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.out_channels_raw = out_channels
+        self.input_feat_shape = input_feat_shape
+        self.act_cfg = act_cfg
+        self.norm_cfg = norm_cfg
+        self.out_channels = out_channels if out_channels else in_channels
+
+        self.num_params_in = self.in_channels * self.feat_channels
+        self.num_params_out = self.out_channels * self.feat_channels
+        self.dynamic_layer = nn.Linear(
+            self.in_channels, self.num_params_in + self.num_params_out)
+
+        self.norm_in = build_norm_layer(norm_cfg, self.feat_channels)[1]
+        self.norm_out = build_norm_layer(norm_cfg, self.out_channels)[1]
+
+        self.activation = build_activation_layer(act_cfg)
+
+        num_output = self.out_channels * input_feat_shape**2
+        self.fc_layer = nn.Linear(num_output, self.out_channels)
+        self.fc_norm = build_norm_layer(norm_cfg, self.out_channels)[1]
+
+    def forward(self, param_feature, input_feature):
+        """Forward function for `DynamicConv`.
+
+        Args:
+            param_feature (Tensor): The feature can be used
+                to generate the parameter, has shape
+                (num_all_proposals, in_channels).
+            input_feature (Tensor): Feature that
+                interact with parameters, has shape
+                (num_all_proposals, in_channels, H, W).
+
+        Returns:
+            Tensor: The output feature has shape
+            (num_all_proposals, out_channels).
+        """
+        num_proposals = param_feature.size(0)
+        input_feature = input_feature.view(num_proposals, self.in_channels,
+                                           -1).permute(2, 0, 1)
+
+        input_feature = input_feature.permute(1, 0, 2)
+        parameters = self.dynamic_layer(param_feature)
+
+        param_in = parameters[:, :self.num_params_in].view(
+            -1, self.in_channels, self.feat_channels)
+        param_out = parameters[:, -self.num_params_out:].view(
+            -1, self.feat_channels, self.out_channels)
+
+        # input_feature has shape (num_all_proposals, H*W, in_channels)
+        # param_in has shape (num_all_proposals, in_channels, feat_channels)
+        # feature has shape (num_all_proposals, H*W, feat_channels)
+        features = torch.bmm(input_feature, param_in)
+        features = self.norm_in(features)
+        features = self.activation(features)
+
+        # param_out has shape (batch_size, feat_channels, out_channels)
+        features = torch.bmm(features, param_out)
+        features = self.norm_out(features)
+        features = self.activation(features)
+
+        features = features.flatten(1)
+        features = self.fc_layer(features)
+        features = self.fc_norm(features)
+        features = self.activation(features)
+
+        return features
diff --git a/mmcv/models/utils/weight_init.py b/mmcv/models/utils/weight_init.py
new file mode 100644
index 0000000..c347f29
--- /dev/null
+++ b/mmcv/models/utils/weight_init.py
@@ -0,0 +1,683 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+import warnings
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmcv.utils import Registry, build_from_cfg, get_logger, print_log
+
+INITIALIZERS = Registry('initializer')
+
+
+def update_init_info(module, init_info):
+    """Update the `_params_init_info` in the module if the value of parameters
+    are changed.
+
+    Args:
+        module (obj:`nn.Module`): The module of PyTorch with a user-defined
+            attribute `_params_init_info` which records the initialization
+            information.
+        init_info (str): The string that describes the initialization.
+    """
+    assert hasattr(
+        module,
+        '_params_init_info'), f'Can not find `_params_init_info` in {module}'
+    for name, param in module.named_parameters():
+
+        assert param in module._params_init_info, (
+            f'Find a new :obj:`Parameter` '
+            f'named `{name}` during executing the '
+            f'`init_weights` of '
+            f'`{module.__class__.__name__}`. '
+            f'Please do not add or '
+            f'replace parameters during executing '
+            f'the `init_weights`. ')
+
+        # The parameter has been changed during executing the
+        # `init_weights` of module
+        mean_value = param.data.mean()
+        if module._params_init_info[param]['tmp_mean_value'] != mean_value:
+            module._params_init_info[param]['init_info'] = init_info
+            module._params_init_info[param]['tmp_mean_value'] = mean_value
+
+
+def constant_init(module, val, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.constant_(module.weight, val)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def xavier_init(module, gain=1, bias=0, distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if hasattr(module, 'weight') and module.weight is not None:
+        if distribution == 'uniform':
+            nn.init.xavier_uniform_(module.weight, gain=gain)
+        else:
+            nn.init.xavier_normal_(module.weight, gain=gain)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def normal_init(module, mean=0, std=1, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.normal_(module.weight, mean, std)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def trunc_normal_init(module: nn.Module,
+                      mean: float = 0,
+                      std: float = 1,
+                      a: float = -2,
+                      b: float = 2,
+                      bias: float = 0) -> None:
+    if hasattr(module, 'weight') and module.weight is not None:
+        trunc_normal_(module.weight, mean, std, a, b)  # type: ignore
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)  # type: ignore
+
+
+def uniform_init(module, a=0, b=1, bias=0):
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.uniform_(module.weight, a, b)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def kaiming_init(module,
+                 a=0,
+                 mode='fan_out',
+                 nonlinearity='relu',
+                 bias=0,
+                 distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if hasattr(module, 'weight') and module.weight is not None:
+        if distribution == 'uniform':
+            nn.init.kaiming_uniform_(
+                module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
+        else:
+            nn.init.kaiming_normal_(
+                module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def caffe2_xavier_init(module, bias=0):
+    # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch
+    # Acknowledgment to FAIR's internal code
+    kaiming_init(
+        module,
+        a=1,
+        mode='fan_in',
+        nonlinearity='leaky_relu',
+        bias=bias,
+        distribution='uniform')
+
+
+def bias_init_with_prob(prior_prob):
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+def _get_bases_name(m):
+    return [b.__name__ for b in m.__class__.__bases__]
+
+
+class BaseInit(object):
+
+    def __init__(self, *, bias=0, bias_prob=None, layer=None):
+        self.wholemodule = False
+        if not isinstance(bias, (int, float)):
+            raise TypeError(f'bias must be a number, but got a {type(bias)}')
+
+        if bias_prob is not None:
+            if not isinstance(bias_prob, float):
+                raise TypeError(f'bias_prob type must be float, \
+                    but got {type(bias_prob)}')
+
+        if layer is not None:
+            if not isinstance(layer, (str, list)):
+                raise TypeError(f'layer must be a str or a list of str, \
+                    but got a {type(layer)}')
+        else:
+            layer = []
+
+        if bias_prob is not None:
+            self.bias = bias_init_with_prob(bias_prob)
+        else:
+            self.bias = bias
+        self.layer = [layer] if isinstance(layer, str) else layer
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Constant')
+class ConstantInit(BaseInit):
+    """Initialize module parameters with constant values.
+
+    Args:
+        val (int | float): the value to fill the weights in the module with
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self, val, **kwargs):
+        super().__init__(**kwargs)
+        self.val = val
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                constant_init(m, self.val, self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    constant_init(m, self.val, self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: val={self.val}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Xavier')
+class XavierInit(BaseInit):
+    r"""Initialize module parameters with values according to the method
+    described in `Understanding the difficulty of training deep feedforward
+    neural networks - Glorot, X. & Bengio, Y. (2010).
+    <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
+
+    Args:
+        gain (int | float): an optional scaling factor. Defaults to 1.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        distribution (str): distribution either be ``'normal'``
+            or ``'uniform'``. Defaults to ``'normal'``.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self, gain=1, distribution='normal', **kwargs):
+        super().__init__(**kwargs)
+        self.gain = gain
+        self.distribution = distribution
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                xavier_init(m, self.gain, self.bias, self.distribution)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    xavier_init(m, self.gain, self.bias, self.distribution)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: gain={self.gain}, ' \
+               f'distribution={self.distribution}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Normal')
+class NormalInit(BaseInit):
+    r"""Initialize module parameters with the values drawn from the normal
+    distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`.
+
+    Args:
+        mean (int | float):the mean of the normal distribution. Defaults to 0.
+        std (int | float): the standard deviation of the normal distribution.
+            Defaults to 1.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+
+    """
+
+    def __init__(self, mean=0, std=1, **kwargs):
+        super().__init__(**kwargs)
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                normal_init(m, self.mean, self.std, self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    normal_init(m, self.mean, self.std, self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: mean={self.mean},' \
+               f' std={self.std}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='TruncNormal')
+class TruncNormalInit(BaseInit):
+    r"""Initialize module parameters with the values drawn from the normal
+    distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` with values
+    outside :math:`[a, b]`.
+
+    Args:
+        mean (float): the mean of the normal distribution. Defaults to 0.
+        std (float):  the standard deviation of the normal distribution.
+            Defaults to 1.
+        a (float): The minimum cutoff value.
+        b ( float): The maximum cutoff value.
+        bias (float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+
+    """
+
+    def __init__(self,
+                 mean: float = 0,
+                 std: float = 1,
+                 a: float = -2,
+                 b: float = 2,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.mean = mean
+        self.std = std
+        self.a = a
+        self.b = b
+
+    def __call__(self, module: nn.Module) -> None:
+
+        def init(m):
+            if self.wholemodule:
+                trunc_normal_init(m, self.mean, self.std, self.a, self.b,
+                                  self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    trunc_normal_init(m, self.mean, self.std, self.a, self.b,
+                                      self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: a={self.a}, b={self.b},' \
+               f' mean={self.mean}, std={self.std}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Uniform')
+class UniformInit(BaseInit):
+    r"""Initialize module parameters with values drawn from the uniform
+    distribution :math:`\mathcal{U}(a, b)`.
+
+    Args:
+        a (int | float): the lower bound of the uniform distribution.
+            Defaults to 0.
+        b (int | float): the upper bound of the uniform distribution.
+            Defaults to 1.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self, a=0, b=1, **kwargs):
+        super().__init__(**kwargs)
+        self.a = a
+        self.b = b
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                uniform_init(m, self.a, self.b, self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    uniform_init(m, self.a, self.b, self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: a={self.a},' \
+               f' b={self.b}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Kaiming')
+class KaimingInit(BaseInit):
+    r"""Initialize module parameters with the values according to the method
+    described in `Delving deep into rectifiers: Surpassing human-level
+    performance on ImageNet classification - He, K. et al. (2015).
+    <https://www.cv-foundation.org/openaccess/content_iccv_2015/
+    papers/He_Delving_Deep_into_ICCV_2015_paper.pdf>`_
+
+    Args:
+        a (int | float): the negative slope of the rectifier used after this
+            layer (only used with ``'leaky_relu'``). Defaults to 0.
+        mode (str):  either ``'fan_in'`` or ``'fan_out'``. Choosing
+            ``'fan_in'`` preserves the magnitude of the variance of the weights
+            in the forward pass. Choosing ``'fan_out'`` preserves the
+            magnitudes in the backwards pass. Defaults to ``'fan_out'``.
+        nonlinearity (str): the non-linear function (`nn.functional` name),
+            recommended to use only with ``'relu'`` or ``'leaky_relu'`` .
+            Defaults to 'relu'.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        distribution (str): distribution either be ``'normal'`` or
+            ``'uniform'``. Defaults to ``'normal'``.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 a=0,
+                 mode='fan_out',
+                 nonlinearity='relu',
+                 distribution='normal',
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.a = a
+        self.mode = mode
+        self.nonlinearity = nonlinearity
+        self.distribution = distribution
+
+    def __call__(self, module):
+
+        def init(m):
+            if self.wholemodule:
+                kaiming_init(m, self.a, self.mode, self.nonlinearity,
+                             self.bias, self.distribution)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    kaiming_init(m, self.a, self.mode, self.nonlinearity,
+                                 self.bias, self.distribution)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: a={self.a}, mode={self.mode}, ' \
+               f'nonlinearity={self.nonlinearity}, ' \
+               f'distribution ={self.distribution}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Caffe2Xavier')
+class Caffe2XavierInit(KaimingInit):
+    # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch
+    # Acknowledgment to FAIR's internal code
+    def __init__(self, **kwargs):
+        super().__init__(
+            a=1,
+            mode='fan_in',
+            nonlinearity='leaky_relu',
+            distribution='uniform',
+            **kwargs)
+
+    def __call__(self, module):
+        super().__call__(module)
+
+
+@INITIALIZERS.register_module(name='Pretrained')
+class PretrainedInit(object):
+    """Initialize module by loading a pretrained model.
+
+    Args:
+        checkpoint (str): the checkpoint file of the pretrained model should
+            be load.
+        prefix (str, optional): the prefix of a sub-module in the pretrained
+            model. it is for loading a part of the pretrained model to
+            initialize. For example, if we would like to only load the
+            backbone of a detector model, we can set ``prefix='backbone.'``.
+            Defaults to None.
+        map_location (str): map tensors into proper locations.
+    """
+
+    def __init__(self, checkpoint, prefix=None, map_location=None):
+        self.checkpoint = checkpoint
+        self.prefix = prefix
+        self.map_location = map_location
+
+    def __call__(self, module):
+        from mmcv.utils import load_checkpoint
+        logger = get_logger('mmcv')
+        if self.prefix is None:
+            print_log(f'load model from: {self.checkpoint}', logger=logger)
+            load_checkpoint(
+                module,
+                self.checkpoint,
+                map_location=self.map_location,
+                strict=False,
+                logger=logger)
+        else:
+            print_log(
+                f'load {self.prefix} in model from: {self.checkpoint}',
+                logger=logger)
+            state_dict = _load_checkpoint_with_prefix(
+                self.prefix, self.checkpoint, map_location=self.map_location)
+            load_state_dict(module, state_dict, strict=False, logger=logger)
+
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: load from {self.checkpoint}'
+        return info
+
+
+def _initialize(module, cfg, wholemodule=False):
+    func = build_from_cfg(cfg, INITIALIZERS)
+    # wholemodule flag is for override mode, there is no layer key in override
+    # and initializer will give init values for the whole module with the name
+    # in override.
+    func.wholemodule = wholemodule
+    func(module)
+
+
+def _initialize_override(module, override, cfg):
+    if not isinstance(override, (dict, list)):
+        raise TypeError(f'override must be a dict or a list of dict, \
+                but got {type(override)}')
+
+    override = [override] if isinstance(override, dict) else override
+
+    for override_ in override:
+
+        cp_override = copy.deepcopy(override_)
+        name = cp_override.pop('name', None)
+        if name is None:
+            raise ValueError('`override` must contain the key "name",'
+                             f'but got {cp_override}')
+        # if override only has name key, it means use args in init_cfg
+        if not cp_override:
+            cp_override.update(cfg)
+        # if override has name key and other args except type key, it will
+        # raise error
+        elif 'type' not in cp_override.keys():
+            raise ValueError(
+                f'`override` need "type" key, but got {cp_override}')
+
+        if hasattr(module, name):
+            _initialize(getattr(module, name), cp_override, wholemodule=True)
+        else:
+            raise RuntimeError(f'module did not have attribute {name}, '
+                               f'but init_cfg is {cp_override}.')
+
+
+def initialize(module, init_cfg):
+    """Initialize a module.
+
+    Args:
+        module (``torch.nn.Module``): the module will be initialized.
+        init_cfg (dict | list[dict]): initialization configuration dict to
+            define initializer. OpenMMLab has implemented 6 initializers
+            including ``Constant``, ``Xavier``, ``Normal``, ``Uniform``,
+            ``Kaiming``, and ``Pretrained``.
+    Example:
+        >>> module = nn.Linear(2, 3, bias=True)
+        >>> init_cfg = dict(type='Constant', layer='Linear', val =1 , bias =2)
+        >>> initialize(module, init_cfg)
+
+        >>> module = nn.Sequential(nn.Conv1d(3, 1, 3), nn.Linear(1,2))
+        >>> # define key ``'layer'`` for initializing layer with different
+        >>> # configuration
+        >>> init_cfg = [dict(type='Constant', layer='Conv1d', val=1),
+                dict(type='Constant', layer='Linear', val=2)]
+        >>> initialize(module, init_cfg)
+
+        >>> # define key``'override'`` to initialize some specific part in
+        >>> # module
+        >>> class FooNet(nn.Module):
+        >>>     def __init__(self):
+        >>>         super().__init__()
+        >>>         self.feat = nn.Conv2d(3, 16, 3)
+        >>>         self.reg = nn.Conv2d(16, 10, 3)
+        >>>         self.cls = nn.Conv2d(16, 5, 3)
+        >>> model = FooNet()
+        >>> init_cfg = dict(type='Constant', val=1, bias=2, layer='Conv2d',
+        >>>     override=dict(type='Constant', name='reg', val=3, bias=4))
+        >>> initialize(model, init_cfg)
+
+        >>> model = ResNet(depth=50)
+        >>> # Initialize weights with the pretrained model.
+        >>> init_cfg = dict(type='Pretrained',
+                checkpoint='torchvision://resnet50')
+        >>> initialize(model, init_cfg)
+
+        >>> # Initialize weights of a sub-module with the specific part of
+        >>> # a pretrained model by using "prefix".
+        >>> url = 'http://download.openmmlab.com/mmdetection/v2.0/retinanet/'\
+        >>>     'retinanet_r50_fpn_1x_coco/'\
+        >>>     'retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth'
+        >>> init_cfg = dict(type='Pretrained',
+                checkpoint=url, prefix='backbone.')
+    """
+    if not isinstance(init_cfg, (dict, list)):
+        raise TypeError(f'init_cfg must be a dict or a list of dict, \
+                but got {type(init_cfg)}')
+
+    if isinstance(init_cfg, dict):
+        init_cfg = [init_cfg]
+
+    for cfg in init_cfg:
+        # should deeply copy the original config because cfg may be used by
+        # other modules, e.g., one init_cfg shared by multiple bottleneck
+        # blocks, the expected cfg will be changed after pop and will change
+        # the initialization behavior of other modules
+        cp_cfg = copy.deepcopy(cfg)
+        override = cp_cfg.pop('override', None)
+        _initialize(module, cp_cfg)
+
+        if override is not None:
+            cp_cfg.pop('layer', None)
+            _initialize_override(module, override, cp_cfg)
+        else:
+            # All attributes in module have same initialization.
+            pass
+
+
+def _no_grad_trunc_normal_(tensor: Tensor, mean: float, std: float, a: float,
+                           b: float) -> Tensor:
+    # Method based on
+    # https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    # Modified from
+    # https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. '
+            'The distribution of values may be incorrect.',
+            stacklevel=2)
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        lower = norm_cdf((a - mean) / std)
+        upper = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [lower, upper], then translate
+        # to [2lower-1, 2upper-1].
+        tensor.uniform_(2 * lower - 1, 2 * upper - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def trunc_normal_(tensor: Tensor,
+                  mean: float = 0.,
+                  std: float = 1.,
+                  a: float = -2.,
+                  b: float = 2.) -> Tensor:
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+
+    Modified from
+    https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+
+    Args:
+        tensor (``torch.Tensor``): an n-dimensional `torch.Tensor`.
+        mean (float): the mean of the normal distribution.
+        std (float): the standard deviation of the normal distribution.
+        a (float): the minimum cutoff value.
+        b (float): the maximum cutoff value.
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
diff --git a/mmcv/models/vad_utils/CD_loss.py b/mmcv/models/vad_utils/CD_loss.py
new file mode 100644
index 0000000..193b628
--- /dev/null
+++ b/mmcv/models/vad_utils/CD_loss.py
@@ -0,0 +1,710 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import nn as nn
+from torch.nn.functional import l1_loss, mse_loss, smooth_l1_loss
+
+from mmcv.models.builder import LOSSES
+from mmcv.models.losses.utils import weighted_loss
+import torch.nn.functional as F
+from mmcv.core.bbox.match_costs.builder import MATCH_COST
+import functools
+
+
+def reduce_loss(loss, reduction):
+    """Reduce loss as specified.
+
+    Args:
+        loss (Tensor): Elementwise loss tensor.
+        reduction (str): Options are "none", "mean" and "sum".
+
+    Return:
+        Tensor: Reduced loss tensor.
+    """
+    reduction_enum = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction_enum == 0:
+        return loss
+    elif reduction_enum == 1:
+        return loss.mean()
+    elif reduction_enum == 2:
+        return loss.sum()
+
+def custom_weight_dir_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None):
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): num_sample, num_dir
+        weight (Tensor): Element-wise weights.
+        reduction (str): Same as built-in losses of PyTorch.
+        avg_factor (float): Average factor when computing the mean of losses.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        raise ValueError('avg_factor should not be none for OrderedPtsL1Loss')
+        # loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            # import pdb;pdb.set_trace()
+            # loss = loss.permute(1,0,2,3).contiguous()
+            loss = loss.sum()
+            loss = loss / avg_factor
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+def custom_weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None):
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): num_sample, num_order, num_pts, num_coords
+        weight (Tensor): Element-wise weights.
+        reduction (str): Same as built-in losses of PyTorch.
+        avg_factor (float): Average factor when computing the mean of losses.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        raise ValueError('avg_factor should not be none for OrderedPtsL1Loss')
+        # loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            # import pdb;pdb.set_trace()
+            loss = loss.permute(1,0,2,3).contiguous()
+            loss = loss.sum((1,2,3))
+            loss = loss / avg_factor
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+def custom_weighted_loss(loss_func):
+    """Create a weighted version of a given loss function.
+
+    To use this decorator, the loss function must have the signature like
+    `loss_func(pred, target, **kwargs)`. The function only needs to compute
+    element-wise loss without any reduction. This decorator will add weight
+    and reduction arguments to the function. The decorated function will have
+    the signature like `loss_func(pred, target, weight=None, reduction='mean',
+    avg_factor=None, **kwargs)`.
+
+    :Example:
+
+    >>> import torch
+    >>> @weighted_loss
+    >>> def l1_loss(pred, target):
+    >>>     return (pred - target).abs()
+
+    >>> pred = torch.Tensor([0, 2, 3])
+    >>> target = torch.Tensor([1, 1, 1])
+    >>> weight = torch.Tensor([1, 0, 1])
+
+    >>> l1_loss(pred, target)
+    tensor(1.3333)
+    >>> l1_loss(pred, target, weight)
+    tensor(1.)
+    >>> l1_loss(pred, target, reduction='none')
+    tensor([1., 1., 2.])
+    >>> l1_loss(pred, target, weight, avg_factor=2)
+    tensor(1.5000)
+    """
+
+    @functools.wraps(loss_func)
+    def wrapper(pred,
+                target,
+                weight=None,
+                reduction='mean',
+                avg_factor=None,
+                **kwargs):
+        # get element-wise loss
+        loss = loss_func(pred, target, **kwargs)
+        loss = custom_weight_reduce_loss(loss, weight, reduction, avg_factor)
+        return loss
+
+    return wrapper
+
+
+def custom_weighted_dir_loss(loss_func):
+    """Create a weighted version of a given loss function.
+
+    To use this decorator, the loss function must have the signature like
+    `loss_func(pred, target, **kwargs)`. The function only needs to compute
+    element-wise loss without any reduction. This decorator will add weight
+    and reduction arguments to the function. The decorated function will have
+    the signature like `loss_func(pred, target, weight=None, reduction='mean',
+    avg_factor=None, **kwargs)`.
+
+    :Example:
+
+    >>> import torch
+    >>> @weighted_loss
+    >>> def l1_loss(pred, target):
+    >>>     return (pred - target).abs()
+
+    >>> pred = torch.Tensor([0, 2, 3])
+    >>> target = torch.Tensor([1, 1, 1])
+    >>> weight = torch.Tensor([1, 0, 1])
+
+    >>> l1_loss(pred, target)
+    tensor(1.3333)
+    >>> l1_loss(pred, target, weight)
+    tensor(1.)
+    >>> l1_loss(pred, target, reduction='none')
+    tensor([1., 1., 2.])
+    >>> l1_loss(pred, target, weight, avg_factor=2)
+    tensor(1.5000)
+    """
+
+    @functools.wraps(loss_func)
+    def wrapper(pred,
+                target,
+                weight=None,
+                reduction='mean',
+                avg_factor=None,
+                **kwargs):
+        # get element-wise loss
+        loss = loss_func(pred, target, **kwargs)
+        loss = custom_weight_dir_reduce_loss(loss, weight, reduction, avg_factor)
+        return loss
+
+    return wrapper
+
+@custom_weighted_loss
+def ordered_pts_smooth_l1_loss(pred, target):
+    """L1 loss.
+
+    Args:
+        pred (torch.Tensor): shape [num_samples, num_pts, num_coords]
+        target (torch.Tensor): shape [num_samples, num_order, num_pts, num_coords]
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    if target.numel() == 0:
+        return pred.sum() * 0
+    pred = pred.unsqueeze(1).repeat(1, target.size(1),1,1)
+    assert pred.size() == target.size()
+    loss =smooth_l1_loss(pred,target, reduction='none')
+    # import pdb;pdb.set_trace()
+    return loss
+
+@weighted_loss
+def pts_l1_loss(pred, target):
+    """L1 loss.
+
+    Args:
+        pred (torch.Tensor): shape [num_samples, num_pts, num_coords]
+        target (torch.Tensor): shape [num_samples, num_pts, num_coords]
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    if target.numel() == 0:
+        return pred.sum() * 0
+    assert pred.size() == target.size()
+    loss = torch.abs(pred - target)
+    return loss
+
+@custom_weighted_loss
+def ordered_pts_l1_loss(pred, target):
+    """L1 loss.
+
+    Args:
+        pred (torch.Tensor): shape [num_samples, num_pts, num_coords]
+        target (torch.Tensor): shape [num_samples, num_order, num_pts, num_coords]
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    if target.numel() == 0:
+        return pred.sum() * 0
+    pred = pred.unsqueeze(1).repeat(1, target.size(1),1,1)
+    assert pred.size() == target.size()
+    loss = torch.abs(pred - target)
+    return loss
+
+@custom_weighted_dir_loss
+def pts_dir_cos_loss(pred, target):
+    """ Dir cosine similiarity loss
+    pred (torch.Tensor): shape [num_samples, num_dir, num_coords]
+    target (torch.Tensor): shape [num_samples, num_dir, num_coords]
+
+    """
+    if target.numel() == 0:
+        return pred.sum() * 0
+    # import pdb;pdb.set_trace()
+    num_samples, num_dir, num_coords = pred.shape
+    loss_func = torch.nn.CosineEmbeddingLoss(reduction='none')
+    tgt_param = target.new_ones((num_samples, num_dir))
+    tgt_param = tgt_param.flatten(0)
+    loss = loss_func(pred.flatten(0,1), target.flatten(0,1), tgt_param)
+    loss = loss.view(num_samples, num_dir)
+    return loss
+
+@LOSSES.register_module()
+class OrderedPtsSmoothL1Loss(nn.Module):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(OrderedPtsSmoothL1Loss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        # import pdb;pdb.set_trace()
+        loss_bbox = self.loss_weight * ordered_pts_smooth_l1_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+
+
+@LOSSES.register_module()
+class PtsDirCosLoss(nn.Module):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(PtsDirCosLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        # import pdb;pdb.set_trace()
+        loss_dir = self.loss_weight * pts_dir_cos_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_dir
+
+
+
+@LOSSES.register_module()
+class PtsL1Loss(nn.Module):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(PtsL1Loss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        # import pdb;pdb.set_trace()
+        loss_bbox = self.loss_weight * pts_l1_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+
+@LOSSES.register_module()
+class OrderedPtsL1Loss(nn.Module):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(OrderedPtsL1Loss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        # import pdb;pdb.set_trace()
+        loss_bbox = self.loss_weight * ordered_pts_l1_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+
+
+
+
+@MATCH_COST.register_module()
+class OrderedPtsSmoothL1Cost(object):
+    """OrderedPtsL1Cost.
+     Args:
+         weight (int | float, optional): loss_weight
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (x, y), which are all in range [0, 1]. Shape
+                [num_query, num_pts, 2].
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x,y). 
+                Shape [num_gt, num_ordered, num_pts, 2].
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        num_gts, num_orders, num_pts, num_coords = gt_bboxes.shape
+        # import pdb;pdb.set_trace()
+        bbox_pred = bbox_pred.view(bbox_pred.size(0),-1).unsqueeze(1).repeat(1,num_gts*num_orders,1)
+        gt_bboxes = gt_bboxes.flatten(2).view(num_gts*num_orders,-1).unsqueeze(0).repeat(bbox_pred.size(0),1,1)
+        # import pdb;pdb.set_trace()
+        bbox_cost = smooth_l1_loss(bbox_pred, gt_bboxes, reduction='none').sum(-1)
+        # bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
+
+@MATCH_COST.register_module()
+class PtsL1Cost(object):
+    """OrderedPtsL1Cost.
+     Args:
+         weight (int | float, optional): loss_weight
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (x, y), which are all in range [0, 1]. Shape
+                [num_query, num_pts, 2].
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x,y). 
+                Shape [num_gt, num_ordered, num_pts, 2].
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        num_gts, num_pts, num_coords = gt_bboxes.shape
+        # import pdb;pdb.set_trace()
+        bbox_pred = bbox_pred.view(bbox_pred.size(0),-1)
+        gt_bboxes = gt_bboxes.view(num_gts,-1)
+        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
+
+@MATCH_COST.register_module()
+class OrderedPtsL1Cost(object):
+    """OrderedPtsL1Cost.
+     Args:
+         weight (int | float, optional): loss_weight
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (x, y), which are all in range [0, 1]. Shape
+                [num_query, num_pts, 2].
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x,y). 
+                Shape [num_gt, num_ordered, num_pts, 2].
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        num_gts, num_orders, num_pts, num_coords = gt_bboxes.shape
+        # import pdb;pdb.set_trace()
+        bbox_pred = bbox_pred.view(bbox_pred.size(0),-1)
+        gt_bboxes = gt_bboxes.flatten(2).view(num_gts*num_orders,-1)
+        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
+
+@MATCH_COST.register_module()
+class MyChamferDistanceCost:
+    def __init__(self, loss_src_weight=1., loss_dst_weight=1.):
+        # assert mode in ['smooth_l1', 'l1', 'l2']
+        # self.mode = mode
+        self.loss_src_weight = loss_src_weight
+        self.loss_dst_weight = loss_dst_weight
+
+    def __call__(self, src, dst,src_weight=1.0,dst_weight=1.0,):
+        """
+        pred_pts (Tensor): normed coordinate(x,y), shape (num_q, num_pts_M, 2)
+        gt_pts (Tensor): normed coordinate(x,y), shape (num_gt, num_pts_N, 2)
+        """
+        # criterion_mode = self.mode
+        # if criterion_mode == 'smooth_l1':
+        #     criterion = smooth_l1_loss
+        # elif criterion_mode == 'l1':
+        #     criterion = l1_loss
+        # elif criterion_mode == 'l2':
+        #     criterion = mse_loss
+        # else:
+        #     raise NotImplementedError
+        # import pdb;pdb.set_trace()
+        src_expand = src.unsqueeze(1).repeat(1,dst.shape[0],1,1)
+        dst_expand = dst.unsqueeze(0).repeat(src.shape[0],1,1,1)
+        # src_expand = src.unsqueeze(2).unsqueeze(1).repeat(1,dst.shape[0], 1, dst.shape[1], 1)
+        # dst_expand = dst.unsqueeze(1).unsqueeze(0).repeat(src.shape[0],1, src.shape[1], 1, 1)
+        distance = torch.cdist(src_expand, dst_expand)
+        src2dst_distance = torch.min(distance, dim=3)[0]  # (num_q, num_gt, num_pts_N)
+        dst2src_distance = torch.min(distance, dim=2)[0]  # (num_q, num_gt, num_pts_M)
+        loss_src = (src2dst_distance * src_weight).mean(-1)
+        loss_dst = (dst2src_distance * dst_weight).mean(-1)
+        loss = loss_src*self.loss_src_weight + loss_dst * self.loss_dst_weight
+        return loss
+
+def chamfer_distance(src,
+                     dst,
+                     src_weight=1.0,
+                     dst_weight=1.0,
+                    #  criterion_mode='l1',
+                     reduction='mean',
+                     avg_factor=None):
+    """Calculate Chamfer Distance of two sets.
+
+    Args:
+        src (torch.Tensor): Source set with shape [B, N, C] to
+            calculate Chamfer Distance.
+        dst (torch.Tensor): Destination set with shape [B, M, C] to
+            calculate Chamfer Distance.
+        src_weight (torch.Tensor or float): Weight of source loss.
+        dst_weight (torch.Tensor or float): Weight of destination loss.
+        criterion_mode (str): Criterion mode to calculate distance.
+            The valid modes are smooth_l1, l1 or l2.
+        reduction (str): Method to reduce losses.
+            The valid reduction method are 'none', 'sum' or 'mean'.
+
+    Returns:
+        tuple: Source and Destination loss with the corresponding indices.
+
+            - loss_src (torch.Tensor): The min distance \
+                from source to destination.
+            - loss_dst (torch.Tensor): The min distance \
+                from destination to source.
+            - indices1 (torch.Tensor): Index the min distance point \
+                for each point in source to destination.
+            - indices2 (torch.Tensor): Index the min distance point \
+                for each point in destination to source.
+    """
+
+    # if criterion_mode == 'smooth_l1':
+    #     criterion = smooth_l1_loss
+    # elif criterion_mode == 'l1':
+    #     criterion = l1_loss
+    # elif criterion_mode == 'l2':
+    #     criterion = mse_loss
+    # else:
+    #     raise NotImplementedError
+
+    # src_expand = src.unsqueeze(2).repeat(1, 1, dst.shape[1], 1)
+    # dst_expand = dst.unsqueeze(1).repeat(1, src.shape[1], 1, 1)
+    # import pdb;pdb.set_trace()
+    distance = torch.cdist(src, dst)
+    src2dst_distance, indices1 = torch.min(distance, dim=2)  # (B,N)
+    dst2src_distance, indices2 = torch.min(distance, dim=1)  # (B,M)
+    # import pdb;pdb.set_trace()
+    #TODO this may be wrong for misaligned src_weight, now[N,fixed_num]
+    # should be [N], then view
+    loss_src = (src2dst_distance * src_weight)
+    loss_dst = (dst2src_distance * dst_weight)
+    if avg_factor is None:
+        reduction_enum = F._Reduction.get_enum(reduction)
+        if reduction_enum == 0:
+            raise ValueError('MyCDLoss can not be used with reduction=`none`')
+        elif reduction_enum == 1:
+            loss_src = loss_src.mean(-1).mean()
+            loss_dst = loss_dst.mean(-1).mean()
+        elif reduction_enum == 2:
+            loss_src = loss_src.mean(-1).sum()
+            loss_dst = loss_dst.mean(-1).sum()
+        else:
+            raise NotImplementedError
+    else:
+        if reduction == 'mean':
+            eps = torch.finfo(torch.float32).eps
+            loss_src = loss_src.mean(-1).sum() / (avg_factor + eps)
+            loss_dst = loss_dst.mean(-1).sum() / (avg_factor + eps)
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+
+    return loss_src, loss_dst, indices1, indices2
+
+
+@LOSSES.register_module()
+class MyChamferDistance(nn.Module):
+    """Calculate Chamfer Distance of two sets.
+
+    Args:
+        mode (str): Criterion mode to calculate distance.
+            The valid modes are smooth_l1, l1 or l2.
+        reduction (str): Method to reduce losses.
+            The valid reduction method are none, sum or mean.
+        loss_src_weight (float): Weight of loss_source.
+        loss_dst_weight (float): Weight of loss_target.
+    """
+
+    def __init__(self,
+                #  mode='l1',
+                 reduction='mean',
+                 loss_src_weight=1.0,
+                 loss_dst_weight=1.0):
+        super(MyChamferDistance, self).__init__()
+
+        # assert mode in ['smooth_l1', 'l1', 'l2']
+        assert reduction in ['none', 'sum', 'mean']
+        # self.mode = mode
+        self.reduction = reduction
+        self.loss_src_weight = loss_src_weight
+        self.loss_dst_weight = loss_dst_weight
+
+    def forward(self,
+                source,
+                target,
+                src_weight=1.0,
+                dst_weight=1.0,
+                avg_factor=None,
+                reduction_override=None,
+                return_indices=False,
+                **kwargs):
+        """Forward function of loss calculation.
+
+        Args:
+            source (torch.Tensor): Source set with shape [B, N, C] to
+                calculate Chamfer Distance.
+            target (torch.Tensor): Destination set with shape [B, M, C] to
+                calculate Chamfer Distance.
+            src_weight (torch.Tensor | float, optional):
+                Weight of source loss. Defaults to 1.0.
+            dst_weight (torch.Tensor | float, optional):
+                Weight of destination loss. Defaults to 1.0.
+            reduction_override (str, optional): Method to reduce losses.
+                The valid reduction method are 'none', 'sum' or 'mean'.
+                Defaults to None.
+            return_indices (bool, optional): Whether to return indices.
+                Defaults to False.
+
+        Returns:
+            tuple[torch.Tensor]: If ``return_indices=True``, return losses of \
+                source and target with their corresponding indices in the \
+                order of ``(loss_source, loss_target, indices1, indices2)``. \
+                If ``return_indices=False``, return \
+                ``(loss_source, loss_target)``.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        loss_source, loss_target, indices1, indices2 = chamfer_distance(
+            source, target, src_weight, dst_weight, reduction,
+            avg_factor=avg_factor)
+
+        loss_source *= self.loss_src_weight
+        loss_target *= self.loss_dst_weight
+
+        loss_pts = loss_source + loss_target
+
+        if return_indices:
+            return loss_pts, indices1, indices2
+        else:
+            return loss_pts
diff --git a/mmcv/models/vad_utils/__init__.py b/mmcv/models/vad_utils/__init__.py
new file mode 100644
index 0000000..126bf07
--- /dev/null
+++ b/mmcv/models/vad_utils/__init__.py
@@ -0,0 +1,7 @@
+from .map_utils import normalize_2d_bbox, normalize_2d_pts, denormalize_2d_bbox, denormalize_2d_pts
+from .CD_loss import (
+    MyChamferDistance, MyChamferDistanceCost,
+    OrderedPtsL1Cost, PtsL1Cost, OrderedPtsSmoothL1Cost,
+    OrderedPtsL1Loss, PtsL1Loss, PtsDirCosLoss
+)
+from .plan_loss import PlanMapBoundLoss, PlanCollisionLoss, PlanMapDirectionLoss
\ No newline at end of file
diff --git a/mmcv/models/vad_utils/map_utils.py b/mmcv/models/vad_utils/map_utils.py
new file mode 100644
index 0000000..0b3f6b3
--- /dev/null
+++ b/mmcv/models/vad_utils/map_utils.py
@@ -0,0 +1,41 @@
+from mmcv.core.bbox.transforms import bbox_xyxy_to_cxcywh, bbox_cxcywh_to_xyxy
+
+def normalize_2d_bbox(bboxes, pc_range):
+
+    patch_h = pc_range[4]-pc_range[1]
+    patch_w = pc_range[3]-pc_range[0]
+    cxcywh_bboxes = bbox_xyxy_to_cxcywh(bboxes)
+    cxcywh_bboxes[...,0:1] = cxcywh_bboxes[..., 0:1] - pc_range[0]
+    cxcywh_bboxes[...,1:2] = cxcywh_bboxes[...,1:2] - pc_range[1]
+    factor = bboxes.new_tensor([patch_w, patch_h,patch_w,patch_h])
+
+    normalized_bboxes = cxcywh_bboxes / factor
+    return normalized_bboxes
+
+def normalize_2d_pts(pts, pc_range):
+    patch_h = pc_range[4]-pc_range[1]
+    patch_w = pc_range[3]-pc_range[0]
+    new_pts = pts.clone()
+    new_pts[...,0:1] = pts[..., 0:1] - pc_range[0]
+    new_pts[...,1:2] = pts[...,1:2] - pc_range[1]
+    factor = pts.new_tensor([patch_w, patch_h])
+    normalized_pts = new_pts / factor
+    return normalized_pts
+
+def denormalize_2d_bbox(bboxes, pc_range):
+
+    bboxes = bbox_cxcywh_to_xyxy(bboxes)
+    bboxes[..., 0::2] = (bboxes[..., 0::2]*(pc_range[3] -
+                            pc_range[0]) + pc_range[0])
+    bboxes[..., 1::2] = (bboxes[..., 1::2]*(pc_range[4] -
+                            pc_range[1]) + pc_range[1])
+
+    return bboxes
+
+def denormalize_2d_pts(pts, pc_range):
+    new_pts = pts.clone()
+    new_pts[...,0:1] = (pts[..., 0:1]*(pc_range[3] -
+                            pc_range[0]) + pc_range[0])
+    new_pts[...,1:2] = (pts[...,1:2]*(pc_range[4] -
+                            pc_range[1]) + pc_range[1])
+    return new_pts
\ No newline at end of file
diff --git a/mmcv/models/vad_utils/plan_loss.py b/mmcv/models/vad_utils/plan_loss.py
new file mode 100644
index 0000000..7792da8
--- /dev/null
+++ b/mmcv/models/vad_utils/plan_loss.py
@@ -0,0 +1,440 @@
+import math
+import torch
+from torch import nn as nn
+from mmcv.models.losses.utils import weighted_loss
+from mmcv.models.builder import LOSSES
+
+
+@LOSSES.register_module()
+class PlanMapBoundLoss(nn.Module):
+    """Planning constraint to push ego vehicle away from the lane boundary.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+        map_thresh (float, optional): confidence threshold to filter map predictions.
+        lane_bound_cls_idx (float, optional): lane_boundary class index.
+        dis_thresh (float, optional): distance threshold between ego vehicle and lane bound.
+        point_cloud_range (list, optional): point cloud range.
+    """
+
+    def __init__(
+        self,
+        reduction='mean',
+        loss_weight=1.0,
+        map_thresh=0.5,
+        lane_bound_cls_idx=2,
+        dis_thresh=1.0,
+        point_cloud_range=[-15.0, -30.0, -2.0, 15.0, 30.0, 2.0],
+        perception_detach=False
+    ):
+        super(PlanMapBoundLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.map_thresh = map_thresh
+        self.lane_bound_cls_idx = lane_bound_cls_idx
+        self.dis_thresh = dis_thresh
+        self.pc_range = point_cloud_range
+        self.perception_detach = perception_detach
+
+    def forward(self,
+                ego_fut_preds,
+                lane_preds,
+                lane_score_preds,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            ego_fut_preds (Tensor): [B, fut_ts, 2]
+            lane_preds (Tensor): [B, num_vec, num_pts, 2]
+            lane_score_preds (Tensor): [B, num_vec, 3]
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        if self.perception_detach:
+            lane_preds = lane_preds.detach()
+            lane_score_preds = lane_score_preds.detach()
+
+        # filter lane element according to confidence score and class
+        not_lane_bound_mask = lane_score_preds[..., self.lane_bound_cls_idx] < self.map_thresh
+        # denormalize map pts
+        lane_bound_preds = lane_preds.clone()
+        lane_bound_preds[...,0:1] = (lane_bound_preds[..., 0:1] * (self.pc_range[3] -
+                                self.pc_range[0]) + self.pc_range[0])
+        lane_bound_preds[...,1:2] = (lane_bound_preds[..., 1:2] * (self.pc_range[4] -
+                                self.pc_range[1]) + self.pc_range[1])
+        # pad not-lane-boundary cls and low confidence preds
+        lane_bound_preds[not_lane_bound_mask] = 1e6
+
+        loss_bbox = self.loss_weight * plan_map_bound_loss(ego_fut_preds, lane_bound_preds,
+                                                           weight=weight, dis_thresh=self.dis_thresh,
+                                                           reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+
+@weighted_loss
+def plan_map_bound_loss(pred, target, dis_thresh=1.0):
+    """Planning map bound constraint (L1 distance).
+
+    Args:
+        pred (torch.Tensor): ego_fut_preds, [B, fut_ts, 2].
+        target (torch.Tensor): lane_bound_preds, [B, num_vec, num_pts, 2].
+        weight (torch.Tensor): [B, fut_ts]
+
+    Returns:
+        torch.Tensor: Calculated loss [B, fut_ts]
+    """
+    pred = pred.cumsum(dim=-2)
+    ego_traj_starts = pred[:, :-1, :]
+    ego_traj_ends = pred
+    B, T, _ = ego_traj_ends.size()
+    padding_zeros = torch.zeros((B, 1, 2), dtype=pred.dtype, device=pred.device)  # initial position
+    ego_traj_starts = torch.cat((padding_zeros, ego_traj_starts), dim=1)
+    _, V, P, _ = target.size()
+    ego_traj_expanded = ego_traj_ends.unsqueeze(2).unsqueeze(3)  # [B, T, 1, 1, 2]
+    maps_expanded = target.unsqueeze(1)  # [1, 1, M, P, 2]
+    dist = torch.linalg.norm(ego_traj_expanded - maps_expanded, dim=-1)  # [B, T, M, P]
+    dist = dist.min(dim=-1, keepdim=False)[0]
+    min_inst_idxs = torch.argmin(dist, dim=-1).tolist()
+    batch_idxs = [[i] for i in range(dist.shape[0])]
+    ts_idxs = [[i for i in range(dist.shape[1])] for j in range(dist.shape[0])]
+    bd_target = target.unsqueeze(1).repeat(1, pred.shape[1], 1, 1, 1)
+    min_bd_insts = bd_target[batch_idxs, ts_idxs, min_inst_idxs]  # [B, T, P, 2]
+    bd_inst_starts = min_bd_insts[:, :, :-1, :].flatten(0, 2)
+    bd_inst_ends = min_bd_insts[:, :, 1:, :].flatten(0, 2)
+    ego_traj_starts = ego_traj_starts.unsqueeze(2).repeat(1, 1, P-1, 1).flatten(0, 2)
+    ego_traj_ends = ego_traj_ends.unsqueeze(2).repeat(1, 1, P-1, 1).flatten(0, 2)
+
+    intersect_mask = segments_intersect(ego_traj_starts, ego_traj_ends,
+                                        bd_inst_starts, bd_inst_ends)
+    intersect_mask = intersect_mask.reshape(B, T, P-1)
+    intersect_mask = intersect_mask.any(dim=-1)
+    intersect_idx = (intersect_mask == True).nonzero()
+
+    target = target.view(target.shape[0], -1, target.shape[-1])
+    # [B, fut_ts, num_vec*num_pts]
+    dist = torch.linalg.norm(pred[:, :, None, :] - target[:, None, :, :], dim=-1)
+    min_idxs = torch.argmin(dist, dim=-1).tolist()
+    batch_idxs = [[i] for i in range(dist.shape[0])]
+    ts_idxs = [[i for i in range(dist.shape[1])] for j in range(dist.shape[0])]
+    min_dist = dist[batch_idxs, ts_idxs, min_idxs]
+    loss = min_dist
+    safe_idx = loss > dis_thresh
+    unsafe_idx = loss <= dis_thresh
+    loss[safe_idx] = 0
+    loss[unsafe_idx] = dis_thresh - loss[unsafe_idx]
+
+    for i in range(len(intersect_idx)):
+        loss[intersect_idx[i, 0], intersect_idx[i, 1]:] = 0
+
+    return loss
+
+
+def segments_intersect(line1_start, line1_end, line2_start, line2_end):
+    # Calculating the differences
+    dx1 = line1_end[:, 0] - line1_start[:, 0]
+    dy1 = line1_end[:, 1] - line1_start[:, 1]
+    dx2 = line2_end[:, 0] - line2_start[:, 0]
+    dy2 = line2_end[:, 1] - line2_start[:, 1]
+
+    # Calculating determinants
+    det = dx1 * dy2 - dx2 * dy1
+    det_mask = det != 0
+
+    # Checking if lines are parallel or coincident
+    parallel_mask = torch.logical_not(det_mask)
+
+    # Calculating intersection parameters
+    t1 = ((line2_start[:, 0] - line1_start[:, 0]) * dy2 
+          - (line2_start[:, 1] - line1_start[:, 1]) * dx2) / det
+    t2 = ((line2_start[:, 0] - line1_start[:, 0]) * dy1 
+          - (line2_start[:, 1] - line1_start[:, 1]) * dx1) / det
+
+    # Checking intersection conditions
+    intersect_mask = torch.logical_and(
+        torch.logical_and(t1 >= 0, t1 <= 1),
+        torch.logical_and(t2 >= 0, t2 <= 1)
+    )
+
+    # Handling parallel or coincident lines
+    intersect_mask[parallel_mask] = False
+
+    return intersect_mask
+
+
+@LOSSES.register_module()
+class PlanCollisionLoss(nn.Module):
+    """Planning constraint to push ego vehicle away from other agents.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+        agent_thresh (float, optional): confidence threshold to filter agent predictions.
+        x_dis_thresh (float, optional): distance threshold between ego and other agents in x-axis.
+        y_dis_thresh (float, optional): distance threshold between ego and other agents in y-axis.
+        point_cloud_range (list, optional): point cloud range.
+    """
+
+    def __init__(
+        self,
+        reduction='mean',
+        loss_weight=1.0,
+        agent_thresh=0.5,
+        x_dis_thresh=1.5,
+        y_dis_thresh=3.0,
+        point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0]
+    ):
+        super(PlanCollisionLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.agent_thresh = agent_thresh
+        self.x_dis_thresh = x_dis_thresh
+        self.y_dis_thresh = y_dis_thresh
+        self.pc_range = point_cloud_range
+
+    def forward(self,
+                ego_fut_preds,
+                agent_preds,
+                agent_fut_preds,
+                agent_score_preds,
+                agent_fut_cls_preds,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            ego_fut_preds (Tensor): [B, fut_ts, 2]
+            agent_preds (Tensor): [B, num_agent, 2]
+            agent_fut_preds (Tensor): [B, num_agent, fut_mode, fut_ts, 2]
+            agent_fut_cls_preds (Tensor): [B, num_agent, fut_mode]
+            agent_score_preds (Tensor): [B, num_agent, 10]
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        # filter agent element according to confidence score
+        agent_max_score_preds, agent_max_score_idxs = agent_score_preds.max(dim=-1)
+        not_valid_agent_mask = agent_max_score_preds < self.agent_thresh
+        # filter low confidence preds
+        agent_fut_preds[not_valid_agent_mask] = 1e6
+        # filter not vehicle preds
+        not_veh_pred_mask = agent_max_score_idxs > 4  # veh idxs are 0-4
+        agent_fut_preds[not_veh_pred_mask] = 1e6
+        # only use best mode pred
+        best_mode_idxs = torch.argmax(agent_fut_cls_preds, dim=-1).tolist()
+        batch_idxs = [[i] for i in range(agent_fut_cls_preds.shape[0])]
+        agent_num_idxs = [[i for i in range(agent_fut_cls_preds.shape[1])] for j in range(agent_fut_cls_preds.shape[0])]
+        agent_fut_preds = agent_fut_preds[batch_idxs, agent_num_idxs, best_mode_idxs]
+
+        loss_bbox = self.loss_weight * plan_col_loss(ego_fut_preds, agent_preds,
+                                                           agent_fut_preds=agent_fut_preds, weight=weight,
+                                                           x_dis_thresh=self.x_dis_thresh,
+                                                           y_dis_thresh=self.y_dis_thresh,
+                                                           reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+
+@weighted_loss
+def plan_col_loss(
+    pred,
+    target,
+    agent_fut_preds,
+    x_dis_thresh=1.5,
+    y_dis_thresh=3.0,
+    dis_thresh=3.0
+):
+    """Planning ego-agent collsion constraint.
+
+    Args:
+        pred (torch.Tensor): ego_fut_preds, [B, fut_ts, 2].
+        target (torch.Tensor): agent_preds, [B, num_agent, 2].
+        agent_fut_preds (Tensor): [B, num_agent, fut_ts, 2].
+        weight (torch.Tensor): [B, fut_ts, 2].
+        x_dis_thresh (float, optional): distance threshold between ego and other agents in x-axis.
+        y_dis_thresh (float, optional): distance threshold between ego and other agents in y-axis.
+        dis_thresh (float, optional): distance threshold to filter distant agents.
+
+    Returns:
+        torch.Tensor: Calculated loss [B, fut_mode, fut_ts, 2]
+    """
+    pred = pred.cumsum(dim=-2)
+    agent_fut_preds = agent_fut_preds.cumsum(dim=-2)
+    target = target[:, :, None, :] + agent_fut_preds
+    # filter distant agents from ego vehicle
+    dist = torch.linalg.norm(pred[:, None, :, :] - target, dim=-1)
+    dist_mask = dist > dis_thresh
+    target[dist_mask] = 1e6
+
+    # [B, num_agent, fut_ts]
+    x_dist = torch.abs(pred[:, None, :, 0] - target[..., 0])
+    y_dist = torch.abs(pred[:, None, :, 1] - target[..., 1])
+    x_min_idxs = torch.argmin(x_dist, dim=1).tolist()
+    y_min_idxs = torch.argmin(y_dist, dim=1).tolist()
+    batch_idxs = [[i] for i in range(y_dist.shape[0])]
+    ts_idxs = [[i for i in range(y_dist.shape[-1])] for j in range(y_dist.shape[0])]
+
+    # [B, fut_ts]
+    x_min_dist = x_dist[batch_idxs, x_min_idxs, ts_idxs]
+    y_min_dist = y_dist[batch_idxs, y_min_idxs, ts_idxs]
+    x_loss = x_min_dist
+    safe_idx = x_loss > x_dis_thresh
+    unsafe_idx = x_loss <= x_dis_thresh
+    x_loss[safe_idx] = 0
+    x_loss[unsafe_idx] = x_dis_thresh - x_loss[unsafe_idx]
+    y_loss = y_min_dist
+    safe_idx = y_loss > y_dis_thresh
+    unsafe_idx = y_loss <= y_dis_thresh
+    y_loss[safe_idx] = 0
+    y_loss[unsafe_idx] = y_dis_thresh - y_loss[unsafe_idx]
+    loss = torch.cat([x_loss.unsqueeze(-1), y_loss.unsqueeze(-1)], dim=-1)
+
+    return loss
+
+
+@LOSSES.register_module()
+class PlanMapDirectionLoss(nn.Module):
+    """Planning loss to force the ego heading angle consistent with lane direction.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+        theta_thresh (float, optional): angle diff thresh between ego and lane.
+        point_cloud_range (list, optional): point cloud range.
+    """
+
+    def __init__(
+        self,
+        reduction='mean',
+        loss_weight=1.0,
+        map_thresh=0.5,
+        dis_thresh=2.0,
+        lane_div_cls_idx=0,
+        point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0]
+    ):
+        super(PlanMapDirectionLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.map_thresh = map_thresh
+        self.dis_thresh = dis_thresh
+        self.lane_div_cls_idx = lane_div_cls_idx
+        self.pc_range = point_cloud_range
+
+    def forward(self,
+                ego_fut_preds,
+                lane_preds,
+                lane_score_preds,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            ego_fut_preds (Tensor): [B, fut_ts, 2]
+            lane_preds (Tensor): [B, num_vec, num_pts, 2]
+            lane_score_preds (Tensor): [B, num_vec, 3]
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        # filter lane element according to confidence score and class
+        not_lane_div_mask = lane_score_preds[..., self.lane_div_cls_idx] < self.map_thresh
+        # denormalize map pts
+        lane_div_preds = lane_preds.clone()
+        lane_div_preds[...,0:1] = (lane_div_preds[..., 0:1] * (self.pc_range[3] -
+                                self.pc_range[0]) + self.pc_range[0])
+        lane_div_preds[...,1:2] = (lane_div_preds[..., 1:2] * (self.pc_range[4] -
+                                self.pc_range[1]) + self.pc_range[1])
+        # pad not-lane-divider cls and low confidence preds
+        lane_div_preds[not_lane_div_mask] = 1e6
+
+        loss_bbox = self.loss_weight * plan_map_dir_loss(ego_fut_preds, lane_div_preds,
+                                                           weight=weight, dis_thresh=self.dis_thresh,
+                                                           reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+
+@weighted_loss
+def plan_map_dir_loss(pred, target, dis_thresh=2.0):
+    """Planning ego-map directional loss.
+
+    Args:
+        pred (torch.Tensor): ego_fut_preds, [B, fut_ts, 2].
+        target (torch.Tensor): lane_div_preds, [B, num_vec, num_pts, 2].
+        weight (torch.Tensor): [B, fut_ts]
+
+    Returns:
+        torch.Tensor: Calculated loss [B, fut_ts]
+    """
+    num_map_pts = target.shape[2]
+    pred = pred.cumsum(dim=-2)
+    traj_dis = torch.linalg.norm(pred[:, -1, :] - pred[:, 0, :], dim=-1)
+    static_mask = traj_dis < 1.0
+    target = target.unsqueeze(1).repeat(1, pred.shape[1], 1, 1, 1)
+
+    # find the closest map instance for ego at each timestamp
+    dist = torch.linalg.norm(pred[:, :, None, None, :] - target, dim=-1)
+    dist = dist.min(dim=-1, keepdim=False)[0]
+    min_inst_idxs = torch.argmin(dist, dim=-1).tolist()
+    batch_idxs = [[i] for i in range(dist.shape[0])]
+    ts_idxs = [[i for i in range(dist.shape[1])] for j in range(dist.shape[0])]
+    target_map_inst = target[batch_idxs, ts_idxs, min_inst_idxs]  # [B, fut_ts, num_pts, 2]
+
+    # calculate distance
+    dist = torch.linalg.norm(pred[:, :, None, :] - target_map_inst, dim=-1)
+    min_pts_idxs = torch.argmin(dist, dim=-1)
+    min_pts_next_idxs = min_pts_idxs.clone()
+    is_end_point = (min_pts_next_idxs == num_map_pts-1)
+    not_end_point = (min_pts_next_idxs != num_map_pts-1)
+    min_pts_next_idxs[is_end_point] = num_map_pts - 2
+    min_pts_next_idxs[not_end_point] = min_pts_next_idxs[not_end_point] + 1
+    min_pts_idxs = min_pts_idxs.tolist()
+    min_pts_next_idxs = min_pts_next_idxs.tolist()
+    traj_yaw = torch.atan2(torch.diff(pred[..., 1]), torch.diff(pred[..., 0]))  # [B, fut_ts-1]
+    # last ts yaw assume same as previous
+    traj_yaw = torch.cat([traj_yaw, traj_yaw[:, [-1]]], dim=-1)  # [B, fut_ts]
+    min_pts = target_map_inst[batch_idxs, ts_idxs, min_pts_idxs]
+    dist = torch.linalg.norm(min_pts - pred, dim=-1)
+    dist_mask = dist > dis_thresh
+    min_pts = min_pts.unsqueeze(2)
+    min_pts_next = target_map_inst[batch_idxs, ts_idxs, min_pts_next_idxs].unsqueeze(2)
+    map_pts = torch.cat([min_pts, min_pts_next], dim=2)
+    lane_yaw = torch.atan2(torch.diff(map_pts[..., 1]).squeeze(-1), torch.diff(map_pts[..., 0]).squeeze(-1))  # [B, fut_ts]
+    yaw_diff = traj_yaw - lane_yaw
+    yaw_diff[yaw_diff > math.pi] =  yaw_diff[yaw_diff > math.pi] - math.pi
+    yaw_diff[yaw_diff > math.pi/2] = yaw_diff[yaw_diff > math.pi/2] - math.pi
+    yaw_diff[yaw_diff < -math.pi] = yaw_diff[yaw_diff < -math.pi] + math.pi
+    yaw_diff[yaw_diff < -math.pi/2] = yaw_diff[yaw_diff < -math.pi/2] + math.pi
+    yaw_diff[dist_mask] = 0  # loss = 0 if no lane around ego
+    yaw_diff[static_mask] = 0  # loss = 0 if ego is static
+
+    loss = torch.abs(yaw_diff)
+
+    return loss  # [B, fut_ts]
diff --git a/mmcv/models/vad_utils/traj_lr_warmup.py b/mmcv/models/vad_utils/traj_lr_warmup.py
new file mode 100644
index 0000000..0b5ba1b
--- /dev/null
+++ b/mmcv/models/vad_utils/traj_lr_warmup.py
@@ -0,0 +1,13 @@
+import torch
+
+def get_traj_warmup_loss_weight(
+    cur_epoch,
+    tot_epoch,
+    start_pos=0.3,
+    end_pos=0.35,
+    scale_weight=1.1
+):
+    epoch_percentage = cur_epoch / tot_epoch
+    sigmoid_input = 5 / (end_pos-start_pos) * epoch_percentage - 2.5 * (end_pos+start_pos) / (end_pos - start_pos)
+
+    return scale_weight * torch.sigmoid(torch.tensor(sigmoid_input))
diff --git a/mmcv/ops/__init__.py b/mmcv/ops/__init__.py
new file mode 100644
index 0000000..0e4638e
--- /dev/null
+++ b/mmcv/ops/__init__.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .modulated_deform_conv import (ModulatedDeformConv2d,
+                                    ModulatedDeformConv2dPack,
+                                    modulated_deform_conv2d)
+from .multi_scale_deform_attn import MultiScaleDeformableAttention
+from .roiaware_pool3d import (RoIAwarePool3d, points_in_boxes_batch,
+                              points_in_boxes_cpu, points_in_boxes_gpu)
+from .roi_align import RoIAlign, roi_align
+from .iou3d import boxes_iou_bev, nms_bev, nms_normal_bev
+from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss,
+                         sigmoid_focal_loss, softmax_focal_loss)
+from .voxelize import Voxelization, voxelization
+from .nms import batched_nms, nms, nms_match, nms_rotated, soft_nms
+from .masked_conv import MaskedConv2d, masked_conv2d
+from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d
+
+
+# __all__ = [
+#     'bbox_overlaps', 'CARAFE', 'CARAFENaive', 'CARAFEPack', 'carafe',
+#     'carafe_naive', 'CornerPool', 'DeformConv2d', 'DeformConv2dPack',
+#     'deform_conv2d', 'DeformRoIPool', 'DeformRoIPoolPack',
+#     'ModulatedDeformRoIPoolPack', 'deform_roi_pool', 'SigmoidFocalLoss',
+#     'SoftmaxFocalLoss', 'sigmoid_focal_loss', 'softmax_focal_loss',
+#     'get_compiler_version', 'get_compiling_cuda_version',
+#     'get_onnxruntime_op_path', 'MaskedConv2d', 'masked_conv2d',
+#     'ModulatedDeformConv2d', 'ModulatedDeformConv2dPack',
+#     'modulated_deform_conv2d', 'batched_nms', 'nms', 'soft_nms', 'nms_match',
+#     'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool', 'SyncBatchNorm', 'Conv2d',
+#     'ConvTranspose2d', 'Linear', 'MaxPool2d', 'CrissCrossAttention', 'PSAMask',
+#     'point_sample', 'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign',
+#     'SAConv2d', 'TINShift', 'tin_shift', 'assign_score_withk',
+#     'box_iou_rotated', 'RoIPointPool3d', 'nms_rotated', 'knn', 'ball_query',
+#     'upfirdn2d', 'FusedBiasLeakyReLU', 'fused_bias_leakyrelu',
+#     'RoIAlignRotated', 'roi_align_rotated', 'pixel_group', 'QueryAndGroup',
+#     'GroupAll', 'grouping_operation', 'contour_expand', 'three_nn',
+#     'three_interpolate', 'MultiScaleDeformableAttention', 'BorderAlign',
+#     'border_align', 'gather_points', 'furthest_point_sample',
+#     'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation',
+#     'boxes_iou_bev', 'nms_bev', 'nms_normal_bev', 'Voxelization',
+#     'voxelization', 'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d',
+#     'points_in_boxes_part', 'points_in_boxes_cpu', 'points_in_boxes_all',
+#     'soft_nms', 'get_compiler_version',
+#     'get_compiling_cuda_version', 'NaiveSyncBatchNorm1d',
+#     'NaiveSyncBatchNorm2d', 'batched_nms', 'Voxelization', 'voxelization',
+#     'dynamic_scatter', 'DynamicScatter',
+#     'SparseBasicBlock', 'SparseBottleneck',
+#     'RoIAwarePool3d', 'points_in_boxes_gpu', 'points_in_boxes_cpu',
+#     'make_sparse_convmodule', 'ball_query', 'knn', 'furthest_point_sample',
+#     'furthest_point_sample_with_dist', 'three_interpolate', 'three_nn',
+#     'gather_points', 'grouping_operation', 'group_points', 'GroupAll',
+#     'QueryAndGroup', 'PointSAModule', 'PointSAModuleMSG', 'PointFPModule',
+#     'points_in_boxes_batch', 'assign_score_withk',
+#     'Points_Sampler', 'build_sa_module',
+#     'PAConv', 'PAConvCUDA', 'PAConvSAModuleMSG', 'PAConvSAModule',
+#     'PAConvCUDASAModule', 'PAConvCUDASAModuleMSG',
+#     'Upsample', 'resize', 'Encoding'
+# ]
diff --git a/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp b/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
new file mode 100644
index 0000000..67190dc
--- /dev/null
+++ b/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
@@ -0,0 +1,343 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
+#pragma once
+#include <cassert>
+#include <cmath>
+
+#ifdef __CUDACC__
+// Designates functions callable from the host (CPU) and the device (GPU)
+#define HOST_DEVICE __host__ __device__
+#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
+#else
+#include <algorithm>
+#define HOST_DEVICE
+#define HOST_DEVICE_INLINE HOST_DEVICE inline
+#endif
+
+namespace {
+
+template <typename T>
+struct RotatedBox {
+  T x_ctr, y_ctr, w, h, a;
+};
+
+template <typename T>
+struct Point {
+  T x, y;
+  HOST_DEVICE_INLINE Point(const T& px = 0, const T& py = 0) : x(px), y(py) {}
+  HOST_DEVICE_INLINE Point operator+(const Point& p) const {
+    return Point(x + p.x, y + p.y);
+  }
+  HOST_DEVICE_INLINE Point& operator+=(const Point& p) {
+    x += p.x;
+    y += p.y;
+    return *this;
+  }
+  HOST_DEVICE_INLINE Point operator-(const Point& p) const {
+    return Point(x - p.x, y - p.y);
+  }
+  HOST_DEVICE_INLINE Point operator*(const T coeff) const {
+    return Point(x * coeff, y * coeff);
+  }
+};
+
+template <typename T>
+HOST_DEVICE_INLINE T dot_2d(const Point<T>& A, const Point<T>& B) {
+  return A.x * B.x + A.y * B.y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T cross_2d(const Point<T>& A, const Point<T>& B) {
+  return A.x * B.y - B.x * A.y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE void get_rotated_vertices(const RotatedBox<T>& box,
+                                             Point<T> (&pts)[4]) {
+  // M_PI / 180. == 0.01745329251
+  // double theta = box.a * 0.01745329251;
+  // MODIFIED
+  double theta = box.a;
+  T cosTheta2 = (T)cos(theta) * 0.5f;
+  T sinTheta2 = (T)sin(theta) * 0.5f;
+
+  // y: top --> down; x: left --> right
+  pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
+  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
+  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[2].x = 2 * box.x_ctr - pts[0].x;
+  pts[2].y = 2 * box.y_ctr - pts[0].y;
+  pts[3].x = 2 * box.x_ctr - pts[1].x;
+  pts[3].y = 2 * box.y_ctr - pts[1].y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int get_intersection_points(const Point<T> (&pts1)[4],
+                                               const Point<T> (&pts2)[4],
+                                               Point<T> (&intersections)[24]) {
+  // Line vector
+  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+  Point<T> vec1[4], vec2[4];
+  for (int i = 0; i < 4; i++) {
+    vec1[i] = pts1[(i + 1) % 4] - pts1[i];
+    vec2[i] = pts2[(i + 1) % 4] - pts2[i];
+  }
+
+  // Line test - test all line combos for intersection
+  int num = 0;  // number of intersections
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      // Solve for 2x2 Ax=b
+      T det = cross_2d<T>(vec2[j], vec1[i]);
+
+      // This takes care of parallel lines
+      if (fabs(det) <= 1e-14) {
+        continue;
+      }
+
+      auto vec12 = pts2[j] - pts1[i];
+
+      T t1 = cross_2d<T>(vec2[j], vec12) / det;
+      T t2 = cross_2d<T>(vec1[i], vec12) / det;
+
+      if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
+        intersections[num++] = pts1[i] + vec1[i] * t1;
+      }
+    }
+  }
+
+  // Check for vertices of rect1 inside rect2
+  {
+    const auto& AB = vec2[0];
+    const auto& DA = vec2[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      // assume ABCD is the rectangle, and P is the point to be judged
+      // P is inside ABCD iff. P's projection on AB lies within AB
+      // and P's projection on AD lies within AD
+
+      auto AP = pts1[i] - pts2[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
+          (APdotAD <= ADdotAD)) {
+        intersections[num++] = pts1[i];
+      }
+    }
+  }
+
+  // Reverse the check - check for vertices of rect2 inside rect1
+  {
+    const auto& AB = vec1[0];
+    const auto& DA = vec1[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      auto AP = pts2[i] - pts1[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
+          (APdotAD <= ADdotAD)) {
+        intersections[num++] = pts2[i];
+      }
+    }
+  }
+
+  return num;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
+                                          const int& num_in, Point<T> (&q)[24],
+                                          bool shift_to_zero = false) {
+  assert(num_in >= 2);
+
+  // Step 1:
+  // Find point with minimum y
+  // if more than 1 points have the same minimum y,
+  // pick the one with the minimum x.
+  int t = 0;
+  for (int i = 1; i < num_in; i++) {
+    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
+      t = i;
+    }
+  }
+  auto& start = p[t];  // starting point
+
+  // Step 2:
+  // Subtract starting point from every points (for sorting in the next step)
+  for (int i = 0; i < num_in; i++) {
+    q[i] = p[i] - start;
+  }
+
+  // Swap the starting point to position 0
+  auto tmp = q[0];
+  q[0] = q[t];
+  q[t] = tmp;
+
+  // Step 3:
+  // Sort point 1 ~ num_in according to their relative cross-product values
+  // (essentially sorting according to angles)
+  // If the angles are the same, sort according to their distance to origin
+  T dist[24];
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+
+#ifdef __CUDACC__
+  // CUDA version
+  // In the future, we can potentially use thrust
+  // for sorting here to improve speed (though not guaranteed)
+  for (int i = 1; i < num_in - 1; i++) {
+    for (int j = i + 1; j < num_in; j++) {
+      T crossProduct = cross_2d<T>(q[i], q[j]);
+      if ((crossProduct < -1e-6) ||
+          (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
+        auto q_tmp = q[i];
+        q[i] = q[j];
+        q[j] = q_tmp;
+        auto dist_tmp = dist[i];
+        dist[i] = dist[j];
+        dist[j] = dist_tmp;
+      }
+    }
+  }
+#else
+  // CPU version
+  std::sort(q + 1, q + num_in,
+            [](const Point<T>& A, const Point<T>& B) -> bool {
+              T temp = cross_2d<T>(A, B);
+              if (fabs(temp) < 1e-6) {
+                return dot_2d<T>(A, A) < dot_2d<T>(B, B);
+              } else {
+                return temp > 0;
+              }
+            });
+#endif
+
+  // Step 4:
+  // Make sure there are at least 2 points (that don't overlap with each other)
+  // in the stack
+  int k;  // index of the non-overlapped second point
+  for (k = 1; k < num_in; k++) {
+    if (dist[k] > 1e-8) {
+      break;
+    }
+  }
+  if (k == num_in) {
+    // We reach the end, which means the convex hull is just one point
+    q[0] = p[t];
+    return 1;
+  }
+  q[1] = q[k];
+  int m = 2;  // 2 points in the stack
+  // Step 5:
+  // Finally we can start the scanning process.
+  // When a non-convex relationship between the 3 points is found
+  // (either concave shape or duplicated points),
+  // we pop the previous point from the stack
+  // until the 3-point relationship is convex again, or
+  // until the stack only contains two points
+  for (int i = k + 1; i < num_in; i++) {
+    while (m > 1 && cross_2d<T>(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
+      m--;
+    }
+    q[m++] = q[i];
+  }
+
+  // Step 6 (Optional):
+  // In general sense we need the original coordinates, so we
+  // need to shift the points back (reverting Step 2)
+  // But if we're only interested in getting the area/perimeter of the shape
+  // We can simply return.
+  if (!shift_to_zero) {
+    for (int i = 0; i < m; i++) {
+      q[i] += start;
+    }
+  }
+
+  return m;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int& m) {
+  if (m <= 2) {
+    return 0;
+  }
+
+  T area = 0;
+  for (int i = 1; i < m - 1; i++) {
+    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
+  }
+
+  return area / 2.0;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T rotated_boxes_intersection(const RotatedBox<T>& box1,
+                                                const RotatedBox<T>& box2) {
+  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+  // from rotated_rect_intersection_pts
+  Point<T> intersectPts[24], orderedPts[24];
+
+  Point<T> pts1[4];
+  Point<T> pts2[4];
+  get_rotated_vertices<T>(box1, pts1);
+  get_rotated_vertices<T>(box2, pts2);
+
+  int num = get_intersection_points<T>(pts1, pts2, intersectPts);
+
+  if (num <= 2) {
+    return 0.0;
+  }
+
+  // Convex Hull to order the intersection points in clockwise order and find
+  // the contour area.
+  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
+  return polygon_area<T>(orderedPts, num_convex);
+}
+
+}  // namespace
+
+template <typename T>
+HOST_DEVICE_INLINE T single_box_iou_rotated(T const* const box1_raw,
+                                            T const* const box2_raw,
+                                            const int mode_flag) {
+  // shift center to the middle point to achieve higher precision in result
+  RotatedBox<T> box1, box2;
+  auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
+  auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
+  box1.x_ctr = box1_raw[0] - center_shift_x;
+  box1.y_ctr = box1_raw[1] - center_shift_y;
+  box1.w = box1_raw[2];
+  box1.h = box1_raw[3];
+  box1.a = box1_raw[4];
+  box2.x_ctr = box2_raw[0] - center_shift_x;
+  box2.y_ctr = box2_raw[1] - center_shift_y;
+  box2.w = box2_raw[2];
+  box2.h = box2_raw[3];
+  box2.a = box2_raw[4];
+
+  const T area1 = box1.w * box1.h;
+  const T area2 = box2.w * box2.h;
+  if (area1 < 1e-14 || area2 < 1e-14) {
+    return 0.f;
+  }
+
+  const T intersection = rotated_boxes_intersection<T>(box1, box2);
+  T baseS = 1.0;
+  if (mode_flag == 0) {
+    baseS = (area1 + area2 - intersection);
+  } else if (mode_flag == 1) {
+    baseS = area1;
+  }
+  const T iou = intersection / baseS;
+  return iou;
+}
diff --git a/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
new file mode 100644
index 0000000..bf0abf7
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
@@ -0,0 +1,112 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
+#define ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+// input: points(B,N0,M,O), centers(B,N0,M,O), scores(B,N1,K,M), knn_idx(B,N1,K)
+// output: fout(B,O,N)
+// algo: fout(b,i,k,j) = s(b,i,k,m)*p(b,c(i),k,m,j) =  s(b,i,k,m)*p(b,i(k),m,j)
+//       i(k) = idx(b,i,k)
+//      sum: fout(b,i,j) = fout(b,i,j) + s(b,i,k,m)*p(b,i,k,m,j)
+//      avg: fout(b,i,j) = sum(fout(b,i,k,j)) / k
+//      max: fout(b,i,j) = max(fout(b,i,k,j), sum(s(b,i,k,m)*p(b,i,k,m,j)))
+
+template <typename T>
+__global__ void assign_score_withk_forward_cuda_kernel(
+    const int B, const int N0, const int N1, const int M, const int K,
+    const int O, const int aggregate, const T* points, const T* centers,
+    const T* scores, const int64_t* knn_idx, T* output) {
+  // ----- parallel loop for B, N1, K and O ---------
+  long i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= B * N1 * K * O) return;
+  // ------- loop for M ----------
+  const int b = (int)(i / (O * N1 * K));
+  const int o = (int)(i % (O * N1 * K) / (N1 * K));
+  const int n = (int)(i % (N1 * K) / K);
+  const int k = (int)(i % K);
+  const int cn = (int)knn_idx[b * K * N1 + n * K +
+                              0];  // The first neighbor is the center point
+  const int kn = (int)knn_idx[b * K * N1 + n * K + k];
+  if (kn >= N0 ||
+      kn < 0) {  // if index overflows, it is out of the neighborhood range
+    return;
+  }
+  assert(b < B);
+  assert(kn < N0);
+  assert(cn < N0);
+  assert(o < O);
+  assert(n < N1);
+  const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
+  T val = output[out_idx];
+  for (int m = 0; m < M; m++) {
+    val += points[b * N0 * M * O + kn * M * O + m * O + o] *
+               scores[b * N1 * K * M + n * K * M + k * M + m] -
+           centers[b * N0 * M * O + cn * M * O + m * O + o] *
+               scores[b * N1 * K * M + n * K * M + k * M + m];
+  }
+  output[out_idx] = val;
+}
+
+template <typename T>
+__global__ void assign_score_withk_points_backward_cuda_kernel(
+    const int B, const int N0, const int N, const int M, const int K,
+    const int O, const int aggregate, const T* grad_out, const T* scores,
+    const int64_t* knn_idx, T* grad_points, T* grad_centers) {
+  // ----- parallel loop for B, M, O ---------
+  long i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= B * M * O) return;
+  int b = (int)(i / (M * O));
+  int m = (int)(i % (M * O) / O);
+  int o = (int)(i % O);
+
+  // ----- loop for N,K ---------
+  for (int n = 0; n < N; n++) {
+    for (int k = 0; k < K; k++) {
+      int kn = knn_idx[b * N * K + n * K + k];
+      int cn = knn_idx[b * N * K + n * K + 0];
+      if (kn >= N0 ||
+          kn < 0) {  // if index overflows, it is out of the neighborhood range
+        continue;
+      }
+      atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
+                scores[b * N * K * M + n * K * M + k * M + m] *
+                    grad_out[b * O * N * K + o * N * K + n * K + k]);
+      atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
+                -scores[b * N * K * M + n * K * M + k * M + m] *
+                    grad_out[b * O * N * K + o * N * K + n * K + k]);
+    }
+  }
+}
+
+template <typename T>
+__global__ void assign_score_withk_scores_backward_cuda_kernel(
+    const int B, const int N0, const int N, const int M, const int K,
+    const int O, const int aggregate, const T* grad_out, const T* points,
+    const T* centers, const int64_t* knn_idx, T* grad_scores) {
+  // ----- parallel loop for B, N, K, M ---------
+  long i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= B * N * K * M) return;
+  const int b = (int)(i / (N * M * K));
+  const int n = (int)(i % (N * M * K) / M / K);
+  const int k = (int)(i % (M * K) / M);
+  const int m = (int)(i % M);
+  const int cn = knn_idx[b * N * K + n * K + 0];
+  const int kn = knn_idx[b * N * K + n * K + k];
+  if (kn >= N0 ||
+      kn < 0) {  // if index overflows, it is out of the neighborhood range
+    return;
+  }
+
+  // -------------- loop for O ------------------------
+  const int out_idx = b * N * K * M + n * K * M + k * M + m;
+  T val = grad_scores[out_idx];
+  for (int o = 0; o < O; o++) {
+    val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
+            centers[b * N0 * M * O + cn * M * O + m * O + o]) *
+           grad_out[b * O * N * K + o * N * K + n * K + k];
+  }
+  grad_scores[out_idx] = val;
+}
+
+#endif  // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
new file mode 100644
index 0000000..2d88c63
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
+#ifndef BALL_QUERY_CUDA_KERNEL_CUH
+#define BALL_QUERY_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+template <typename T>
+__global__ void ball_query_forward_cuda_kernel(int b, int n, int m,
+                                               float min_radius,
+                                               float max_radius, int nsample,
+                                               const T* new_xyz, const T* xyz,
+                                               int* idx) {
+  // new_xyz: (B, M, 3)
+  // xyz: (B, N, 3)
+  // output:
+  //      idx: (B, M, nsample)
+  int bs_idx = blockIdx.y;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (bs_idx >= b || pt_idx >= m) return;
+
+  new_xyz += bs_idx * m * 3 + pt_idx * 3;
+  xyz += bs_idx * n * 3;
+  idx += bs_idx * m * nsample + pt_idx * nsample;
+
+  float max_radius2 = max_radius * max_radius;
+  float min_radius2 = min_radius * min_radius;
+  T new_x = new_xyz[0];
+  T new_y = new_xyz[1];
+  T new_z = new_xyz[2];
+
+  int cnt = 0;
+  for (int k = 0; k < n; ++k) {
+    T x = xyz[k * 3 + 0];
+    T y = xyz[k * 3 + 1];
+    T z = xyz[k * 3 + 2];
+    T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+           (new_z - z) * (new_z - z);
+    if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
+      if (cnt == 0) {
+        for (int l = 0; l < nsample; ++l) {
+          idx[l] = k;
+        }
+      }
+      idx[cnt] = k;
+      ++cnt;
+      if (cnt >= nsample) break;
+    }
+  }
+}
+
+#endif  // BALL_QUERY_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
new file mode 100644
index 0000000..27e2c70
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
@@ -0,0 +1,80 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef BBOX_OVERLAPS_CUDA_KERNEL_CUH
+#define BBOX_OVERLAPS_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+template <typename T>
+__global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
+                                          T* ious, const int num_bbox1,
+                                          const int num_bbox2, const int mode,
+                                          const bool aligned,
+                                          const int offset) {
+  if (aligned) {
+    CUDA_1D_KERNEL_LOOP(index, num_bbox1) {
+      int b1 = index;
+      int b2 = index;
+
+      int base1 = b1 * 4;
+      T b1_x1 = bbox1[base1];
+      T b1_y1 = bbox1[base1 + 1];
+      T b1_x2 = bbox1[base1 + 2];
+      T b1_y2 = bbox1[base1 + 3];
+      T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+
+      int base2 = b2 * 4;
+      T b2_x1 = bbox2[base2];
+      T b2_y1 = bbox2[base2 + 1];
+      T b2_x2 = bbox2[base2 + 2];
+      T b2_y2 = bbox2[base2 + 3];
+      T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+
+      T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+      T width = fmaxf(right - left + offset, 0.f);
+      T height = fmaxf(bottom - top + offset, 0.f);
+      T interS = width * height;
+      T baseS = 1.0;
+      if (mode == 0) {
+        baseS = fmaxf(b1_area + b2_area - interS, T(offset));
+      } else if (mode == 1) {
+        baseS = fmaxf(b1_area, T(offset));
+      }
+      ious[index] = interS / baseS;
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) {
+      int b1 = index / num_bbox2;
+      int b2 = index % num_bbox2;
+
+      int base1 = b1 * 4;
+      T b1_x1 = bbox1[base1];
+      T b1_y1 = bbox1[base1 + 1];
+      T b1_x2 = bbox1[base1 + 2];
+      T b1_y2 = bbox1[base1 + 3];
+      T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+
+      int base2 = b2 * 4;
+      T b2_x1 = bbox2[base2];
+      T b2_y1 = bbox2[base2 + 1];
+      T b2_x2 = bbox2[base2 + 2];
+      T b2_y2 = bbox2[base2 + 3];
+      T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+
+      T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+      T width = fmaxf(right - left + offset, 0.f);
+      T height = fmaxf(bottom - top + offset, 0.f);
+      T interS = width * height;
+      T baseS = 1.0;
+      if (mode == 0) {
+        baseS = fmaxf(b1_area + b2_area - interS, T(offset));
+      } else if (mode == 1) {
+        baseS = fmaxf(b1_area, T(offset));
+      }
+      ious[index] = interS / baseS;
+    }
+  }
+}
+
+#endif  // BBOX_OVERLAPS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh
new file mode 100644
index 0000000..49c7877
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh
@@ -0,0 +1,196 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/csrc/border_align/border_align_kernel.cu.
+// the main difference: (1) use `argmax_idx` for fast computing of gradient
+// during the backward. (2) `wh` is directly computed by `boxes`, rather than
+// passing it as argument to forward or backward functions.
+
+#ifndef BORDER_ALIGN_CUDA_KERNEL_CUH
+#define BORDER_ALIGN_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_WITH_TRT
+
+enum BorderMode { Top = 0, Left = 1, Bottom = 2, Right = 3 };
+
+/*** Forward ***/
+template <typename T>
+__global__ void border_align_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* boxes, T* output,
+    int* argmax_idx, const int channels, const int box_size, const int height,
+    const int width, const int pool_size) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (batch_idx, c_idx, box_idx) is an element paralleled for computing
+    // output, and `extreme_idx` is in range [0,3]
+    int batch_idx, c_idx, box_idx, extreme_idx, maxidx, *offset_argmax_idx;
+    const T *offset_box, *offset_input, *offset_box_x;
+    T *offset_output, box_width, box_height, stride, x_stride, y_stride, x, y,
+        val, maxval;
+
+    extreme_idx = threadIdx.y;
+    // shape (N, C, box_size, 4) for output
+    batch_idx = index / channels / box_size;
+    // shape (N, box_size, 4) for boxes
+    box_idx = index % box_size + batch_idx * box_size;
+    c_idx = (index / box_size) % channels;
+
+    offset_box = boxes + box_idx * 4;
+    box_width = *(offset_box + 2) - *offset_box;
+    box_height = *(offset_box + 3) - *(offset_box + 1);
+    offset_output = output + index * 4 + extreme_idx;
+    offset_argmax_idx = argmax_idx + index * 4 + extreme_idx;
+    // shape (N, 4C, h, w) for input.
+    // [0,C) for top feature, [C,2C) for left feature,
+    // [2C,3C) for bottom feature, [3C,4C) for right feature
+    offset_input =
+        input + (batch_idx * channels * 4 + extreme_idx * channels + c_idx) *
+                    height * width;
+
+    // extreme_idx in [0,1] -> offset_box_x indexed at x1
+    // extreme_idx in [2,3] -> offset_box_x indexed at x2
+    offset_box_x = offset_box + extreme_idx / 2 * 2;
+
+    // (x1,y1) or (x2,y2) for (x,y)
+    x = *offset_box_x;
+    y = *(offset_box_x + 1);
+
+    switch (extreme_idx) {
+      // top
+      case BorderMode::Top:
+        stride = box_width / pool_size;
+        x_stride = stride;
+        y_stride = 0;
+        break;
+      // left
+      case BorderMode::Left:
+        stride = box_height / pool_size;
+        x_stride = 0;
+        y_stride = stride;
+        break;
+      // bottom
+      case BorderMode::Bottom:
+        stride = box_width / pool_size;
+        x_stride = -stride;
+        y_stride = 0;
+        break;
+      // right
+      case BorderMode::Right:
+        stride = box_height / pool_size;
+        x_stride = 0;
+        y_stride = -stride;
+        break;
+    }
+
+    // initialize maxval and maxidx with the start position (e.g. (x1,y1) or
+    // (x2,y2))
+    maxval = bilinear_interpolate(offset_input, height, width, y, x, index);
+    maxidx = 0;
+
+    // do max_pool along the border
+    for (int i = 1; i <= pool_size; i++) {
+      x += x_stride;
+      y += y_stride;
+      val = bilinear_interpolate(offset_input, height, width, y, x, index);
+      if (val > maxval) {
+        maxval = val;
+        maxidx = i;
+      }
+    }
+
+    // update output and argmax_idx
+    *offset_output = maxval;
+    *offset_argmax_idx = maxidx;
+  }
+}
+
+/*** Backward ***/
+template <typename T>
+__global__ void border_align_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* boxes,
+    const int* argmax_idx, T* grad_input, const int channels,
+    const int box_size, const int height, const int width,
+    const int pool_size) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (batch_idx, c_idx, box_idx) is an element paralleled for computing
+    // output, and `extreme_idx` is in range [0,3]
+    int batch_idx, c_idx, box_idx, extreme_idx;
+    const int* offset_argmax_idx;
+    const T *offset_grad_output, *offset_box, *offset_box_x;
+    T *offset_grad_input, box_width, box_height, stride, x_stride, y_stride, x,
+        y;
+
+    extreme_idx = threadIdx.y;
+    batch_idx = index / channels / box_size;
+    box_idx = index % box_size + batch_idx * box_size;
+    c_idx = (index / box_size) % channels;
+
+    offset_box = boxes + box_idx * 4;
+    box_width = *(offset_box + 2) - *offset_box;
+    box_height = *(offset_box + 3) - *(offset_box + 1);
+    offset_grad_output = grad_output + index * 4 + extreme_idx;
+    offset_argmax_idx = argmax_idx + index * 4 + extreme_idx;
+    // [0,C) for top feature grad, [C,2C) for left feature grad,
+    // [2C,3C) for bottom feature grad, [3C,4C) for right feature grad
+    offset_grad_input = grad_input + (batch_idx * channels * 4 +
+                                      extreme_idx * channels + c_idx) *
+                                         height * width;
+
+    // extreme_idx in [0,1] -> offset_box_x indexed at x1
+    // extreme_idx in [2,3] -> offset_box_x indexed at x2
+    offset_box_x = offset_box + extreme_idx / 2 * 2;
+
+    switch (extreme_idx) {
+      // top
+      case BorderMode::Top:
+        stride = box_width / pool_size;
+        x_stride = stride;
+        y_stride = 0;
+        break;
+      // left
+      case BorderMode::Left:
+        stride = box_height / pool_size;
+        x_stride = 0;
+        y_stride = stride;
+        break;
+      // bottom
+      case BorderMode::Bottom:
+        stride = box_width / pool_size;
+        x_stride = -stride;
+        y_stride = 0;
+        break;
+      // right
+      case BorderMode::Right:
+        stride = box_height / pool_size;
+        x_stride = 0;
+        y_stride = -stride;
+        break;
+    }
+
+    // get position (x,y) which has maximum value during forward
+    x = *offset_box_x;
+    y = *(offset_box_x + 1);
+    x += x_stride * (T)(*offset_argmax_idx);
+    y += y_stride * (T)(*offset_argmax_idx);
+
+    T w1, w2, w3, w4;
+    int x_low, x_high, y_low, y_high;
+    bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low,
+                                  x_high, y_low, y_high, index);
+
+    // update grad_output
+    atomicAdd(offset_grad_input + y_low * width + x_low,
+              *offset_grad_output * w1);
+    atomicAdd(offset_grad_input + y_low * width + x_high,
+              *offset_grad_output * w2);
+    atomicAdd(offset_grad_input + y_high * width + x_low,
+              *offset_grad_output * w3);
+    atomicAdd(offset_grad_input + y_high * width + x_high,
+              *offset_grad_output * w4);
+  }
+}
+
+#endif  // BORDER_ALIGN_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh b/mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh
new file mode 100644
index 0000000..e7171e0
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh
@@ -0,0 +1,78 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
+#ifndef BOX_IOU_ROTATED_CUDA_CUH
+#define BOX_IOU_ROTATED_CUDA_CUH
+
+
+#include "pytorch_cuda_helper.hpp"
+#include "box_iou_rotated_utils.hpp"
+
+// 2D block with 32 * 16 = 512 threads per block
+const int BLOCK_DIM_X = 32;
+const int BLOCK_DIM_Y = 16;
+
+inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
+
+template <typename T>
+__global__ void box_iou_rotated_cuda_kernel(
+    const int n_boxes1, const int n_boxes2, const T* dev_boxes1,
+    const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {
+  if (aligned) {
+    CUDA_1D_KERNEL_LOOP(index, n_boxes1) {
+      int b1 = index;
+      int b2 = index;
+
+      int base1 = b1 * 5;
+
+      float block_boxes1[5];
+      float block_boxes2[5];
+
+      block_boxes1[0] = dev_boxes1[base1 + 0];
+      block_boxes1[1] = dev_boxes1[base1 + 1];
+      block_boxes1[2] = dev_boxes1[base1 + 2];
+      block_boxes1[3] = dev_boxes1[base1 + 3];
+      block_boxes1[4] = dev_boxes1[base1 + 4];
+
+      int base2 = b2 * 5;
+
+      block_boxes2[0] = dev_boxes2[base2 + 0];
+      block_boxes2[1] = dev_boxes2[base2 + 1];
+      block_boxes2[2] = dev_boxes2[base2 + 2];
+      block_boxes2[3] = dev_boxes2[base2 + 3];
+      block_boxes2[4] = dev_boxes2[base2 + 4];
+
+      dev_ious[index] =
+          single_box_iou_rotated<T>(block_boxes1, block_boxes2, mode_flag);
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {
+      int b1 = index / n_boxes2;
+      int b2 = index % n_boxes2;
+
+      int base1 = b1 * 5;
+
+      float block_boxes1[5];
+      float block_boxes2[5];
+
+      block_boxes1[0] = dev_boxes1[base1 + 0];
+      block_boxes1[1] = dev_boxes1[base1 + 1];
+      block_boxes1[2] = dev_boxes1[base1 + 2];
+      block_boxes1[3] = dev_boxes1[base1 + 3];
+      block_boxes1[4] = dev_boxes1[base1 + 4];
+
+      int base2 = b2 * 5;
+
+      block_boxes2[0] = dev_boxes2[base2 + 0];
+      block_boxes2[1] = dev_boxes2[base2 + 1];
+      block_boxes2[2] = dev_boxes2[base2 + 2];
+      block_boxes2[3] = dev_boxes2[base2 + 3];
+      block_boxes2[4] = dev_boxes2[base2 + 4];
+
+      dev_ious[index] =
+          single_box_iou_rotated<T>(block_boxes1, block_boxes2, mode_flag);
+    }
+  }
+}
+
+#endif
diff --git a/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
new file mode 100644
index 0000000..d77f87c
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
@@ -0,0 +1,328 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CARAFE_CUDA_KERNEL_CUH
+#define CARAFE_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+#ifdef HIP_DIFF
+#define WARP_SIZE 64
+#else
+#define WARP_SIZE 32
+#endif
+#define THREADS_PER_PIXEL 32
+#define MAX_SHARED_MEMORY 49152
+#define MAX_SHARED_SCALAR_T 6144  // 49152 / 8 = 6144
+#define MAXIMIZE_KERNEL_SIZE true
+#define kTileDim 32
+#define kBlockRows 8
+#define FULL_MASK 0xffffffff
+
+inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
+
+__device__ inline int Loc2Index(const int n, const int c, const int h,
+                                const int w, const int channel_num,
+                                const int height, const int width) {
+  int index = w + (h + (c + n * channel_num) * height) * width;
+  return index;
+}
+#ifndef HIP_DIFF
+/* TODO: move this to a common place */
+template <typename scalar_t>
+__device__ inline scalar_t min(scalar_t a, scalar_t b) {
+  return a < b ? a : b;
+}
+
+template <typename scalar_t>
+__device__ inline scalar_t max(scalar_t a, scalar_t b) {
+  return a > b ? a : b;
+}
+#endif
+template <typename scalar_t>
+__device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {
+  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
+#ifdef HIP_DIFF
+    val += __shfl_down(val, offset);
+#else
+    val += __shfl_down_sync(FULL_MASK, val, offset);
+#endif
+  return val;
+}
+
+template <>
+__device__ __forceinline__ phalf warpReduceSum(phalf val) {
+  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
+#ifdef HIP_DIFF
+    __PHALF(val) += __shfl_down(FULL_MASK, val, offset);
+#else
+    __PHALF(val) +=
+        __shfl_down_sync(FULL_MASK, static_cast<__half>(__PHALF(val)), offset);
+#endif
+  return val;
+}
+
+// Splits the original matrix into submatrices with size 32 * 32.
+// Each block transposes one submatrix by loading it into shared memory.
+// Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/
+template <typename scalar_t>
+__global__ void BatchTranspose2DCUDAKernel(const int N, const int H,
+                                           const int W, const int dh,
+                                           const int dw,
+                                           const scalar_t *__restrict__ X,
+                                           scalar_t *__restrict__ Y) {
+  __shared__ scalar_t tile[kTileDim][kTileDim + 1];
+  const int n = blockIdx.x / (dh * dw);
+  const int k = blockIdx.x % (dh * dw);
+  const int r = k / dw;
+  const int c = k % dw;
+  const int offset = n * H * W;
+  int x = c * kTileDim + threadIdx.x;
+  int y = r * kTileDim + threadIdx.y;
+  if (x < W) {
+    for (int i = 0; threadIdx.y + i < kTileDim && y + i < H; i += kBlockRows) {
+      tile[threadIdx.y + i][threadIdx.x] = X[offset + (y + i) * W + x];
+    }
+  }
+  __syncthreads();
+  x = r * kTileDim + threadIdx.x;
+  y = c * kTileDim + threadIdx.y;
+  if (x < H) {
+    for (int i = 0; threadIdx.y + i < kTileDim && y + i < W; i += kBlockRows) {
+      Y[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i];
+    }
+  }
+}
+template <typename scalar_t>
+__global__ void CARAFEForward(
+    const int num_kernels, const scalar_t *__restrict__ bottom_data,
+    const scalar_t *__restrict__ bottom_masks, const int kernel_size,
+    const int group_size, const int scale_factor, const int channels,
+    const int down_height, const int down_width, const int height,
+    const int width, const int mask_channels, scalar_t *__restrict__ top_data) {
+#if MAXIMIZE_KERNEL_SIZE
+  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
+#else
+  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
+#endif
+
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
+  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+  index = index / THREADS_PER_PIXEL;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+
+  const int down_pw = pw / scale_factor;
+  const int down_ph = ph / scale_factor;
+
+  const int start_w = down_pw - (kernel_size - 1) / 2;
+  const int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+  const int start_h = down_ph - (kernel_size - 1) / 2;
+  const int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
+    int mask_index = Loc2Index(n, ph, pw, c, height, width, mask_channels);
+    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
+  }
+  __syncthreads();
+
+  const int channels_per_group = ceilf(channels / (float)group_size);
+#pragma unroll
+  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+    int mask_group = c / channels_per_group;
+    scalar_t output_val = 0;
+#pragma unroll
+    for (int iy = start_h; iy < end_h; iy++) {
+#pragma unroll
+      for (int ix = start_w; ix < end_w; ix++) {
+        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+          continue;
+        }
+        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index =
+            Loc2Index(n, iy, ix, c, down_height, down_width, channels);
+
+        output_val += bottom_data[feat_index] *
+                      shared_mask[mask_c * WARP_SIZE + pixel_id];
+      }
+    }
+
+    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
+    top_data[top_index] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void CARAFEBackward_Feature(
+    const int num_kernels, const scalar_t *__restrict__ top_diff,
+    const scalar_t *__restrict__ bottom_masks, const int kernel_size,
+    const int group_size, const int scale_factor, const int channels,
+    const int down_height, const int down_width, const int height,
+    const int width, const int mask_channels,
+    scalar_t *__restrict__ bottom_diff) {
+#if MAXIMIZE_KERNEL_SIZE
+  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
+#else
+  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
+#endif
+
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+
+  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
+  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+  // (n, c, ph, pw) is an element in the bottom_data
+  index = index / THREADS_PER_PIXEL;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+
+  const int start_w = pw - (kernel_size - 1) * scale_factor / 2;
+  const int end_w = pw + (kernel_size - 1) * scale_factor / 2 + 1;
+  const int start_h = ph - (kernel_size - 1) * scale_factor / 2;
+  const int end_h = ph + (kernel_size - 1) * scale_factor / 2 + 1;
+  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
+    const int mask_w = (c % kernel_size) * scale_factor;
+    const int mask_h = (c / kernel_size % kernel_size) * scale_factor;
+    const int mask_x = start_w + mask_w;
+    const int mask_y = start_h + mask_h;
+    if (mask_y < 0 || mask_y > height - 1 || mask_x < 0 || mask_x > width - 1) {
+      shared_mask[c * WARP_SIZE + pixel_id] = 0;
+      continue;
+    }
+    const int mask_group = c / (kernel_size * kernel_size);
+    const int mask_c = (2 * mask_group + 1) * kernel_size * kernel_size - c - 1;
+    int mask_index =
+        Loc2Index(n, mask_c, mask_y, mask_x, mask_channels, height, width);
+    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
+  }
+  __syncthreads();
+  const int channels_per_group = ceilf(channels / (float)group_size);
+#pragma unroll
+  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+    int mask_group = c / channels_per_group;
+    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
+    scalar_t output_val = 0;
+#pragma unroll
+    for (int iy = start_h; iy < end_h; iy += scale_factor) {
+#pragma unroll
+      for (int ix = start_w; ix < end_w; ix += scale_factor) {
+        if (iy < 0 || iy > height - 1 || ix < 0 || ix > width - 1) {
+          continue;
+        }
+        int mask_iy =
+            (iy - ph + (kernel_size - 1) * scale_factor / 2) / scale_factor;
+        int mask_ix =
+            (ix - pw + (kernel_size - 1) * scale_factor / 2) / scale_factor;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index = Loc2Index(n, iy, ix, c, height, width, channels);
+        output_val +=
+            shared_mask[mask_c * WARP_SIZE + pixel_id] * top_diff[feat_index];
+      }
+    }
+    bottom_diff[top_index] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void FeatureSum(const int num_kernels,
+                           const scalar_t *__restrict__ input_data,
+                           const int scale_factor, const int channels,
+                           const int height, const int width,
+                           scalar_t *__restrict__ output_data) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+  index = index / THREADS_PER_PIXEL;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+    scalar_t output_val = 0;
+    for (int iy = ph * scale_factor; iy < (ph + 1) * scale_factor; iy++) {
+      for (int ix = pw * scale_factor; ix < (pw + 1) * scale_factor; ix++) {
+        int input_id = Loc2Index(n, iy, ix, c, height * scale_factor,
+                                 width * scale_factor, channels);
+        output_val += input_data[input_id];
+      }
+    }
+    const int output_id = Loc2Index(n, ph, pw, c, height, width, channels);
+    output_data[output_id] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void CARAFEBackward_Mask(const int num_kernels,
+                                    const scalar_t *__restrict__ top_diff,
+                                    const scalar_t *__restrict__ bottom_data,
+                                    const int kernel_size, const int group_size,
+                                    const int scale_factor, const int channels,
+                                    const int down_height, const int down_width,
+                                    const int height, const int width,
+                                    const int mask_channels,
+                                    scalar_t *__restrict__ mask_diff) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+
+  const int lane_id = index % WARP_SIZE;
+  index = index / WARP_SIZE;
+  const int mask_c = index % mask_channels;
+  // (n, c, ph, pw) is an element in the bottom_data
+  index = index / mask_channels;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+
+  const int down_pw = pw / scale_factor;
+  const int down_ph = ph / scale_factor;
+
+  const int mask_group = mask_c / (kernel_size * kernel_size);
+  const int mask_loc = mask_c % (kernel_size * kernel_size);
+
+  const int offset_x = mask_loc % kernel_size - (kernel_size - 1) / 2;
+  const int offset_y =
+      mask_loc / kernel_size % kernel_size - (kernel_size - 1) / 2;
+
+  const int down_x = down_pw + offset_x;
+  const int down_y = down_ph + offset_y;
+
+  scalar_t output_val = 0;
+
+  if (down_y >= 0 && down_y <= down_height - 1 && down_x >= 0 &&
+      down_x <= down_width - 1) {
+    const int channels_per_mask = ceilf(channels / (float)group_size);
+    const int start = channels_per_mask * mask_group;
+    const int end = min(channels_per_mask * (mask_group + 1), channels);
+    for (int c = start + lane_id; c < end; c += WARP_SIZE) {
+      int bottom_id =
+          Loc2Index(n, down_y, down_x, c, down_height, down_width, channels);
+      int top_id = Loc2Index(n, ph, pw, c, height, width, channels);
+      output_val += top_diff[top_id] * bottom_data[bottom_id];
+    }
+  }
+#ifdef HIP_DIFF
+  __syncthreads();
+#else
+  __syncwarp();
+#endif
+  output_val = warpReduceSum(output_val);
+  if (lane_id == 0) {
+    const int mask_id =
+        Loc2Index(n, ph, pw, mask_c, height, width, mask_channels);
+    mask_diff[mask_id] = output_val;
+  }
+}
+
+#endif  // CARAFE_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh
new file mode 100644
index 0000000..0a4ab87
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh
@@ -0,0 +1,107 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CARAFE_NAIVE_CUDA_KERNEL_CUH
+#define CARAFE_NAIVE_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+__device__ inline int Loc2Index(const int n, const int c, const int h,
+                                const int w, const int channel_num,
+                                const int height, const int width) {
+  int index = w + (h + (c + n * channel_num) * height) * width;
+  return index;
+}
+
+template <typename scalar_t>
+__global__ void carafe_naive_forward_cuda_kernel(
+    const int nthreads, const scalar_t *bottom_data,
+    const scalar_t *bottom_masks, scalar_t *top_data, const int kernel_size,
+    const int group_size, const int scale_factor, const int channels,
+    const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the bottom_data
+    int pw = index % width;
+    int ph = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    int mask_channels = kernel_size * kernel_size * group_size;
+    int mask_group = c / (channels / group_size);
+
+    int down_pw = pw / scale_factor;
+    int down_ph = ph / scale_factor;
+    int down_width = width / scale_factor;
+    int down_height = height / scale_factor;
+    int start_w = down_pw - (kernel_size - 1) / 2;
+    int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+    int start_h = down_ph - (kernel_size - 1) / 2;
+    int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+
+    scalar_t output_val = 0;
+    for (int iy = start_h; iy < end_h; iy++) {
+      for (int ix = start_w; ix < end_w; ix++) {
+        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+          continue;
+        }
+        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index =
+            Loc2Index(n, c, iy, ix, channels, down_height, down_width);
+        int mask_index =
+            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
+        output_val += bottom_data[feat_index] * bottom_masks[mask_index];
+      }
+    }
+    top_data[index] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void carafe_naive_backward_cuda_kernel(
+    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_data,
+    const scalar_t *bottom_masks, scalar_t *bottom_diff, scalar_t *mask_diff,
+    const int kernel_size, const int group_size, const int scale_factor,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the bottom_data
+    int pw = index % width;
+    int ph = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    int mask_channels = kernel_size * kernel_size * group_size;
+    int mask_group = c / (channels / group_size);
+
+    int down_pw = pw / scale_factor;
+    int down_ph = ph / scale_factor;
+    int down_width = width / scale_factor;
+    int down_height = height / scale_factor;
+    int start_w = down_pw - (kernel_size - 1) / 2;
+    int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+    int start_h = down_ph - (kernel_size - 1) / 2;
+    int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+
+    for (int iy = start_h; iy < end_h; iy++) {
+      for (int ix = start_w; ix < end_w; ix++) {
+        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+          continue;
+        }
+        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index =
+            Loc2Index(n, c, iy, ix, channels, down_height, down_width);
+        int mask_index =
+            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
+        atomicAdd(bottom_diff + feat_index,
+                  bottom_masks[mask_index] * top_diff[index]);
+        atomicAdd(mask_diff + mask_index,
+                  bottom_data[feat_index] * top_diff[index]);
+      }
+    }
+  }
+}
+
+#endif  // CARAFE_NAIVE_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp b/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
new file mode 100644
index 0000000..dc5df17
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
@@ -0,0 +1,112 @@
+#ifndef COMMON_CUDA_HELPER
+#define COMMON_CUDA_HELPER
+
+#include <cuda.h>
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+#define THREADS_PER_BLOCK 512
+
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+inline int GET_BLOCKS(const int N) {
+  int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+  int max_block_num = 4096;
+  return min(optimal_block_num, max_block_num);
+}
+
+template <typename T>
+__device__ T bilinear_interpolate(const T* input, const int height,
+                                  const int width, T y, T x,
+                                  const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = input[y_low * width + x_low];
+  T v2 = input[y_low * width + x_high];
+  T v3 = input[y_high * width + x_low];
+  T v4 = input[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4,
+    int& x_low, int& x_high, int& y_low, int& y_high,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+#endif  // COMMON_CUDA_HELPER
diff --git a/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh b/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
new file mode 100644
index 0000000..0ef3fae
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
@@ -0,0 +1,227 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_cuda_kernel.cu
+// Original licence: Under MIT License
+
+#ifndef CORRELATION_CUDA
+#define CORRELATION_CUDA
+
+#include "pytorch_cuda_helper.hpp"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+// Using <torch/extension.h> is recommended in the official documentation in
+// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op.
+// However, we use <torch/types.h> for compatibility with CUDA 9.0
+// Read https://github.com/pytorch/extension-cpp/issues/35 for more details.
+#include <torch/types.h>
+
+#include <iostream>
+#include <vector>
+
+using namespace torch;
+
+#define TensorAcc4R PackedTensorAccessor32<scalar_t, 4, RestrictPtrTraits>
+#define TensorAcc5R PackedTensorAccessor32<scalar_t, 5, RestrictPtrTraits>
+#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)
+
+#define THREADS_FORWARD 32
+#define THREADS_BACKWARD 16
+
+template <typename scalar_t>
+__global__ void correlation_forward_cuda_kernel(
+    const TensorAcc4R rInput1, const TensorAcc4R rInput2, TensorAcc5R output,
+    int kH, int kW, int patchH, int patchW, int padH, int padW, int dilationH,
+    int dilationW, int dilation_patchH, int dilation_patchW, int dH, int dW) {
+  const int iH = rInput1.size(1);
+  const int iW = rInput1.size(2);
+  const int C = rInput1.size(3);
+
+  const int n = blockIdx.x;
+  const int h = blockIdx.y;
+  const int w = blockIdx.z;
+  const int thread = threadIdx.x;
+
+  const int start_i = -padH + h * dH;
+  const int start_j = -padW + w * dW;
+
+  const int patchRadH = dilation_patchH * (patchH - 1) / 2;
+  const int patchRadW = dilation_patchW * (patchW - 1) / 2;
+
+  __shared__ scalar_t prod_sum[THREADS_FORWARD];
+
+  for (int ph = 0; ph < patchH; ++ph) {
+    int ph_dilated = ph * dilation_patchH - patchRadH;
+    for (int pw = 0; pw < patchW; ++pw) {
+      int pw_dilated = pw * dilation_patchW - patchRadW;
+      prod_sum[thread] = 0;
+      for (int i = 0; i < kH; ++i) {
+        int i1 = start_i + i * dilationH;
+        int i2 = i1 + ph_dilated;
+        if
+          WITHIN_BOUNDS(i1, i2, iH, iH) {
+            for (int j = 0; j < kW; ++j) {
+              int j1 = start_j + j * dilationW;
+              int j2 = j1 + pw_dilated;
+              if
+                WITHIN_BOUNDS(j1, j2, iW, iW) {
+                  for (int c = thread; c < C; c += THREADS_FORWARD) {
+                    scalar_t v1 = rInput1[n][i1][j1][c];
+                    scalar_t v2 = rInput2[n][i2][j2][c];
+                    prod_sum[thread] += v1 * v2;
+                  }
+                }
+            }
+          }
+      }
+      // accumulate
+      __syncthreads();
+      if (thread == 0) {
+        scalar_t reduce_sum = 0;
+        for (int index = 0; index < THREADS_FORWARD; ++index) {
+          reduce_sum += prod_sum[index];
+        }
+        output[n][ph][pw][h][w] = reduce_sum;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void correlation_backward_cuda_kernel_input1(
+    const TensorAcc5R grad_output, const TensorAcc4R input2,
+    TensorAcc4R grad_input1, const int kH, const int kW, const int patchH,
+    const int patchW, const int padH, const int padW, const int dilationH,
+    const int dilationW, const int dilation_patchH, const int dilation_patchW,
+    const int dH, const int dW, const int batch) {
+  const int iH = input2.size(2);
+  const int iW = input2.size(3);
+
+  const int H = grad_output.size(3);
+  const int W = grad_output.size(4);
+
+  const int patchRadH = (patchH - 1) / 2;
+  const int patchRadW = (patchW - 1) / 2;
+
+  const int n = batch;
+  const int c = blockIdx.x;
+  const int h = blockIdx.y;
+  const int w = blockIdx.z;
+  const int ph_off = threadIdx.x;
+  const int pw_off = threadIdx.y;
+
+  const int h_2 = h + padH;
+  const int w_2 = w + padW;
+  const int min_h = h_2 - kH * dilationH;
+  const int min_w = w_2 - kW * dilationW;
+
+  __shared__ scalar_t prod_sum[THREADS_BACKWARD][THREADS_BACKWARD];
+  prod_sum[ph_off][pw_off] = 0;
+
+  for (int ph = ph_off; ph < patchH; ph += THREADS_BACKWARD) {
+    int i1 = h + dilation_patchH * (ph - patchRadH);
+    for (int pw = pw_off; pw < patchW; pw += THREADS_BACKWARD) {
+      int j1 = w + dilation_patchW * (pw - patchRadW);
+      if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+        scalar_t val = input2[n][c][i1][j1];
+        for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
+          int i2 = (h_3) / dH;
+          if (i2 * dH != h_3) continue;
+          for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
+            int j2 = (w_3) / dW;
+            if (j2 * dW != w_3) continue;
+            if
+              WITHIN_BOUNDS(i2, j2, H, W) {
+                prod_sum[ph_off][pw_off] +=
+                    grad_output[n][ph][pw][i2][j2] * val;
+              }
+          }
+        }
+      }
+    }
+  }
+
+  __syncthreads();
+
+  if (ph_off == 0 && pw_off == 0) {
+    scalar_t reduce_sum = 0;
+    for (int ph = 0; ph < THREADS_BACKWARD; ++ph) {
+      for (int pw = 0; pw < THREADS_BACKWARD; ++pw) {
+        reduce_sum += prod_sum[ph][pw];
+      }
+    }
+    grad_input1[n][c][h][w] = reduce_sum;
+  }
+}
+
+template <typename scalar_t>
+__global__ void correlation_backward_cuda_kernel_input2(
+    const TensorAcc5R grad_output, const TensorAcc4R input1,
+    TensorAcc4R grad_input2, int kH, int kW, int patchH, int patchW, int padH,
+    int padW, int dilationH, int dilationW, int dilation_patchH,
+    int dilation_patchW, int dH, int dW, int batch) {
+  const int iH = input1.size(2);
+  const int iW = input1.size(3);
+
+  const int patchRadH = (patchH - 1) / 2;
+  const int patchRadW = (patchW - 1) / 2;
+
+  const int H = grad_output.size(3);
+  const int W = grad_output.size(4);
+
+  const int dilatedKH = kH * dilationH;
+  const int dilatedKW = kW * dilationW;
+
+  const int n = batch;
+  const int c = blockIdx.x;
+  const int h = blockIdx.y;
+  const int w = blockIdx.z;
+  const int ph_off = threadIdx.x;
+  const int pw_off = threadIdx.y;
+
+  __shared__ scalar_t prod_sum[THREADS_BACKWARD][THREADS_BACKWARD];
+  prod_sum[ph_off][pw_off] = 0;
+
+  for (int ph = ph_off; ph < patchH; ph += THREADS_BACKWARD) {
+    int i1 = h - dilation_patchH * (ph - patchRadH);
+    for (int pw = pw_off; pw < patchW; pw += THREADS_BACKWARD) {
+      int j1 = w - dilation_patchW * (pw - patchRadW);
+      if
+        WITHIN_BOUNDS(i1, j1, iH, iW) {
+          scalar_t val = input1[n][c][i1][j1];
+
+          const int h_2 = i1 + padH;
+          const int w_2 = j1 + padW;
+          const int min_h = h_2 - dilatedKH;
+          const int min_w = w_2 - dilatedKW;
+
+          for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
+            int i2 = (h_3) / dH;
+            if (i2 * dH != h_3) continue;
+            for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
+              int j2 = (w_3) / dW;
+              if (j2 * dW != w_3) continue;
+              if
+                WITHIN_BOUNDS(i2, j2, H, W) {
+                  prod_sum[ph_off][pw_off] +=
+                      grad_output[n][ph][pw][i2][j2] * val;
+                }
+            }
+          }
+        }
+    }
+  }
+
+  __syncthreads();
+
+  if (ph_off == 0 && pw_off == 0) {
+    scalar_t reduce_sum = 0;
+    for (int ph = 0; ph < THREADS_BACKWARD; ++ph) {
+      for (int pw = 0; pw < THREADS_BACKWARD; ++pw) {
+        reduce_sum += prod_sum[ph][pw];
+      }
+    }
+    grad_input2[n][c][h][w] = reduce_sum;
+  }
+}
+#endif
diff --git a/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh
new file mode 100644
index 0000000..98e1e7a
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh
@@ -0,0 +1,363 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer
+ *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer
+ *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modified from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+#ifndef DEFORM_CONV_CUDA_KERNEL_CUH
+#define DEFORM_CONV_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_WITH_TRT
+
+template <typename T>
+__device__ T deformable_im2col_bilinear(const T *input, const int data_width,
+                                        const int height, const int width, T h,
+                                        T w) {
+  if (h <= -1 || height <= h || w <= -1 || width <= w) {
+    return 0;
+  }
+
+  int h_low = floorf(h);
+  int w_low = floorf(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+__device__ T get_gradient_weight(T argmax_h, T argmax_w, const int h,
+                                 const int w, const int height,
+                                 const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+__device__ T get_coordinate_weight(T argmax_h, T argmax_w, const int height,
+                                   const int width, const T *im_data,
+                                   const int data_width, const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+__global__ void deformable_im2col_gpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = deformable_im2col_bilinear(data_im_ptr, width, height, width,
+                                           h_im, w_im);
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void deformable_col2im_gpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
+                                         cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void deformable_col2im_coord_gpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int offset_channels, const int deformable_group, const int height_col,
+    const int width_col, T *grad_offset) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    T val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      const T weight = get_coordinate_weight(inv_h, inv_w, height, width,
+                                             data_im_ptr + cnt * height * width,
+                                             width, bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+#endif  // DEFORM_CONV_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh
new file mode 100644
index 0000000..ac95b35
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh
@@ -0,0 +1,182 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef DEFORM_ROI_POOL_CUDA_KERNEL_CUH
+#define DEFORM_ROI_POOL_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+template <typename T>
+__global__ void deform_roi_pool_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* rois, const T* offset,
+    T* output, const int pooled_height, const int pooled_width,
+    const T spatial_scale, const int sampling_ratio, const T gamma,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale - 0.5;
+    T roi_start_h = offset_rois[2] * spatial_scale - 0.5;
+    T roi_end_w = offset_rois[3] * spatial_scale - 0.5;
+    T roi_end_h = offset_rois[4] * spatial_scale - 0.5;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_height / pooled_height));
+    int roi_bin_grid_w =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_width / pooled_width));
+
+    // Compute roi offset
+    if (offset != NULL) {
+      const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +
+                              ph * pooled_width + pw;
+      T offset_roi_w = gamma * roi_width * offset_cur_w[0];
+      T offset_roi_h =
+          gamma * roi_height * offset_cur_w[pooled_width * pooled_height];
+      roi_start_w += offset_roi_w;
+      roi_start_h += offset_roi_h;
+    }
+
+    // We do average pooling inside a bin
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_start_h + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+        T val = bilinear_interpolate(offset_input, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output[index] = output_val / count;
+  }
+}
+
+template <typename T>
+__global__ void deform_roi_pool_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* input, const T* rois,
+    const T* offset, T* grad_input, T* grad_offset, const int pooled_height,
+    const int pooled_width, const T spatial_scale, const int sampling_ratio,
+    const T gamma, const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+    const T* offset_input =
+        input + ((roi_batch_ind * channels + c) * height * width);
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale - 0.5;
+    T roi_start_h = offset_rois[2] * spatial_scale - 0.5;
+    T roi_end_w = offset_rois[3] * spatial_scale - 0.5;
+    T roi_end_h = offset_rois[4] * spatial_scale - 0.5;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_height / pooled_height));
+    int roi_bin_grid_w =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_width / pooled_width));
+
+    // Compute roi offset
+    if (offset != NULL) {
+      const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +
+                              ph * pooled_width + pw;
+      T offset_roi_w = gamma * roi_width * offset_cur_w[0];
+      T offset_roi_h =
+          gamma * roi_height * offset_cur_w[pooled_width * pooled_height];
+      roi_start_w += offset_roi_w;
+      roi_start_h += offset_roi_h;
+    }
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+    const T grad_output_this_bin = grad_output[index] / count;
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_start_h + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_grad_input + y_low * width + x_low,
+                    grad_output_this_bin * w1);
+          atomicAdd(offset_grad_input + y_low * width + x_high,
+                    grad_output_this_bin * w2);
+          atomicAdd(offset_grad_input + y_high * width + x_low,
+                    grad_output_this_bin * w3);
+          atomicAdd(offset_grad_input + y_high * width + x_high,
+                    grad_output_this_bin * w4);
+          if (offset != NULL) {
+            T input_00 = offset_input[y_low * width + x_low];
+            T input_10 = offset_input[y_low * width + x_high];
+            T input_01 = offset_input[y_high * width + x_low];
+            T input_11 = offset_input[y_high * width + x_high];
+            T ogx = gamma * roi_width * grad_output_this_bin *
+                    (input_11 * (y - y_low) + input_10 * (y_high - y) +
+                     input_01 * (y_low - y) + input_00 * (y - y_high));
+            T ogy = gamma * roi_height * grad_output_this_bin *
+                    (input_11 * (x - x_low) + input_01 * (x_high - x) +
+                     input_10 * (x_low - x) + input_00 * (x - x_high));
+            atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +
+                          ph * pooled_width + pw,
+                      ogx);
+            atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +
+                          pooled_width * pooled_height + ph * pooled_width + pw,
+                      ogy);
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif  // DEFORM_ROI_POOL_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh
new file mode 100644
index 0000000..c23278a
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh
@@ -0,0 +1,148 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH
+#define FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i,
+                         int idx1, int idx2) {
+  const float v1 = dists[idx1], v2 = dists[idx2];
+  const int i1 = dists_i[idx1], i2 = dists_i[idx2];
+  dists[idx1] = max(v1, v2);
+  dists_i[idx1] = v2 > v1 ? i2 : i1;
+}
+
+template <unsigned int block_size>
+__global__ void furthest_point_sampling_forward_cuda_kernel(
+    int b, int n, int m, const float *__restrict__ dataset,
+    float *__restrict__ temp, int *__restrict__ idxs) {
+  // dataset: (B, N, 3)
+  // tmp: (B, N)
+  // output:
+  //      idx: (B, M)
+
+  if (m <= 0) return;
+  __shared__ float dists[block_size];
+  __shared__ int dists_i[block_size];
+
+  int batch_index = blockIdx.x;
+  dataset += batch_index * n * 3;
+  temp += batch_index * n;
+  idxs += batch_index * m;
+
+  int tid = threadIdx.x;
+  const int stride = block_size;
+
+  int old = 0;
+  if (threadIdx.x == 0) idxs[0] = old;
+
+  __syncthreads();
+  for (int j = 1; j < m; j++) {
+    int besti = 0;
+    float best = -1;
+    float x1 = dataset[old * 3 + 0];
+    float y1 = dataset[old * 3 + 1];
+    float z1 = dataset[old * 3 + 2];
+    for (int k = tid; k < n; k += stride) {
+      float x2, y2, z2;
+      x2 = dataset[k * 3 + 0];
+      y2 = dataset[k * 3 + 1];
+      z2 = dataset[k * 3 + 2];
+      // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
+      // if (mag <= 1e-3)
+      // continue;
+
+      float d =
+          (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
+      float d2 = min(d, temp[k]);
+      temp[k] = d2;
+      besti = d2 > best ? k : besti;
+      best = d2 > best ? d2 : best;
+    }
+    dists[tid] = best;
+    dists_i[tid] = besti;
+    __syncthreads();
+
+#pragma unroll
+    for (int block_size_thres = 1024; block_size_thres >= 2;
+         block_size_thres >>= 1) {
+      const int tid_thres = block_size_thres / 2;
+      if (block_size >= block_size_thres && tid < tid_thres) {
+        __update(dists, dists_i, tid, tid + tid_thres);
+      }
+      __syncthreads();
+    }
+
+    old = dists_i[0];
+    if (tid == 0) idxs[j] = old;
+  }
+}
+
+// Modified from
+// https://github.com/qiqihaer/3DSSD-pytorch/blob/master/lib/pointnet2/src/sampling_gpu.cu
+template <unsigned int block_size>
+__global__ void furthest_point_sampling_with_dist_forward_cuda_kernel(
+    int b, int n, int m, const float *__restrict__ dataset,
+    float *__restrict__ temp, int *__restrict__ idxs) {
+  // dataset: (B, N, N)
+  // tmp: (B, N)
+  // output:
+  //      idx: (B, M)
+
+  if (m <= 0) return;
+  __shared__ float dists[block_size];
+  __shared__ int dists_i[block_size];
+
+  int batch_index = blockIdx.x;
+  dataset += batch_index * n * n;
+  temp += batch_index * n;
+  idxs += batch_index * m;
+
+  int tid = threadIdx.x;
+  const int stride = block_size;
+
+  int old = 0;
+  if (threadIdx.x == 0) idxs[0] = old;
+
+  __syncthreads();
+  for (int j = 1; j < m; j++) {
+    int besti = 0;
+    float best = -1;
+    // float x1 = dataset[old * 3 + 0];
+    // float y1 = dataset[old * 3 + 1];
+    // float z1 = dataset[old * 3 + 2];
+    for (int k = tid; k < n; k += stride) {
+      // float x2, y2, z2;
+      // x2 = dataset[k * 3 + 0];
+      // y2 = dataset[k * 3 + 1];
+      // z2 = dataset[k * 3 + 2];
+
+      // float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) *
+      // (z2 - z1);
+      float d = dataset[old * n + k];
+
+      float d2 = min(d, temp[k]);
+      temp[k] = d2;
+      besti = d2 > best ? k : besti;
+      best = d2 > best ? d2 : best;
+    }
+    dists[tid] = best;
+    dists_i[tid] = besti;
+    __syncthreads();
+
+#pragma unroll
+    for (int block_size_thres = 1024; block_size_thres >= 2;
+         block_size_thres >>= 1) {
+      const int tid_thres = block_size_thres / 2;
+      if (block_size >= block_size_thres && tid < tid_thres) {
+        __update(dists, dists_i, tid, tid + tid_thres);
+      }
+      __syncthreads();
+    }
+
+    old = dists_i[0];
+    if (tid == 0) idxs[j] = old;
+  }
+}
+
+#endif  // FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
new file mode 100644
index 0000000..0008453
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
@@ -0,0 +1,52 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef GATHER_POINTS_CUDA_KERNEL_CUH
+#define GATHER_POINTS_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+#define TOTAL_THREADS 1024
+
+template <typename T>
+__global__ void gather_points_forward_cuda_kernel(int b, int c, int n, int m,
+                                                  const T *points,
+                                                  const int *__restrict__ idx,
+                                                  T *out) {
+  // points: (B, C, N)
+  // idx: (B, M)
+  // output:
+  //      out: (B, C, M)
+
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
+
+  out += bs_idx * c * m + c_idx * m + pt_idx;
+  idx += bs_idx * m + pt_idx;
+  points += bs_idx * c * n + c_idx * n;
+  out[0] = points[idx[0]];
+}
+
+template <typename T>
+__global__ void gather_points_backward_cuda_kernel(int b, int c, int n, int m,
+                                                   const T *grad_out,
+                                                   const int *__restrict__ idx,
+                                                   T *grad_points) {
+  // grad_out: (B, C, M)
+  // idx: (B, M)
+  // output:
+  //      grad_points: (B, C, N)
+
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
+
+  grad_out += bs_idx * c * m + c_idx * m + pt_idx;
+  idx += bs_idx * m + pt_idx;
+  grad_points += bs_idx * c * n + c_idx * n;
+
+  atomicAdd(grad_points + idx[0], grad_out[0]);
+}
+
+#endif  // GATHER_POINTS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
new file mode 100644
index 0000000..ffbc1f9
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
@@ -0,0 +1,59 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
+#ifndef GROUP_POINTS_CUDA_KERNEL_CUH
+#define GROUP_POINTS_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+template <typename T>
+__global__ void group_points_forward_cuda_kernel(int b, int c, int n,
+                                                 int npoints, int nsample,
+                                                 const T *points,
+                                                 const int *__restrict__ idx,
+                                                 T *out) {
+  // points: (B, C, N)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      out: (B, C, npoints, nsample)
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int pt_idx = index / nsample;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
+
+  int sample_idx = index % nsample;
+
+  idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+  int in_idx = bs_idx * c * n + c_idx * n + idx[0];
+  int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+                pt_idx * nsample + sample_idx;
+
+  out[out_idx] = points[in_idx];
+}
+
+template <typename T>
+__global__ void group_points_backward_cuda_kernel(int b, int c, int n,
+                                                  int npoints, int nsample,
+                                                  const T *grad_out,
+                                                  const int *__restrict__ idx,
+                                                  T *grad_points) {
+  // grad_out: (B, C, npoints, nsample)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      grad_points: (B, C, N)
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int pt_idx = index / nsample;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
+
+  int sample_idx = index % nsample;
+  grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+              pt_idx * nsample + sample_idx;
+  idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+
+  atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
+}
+
+#endif  // GROUP_POINTS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
new file mode 100644
index 0000000..c85ca64
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
@@ -0,0 +1,365 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef IOU3D_CUDA_KERNEL_CUH
+#define IOU3D_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+const int THREADS_PER_BLOCK_IOU3D = 16;
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+__device__ const float EPS = 1e-8;
+
+struct Point {
+  float x, y;
+  __device__ Point() {}
+  __device__ Point(double _x, double _y) { x = _x, y = _y; }
+
+  __device__ void set(float _x, float _y) {
+    x = _x;
+    y = _y;
+  }
+
+  __device__ Point operator+(const Point &b) const {
+    return Point(x + b.x, y + b.y);
+  }
+
+  __device__ Point operator-(const Point &b) const {
+    return Point(x - b.x, y - b.y);
+  }
+};
+
+__device__ inline float cross(const Point &a, const Point &b) {
+  return a.x * b.y - a.y * b.x;
+}
+
+__device__ inline float cross(const Point &p1, const Point &p2,
+                              const Point &p0) {
+  return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
+}
+
+__device__ int check_rect_cross(const Point &p1, const Point &p2,
+                                const Point &q1, const Point &q2) {
+  int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) &&
+            min(q1.x, q2.x) <= max(p1.x, p2.x) &&
+            min(p1.y, p2.y) <= max(q1.y, q2.y) &&
+            min(q1.y, q2.y) <= max(p1.y, p2.y);
+  return ret;
+}
+
+__device__ inline int check_in_box2d(const float *box, const Point &p) {
+  // params: box (5) [x1, y1, x2, y2, angle]
+  const float MARGIN = 1e-5;
+
+  float center_x = (box[0] + box[2]) / 2;
+  float center_y = (box[1] + box[3]) / 2;
+  float angle_cos = cos(-box[4]),
+        angle_sin =
+            sin(-box[4]);  // rotate the point in the opposite direction of box
+  float rot_x =
+      (p.x - center_x) * angle_cos - (p.y - center_y) * angle_sin + center_x;
+  float rot_y =
+      (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos + center_y;
+
+  return (rot_x > box[0] - MARGIN && rot_x < box[2] + MARGIN &&
+          rot_y > box[1] - MARGIN && rot_y < box[3] + MARGIN);
+}
+
+__device__ inline int intersection(const Point &p1, const Point &p0,
+                                   const Point &q1, const Point &q0,
+                                   Point &ans_point) {
+  // fast exclusion
+  if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
+
+  // check cross standing
+  float s1 = cross(q0, p1, p0);
+  float s2 = cross(p1, q1, p0);
+  float s3 = cross(p0, q1, q0);
+  float s4 = cross(q1, p1, q0);
+
+  if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
+
+  // calculate intersection of two lines
+  float s5 = cross(q1, p1, p0);
+  if (fabs(s5 - s1) > EPS) {
+    ans_point.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
+    ans_point.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
+
+  } else {
+    float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
+    float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
+    float D = a0 * b1 - a1 * b0;
+
+    ans_point.x = (b0 * c1 - b1 * c0) / D;
+    ans_point.y = (a1 * c0 - a0 * c1) / D;
+  }
+
+  return 1;
+}
+
+__device__ inline void rotate_around_center(const Point &center,
+                                            const float angle_cos,
+                                            const float angle_sin, Point &p) {
+  float new_x =
+      (p.x - center.x) * angle_cos - (p.y - center.y) * angle_sin + center.x;
+  float new_y =
+      (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
+  p.set(new_x, new_y);
+}
+
+__device__ inline int point_cmp(const Point &a, const Point &b,
+                                const Point &center) {
+  return atan2(a.y - center.y, a.x - center.x) >
+         atan2(b.y - center.y, b.x - center.x);
+}
+
+__device__ inline float box_overlap(const float *box_a, const float *box_b) {
+  // params: box_a (5) [x1, y1, x2, y2, angle]
+  // params: box_b (5) [x1, y1, x2, y2, angle]
+
+  float a_x1 = box_a[0], a_y1 = box_a[1], a_x2 = box_a[2], a_y2 = box_a[3],
+        a_angle = box_a[4];
+  float b_x1 = box_b[0], b_y1 = box_b[1], b_x2 = box_b[2], b_y2 = box_b[3],
+        b_angle = box_b[4];
+
+  Point center_a((a_x1 + a_x2) / 2, (a_y1 + a_y2) / 2);
+  Point center_b((b_x1 + b_x2) / 2, (b_y1 + b_y2) / 2);
+
+  Point box_a_corners[5];
+  box_a_corners[0].set(a_x1, a_y1);
+  box_a_corners[1].set(a_x2, a_y1);
+  box_a_corners[2].set(a_x2, a_y2);
+  box_a_corners[3].set(a_x1, a_y2);
+
+  Point box_b_corners[5];
+  box_b_corners[0].set(b_x1, b_y1);
+  box_b_corners[1].set(b_x2, b_y1);
+  box_b_corners[2].set(b_x2, b_y2);
+  box_b_corners[3].set(b_x1, b_y2);
+
+  // get oriented corners
+  float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
+  float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
+
+  for (int k = 0; k < 4; k++) {
+    rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
+    rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
+  }
+
+  box_a_corners[4] = box_a_corners[0];
+  box_b_corners[4] = box_b_corners[0];
+
+  // get intersection of lines
+  Point cross_points[16];
+  Point poly_center;
+  int cnt = 0, flag = 0;
+
+  poly_center.set(0, 0);
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      flag = intersection(box_a_corners[i + 1], box_a_corners[i],
+                          box_b_corners[j + 1], box_b_corners[j],
+                          cross_points[cnt]);
+      if (flag) {
+        poly_center = poly_center + cross_points[cnt];
+        cnt++;
+      }
+    }
+  }
+
+  // check corners
+  for (int k = 0; k < 4; k++) {
+    if (check_in_box2d(box_a, box_b_corners[k])) {
+      poly_center = poly_center + box_b_corners[k];
+      cross_points[cnt] = box_b_corners[k];
+      cnt++;
+    }
+    if (check_in_box2d(box_b, box_a_corners[k])) {
+      poly_center = poly_center + box_a_corners[k];
+      cross_points[cnt] = box_a_corners[k];
+      cnt++;
+    }
+  }
+
+  poly_center.x /= cnt;
+  poly_center.y /= cnt;
+
+  // sort the points of polygon
+  Point temp;
+  for (int j = 0; j < cnt - 1; j++) {
+    for (int i = 0; i < cnt - j - 1; i++) {
+      if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) {
+        temp = cross_points[i];
+        cross_points[i] = cross_points[i + 1];
+        cross_points[i + 1] = temp;
+      }
+    }
+  }
+
+  // get the overlap areas
+  float area = 0;
+  for (int k = 0; k < cnt - 1; k++) {
+    area += cross(cross_points[k] - cross_points[0],
+                  cross_points[k + 1] - cross_points[0]);
+  }
+
+  return fabs(area) / 2.0;
+}
+
+__device__ inline float iou_bev(const float *box_a, const float *box_b) {
+  // params: box_a (5) [x1, y1, x2, y2, angle]
+  // params: box_b (5) [x1, y1, x2, y2, angle]
+  float sa = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]);
+  float sb = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]);
+  float s_overlap = box_overlap(box_a, box_b);
+  return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
+}
+
+__global__ void iou3d_boxes_overlap_bev_forward_cuda_kernel(
+    const int num_a, const float *boxes_a, const int num_b,
+    const float *boxes_b, float *ans_overlap) {
+  const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+  const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+
+  if (a_idx >= num_a || b_idx >= num_b) {
+    return;
+  }
+  const float *cur_box_a = boxes_a + a_idx * 5;
+  const float *cur_box_b = boxes_b + b_idx * 5;
+  float s_overlap = box_overlap(cur_box_a, cur_box_b);
+  ans_overlap[a_idx * num_b + b_idx] = s_overlap;
+}
+
+__global__ void iou3d_boxes_iou_bev_forward_cuda_kernel(const int num_a,
+                                                        const float *boxes_a,
+                                                        const int num_b,
+                                                        const float *boxes_b,
+                                                        float *ans_iou) {
+  const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+  const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+
+  if (a_idx >= num_a || b_idx >= num_b) {
+    return;
+  }
+
+  const float *cur_box_a = boxes_a + a_idx * 5;
+  const float *cur_box_b = boxes_b + b_idx * 5;
+  float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
+  ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
+}
+
+__global__ void nms_forward_cuda_kernel(const int boxes_num,
+                                        const float nms_overlap_thresh,
+                                        const float *boxes,
+                                        unsigned long long *mask) {
+  // params: boxes (N, 5) [x1, y1, x2, y2, ry]
+  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+                             THREADS_PER_BLOCK_NMS);
+  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+                             THREADS_PER_BLOCK_NMS);
+
+  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
+
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 5 + 0] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
+    block_boxes[threadIdx.x * 5 + 1] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
+    block_boxes[threadIdx.x * 5 + 2] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
+    block_boxes[threadIdx.x * 5 + 3] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
+    block_boxes[threadIdx.x * 5 + 4] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+    const float *cur_box = boxes + cur_box_idx * 5;
+
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (iou_bev(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+    mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+__device__ inline float iou_normal(float const *const a, float const *const b) {
+  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
+  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
+  float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
+  float interS = width * height;
+  float Sa = (a[2] - a[0]) * (a[3] - a[1]);
+  float Sb = (b[2] - b[0]) * (b[3] - b[1]);
+  return interS / fmaxf(Sa + Sb - interS, EPS);
+}
+
+__global__ void nms_normal_forward_cuda_kernel(const int boxes_num,
+                                               const float nms_overlap_thresh,
+                                               const float *boxes,
+                                               unsigned long long *mask) {
+  // params: boxes (N, 5) [x1, y1, x2, y2, ry]
+  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+                             THREADS_PER_BLOCK_NMS);
+  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+                             THREADS_PER_BLOCK_NMS);
+
+  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
+
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 5 + 0] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
+    block_boxes[threadIdx.x * 5 + 1] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
+    block_boxes[threadIdx.x * 5 + 2] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
+    block_boxes[threadIdx.x * 5 + 3] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
+    block_boxes[threadIdx.x * 5 + 4] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+    const float *cur_box = boxes + cur_box_idx * 5;
+
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (iou_normal(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+    mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+#endif  // IOU3D_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
new file mode 100644
index 0000000..9a48cb0
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
@@ -0,0 +1,87 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
+#ifndef KNN_CUDA_KERNEL_CUH
+#define KNN_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+inline __device__ void swap_float(float *x, float *y) {
+  float tmp = *x;
+  *x = *y;
+  *y = tmp;
+}
+
+inline __device__ void swap_int(int *x, int *y) {
+  int tmp = *x;
+  *x = *y;
+  *y = tmp;
+}
+
+__device__ void reheap(float *dist, int *idx, int k) {
+  int root = 0;
+  int child = root * 2 + 1;
+  while (child < k) {
+    if (child + 1 < k && dist[child + 1] > dist[child]) child++;
+    if (dist[root] > dist[child]) return;
+    swap_float(&dist[root], &dist[child]);
+    swap_int(&idx[root], &idx[child]);
+    root = child;
+    child = root * 2 + 1;
+  }
+}
+
+__device__ void heap_sort(float *dist, int *idx, int k) {
+  int i;
+  for (i = k - 1; i > 0; i--) {
+    swap_float(&dist[0], &dist[i]);
+    swap_int(&idx[0], &idx[i]);
+    reheap(dist, idx, i);
+  }
+}
+
+// input: xyz (b, n, 3) new_xyz (b, m, 3)
+// output: idx (b, m, nsample) dist2 (b, m, nsample)
+template <typename T>
+__global__ void knn_forward_cuda_kernel(int b, int n, int m, int nsample,
+                                        const T *xyz, const T *new_xyz,
+                                        int *__restrict__ idx, T *dist2) {
+  int bs_idx = blockIdx.y;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (bs_idx >= b || pt_idx >= m) return;
+
+  new_xyz += bs_idx * m * 3 + pt_idx * 3;
+  xyz += bs_idx * n * 3;
+  idx += bs_idx * m * nsample + pt_idx * nsample;
+  dist2 += bs_idx * m * nsample + pt_idx * nsample;
+
+  T new_x = new_xyz[0];
+  T new_y = new_xyz[1];
+  T new_z = new_xyz[2];
+
+  float best_dist[100];
+  int best_idx[100];
+  for (int i = 0; i < nsample; i++) {
+    best_dist[i] = 1e10;
+    best_idx[i] = 0;
+  }
+  for (int i = 0; i < n; i++) {
+    T x = xyz[i * 3 + 0];
+    T y = xyz[i * 3 + 1];
+    T z = xyz[i * 3 + 2];
+    T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+           (new_z - z) * (new_z - z);
+    if (d2 < best_dist[0]) {
+      best_dist[0] = d2;
+      best_idx[0] = i;
+      reheap(best_dist, best_idx, nsample);
+    }
+  }
+  heap_sort(best_dist, best_idx, nsample);
+  for (int i = 0; i < nsample; i++) {
+    idx[i] = best_idx[i];
+    dist2[i] = best_dist[i];
+  }
+}
+
+#endif  // KNN_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh
new file mode 100644
index 0000000..b11b3cd
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MASKED_CONV2D_CUDA_KERNEL_CUH
+#define MASKED_CONV2D_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+template <typename scalar_t>
+__global__ void MaskedIm2colForward(const int n, const scalar_t *data_im,
+                                    const int height, const int width,
+                                    const int kernel_h, const int kernel_w,
+                                    const int pad_h, const int pad_w,
+                                    const int64_t *mask_h_idx,
+                                    const int64_t *mask_w_idx,
+                                    const int mask_cnt, scalar_t *data_col) {
+  // mask_cnt * channels
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int m_index = index % mask_cnt;
+    const int h_col = mask_h_idx[m_index];
+    const int w_col = mask_w_idx[m_index];
+    const int c_im = index / mask_cnt;
+    const int c_col = c_im * kernel_h * kernel_w;
+    const int h_offset = h_col - pad_h;
+    const int w_offset = w_col - pad_w;
+    scalar_t *data_col_ptr = data_col + c_col * mask_cnt + m_index;
+    for (int i = 0; i < kernel_h; ++i) {
+      int h_im = h_offset + i;
+      for (int j = 0; j < kernel_w; ++j) {
+        int w_im = w_offset + j;
+        if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+          *data_col_ptr =
+              (scalar_t)data_im[(c_im * height + h_im) * width + w_im];
+        } else {
+          *data_col_ptr = 0.0;
+        }
+        data_col_ptr += mask_cnt;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void MaskedCol2imForward(const int n, const scalar_t *data_col,
+                                    const int height, const int width,
+                                    const int channels,
+                                    const int64_t *mask_h_idx,
+                                    const int64_t *mask_w_idx,
+                                    const int mask_cnt, scalar_t *data_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int m_index = index % mask_cnt;
+    const int h_im = mask_h_idx[m_index];
+    const int w_im = mask_w_idx[m_index];
+    const int c_im = index / mask_cnt;
+    // compute the start and end of the output
+    data_im[(c_im * height + h_im) * width + w_im] = data_col[index];
+  }
+}
+
+#endif  // MASKED_CONV2D_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh
new file mode 100644
index 0000000..b29c74e
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh
@@ -0,0 +1,395 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer
+ *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer
+ *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modified from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+#ifndef MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
+#define MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_WITH_TRT
+
+template <typename T>
+__device__ T dmcn_im2col_bilinear(const T *input, const int data_width,
+                                  const int height, const int width, T h, T w) {
+  int h_low = floorf(h);
+  int w_low = floorf(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+__device__ T dmcn_get_gradient_weight(T argmax_h, T argmax_w, const int h,
+                                      const int w, const int height,
+                                      const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+__device__ T dmcn_get_coordinate_weight(T argmax_h, T argmax_w,
+                                        const int height, const int width,
+                                        const T *im_data, const int data_width,
+                                        const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+__global__ void modulated_deformable_im2col_gpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const T *data_mask,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const T *data_mask_ptr =
+        data_mask + (b_col * deformable_group + deformable_group_index) *
+                        kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im,
+                                     w_im);
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void modulated_deformable_col2im_gpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const T *data_mask,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr =
+        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T mask = data_mask_ptr[data_mask_hw_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight =
+              dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
+                                       cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const T *data_mask, const int channels, const int height, const int width,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int channel_per_deformable_group,
+    const int batch_size, const int offset_channels, const int deformable_group,
+    const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    T val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const int data_mask_hw_ptr =
+          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      const T mask = data_mask_ptr[data_mask_hw_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      else
+        mval += data_col_ptr[col_pos] *
+                dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width,
+                                     height, width, inv_h, inv_w);
+      const T weight = dmcn_get_coordinate_weight(
+          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+          width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
+      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
+      // height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
+                      kernel_w +
+                  offset_c / 2) *
+                     height_col +
+                 h) *
+                    width_col +
+                w] = mval;
+  }
+}
+
+#endif  // MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
new file mode 100644
index 0000000..aff1ea2
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
@@ -0,0 +1,800 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#ifndef DEFORM_ATTN_CUDA_KERNEL
+#define DEFORM_ATTN_CUDA_KERNEL
+
+#include "common_cuda_helper.hpp"
+#include "pytorch_cuda_helper.hpp"
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads) {
+  return (N + num_threads - 1) / num_threads;
+}
+
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c) {
+  const int h_low = floorf(h);
+  const int w_low = floorf(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,
+    const scalar_t &attn_weight, scalar_t *&grad_value,
+    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {
+  const int h_low = floorf(h);
+  const int w_low = floorf(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,
+    const scalar_t &attn_weight, scalar_t *&grad_value,
+    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {
+  const int h_low = floorf(h);
+  const int w_low = floorf(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val);
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(
+    const int n, const scalar_t *data_value, const int64_t *data_spatial_shapes,
+    const int64_t *data_level_start_index, const scalar_t *data_sampling_loc,
+    const scalar_t *data_attn_weight, const int batch_size,
+    const int spatial_size, const int num_heads, const int channels,
+    const int num_levels, const int num_query, const int num_point,
+    scalar_t *data_col) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr =
+          data_value +
+          (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h,
+                                                spatial_w, num_heads, channels,
+                                                h_im, w_im, m_col, c_col) *
+                 weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0) {
+          scalar_t _grad_w = cache_grad_sampling_loc[0],
+                   _grad_h = cache_grad_sampling_loc[1],
+                   _grad_a = cache_grad_attn_weight[0];
+          int sid = 2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid) {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+    scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0) {
+          scalar_t _grad_w = cache_grad_sampling_loc[0],
+                   _grad_h = cache_grad_sampling_loc[1],
+                   _grad_a = cache_grad_attn_weight[0];
+          int sid = 2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid) {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+    scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+             s >>= 1, spre >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre) {
+              cache_grad_attn_weight[tid] +=
+                  cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] +=
+                  cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] +=
+                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    extern __shared__ int _s[];
+    scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+    scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+             s >>= 1, spre >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre) {
+              cache_grad_attn_weight[tid] +=
+                  cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] +=
+                  cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] +=
+                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear_gm(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+#endif  // DEFORM_ATTN_CUDA_KERNEL
diff --git a/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
new file mode 100644
index 0000000..2bd4ef8
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
@@ -0,0 +1,70 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef NMS_CUDA_KERNEL_CUH
+#define NMS_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_WITH_TRT
+
+int const threadsPerBlock = sizeof(unsigned long long int) * 8;
+
+__device__ inline bool devIoU(float const *const a, float const *const b,
+                              const int offset, const float threshold) {
+  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
+  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
+  float width = fmaxf(right - left + offset, 0.f),
+        height = fmaxf(bottom - top + offset, 0.f);
+  float interS = width * height;
+  float Sa = (a[2] - a[0] + offset) * (a[3] - a[1] + offset);
+  float Sb = (b[2] - b[0] + offset) * (b[3] - b[1] + offset);
+  return interS > threshold * (Sa + Sb - interS);
+}
+
+__global__ void nms_cuda(const int n_boxes, const float iou_threshold,
+                         const int offset, const float *dev_boxes,
+                         unsigned long long *dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+  const int tid = threadIdx.x;
+
+  if (row_start > col_start) return;
+
+  const int row_size =
+      fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+      fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+  __shared__ float block_boxes[threadsPerBlock * 4];
+  if (tid < col_size) {
+    block_boxes[tid * 4 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
+    block_boxes[tid * 4 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
+    block_boxes[tid * 4 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
+    block_boxes[tid * 4 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
+  }
+  __syncthreads();
+
+  if (tid < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + tid;
+    const float *cur_box = dev_boxes + cur_box_idx * 4;
+    int i = 0;
+    unsigned long long int t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = tid + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
+        t |= 1ULL << i;
+      }
+    }
+    dev_mask[cur_box_idx * gridDim.y + col_start] = t;
+  }
+}
+#endif  // NMS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh b/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
new file mode 100644
index 0000000..cf4863e
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
@@ -0,0 +1,131 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
+#ifndef NMS_ROTATED_CUDA_CUH
+#define NMS_ROTATED_CUDA_CUH
+
+#include "pytorch_cuda_helper.hpp"
+#include "box_iou_rotated_utils.hpp"
+
+__host__ __device__ inline int divideUP(const int x, const int y) {
+  return (((x) + (y)-1) / (y));
+}
+
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+
+template <typename T>
+__global__ void nms_rotated_cuda_kernel(const int n_boxes,
+                                        const float iou_threshold,
+                                        const T* dev_boxes,
+                                        unsigned long long* dev_mask,
+                                        const int multi_label) {
+  // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel
+
+  if (multi_label == 1) {
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    // Compared to nms_cuda_kernel, where each box is represented with 4 values
+    // (x1, y1, x2, y2), each rotated box is represented with 5 values
+    // (x_center, y_center, width, height, angle_degrees) here.
+    __shared__ T block_boxes[threadsPerBlock * 5];
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 6 + 0] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
+      block_boxes[threadIdx.x * 6 + 1] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
+      block_boxes[threadIdx.x * 6 + 2] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
+      block_boxes[threadIdx.x * 6 + 3] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
+      block_boxes[threadIdx.x * 6 + 4] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
+      block_boxes[threadIdx.x * 6 + 5] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 5];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+      const T* cur_box = dev_boxes + cur_box_idx * 6;
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        // Instead of devIoU used by original horizontal nms, here
+        // we use the single_box_iou_rotated function from
+        // box_iou_rotated_utils.h
+        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 6, 0) >
+            iou_threshold) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
+      dev_mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  } else {
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    // Compared to nms_cuda_kernel, where each box is represented with 4 values
+    // (x1, y1, x2, y2), each rotated box is represented with 5 values
+    // (x_center, y_center, width, height, angle_degrees) here.
+    __shared__ T block_boxes[threadsPerBlock * 5];
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 5 + 0] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+      block_boxes[threadIdx.x * 5 + 1] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+      block_boxes[threadIdx.x * 5 + 2] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+      block_boxes[threadIdx.x * 5 + 3] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+      block_boxes[threadIdx.x * 5 + 4] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+      const T* cur_box = dev_boxes + cur_box_idx * 5;
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        // Instead of devIoU used by original horizontal nms, here
+        // we use the single_box_iou_rotated function from
+        // box_iou_rotated_utils.h
+        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5, 0) >
+            iou_threshold) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
+      dev_mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  }
+}
+
+#endif
diff --git a/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
new file mode 100644
index 0000000..011f5f7
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
@@ -0,0 +1,89 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef POINT_IN_BOXES_CUDA_KERNEL_CUH
+#define POINT_IN_BOXES_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+template <typename T>
+__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
+                                             T &local_x, T &local_y) {
+  T cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+template <typename T>
+__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
+                                        T &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
+  // cz in the bottom center
+  T x = pt[0], y = pt[1], z = pt[2];
+  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
+  cz += z_size /
+        2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > z_size / 2.0) return 0;
+  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
+                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
+  return in_flag;
+}
+
+template <typename T>
+__global__ void points_in_boxes_part_forward_cuda_kernel(
+    int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts,
+    int *box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
+  // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points:
+  // (B, npoints), default -1
+
+  int bs_idx = blockIdx.y;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (bs_idx >= batch_size || pt_idx >= pts_num) return;
+
+  boxes += bs_idx * boxes_num * 7;
+  pts += bs_idx * pts_num * 3 + pt_idx * 3;
+  box_idx_of_points += bs_idx * pts_num + pt_idx;
+
+  T local_x = 0, local_y = 0;
+  int cur_in_flag = 0;
+  for (int k = 0; k < boxes_num; k++) {
+    cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
+    if (cur_in_flag) {
+      box_idx_of_points[0] = k;
+      break;
+    }
+  }
+}
+
+template <typename T>
+__global__ void points_in_boxes_all_forward_cuda_kernel(
+    int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts,
+    int *box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
+  // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points:
+  // (B, npoints), default -1
+
+  int bs_idx = blockIdx.y;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (bs_idx >= batch_size || pt_idx >= pts_num) return;
+
+  boxes += bs_idx * boxes_num * 7;
+  pts += bs_idx * pts_num * 3 + pt_idx * 3;
+  box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;
+
+  T local_x = 0, local_y = 0;
+  for (int k = 0; k < boxes_num; k++) {
+    const int cur_in_flag =
+        check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
+    if (cur_in_flag) {
+      box_idx_of_points[k] = 1;
+    }
+  }
+}
+
+#endif  // POINT_IN_BOXES_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh
new file mode 100644
index 0000000..523d71a
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh
@@ -0,0 +1,137 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef PSAMASK_CUDA_KERNEL_CUH
+#define PSAMASK_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+// CUDA: grid stride looping
+#ifndef CUDA_KERNEL_LOOP
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+#endif
+
+template <typename T>
+__global__ void psamask_collect_forward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* mask_data, T* buffer_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        buffer_data[(n * h_feature * w_feature +
+                     (hidx + h - half_h_mask) * w_feature +
+                     (widx + w - half_w_mask)) *
+                        h_feature * w_feature +
+                    h * w_feature + w] = mask_data
+            [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) *
+                 w_feature +
+             w];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void psamask_distribute_forward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* mask_data, T* buffer_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        buffer_data[(n * h_feature * w_feature + h * w_feature + w) *
+                        h_feature * w_feature +
+                    (hidx + h - half_h_mask) * w_feature +
+                    (widx + w - half_w_mask)] = mask_data
+            [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) *
+                 w_feature +
+             w];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void psamask_collect_backward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* buffer_diff, T* mask_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature +
+                   h) *
+                      w_feature +
+                  w] = buffer_diff[(n * h_feature * w_feature +
+                                    (hidx + h - half_h_mask) * w_feature +
+                                    (widx + w - half_w_mask)) *
+                                       h_feature * w_feature +
+                                   h * w_feature + w];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void psamask_distribute_backward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* buffer_diff, T* mask_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature +
+                   h) *
+                      w_feature +
+                  w] =
+            buffer_diff[(n * h_feature * w_feature + h * w_feature + w) *
+                            h_feature * w_feature +
+                        (hidx + h - half_h_mask) * w_feature +
+                        (widx + w - half_w_mask)];
+      }
+    }
+  }
+}
+
+#endif  // PSAMASK_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh
new file mode 100644
index 0000000..8b90ee6
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh
@@ -0,0 +1,208 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROI_ALIGN_CUDA_KERNEL_CUH
+#define ROI_ALIGN_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_WITH_TRT
+
+/*** Forward ***/
+template <typename T>
+__global__ void roi_align_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* rois, T* output, T* argmax_y,
+    T* argmax_x, const int pooled_height, const int pooled_width,
+    const T spatial_scale, const int sampling_ratio,
+    const int pool_mode,  // 0 - max pool, 1 - avg pool
+    const bool aligned, const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (!aligned) {  // for backward-compatibility only
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_height / pooled_height));
+    int roi_bin_grid_w =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_width / pooled_width));
+
+    if (pool_mode == 0) {
+      // We do max pooling inside a bin
+      T maxval = -FLT_MAX;
+      T maxidx_y = -1.f, maxidx_x = -1.f;
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+          T val =
+              bilinear_interpolate(offset_input, height, width, y, x, index);
+          if (val > maxval) {
+            maxval = val;
+            maxidx_y = y;
+            maxidx_x = x;
+          }
+        }
+      }
+      output[index] = maxval;
+      argmax_y[index] = maxidx_y;
+      argmax_x[index] = maxidx_x;
+    } else if (pool_mode == 1) {
+      // We do average pooling inside a bin
+      const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
+      T output_val = 0.;
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+          T val =
+              bilinear_interpolate(offset_input, height, width, y, x, index);
+          output_val += val;
+        }
+      }
+      output[index] = output_val / count;
+    }
+  }
+}
+
+/*** Backward ***/
+template <typename T>
+__global__ void roi_align_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* rois, const T* argmax_y,
+    const T* argmax_x, T* grad_input, const int pooled_height,
+    const int pooled_width, const T spatial_scale, const int sampling_ratio,
+    const int pool_mode,  // 0 - max pool, 1 - avg pool
+    const bool aligned, const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T grad_output_this_bin = grad_output[index];
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    if (pool_mode == 0) {
+      T y = argmax_y[index], x = argmax_x[index];
+      if (y != -1.f) {
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_grad_input + y_low * width + x_low,
+                    grad_output_this_bin * w1);
+          atomicAdd(offset_grad_input + y_low * width + x_high,
+                    grad_output_this_bin * w2);
+          atomicAdd(offset_grad_input + y_high * width + x_low,
+                    grad_output_this_bin * w3);
+          atomicAdd(offset_grad_input + y_high * width + x_high,
+                    grad_output_this_bin * w4);
+        }
+      }
+    } else if (pool_mode == 1) {
+      // Do not using rounding; this implementation detail is critical
+      T offset = aligned ? (T)0.5 : (T)0.0;
+      T roi_start_w = offset_rois[1] * spatial_scale - offset;
+      T roi_start_h = offset_rois[2] * spatial_scale - offset;
+      T roi_end_w = offset_rois[3] * spatial_scale - offset;
+      T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+      T roi_width = roi_end_w - roi_start_w;
+      T roi_height = roi_end_h - roi_start_h;
+      if (!aligned) {  // for backward-compatibility only
+        roi_width = max(roi_width, (T)1.);
+        roi_height = max(roi_height, (T)1.);
+      }
+
+      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+      // We use roi_bin_grid to sample the grid and mimic integral
+      int roi_bin_grid_h =
+          (sampling_ratio > 0)
+              ? sampling_ratio
+              : static_cast<int>(ceilf(roi_height / pooled_height));
+      int roi_bin_grid_w =
+          (sampling_ratio > 0)
+              ? sampling_ratio
+              : static_cast<int>(ceilf(roi_width / pooled_width));
+
+      // We do average (integral) pooling inside a bin
+      const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+
+          T w1, w2, w3, w4;
+          int x_low, x_high, y_low, y_high;
+          bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                        x_low, x_high, y_low, y_high, index);
+
+          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+            atomicAdd(offset_grad_input + y_low * width + x_low,
+                      grad_output_this_bin * w1 / count);
+            atomicAdd(offset_grad_input + y_low * width + x_high,
+                      grad_output_this_bin * w2 / count);
+            atomicAdd(offset_grad_input + y_high * width + x_low,
+                      grad_output_this_bin * w3 / count);
+            atomicAdd(offset_grad_input + y_high * width + x_high,
+                      grad_output_this_bin * w4 / count);
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif  // ROI_ALIGN_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
new file mode 100644
index 0000000..0978f40
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
@@ -0,0 +1,198 @@
+// Modified from
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#ifndef ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
+#define ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_WITH_TRT
+
+/*** Forward ***/
+template <typename scalar_t>
+__global__ void roi_align_rotated_forward_cuda_kernel(
+    const int nthreads, const scalar_t *bottom_data,
+    const scalar_t *bottom_rois, const scalar_t spatial_scale,
+    const int sample_num, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, scalar_t *top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;
+    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset;
+    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset;
+    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
+    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
+    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
+    scalar_t theta = offset_bottom_rois[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    if (!aligned) {  // for backward-compatibility only
+      // Force malformed ROIs to be 1x1
+      roi_width = max(roi_width, (scalar_t)1.);
+      roi_height = max(roi_height, (scalar_t)1.);
+    }
+    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
+                          static_cast<scalar_t>(pooled_height);
+    scalar_t bin_size_w =
+        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
+
+    const scalar_t *offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sample_num > 0)
+                             ? sample_num
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sample_num > 0) ? sample_num : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    scalar_t roi_start_h = -roi_height / 2.0;
+    scalar_t roi_start_w = -roi_width / 2.0;
+    scalar_t cosscalar_theta = cos(theta);
+    scalar_t sinscalar_theta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    scalar_t output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const scalar_t yy =
+          roi_start_h + ph * bin_size_h +
+          static_cast<scalar_t>(iy + .5f) * bin_size_h /
+              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const scalar_t xx = roi_start_w + pw * bin_size_w +
+                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
+                                static_cast<scalar_t>(roi_bin_grid_w);
+
+        // Rotate by theta (counterclockwise) around the center and translate
+        scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
+        scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;
+
+        scalar_t val = bilinear_interpolate<scalar_t>(
+            offset_bottom_data, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+/*** Backward ***/
+template <typename scalar_t>
+__global__ void roi_align_rotated_backward_cuda_kernel(
+    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
+    const scalar_t spatial_scale, const int sample_num, const bool aligned,
+    const bool clockwise, const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, scalar_t *bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not round
+    scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;
+    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset;
+    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset;
+    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
+    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
+    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
+    scalar_t theta = offset_bottom_rois[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    if (!aligned) {  // for backward-compatibility only
+      // Force malformed ROIs to be 1x1
+      roi_width = max(roi_width, (scalar_t)1.);
+      roi_height = max(roi_height, (scalar_t)1.);
+    }
+    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
+                          static_cast<scalar_t>(pooled_height);
+    scalar_t bin_size_w =
+        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
+
+    scalar_t *offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const scalar_t *offset_top_diff = top_diff + top_offset;
+    const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sample_num > 0)
+                             ? sample_num
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sample_num > 0) ? sample_num : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    scalar_t roi_start_h = -roi_height / 2.0;
+    scalar_t roi_start_w = -roi_width / 2.0;
+    scalar_t cosTheta = cos(theta);
+    scalar_t sinTheta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const scalar_t yy =
+          roi_start_h + ph * bin_size_h +
+          static_cast<scalar_t>(iy + .5f) * bin_size_h /
+              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const scalar_t xx = roi_start_w + pw * bin_size_w +
+                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
+                                static_cast<scalar_t>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;
+        scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;
+
+        scalar_t w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,
+                                                w4, x_low, x_high, y_low,
+                                                y_high, index);
+
+        scalar_t g1 = top_diff_this_bin * w1 / count;
+        scalar_t g2 = top_diff_this_bin * w2 / count;
+        scalar_t g3 = top_diff_this_bin * w3 / count;
+        scalar_t g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
+          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
+          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
+          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
+        }  // if
+      }    // ix
+    }      // iy
+  }        // CUDA_1D_KERNEL_LOOP
+}  // RoIAlignBackward
+
+#endif  // ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh
new file mode 100644
index 0000000..39c7cb1
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh
@@ -0,0 +1,89 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROI_POOL_CUDA_KERNEL_CUH
+#define ROI_POOL_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+template <typename T>
+__global__ void roi_pool_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* rois, T* output, int* argmax,
+    const int pooled_height, const int pooled_width, const T spatial_scale,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+    // calculate the roi region on feature maps
+    T roi_x1 = offset_rois[1] * spatial_scale;
+    T roi_y1 = offset_rois[2] * spatial_scale;
+    T roi_x2 = (offset_rois[3] + 1) * spatial_scale;
+    T roi_y2 = (offset_rois[4] + 1) * spatial_scale;
+
+    // force malformed rois to be 1x1
+    T roi_w = roi_x2 - roi_x1;
+    T roi_h = roi_y2 - roi_y1;
+    if (roi_w <= 0 || roi_h <= 0) continue;
+
+    T bin_size_w = roi_w / static_cast<T>(pooled_width);
+    T bin_size_h = roi_h / static_cast<T>(pooled_height);
+
+    // the corresponding bin region
+    int bin_x1 = floorf(static_cast<T>(pw) * bin_size_w + roi_x1);
+    int bin_y1 = floorf(static_cast<T>(ph) * bin_size_h + roi_y1);
+    int bin_x2 = ceilf(static_cast<T>(pw + 1) * bin_size_w + roi_x1);
+    int bin_y2 = ceilf(static_cast<T>(ph + 1) * bin_size_h + roi_y1);
+
+    // add roi offsets and clip to input boundaries
+    bin_x1 = min(max(bin_x1, 0), width);
+    bin_y1 = min(max(bin_y1, 0), height);
+    bin_x2 = min(max(bin_x2, 0), width);
+    bin_y2 = min(max(bin_y2, 0), height);
+    bool is_empty = (bin_y2 <= bin_y1) || (bin_x2 <= bin_x1);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+    // Define an empty pooling region to be zero
+    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+    T max_val = is_empty ? 0 : -FLT_MAX;
+    int max_idx = -1;
+    for (int h = bin_y1; h < bin_y2; ++h) {
+      for (int w = bin_x1; w < bin_x2; ++w) {
+        int offset = h * width + w;
+        if (offset_input[offset] > max_val) {
+          max_val = offset_input[offset];
+          max_idx = offset;
+        }
+      }
+    }
+    output[index] = max_val;
+    if (argmax != NULL) argmax[index] = max_idx;
+  }
+}
+
+template <typename T>
+__global__ void roi_pool_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* rois, const int* argmax,
+    T* grad_input, const int pooled_height, const int pooled_width,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c) is an element in the pooled output
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    int roi_batch_ind = rois[n * 5];
+    T* grad_input_offset =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+    int argmax_index = argmax[index];
+
+    if (argmax_index != -1) {
+      atomicAdd(grad_input_offset + argmax_index, grad_output[index]);
+    }
+  }
+}
+
+#endif  // ROI_POOL_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
new file mode 100644
index 0000000..4d56943
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
@@ -0,0 +1,264 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROIAWARE_POOL3D_CUDA_KERNEL_CUH
+#define ROIAWARE_POOL3D_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+template <typename T>
+__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
+                                             T &local_x, T &local_y) {
+  T cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+template <typename T>
+__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
+                                        T &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
+  // cz in the bottom center
+  T x = pt[0], y = pt[1], z = pt[2];
+  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
+  cz += z_size /
+        2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > z_size / 2.0) return 0;
+  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
+                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
+  return in_flag;
+}
+
+template <typename T>
+__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
+                                            int out_x, int out_y, int out_z,
+                                            const T *rois, const T *pts,
+                                            int *pts_mask) {
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N,
+  // npoints): -1 means point does not in this box, otherwise: encode (x_idxs,
+  // y_idxs, z_idxs) by binary bit
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int box_idx = blockIdx.y;
+  if (pt_idx >= pts_num || box_idx >= boxes_num) return;
+
+  pts += pt_idx * 3;
+  rois += box_idx * 7;
+  pts_mask += box_idx * pts_num + pt_idx;
+
+  T local_x = 0, local_y = 0;
+  int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
+
+  pts_mask[0] = -1;
+  if (cur_in_flag > 0) {
+    T local_z = pts[2] - rois[2];
+    T x_size = rois[3], y_size = rois[4], z_size = rois[5];
+
+    T x_res = x_size / out_x;
+    T y_res = y_size / out_y;
+    T z_res = z_size / out_z;
+
+    unsigned int x_idx = int((local_x + x_size / 2) / x_res);
+    unsigned int y_idx = int((local_y + y_size / 2) / y_res);
+    unsigned int z_idx = int(local_z / z_res);
+
+    x_idx = min(max(x_idx, 0), out_x - 1);
+    y_idx = min(max(y_idx, 0), out_y - 1);
+    z_idx = min(max(z_idx, 0), out_z - 1);
+
+    unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
+
+    pts_mask[0] = idx_encoding;
+  }
+}
+
+template <typename T>
+__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
+                                             int max_pts_each_voxel, int out_x,
+                                             int out_y, int out_z,
+                                             const int *pts_mask,
+                                             T *pts_idx_of_voxels) {
+  // params pts_mask: (N, npoints)  0 or 1
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+
+  int box_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (box_idx >= boxes_num) return;
+
+  int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
+  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
+
+  for (int k = 0; k < pts_num; k++) {
+    if (pts_mask[box_idx * pts_num + k] != -1) {
+      unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
+      unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
+      unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
+      unsigned int z_idx = idx_encoding & 0xFF;
+      unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
+                                 y_idx * out_z * max_pts_each_voxel +
+                                 z_idx * max_pts_each_voxel;
+      unsigned int cnt = pts_idx_of_voxels[base_offset];
+      if (cnt < max_num_pts) {
+        pts_idx_of_voxels[base_offset + cnt + 1] = k;
+        pts_idx_of_voxels[base_offset]++;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
+                                   int max_pts_each_voxel, int out_x, int out_y,
+                                   int out_z, const T *pts_feature,
+                                   const int *pts_idx_of_voxels,
+                                   T *pooled_features, int *argmax) {
+  // params pts_feature: (npoints, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
+  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x_idx = voxel_idx_flat / (out_y * out_z);
+  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+  int z_idx = voxel_idx_flat % out_z;
+  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
+      y_idx >= out_y || z_idx >= out_z)
+    return;
+
+  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                       offset_base * max_pts_each_voxel;
+  pooled_features += box_idx * out_x * out_y * out_z * channels +
+                     offset_base * channels + channel_idx;
+  argmax += box_idx * out_x * out_y * out_z * channels +
+            offset_base * channels + channel_idx;
+
+  int argmax_idx = -1;
+  float max_val = -1e50;
+
+  int total_pts = pts_idx_of_voxels[0];
+
+  for (int k = 1; k <= total_pts; k++) {
+    if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val) {
+      max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+      argmax_idx = pts_idx_of_voxels[k];
+    }
+  }
+
+  if (argmax_idx != -1) {
+    pooled_features[0] = max_val;
+  }
+  argmax[0] = argmax_idx;
+}
+
+template <typename T>
+__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
+                                   int max_pts_each_voxel, int out_x, int out_y,
+                                   int out_z, const T *pts_feature,
+                                   const int *pts_idx_of_voxels,
+                                   T *pooled_features) {
+  // params pts_feature: (npoints, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
+  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x_idx = voxel_idx_flat / (out_y * out_z);
+  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+  int z_idx = voxel_idx_flat % out_z;
+  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
+      y_idx >= out_y || z_idx >= out_z)
+    return;
+
+  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                       offset_base * max_pts_each_voxel;
+  pooled_features += box_idx * out_x * out_y * out_z * channels +
+                     offset_base * channels + channel_idx;
+
+  float sum_val = 0;
+  int total_pts = pts_idx_of_voxels[0];
+
+  for (int k = 1; k <= total_pts; k++) {
+    sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+  }
+
+  if (total_pts > 0) {
+    pooled_features[0] = sum_val / total_pts;
+  }
+}
+
+template <typename T>
+__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
+                                            int out_x, int out_y, int out_z,
+                                            const int *argmax,
+                                            const T *grad_out, T *grad_in) {
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x_idx = voxel_idx_flat / (out_y * out_z);
+  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+  int z_idx = voxel_idx_flat % out_z;
+  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
+      y_idx >= out_y || z_idx >= out_z)
+    return;
+
+  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+  argmax += box_idx * out_x * out_y * out_z * channels +
+            offset_base * channels + channel_idx;
+  grad_out += box_idx * out_x * out_y * out_z * channels +
+              offset_base * channels + channel_idx;
+
+  if (argmax[0] == -1) return;
+
+  atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
+}
+
+template <typename T>
+__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
+                                            int out_x, int out_y, int out_z,
+                                            int max_pts_each_voxel,
+                                            const int *pts_idx_of_voxels,
+                                            const T *grad_out, T *grad_in) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x_idx = voxel_idx_flat / (out_y * out_z);
+  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+  int z_idx = voxel_idx_flat % out_z;
+  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
+      y_idx >= out_y || z_idx >= out_z)
+    return;
+
+  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                       offset_base * max_pts_each_voxel;
+  grad_out += box_idx * out_x * out_y * out_z * channels +
+              offset_base * channels + channel_idx;
+
+  int total_pts = pts_idx_of_voxels[0];
+  float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
+  for (int k = 1; k <= total_pts; k++) {
+    atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
+              grad_out[0] * cur_grad);
+  }
+}
+
+#endif  // ROIAWARE_POOL3D_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
new file mode 100644
index 0000000..bef665a
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
@@ -0,0 +1,140 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROIPOINT_POOL3D_CUDA_KERNEL_CUH
+#define ROIPOINT_POOL3D_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+template <typename T>
+__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
+                                             T &local_x, T &local_y) {
+  T cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+template <typename T>
+__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
+                                        T &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate, cz in the
+  // bottom center
+  T x = pt[0], y = pt[1], z = pt[2];
+  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  T dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6];
+  cz += dz / 2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > dz / 2.0) return 0;
+  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+  T in_flag = (local_x > -dx / 2.0) & (local_x < dx / 2.0) &
+              (local_y > -dy / 2.0) & (local_y < dy / 2.0);
+  return in_flag;
+}
+
+template <typename T>
+__global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num,
+                                    const T *xyz, const T *boxes3d,
+                                    int *pts_assign) {
+  // params xyz: (B, N, 3)
+  // params boxes3d: (B, M, 7)
+  // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means
+  // background points
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int box_idx = blockIdx.y;
+  int bs_idx = blockIdx.z;
+
+  if (pt_idx >= pts_num || box_idx >= boxes_num || bs_idx >= batch_size) {
+    return;
+  }
+  int assign_idx = bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
+  pts_assign[assign_idx] = 0;
+
+  int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
+  int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;
+
+  T local_x = 0, local_y = 0;
+  int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset,
+                                      local_x, local_y);
+  pts_assign[assign_idx] = cur_in_flag;
+}
+
+__global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,
+                               int sampled_pts_num, const int *pts_assign,
+                               int *pts_idx, int *pooled_empty_flag) {
+  // params xyz: (B, N, 3)
+  // params pts_feature: (B, N, C)
+  // params pts_assign: (B, N)
+  // params pts_idx: (B, M, 512)
+  // params pooled_empty_flag: (B, M)
+
+  int boxes_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (boxes_idx >= boxes_num) {
+    return;
+  }
+
+  int bs_idx = blockIdx.y;
+
+  int cnt = 0;
+  for (int k = 0; k < pts_num; k++) {
+    if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num + boxes_idx]) {
+      if (cnt < sampled_pts_num) {
+        pts_idx[bs_idx * boxes_num * sampled_pts_num +
+                boxes_idx * sampled_pts_num + cnt] = k;
+        cnt++;
+      } else
+        break;
+    }
+  }
+
+  if (cnt == 0) {
+    pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
+  } else if (cnt < sampled_pts_num) {
+    // duplicate same points for sampling
+    for (int k = cnt; k < sampled_pts_num; k++) {
+      int duplicate_idx = k % cnt;
+      int base_offset =
+          bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
+      pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
+    }
+  }
+}
+
+template <typename T>
+__global__ void roipoint_pool3d_forward(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const T *xyz, const int *pts_idx, const T *pts_feature,
+    T *pooled_features, int *pooled_empty_flag) {
+  // params xyz: (B, N, 3)
+  // params pts_idx: (B, M, 512)
+  // params pts_feature: (B, N, C)
+  // params pooled_features: (B, M, 512, 3+C)
+  // params pooled_empty_flag: (B, M)
+
+  int sample_pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int box_idx = blockIdx.y;
+  int bs_idx = blockIdx.z;
+
+  if (sample_pt_idx >= sampled_pts_num || box_idx >= boxes_num ||
+      bs_idx >= batch_size) {
+    return;
+  }
+
+  if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) {
+    return;
+  }
+
+  int temp_idx = bs_idx * boxes_num * sampled_pts_num +
+                 box_idx * sampled_pts_num + sample_pt_idx;
+  int src_pt_idx = pts_idx[temp_idx];
+  int dst_feature_offset = temp_idx * (3 + feature_in_len);
+
+  for (int j = 0; j < 3; j++)
+    pooled_features[dst_feature_offset + j] =
+        xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
+
+  int src_feature_offset =
+      bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
+  memcpy(pooled_features + dst_feature_offset + 3,
+         pts_feature + src_feature_offset, feature_in_len * sizeof(T));
+}
+
+#endif  // ROIPOINT_POOL3D_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
new file mode 100644
index 0000000..c375f24
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
@@ -0,0 +1,183 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef SCATTER_POINTS_CUDA_KERNEL_CUH
+#define SCATTER_POINTS_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+int const maxGridDim = 50000;
+
+__device__ __forceinline__ static void reduceMax(float *address, float val) {
+  int *address_as_i = reinterpret_cast<int *>(address);
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_i, assumed,
+                    __float_as_int(fmaxf(val, __int_as_float(assumed))));
+  } while (assumed != old || __int_as_float(old) < val);
+}
+
+__device__ __forceinline__ static void reduceMax(double *address, double val) {
+  unsigned long long *address_as_ull =
+      reinterpret_cast<unsigned long long *>(address);
+  unsigned long long old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(
+        address_as_ull, assumed,
+        __double_as_longlong(fmax(val, __longlong_as_double(assumed))));
+  } while (assumed != old || __longlong_as_double(old) < val);
+}
+
+// get rid of meaningless warnings when compiling host code
+#ifdef HIP_DIFF
+__device__ __forceinline__ static void reduceAdd(float *address, float val) {
+  atomicAdd(address, val);
+}
+__device__ __forceinline__ static void reduceAdd(double *address, double val) {
+  atomicAdd(address, val);
+}
+#else
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__ static void reduceAdd(float *address, float val) {
+#if (__CUDA_ARCH__ < 200)
+#ifdef _MSC_VER
+#pragma message( \
+    "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32")
+#else
+#warning \
+    "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32"
+#endif
+  int *address_as_i = reinterpret_cast<int *>(address);
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_i, assumed,
+                    __float_as_int(val + __int_as_float(assumed)));
+  } while (assumed != old);
+#else
+  atomicAdd(address, val);
+#endif
+}
+
+__device__ __forceinline__ static void reduceAdd(double *address, double val) {
+#if (__CUDA_ARCH__ < 600)
+#ifdef _MSC_VER
+#pragma message( \
+    "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64")
+#else
+#warning \
+    "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64"
+#endif
+  unsigned long long *address_as_ull =
+      reinterpret_cast<unsigned long long *>(address);
+  unsigned long long old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+#else
+  atomicAdd(address, val);
+#endif
+}
+#endif  // __CUDA_ARCH__
+#endif  // HIP_DIFF
+
+template <typename T>
+__global__ void feats_reduce_kernel(
+    const T *feats, const int32_t *coors_map,
+    T *reduced_feats,  // shall be 0 at initialization
+    const int num_input, const int num_feats, const reduce_t reduce_type) {
+  CUDA_1D_KERNEL_LOOP(x, num_input) {
+    int32_t reduce_to = coors_map[x];
+    if (reduce_to == -1) continue;
+
+    const T *feats_offset = feats + x * num_feats;
+    T *reduced_feats_offset = reduced_feats + reduce_to * num_feats;
+    if (reduce_type == reduce_t::MAX) {
+      for (int i = 0; i < num_feats; i++) {
+        reduceMax(&reduced_feats_offset[i], feats_offset[i]);
+      }
+    } else {
+      for (int i = 0; i < num_feats; i++) {
+        reduceAdd(&reduced_feats_offset[i], feats_offset[i]);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void add_reduce_traceback_grad_kernel(
+    T *grad_feats, const T *grad_reduced_feats, const int32_t *coors_map,
+    const int32_t *reduce_count, const int num_input, const int num_feats,
+    const reduce_t reduce_type) {
+  CUDA_1D_KERNEL_LOOP(x, num_input) {
+    int32_t reduce_to = coors_map[x];
+    if (reduce_to == -1) {
+      continue;
+    }
+
+    const int input_offset = x * num_feats;
+    T *grad_feats_offset = grad_feats + input_offset;
+    const int reduced_offset = reduce_to * num_feats;
+    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
+
+    if (reduce_type == reduce_t::SUM) {
+      for (int i = 0; i < num_feats; i++) {
+        grad_feats_offset[i] = grad_reduced_feats_offset[i];
+      }
+    } else if (reduce_type == reduce_t::MEAN) {
+      for (int i = 0; i < num_feats; i++) {
+        grad_feats_offset[i] = grad_reduced_feats_offset[i] /
+                               static_cast<T>(reduce_count[reduce_to]);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void max_reduce_traceback_scatter_idx_kernel(
+    const T *feats, const T *reduced_feats, int32_t *reduce_from,
+    const int32_t *coors_map, const int num_input, const int num_feats) {
+  CUDA_1D_KERNEL_LOOP(x, num_input) {
+    int32_t reduce_to = coors_map[x];
+
+    const int input_offset = x * num_feats;
+    const T *feats_offset = feats + input_offset;
+
+    if (reduce_to == -1) {
+      continue;
+    }
+
+    const int reduced_offset = reduce_to * num_feats;
+    const T *reduced_feats_offset = reduced_feats + reduced_offset;
+    int32_t *reduce_from_offset = reduce_from + reduced_offset;
+
+    for (int i = 0; i < num_feats; i++) {
+      if (feats_offset[i] == reduced_feats_offset[i]) {
+        atomicMin(&reduce_from_offset[i], static_cast<int32_t>(x));
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void max_reduce_scatter_grad_kernel(T *grad_feats,
+                                               const T *grad_reduced_feats,
+                                               const int32_t *reduce_from,
+                                               const int num_reduced,
+                                               const int num_feats) {
+  CUDA_1D_KERNEL_LOOP(x, num_reduced) {
+    const int reduced_offset = x * num_feats;
+    const int32_t *scatter_to_offset = reduce_from + reduced_offset;
+    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
+
+    for (int i = 0; i < num_feats; i++) {
+      grad_feats[scatter_to_offset[i] * num_feats + i] =
+          grad_reduced_feats_offset[i];
+    }
+  }
+}
+
+#endif  // SCATTER_POINTS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh
new file mode 100644
index 0000000..d133589
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh
@@ -0,0 +1,67 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
+#define SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+template <typename T>
+__global__ void sigmoid_focal_loss_forward_cuda_kernel(
+    const int nthreads, const T* input, const int64_t* target, const T* weight,
+    T* output, const T gamma, const T alpha, const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / num_classes;
+    int c = index % num_classes;
+
+    int64_t t = target[n];
+    T flag_p = (t == c);
+    T flag_n = (t != c);
+
+    // p = sigmoid(x) = 1. / 1. + expf(-x)
+    T p = (T)1. / ((T)1. + expf(-input[index]));
+
+    // (1 - p)**gamma * log(p)
+    T term_p = pow(((T)1. - p), gamma) * log(max(p, (T)FLT_MIN));
+    // p**gamma * log(1 - p)
+    T term_n = pow(p, gamma) * log(max((T)1. - p, (T)FLT_MIN));
+
+    output[index] = (T)0.;
+    output[index] += -flag_p * alpha * term_p;
+    output[index] += -flag_n * ((T)1. - alpha) * term_n;
+    if (weight != NULL) {
+      output[index] *= weight[t];
+    }
+  }
+}
+
+template <typename T>
+__global__ void sigmoid_focal_loss_backward_cuda_kernel(
+    const int nthreads, const T* input, const int64_t* target, const T* weight,
+    T* grad_input, const T gamma, const T alpha, const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / num_classes;
+    int c = index % num_classes;
+
+    int64_t t = target[n];
+    T flag_p = (t == c);
+    T flag_n = (t != c);
+
+    // p = sigmoid(x) = 1. / 1. + expf(-x)
+    T p = (T)1. / ((T)1. + exp(-input[index]));
+
+    // (1 - p)**gamma * (1 - p - gamma*p*log(p))
+    T term_p = pow(((T)1. - p), gamma) *
+               ((T)1. - p - (gamma * p * log(max(p, (T)FLT_MIN))));
+    // p**gamma * (gamma * (1 - p) * log(1 - p) - p)
+    T term_n = pow(p, gamma) *
+               (gamma * ((T)1. - p) * log(max((T)1. - p, (T)FLT_MIN)) - p);
+
+    grad_input[index] = (T)0.;
+    grad_input[index] += -flag_p * alpha * term_p;
+    grad_input[index] += -flag_n * ((T)1. - alpha) * term_n;
+    if (weight != NULL) {
+      grad_input[index] *= weight[t];
+    }
+  }
+}
+
+#endif  // SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh
new file mode 100644
index 0000000..64299b9
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh
@@ -0,0 +1,68 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
+#define SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+template <typename T>
+__global__ void softmax_focal_loss_forward_cuda_kernel(
+    const int nthreads, const T* softmax, const int64_t* target,
+    const T* weight, T* output, const T gamma, const T alpha,
+    const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int64_t label = target[index];
+    T pred = softmax[index * num_classes + label];
+
+    if (label >= 0) {
+      output[index] =
+          -alpha * pow((T)1. - pred, gamma) * log(max(pred, (T)FLT_MIN));
+    } else {
+      output[index] = 0;
+    }
+    if (weight != NULL) {
+      output[index] *= weight[label];
+    }
+  }
+}
+
+template <typename T>
+__global__ void softmax_focal_loss_backward_cuda1_kernel(
+    const int nthreads, const T* softmax, const int64_t* target,
+    const T* weight, T* buff, const T gamma, const T alpha,
+    const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int64_t label = target[index];
+    T pred = softmax[index * num_classes + label];
+
+    if (label >= 0) {
+      buff[index] = alpha * (-pow((T)1. - pred, gamma) +
+                             gamma * pow((T)1. - pred, gamma - 1) * pred *
+                                 log(max(pred, (T)FLT_MIN)));
+    } else {
+      buff[index] = 0;
+    }
+    if (weight != NULL) {
+      buff[index] *= weight[label];
+    }
+  }
+}
+
+template <typename T>
+__global__ void softmax_focal_loss_backward_cuda2_kernel(
+    const int nthreads, const T* softmax, const int64_t* target, const T* buff,
+    T* grad_input, const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / num_classes;
+    int c = index % num_classes;
+    int64_t label = target[n];
+
+    if (label >= 0) {
+      T flag = (label == c ? (T)1. : (T)0.);
+      grad_input[index] = buff[n] * (flag - softmax[index]);
+    } else {
+      grad_input[index] = 0;
+    }
+  }
+}
+
+#endif  // SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh
new file mode 100644
index 0000000..e16e637
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh
@@ -0,0 +1,327 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef SYNCBN_CUDA_KERNEL_CUH
+#define SYNCBN_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+template <typename T>
+__global__ void sync_bn_forward_mean_cuda_kernel(const T *input, float *mean,
+                                                 int num, int channels,
+                                                 int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer[tid] += input[index];
+  }
+  __syncthreads();
+
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    mean[c] = buffer[0] / total;
+  }
+}
+
+template <>
+__global__ void sync_bn_forward_mean_cuda_kernel(const phalf *input,
+                                                 float *mean, int num,
+                                                 int channels, int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer[tid] += static_cast<float>(input[index]);
+  }
+  __syncthreads();
+
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    mean[c] = buffer[0] / total;
+  }
+}
+
+template <typename T>
+__global__ void sync_bn_forward_var_cuda_kernel(const T *input,
+                                                const float *mean, float *var,
+                                                int num, int channels,
+                                                int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    float td = input[index] - mean[c];
+    buffer[tid] += td * td;
+  }
+  __syncthreads();
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    var[c] = buffer[0] / total;
+  }
+}
+
+template <>
+__global__ void sync_bn_forward_var_cuda_kernel(const phalf *input,
+                                                const float *mean, float *var,
+                                                int num, int channels,
+                                                int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    float td = static_cast<float>(input[index]) - mean[c];
+    buffer[tid] += td * td;
+  }
+  __syncthreads();
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    var[c] = buffer[0] / total;
+  }
+}
+
+template <typename T>
+__global__ void sync_bn_forward_output_cuda_kernel(
+    const T *input, const float *mean, const float *var, float *running_mean,
+    float *running_var, const float *weight, const float *bias, float *norm,
+    float *std, T *output, int num, int channels, int spatial, float eps,
+    float momentum, int group_size) {
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  float mean_value = mean[c];
+  float std_value = sqrt(var[c] + eps);
+
+  if (weight != nullptr) {
+    float weight_value = weight[c];
+    float bias_value = bias[c];
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        norm[index] = (input[index] - mean_value) / std_value;
+        output[index] = norm[index] * weight_value + bias_value;
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] =
+            (input[index] - mean_value) / std_value * weight_value + bias_value;
+      }
+    }
+  } else {
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] = norm[index] = (input[index] - mean_value) / std_value;
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] = (input[index] - mean_value) / std_value;
+      }
+    }
+  }
+  if (tid == 0) {
+    if (std != nullptr) std[c] = std_value;
+    if (running_mean != nullptr) {
+      running_mean[c] =
+          momentum * mean_value + (1 - momentum) * running_mean[c];
+      int count = num * spatial * group_size;
+      float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c];
+      running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c];
+    }
+  }
+}
+
+template <>
+__global__ void sync_bn_forward_output_cuda_kernel(
+    const phalf *input, const float *mean, const float *var,
+    float *running_mean, float *running_var, const float *weight,
+    const float *bias, float *norm, float *std, phalf *output, int num,
+    int channels, int spatial, float eps, float momentum, int group_size) {
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  float mean_value = mean[c];
+  float std_value = sqrt(var[c] + eps);
+  if (weight != nullptr) {
+    float weight_value = weight[c];
+    float bias_value = bias[c];
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        norm[index] =
+            (static_cast<float>(input[index]) - mean_value) / std_value;
+        output[index] =
+            static_cast<phalf>(norm[index] * weight_value + bias_value);
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] =
+            static_cast<phalf>((static_cast<float>(input[index]) - mean_value) /
+                                   std_value * weight_value +
+                               bias_value);
+      }
+    }
+  } else {
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        norm[index] =
+            (static_cast<float>(input[index]) - mean_value) / std_value;
+        output[index] = static_cast<phalf>(norm[index]);
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] = static_cast<phalf>(
+            (static_cast<float>(input[index]) - mean_value) / std_value);
+      }
+    }
+  }
+  if (tid == 0) {
+    if (std != nullptr) std[c] = std_value;
+    if (running_mean != nullptr) {
+      running_mean[c] =
+          momentum * mean_value + (1 - momentum) * running_mean[c];
+      int count = num * spatial * group_size;
+      float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c];
+      running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c];
+    }
+  }
+}
+
+template <typename T>
+__global__ void sync_bn_backward_param_cuda_kernel(const T *grad_output,
+                                                   const float *norm,
+                                                   float *grad_weight,
+                                                   float *grad_bias, int num,
+                                                   int channels, int spatial) {
+  __shared__ float buffer1[THREADS_PER_BLOCK];
+  __shared__ float buffer2[THREADS_PER_BLOCK];
+
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer1[tid] = buffer2[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer1[tid] += grad_output[index] * norm[index];
+    buffer2[tid] += grad_output[index];
+  }
+  __syncthreads();
+
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer1[tid] += buffer1[tid + s];
+      buffer2[tid] += buffer2[tid + s];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    grad_weight[c] = buffer1[0];
+    grad_bias[c] = buffer2[0];
+  }
+}
+
+template <>
+__global__ void sync_bn_backward_param_cuda_kernel(const phalf *grad_output,
+                                                   const float *norm,
+                                                   float *grad_weight,
+                                                   float *grad_bias, int num,
+                                                   int channels, int spatial) {
+  __shared__ float buffer1[THREADS_PER_BLOCK];
+  __shared__ float buffer2[THREADS_PER_BLOCK];
+
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer1[tid] = buffer2[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer1[tid] += static_cast<float>(grad_output[index]) * norm[index];
+    buffer2[tid] += static_cast<float>(grad_output[index]);
+  }
+  __syncthreads();
+
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer1[tid] += buffer1[tid + s];
+      buffer2[tid] += buffer2[tid + s];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    grad_weight[c] = buffer1[0];
+    grad_bias[c] = buffer2[0];
+  }
+}
+
+template <typename T>
+__global__ void sync_bn_backward_data_cuda_kernel(
+    int output_size, const T *grad_output, const float *weight,
+    const float *grad_weight, const float *grad_bias, const float *norm,
+    const float *std, T *grad_input, int num, int channels, int spatial) {
+  int factor = num * spatial;
+  CUDA_1D_KERNEL_LOOP(index, output_size) {
+    int c = (index / spatial) % channels;
+    grad_input[index] =
+        weight[c] *
+        (grad_output[index] -
+         (grad_weight[c] * norm[index] + grad_bias[c]) / factor) /
+        std[c];
+  }
+}
+
+template <>
+__global__ void sync_bn_backward_data_cuda_kernel(
+    int output_size, const phalf *grad_output, const float *weight,
+    const float *grad_weight, const float *grad_bias, const float *norm,
+    const float *std, phalf *grad_input, int num, int channels, int spatial) {
+  int factor = num * spatial;
+  CUDA_1D_KERNEL_LOOP(index, output_size) {
+    int c = (index / spatial) % channels;
+    grad_input[index] = static_cast<phalf>(
+        weight[c] *
+        (static_cast<float>(grad_output[index]) -
+         (grad_weight[c] * norm[index] + grad_bias[c]) / factor) /
+        std[c]);
+  }
+}
+
+#endif  // SYNCBN_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
new file mode 100644
index 0000000..1346b40
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
@@ -0,0 +1,57 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef THREE_INTERPOLATE_CUDA_KERNEL_CUH
+#define THREE_INTERPOLATE_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+template <typename T>
+__global__ void three_interpolate_forward_cuda_kernel(
+    int b, int c, int m, int n, const T *points, const int *__restrict__ idx,
+    const T *weight, T *out) {
+  // points: (B, C, M)
+  // idx: (B, N, 3)
+  // weight: (B, N, 3)
+  // output:
+  //      out: (B, C, N)
+
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
+
+  weight += bs_idx * n * 3 + pt_idx * 3;
+  points += bs_idx * c * m + c_idx * m;
+  idx += bs_idx * n * 3 + pt_idx * 3;
+  out += bs_idx * c * n + c_idx * n;
+
+  out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
+                weight[2] * points[idx[2]];
+}
+
+template <typename T>
+__global__ void three_interpolate_backward_cuda_kernel(
+    int b, int c, int n, int m, const T *grad_out, const int *__restrict__ idx,
+    const T *weight, T *grad_points) {
+  // grad_out: (B, C, N)
+  // weight: (B, N, 3)
+  // output:
+  //      grad_points: (B, C, M)
+
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
+
+  grad_out += bs_idx * c * n + c_idx * n + pt_idx;
+  weight += bs_idx * n * 3 + pt_idx * 3;
+  grad_points += bs_idx * c * m + c_idx * m;
+  idx += bs_idx * n * 3 + pt_idx * 3;
+
+  atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
+  atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
+  atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
+}
+
+#endif  // THREE_INTERPOLATE_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
new file mode 100644
index 0000000..23fa091
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
@@ -0,0 +1,62 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef THREE_NN_CUDA_KERNEL_CUH
+#define THREE_NN_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+template <typename T>
+__global__ void three_nn_forward_cuda_kernel(int b, int n, int m,
+                                             const T *unknown, const T *known,
+                                             T *dist2, int *__restrict__ idx) {
+  // unknown: (B, N, 3)
+  // known: (B, M, 3)
+  // output:
+  //      dist2: (B, N, 3)
+  //      idx: (B, N, 3)
+
+  int bs_idx = blockIdx.y;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (bs_idx >= b || pt_idx >= n) return;
+
+  unknown += bs_idx * n * 3 + pt_idx * 3;
+  known += bs_idx * m * 3;
+  dist2 += bs_idx * n * 3 + pt_idx * 3;
+  idx += bs_idx * n * 3 + pt_idx * 3;
+
+  T ux = unknown[0];
+  T uy = unknown[1];
+  T uz = unknown[2];
+
+  double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+  int besti1 = 0, besti2 = 0, besti3 = 0;
+  for (int k = 0; k < m; ++k) {
+    T x = known[k * 3 + 0];
+    T y = known[k * 3 + 1];
+    T z = known[k * 3 + 2];
+    T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+    if (d < best1) {
+      best3 = best2;
+      besti3 = besti2;
+      best2 = best1;
+      besti2 = besti1;
+      best1 = d;
+      besti1 = k;
+    } else if (d < best2) {
+      best3 = best2;
+      besti3 = besti2;
+      best2 = d;
+      besti2 = k;
+    } else if (d < best3) {
+      best3 = d;
+      besti3 = k;
+    }
+  }
+  dist2[0] = best1;
+  dist2[1] = best2;
+  dist2[2] = best3;
+  idx[0] = besti1;
+  idx[1] = besti2;
+  idx[2] = besti3;
+}
+
+#endif  // THREE_NN_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh
new file mode 100644
index 0000000..8b7112d
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh
@@ -0,0 +1,57 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef TIN_SHIFT_CUDA_KERNEL_CUH
+#define TIN_SHIFT_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+template <typename T>
+__global__ void tin_shift_forward_cuda_kernel(
+    const int nthreads, const T* input, const int* shift, T* output,
+    const int batch_size, const int channels, const int t_size,
+    const int hw_size, const int group_size, const int group_channel) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int hw_index = index % hw_size;
+    const int j = (index / hw_size) % channels;
+
+    const int n_index = (index / hw_size / channels) % batch_size;
+    int group_id = j / group_channel;
+    int t_shift = shift[n_index * group_size + group_id];
+    int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index;
+    for (int i = 0; i < t_size; i++) {
+      int now_t = i + t_shift;
+      int data_id = i * hw_size * channels + offset;
+      if (now_t < 0 || now_t >= t_size) {
+        continue;
+      }
+      int out_id = now_t * hw_size * channels + offset;
+      output[out_id] = input[data_id];
+    }
+  }
+}
+
+template <typename T>
+__global__ void tin_shift_backward_cuda_kernel(
+    const int nthreads, const T* input, const int* shift, T* output,
+    const int batch_size, const int channels, const int t_size,
+    const int hw_size, const int group_size, const int group_channel) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int hw_index = index % hw_size;
+    const int j = (index / hw_size) % channels;
+
+    const int n_index = (index / hw_size / channels) % batch_size;
+    int group_id = j / group_channel;
+    int t_shift = shift[n_index * group_size + group_id];
+    int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index;
+    for (int i = 0; i < t_size; i++) {
+      int now_t = i + t_shift;
+      int data_id = i * hw_size * channels + offset;
+      if (now_t < 0 || now_t >= t_size) {
+        continue;
+      }
+      int out_id = now_t * hw_size * channels + offset;
+      output[out_id] = input[data_id];
+    }
+  }
+}
+
+#endif  // TIN_SHIFT_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
new file mode 100644
index 0000000..f817662
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
@@ -0,0 +1,165 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef VOXELIZATION_CUDA_KERNEL_CUH
+#define VOXELIZATION_CUDA_KERNEL_CUH
+
+#include "pytorch_cuda_helper.hpp"
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+template <typename T, typename T_int>
+__global__ void dynamic_voxelize_kernel(
+    const T* points, T_int* coors, const float voxel_x, const float voxel_y,
+    const float voxel_z, const float coors_x_min, const float coors_y_min,
+    const float coors_z_min, const float coors_x_max, const float coors_y_max,
+    const float coors_z_max, const int grid_x, const int grid_y,
+    const int grid_z, const int num_points, const int num_features,
+    const int NDim) {
+  //   const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, num_points) {
+    // To save some computation
+    auto points_offset = points + index * num_features;
+    auto coors_offset = coors + index * NDim;
+    int c_x = floor((points_offset[0] - coors_x_min) / voxel_x);
+    if (c_x < 0 || c_x >= grid_x) {
+      coors_offset[0] = -1;
+      continue;
+    }
+
+    int c_y = floor((points_offset[1] - coors_y_min) / voxel_y);
+    if (c_y < 0 || c_y >= grid_y) {
+      coors_offset[0] = -1;
+      coors_offset[1] = -1;
+      continue;
+    }
+
+    int c_z = floor((points_offset[2] - coors_z_min) / voxel_z);
+    if (c_z < 0 || c_z >= grid_z) {
+      coors_offset[0] = -1;
+      coors_offset[1] = -1;
+      coors_offset[2] = -1;
+    } else {
+      coors_offset[0] = c_z;
+      coors_offset[1] = c_y;
+      coors_offset[2] = c_x;
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void assign_point_to_voxel(const int nthreads, const T* points,
+                                      T_int* point_to_voxelidx,
+                                      T_int* coor_to_voxelidx, T* voxels,
+                                      const int max_points,
+                                      const int num_features,
+                                      const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+    int index = thread_idx / num_features;
+
+    int num = point_to_voxelidx[index];
+    int voxelidx = coor_to_voxelidx[index];
+    if (num > -1 && voxelidx > -1) {
+      auto voxels_offset =
+          voxels + voxelidx * max_points * num_features + num * num_features;
+
+      int k = thread_idx % num_features;
+      voxels_offset[k] = points[thread_idx];
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void assign_voxel_coors(const int nthreads, T_int* coor,
+                                   T_int* point_to_voxelidx,
+                                   T_int* coor_to_voxelidx, T_int* voxel_coors,
+                                   const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+    // if (index >= num_points) return;
+    int index = thread_idx / NDim;
+    int num = point_to_voxelidx[index];
+    int voxelidx = coor_to_voxelidx[index];
+    if (num == 0 && voxelidx > -1) {
+      auto coors_offset = voxel_coors + voxelidx * NDim;
+      int k = thread_idx % NDim;
+      coors_offset[k] = coor[thread_idx];
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void point_to_voxelidx_kernel(const T_int* coor,
+                                         T_int* point_to_voxelidx,
+                                         T_int* point_to_pointidx,
+                                         const int max_points,
+                                         const int max_voxels,
+                                         const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(index, num_points) {
+    auto coor_offset = coor + index * NDim;
+    // skip invalid points
+    if ((index >= num_points) || (coor_offset[0] == -1)) return;
+
+    int num = 0;
+    int coor_x = coor_offset[0];
+    int coor_y = coor_offset[1];
+    int coor_z = coor_offset[2];
+    // only calculate the coors before this coor[index]
+    for (int i = 0; i < index; ++i) {
+      auto prev_coor = coor + i * NDim;
+      if (prev_coor[0] == -1) continue;
+
+      // Find all previous points that have the same coors
+      // if find the same coor, record it
+      if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
+          (prev_coor[2] == coor_z)) {
+        num++;
+        if (num == 1) {
+          // point to the same coor that first show up
+          point_to_pointidx[index] = i;
+        } else if (num >= max_points) {
+          // out of boundary
+          return;
+        }
+      }
+    }
+    if (num == 0) {
+      point_to_pointidx[index] = index;
+    }
+    if (num < max_points) {
+      point_to_voxelidx[index] = num;
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void determin_voxel_num(
+    // const T_int* coor,
+    T_int* num_points_per_voxel, T_int* point_to_voxelidx,
+    T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
+    const int max_points, const int max_voxels, const int num_points) {
+  // only calculate the coors before this coor[index]
+  for (int i = 0; i < num_points; ++i) {
+    int point_pos_in_voxel = point_to_voxelidx[i];
+    // record voxel
+    if (point_pos_in_voxel == -1) {
+      // out of max_points or invalid point
+      continue;
+    } else if (point_pos_in_voxel == 0) {
+      // record new voxel
+      int voxelidx = voxel_num[0];
+      if (voxel_num[0] >= max_voxels) continue;
+      voxel_num[0] += 1;
+      coor_to_voxelidx[i] = voxelidx;
+      num_points_per_voxel[voxelidx] = 1;
+    } else {
+      int point_idx = point_to_pointidx[i];
+      int voxelidx = coor_to_voxelidx[point_idx];
+      if (voxelidx != -1) {
+        coor_to_voxelidx[i] = voxelidx;
+        num_points_per_voxel[voxelidx] += 1;
+      }
+    }
+  }
+}
+
+#endif  // VOXELIZATION_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp b/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
new file mode 100644
index 0000000..c7f9f35
--- /dev/null
+++ b/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
@@ -0,0 +1,24 @@
+#ifndef PYTORCH_CPP_HELPER
+#define PYTORCH_CPP_HELPER
+#include <torch/extension.h>
+
+#include <vector>
+
+using namespace at;
+
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CPU(x) \
+  TORCH_CHECK(!x.device().is_cuda(), #x " must be a CPU tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_CUDA_INPUT(x) \
+  CHECK_CUDA(x);            \
+  CHECK_CONTIGUOUS(x)
+#define CHECK_CPU_INPUT(x) \
+  CHECK_CPU(x);            \
+  CHECK_CONTIGUOUS(x)
+
+#endif  // PYTORCH_CPP_HELPER
diff --git a/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp b/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp
new file mode 100644
index 0000000..9869b53
--- /dev/null
+++ b/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp
@@ -0,0 +1,19 @@
+#ifndef PYTORCH_CUDA_HELPER
+#define PYTORCH_CUDA_HELPER
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <THC/THCAtomics.cuh>
+
+#include "common_cuda_helper.hpp"
+
+using at::Half;
+using at::Tensor;
+using phalf = at::Half;
+
+#define __PHALF(x) (x)
+
+#endif  // PYTORCH_CUDA_HELPER
diff --git a/mmcv/ops/csrc/common/pytorch_device_registry.hpp b/mmcv/ops/csrc/common/pytorch_device_registry.hpp
new file mode 100644
index 0000000..2a32b72
--- /dev/null
+++ b/mmcv/ops/csrc/common/pytorch_device_registry.hpp
@@ -0,0 +1,141 @@
+#ifndef PYTORCH_DEVICE_REGISTRY_H
+#define PYTORCH_DEVICE_REGISTRY_H
+
+// Using <torch/extension.h> is recommended in the official documentation in
+// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op.
+// However, we use <torch/types.h> for compatibility with CUDA 9.0
+// Read https://github.com/pytorch/extension-cpp/issues/35 for more details.
+#include <torch/types.h>
+
+#include <cassert>
+#include <functional>
+#include <map>
+#include <type_traits>
+
+inline std::string GetDeviceStr(const at::Device& device) {
+  std::string str = DeviceTypeName(device.type(), true);
+  if (device.has_index()) {
+    str.push_back(':');
+    str.append(std::to_string(device.index()));
+  }
+  return str;
+}
+
+// Registry
+template <typename F, F f>
+class DeviceRegistry;
+
+template <typename Ret, typename... Args, Ret (*f)(Args...)>
+class DeviceRegistry<Ret (*)(Args...), f> {
+ public:
+  using FunctionType = Ret (*)(Args...);
+  static const int MAX_DEVICE_TYPES =
+      int8_t(at::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
+
+  void Register(at::DeviceType device, FunctionType function) {
+    funcs_[int8_t(device)] = function;
+  }
+
+  FunctionType Find(at::DeviceType device) const {
+    return funcs_[int8_t(device)];
+  }
+
+  static DeviceRegistry& instance() {
+    static DeviceRegistry inst;
+    return inst;
+  }
+
+ private:
+  DeviceRegistry() {
+    for (size_t i = 0; i < MAX_DEVICE_TYPES; ++i) {
+      funcs_[i] = nullptr;
+    }
+  };
+  FunctionType funcs_[MAX_DEVICE_TYPES];
+};
+
+// get device of first tensor param
+
+template <typename T, typename... Args,
+          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+at::Device GetFirstTensorDevice(T&& t, Args&&... args) {
+  return std::forward<T>(t).device();
+}
+template <typename T, typename... Args,
+          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+at::Device GetFirstTensorDevice(T&& t, Args&&... args) {
+  return GetFirstTensorDevice(std::forward<Args>(args)...);
+}
+
+// check device consistency
+
+inline std::pair<int, at::Device> CheckDeviceConsistency(
+    const at::Device& device, int index) {
+  return {index, device};
+}
+
+template <typename T, typename... Args,
+          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
+                                                  int index, T&& t,
+                                                  Args&&... args);
+
+template <typename T, typename... Args,
+          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
+                                                  int index, T&& t,
+                                                  Args&&... args) {
+  auto new_device = std::forward<T>(t).device();
+  if (new_device.type() != device.type() ||
+      new_device.index() != device.index()) {
+    return {index, new_device};
+  }
+  return CheckDeviceConsistency(device, index + 1, std::forward<Args>(args)...);
+}
+
+template <
+    typename T, typename... Args,
+    std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>
+std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
+                                                  int index, T&& t,
+                                                  Args&&... args) {
+  return CheckDeviceConsistency(device, index + 1, std::forward<Args>(args)...);
+}
+
+// dispatch
+
+template <typename R, typename... Args>
+auto Dispatch(const R& registry, const char* name, Args&&... args) {
+  auto device = GetFirstTensorDevice(std::forward<Args>(args)...);
+  auto inconsist =
+      CheckDeviceConsistency(device, 0, std::forward<Args>(args)...);
+  TORCH_CHECK(inconsist.first >= int(sizeof...(Args)), name, ": at param ",
+              inconsist.first,
+              ", inconsistent device: ", GetDeviceStr(inconsist.second).c_str(),
+              " vs ", GetDeviceStr(device).c_str(), "\n")
+  auto f_ptr = registry.Find(device.type());
+  TORCH_CHECK(f_ptr != nullptr, name, ": implementation for device ",
+              GetDeviceStr(device).c_str(), " not found.\n")
+  return f_ptr(std::forward<Args>(args)...);
+}
+
+// helper macro
+
+#define DEVICE_REGISTRY(key) DeviceRegistry<decltype(&(key)), key>::instance()
+
+#define REGISTER_DEVICE_IMPL(key, device, value)           \
+  struct key##_##device##_registerer {                     \
+    key##_##device##_registerer() {                        \
+      DEVICE_REGISTRY(key).Register(at::k##device, value); \
+    }                                                      \
+  };                                                       \
+  static key##_##device##_registerer _##key##_##device##_registerer;
+
+#define DISPATCH_DEVICE_IMPL(key, ...) \
+  Dispatch(DEVICE_REGISTRY(key), #key, __VA_ARGS__)
+
+#endif  // PYTORCH_DEVICE_REGISTRY
diff --git a/mmcv/ops/csrc/pytorch/assign_score_withk.cpp b/mmcv/ops/csrc/pytorch/assign_score_withk.cpp
new file mode 100644
index 0000000..9076277
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/assign_score_withk.cpp
@@ -0,0 +1,42 @@
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output) {
+  DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O,
+                       aggregate, points, centers, scores, knn_idx, output);
+}
+
+void assign_score_withk_backward_impl(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O,
+                       aggregate, grad_out, points, centers, scores, knn_idx,
+                       grad_points, grad_centers, grad_scores);
+}
+
+void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
+                                const Tensor& scores, const Tensor& knn_idx,
+                                Tensor& output, int B, int N0, int N1, int M,
+                                int K, int O, int aggregate) {
+  assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points,
+                                  centers, scores, knn_idx, output);
+}
+
+void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
+                                 const Tensor& centers, const Tensor& scores,
+                                 const Tensor& knn_idx, Tensor& grad_points,
+                                 Tensor& grad_centers, Tensor& grad_scores,
+                                 int B, int N0, int N1, int M, int K, int O,
+                                 int aggregate) {
+  assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out,
+                                   points, centers, scores, knn_idx,
+                                   grad_points, grad_centers, grad_scores);
+}
diff --git a/mmcv/ops/csrc/pytorch/ball_query.cpp b/mmcv/ops/csrc/pytorch/ball_query.cpp
new file mode 100644
index 0000000..1c9e7a2
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/ball_query.cpp
@@ -0,0 +1,20 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx) {
+  DISPATCH_DEVICE_IMPL(ball_query_forward_impl, b, n, m, min_radius, max_radius,
+                       nsample, new_xyz, xyz, idx);
+}
+
+void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
+                        Tensor idx_tensor, int b, int n, int m,
+                        float min_radius, float max_radius, int nsample) {
+  ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,
+                          new_xyz_tensor, xyz_tensor, idx_tensor);
+}
diff --git a/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp b/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
new file mode 100644
index 0000000..187216f
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  DISPATCH_DEVICE_IMPL(bbox_overlaps_impl, bboxes1, bboxes2, ious, mode,
+                       aligned, offset);
+}
+
+void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                   const int mode, const bool aligned, const int offset) {
+  bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
diff --git a/mmcv/ops/csrc/pytorch/border_align.cpp b/mmcv/ops/csrc/pytorch/border_align.cpp
new file mode 100644
index 0000000..565de68
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/border_align.cpp
@@ -0,0 +1,30 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size) {
+  DISPATCH_DEVICE_IMPL(border_align_forward_impl, input, boxes, output,
+                       argmax_idx, pool_size);
+}
+
+void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
+                                const Tensor &argmax_idx, Tensor grad_input,
+                                const int pool_size) {
+  DISPATCH_DEVICE_IMPL(border_align_backward_impl, grad_output, boxes,
+                       argmax_idx, grad_input, pool_size);
+}
+
+void border_align_forward(const Tensor &input, const Tensor &boxes,
+                          Tensor output, Tensor argmax_idx,
+                          const int pool_size) {
+  border_align_forward_impl(input, boxes, output, argmax_idx, pool_size);
+}
+
+void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
+                           const Tensor &argmax_idx, Tensor grad_input,
+                           const int pool_size) {
+  border_align_backward_impl(grad_output, boxes, argmax_idx, grad_input,
+                             pool_size);
+}
diff --git a/mmcv/ops/csrc/pytorch/box_iou_rotated.cpp b/mmcv/ops/csrc/pytorch/box_iou_rotated.cpp
new file mode 100644
index 0000000..a2a4e09
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/box_iou_rotated.cpp
@@ -0,0 +1,19 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned) {
+  DISPATCH_DEVICE_IMPL(box_iou_rotated_impl, boxes1, boxes2, ious, mode_flag,
+                       aligned);
+}
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                     const int mode_flag, const bool aligned) {
+  box_iou_rotated_impl(boxes1, boxes2, ious, mode_flag, aligned);
+}
diff --git a/mmcv/ops/csrc/pytorch/carafe.cpp b/mmcv/ops/csrc/pytorch/carafe.cpp
new file mode 100644
index 0000000..a563aed
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/carafe.cpp
@@ -0,0 +1,38 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_forward_impl, features, masks, rfeatures, routput,
+                       rmasks, output, kernel_size, group_size, scale_factor);
+}
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_backward_impl, top_grad, rfeatures, masks,
+                       rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
+}
+
+void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
+                    Tensor routput, Tensor rmasks, Tensor output,
+                    int kernel_size, int group_size, int scale_factor) {
+  carafe_forward_impl(features, masks, rfeatures, routput, rmasks, output,
+                      kernel_size, group_size, scale_factor);
+}
+
+void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                     Tensor rtop_grad, Tensor rbottom_grad_hs,
+                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
+                     Tensor mask_grad, int kernel_size, int group_size,
+                     int scale_factor) {
+  carafe_backward_impl(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
+                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
+                       kernel_size, group_size, scale_factor);
+}
diff --git a/mmcv/ops/csrc/pytorch/carafe_naive.cpp b/mmcv/ops/csrc/pytorch/carafe_naive.cpp
new file mode 100644
index 0000000..6e8917a
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/carafe_naive.cpp
@@ -0,0 +1,32 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_naive_forward_impl, features, masks, output,
+                       kernel_size, group_size, scale_factor);
+}
+
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_naive_backward_impl, top_grad, features, masks,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
+}
+
+void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
+                          int kernel_size, int group_size, int scale_factor) {
+  carafe_naive_forward_impl(features, masks, output, kernel_size, group_size,
+                            scale_factor);
+}
+
+void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
+                           Tensor bottom_grad, Tensor mask_grad,
+                           int kernel_size, int group_size, int scale_factor) {
+  carafe_naive_backward_impl(top_grad, features, masks, bottom_grad, mask_grad,
+                             kernel_size, group_size, scale_factor);
+}
diff --git a/mmcv/ops/csrc/pytorch/contour_expand.cpp b/mmcv/ops/csrc/pytorch/contour_expand.cpp
new file mode 100755
index 0000000..7639ae5
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/contour_expand.cpp
@@ -0,0 +1,112 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// It is modified from https://github.com/whai362/PSENet
+#include <iostream>
+#include <queue>
+
+#include "pytorch_cpp_helper.hpp"
+
+using namespace std;
+
+class Point2d {
+ public:
+  int x;
+  int y;
+
+  Point2d() : x(0), y(0) {}
+  Point2d(int _x, int _y) : x(_x), y(_y) {}
+};
+
+void kernel_dilate(const uint8_t *data, IntArrayRef data_shape,
+                   const int *label_map, int &label_num, int &min_area,
+                   vector<vector<int>> &text_line) {
+  std::vector<int> area(label_num + 1);
+  int kernel_num = data_shape[0];
+  int height = data_shape[1];
+  int width = data_shape[2];
+
+  for (int x = 0; x < height; ++x) {
+    for (int y = 0; y < width; ++y) {
+      int label = label_map[x * width + y];
+      if (label == 0) continue;
+      area[label] += 1;
+    }
+  }
+
+  queue<Point2d> queue, next_queue;
+  for (int x = 0; x < height; ++x) {
+    vector<int> row(width);
+    for (int y = 0; y < width; ++y) {
+      int label = label_map[x * width + y];
+      if (label == 0) continue;
+      if (area[label] < min_area) continue;
+
+      Point2d point(x, y);
+      queue.push(point);
+      row[y] = label;
+    }
+    text_line.emplace_back(row);
+  }
+
+  int dx[] = {-1, 1, 0, 0};
+  int dy[] = {0, 0, -1, 1};
+  vector<int> kernel_step(kernel_num);
+  std::for_each(kernel_step.begin(), kernel_step.end(),
+                [=](int &k) { return k * height * width; });
+
+  for (int kernel_id = kernel_num - 2; kernel_id >= 0; --kernel_id) {
+    while (!queue.empty()) {
+      Point2d point = queue.front();
+      queue.pop();
+      int x = point.x;
+      int y = point.y;
+      int label = text_line[x][y];
+
+      bool is_edge = true;
+      for (int d = 0; d < 4; ++d) {
+        int tmp_x = x + dx[d];
+        int tmp_y = y + dy[d];
+
+        if (tmp_x < 0 || tmp_x >= height) continue;
+        if (tmp_y < 0 || tmp_y >= width) continue;
+        int kernel_value = data[kernel_step[kernel_id] + tmp_x * width + tmp_y];
+        if (kernel_value == 0) continue;
+        if (text_line[tmp_x][tmp_y] > 0) continue;
+
+        Point2d point(tmp_x, tmp_y);
+        queue.push(point);
+        text_line[tmp_x][tmp_y] = label;
+        is_edge = false;
+      }
+
+      if (is_edge) {
+        next_queue.push(point);
+      }
+    }
+    swap(queue, next_queue);
+  }
+}
+
+std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
+                                             Tensor internal_kernel_label,
+                                             int min_kernel_area,
+                                             int kernel_num) {
+  kernel_mask = kernel_mask.contiguous();
+  internal_kernel_label = internal_kernel_label.contiguous();
+  assert(kernel_mask.dim() == 3);
+  assert(internal_kernel_label.dim() == 2);
+  assert(kernel_mask.size(1) == internal_kernel_label.size(0));
+  assert(kernel_mask.size(2) == internal_kernel_label.size(1));
+  CHECK_CPU_INPUT(kernel_mask);
+  CHECK_CPU_INPUT(internal_kernel_label);
+  auto ptr_data = kernel_mask.data_ptr<uint8_t>();
+  IntArrayRef data_shape = kernel_mask.sizes();
+
+  auto data_label_map = internal_kernel_label.data_ptr<int32_t>();
+  IntArrayRef label_map_shape = internal_kernel_label.sizes();
+  vector<vector<int>> text_line;
+
+  kernel_dilate(ptr_data, data_shape, data_label_map, kernel_num,
+                min_kernel_area, text_line);
+
+  return text_line;
+}
diff --git a/mmcv/ops/csrc/pytorch/corner_pool.cpp b/mmcv/ops/csrc/pytorch/corner_pool.cpp
new file mode 100644
index 0000000..732cdb0
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/corner_pool.cpp
@@ -0,0 +1,240 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/princeton-vl/CornerNet-Lite/tree/master/core/models/py_utils/_cpools/src
+#include "pytorch_cpp_helper.hpp"
+
+Tensor bottom_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get height
+  int64_t height = input.size(2);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < height; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 2, ind, height);
+    Tensor cur_temp = at::slice(output, 2, ind, height).clone();
+    Tensor next_temp = at::slice(output, 2, 0, height - ind).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor bottom_pool_backward(Tensor input, Tensor grad_output) {
+  auto output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(2, 0);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(0);
+
+  auto output_temp = output.select(2, 0);
+  auto grad_output_temp = grad_output.select(2, 0);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(2);
+  auto gt_mask = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, width},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 0; ind < height - 1; ++ind) {
+    input_temp = input.select(2, ind + 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, ind + 1);
+
+    grad_output_temp = grad_output.select(2, ind + 1).unsqueeze(2);
+    output.scatter_add_(2, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
+
+Tensor left_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get width
+  int64_t width = input.size(3);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < width; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 3, 0, width - ind);
+    Tensor cur_temp = at::slice(output, 3, 0, width - ind).clone();
+    Tensor next_temp = at::slice(output, 3, ind, width).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor left_pool_backward(Tensor input, Tensor grad_output) {
+  auto output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(3, width - 1);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(width - 1);
+
+  auto output_temp = output.select(3, width - 1);
+  auto grad_output_temp = grad_output.select(3, width - 1);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(3);
+  auto gt_mask = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, height},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 1; ind < width; ++ind) {
+    input_temp = input.select(3, width - ind - 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, width - ind - 1);
+
+    grad_output_temp = grad_output.select(3, width - ind - 1).unsqueeze(3);
+    output.scatter_add_(3, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
+
+Tensor right_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get width
+  int64_t width = input.size(3);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < width; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 3, ind, width);
+    Tensor cur_temp = at::slice(output, 3, ind, width).clone();
+    Tensor next_temp = at::slice(output, 3, 0, width - ind).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor right_pool_backward(Tensor input, Tensor grad_output) {
+  Tensor output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(3, 0);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(0);
+
+  auto output_temp = output.select(3, 0);
+  auto grad_output_temp = grad_output.select(3, 0);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(3);
+  auto gt_mask = torch::zeros({batch, channel, height},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, height},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 0; ind < width - 1; ++ind) {
+    input_temp = input.select(3, ind + 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, ind + 1);
+
+    grad_output_temp = grad_output.select(3, ind + 1).unsqueeze(3);
+    output.scatter_add_(3, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
+
+Tensor top_pool_forward(Tensor input) {
+  // Initialize output
+  Tensor output = at::zeros_like(input);
+  // Get height
+  int64_t height = input.size(2);
+  output.copy_(input);
+
+  for (int64_t ind = 1; ind < height; ind <<= 1) {
+    Tensor max_temp = at::slice(output, 2, 0, height - ind);
+    Tensor cur_temp = at::slice(output, 2, 0, height - ind).clone();
+    Tensor next_temp = at::slice(output, 2, ind, height).clone();
+    at::max_out(max_temp, cur_temp, next_temp);
+  }
+
+  return output;
+}
+
+Tensor top_pool_backward(Tensor input, Tensor grad_output) {
+  auto output = at::zeros_like(input);
+
+  int32_t batch = input.size(0);
+  int32_t channel = input.size(1);
+  int32_t height = input.size(2);
+  int32_t width = input.size(3);
+
+  auto max_val = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kFloat));
+  auto max_ind = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kLong));
+
+  auto input_temp = input.select(2, height - 1);
+  max_val.copy_(input_temp);
+
+  max_ind.fill_(height - 1);
+
+  auto output_temp = output.select(2, height - 1);
+  auto grad_output_temp = grad_output.select(2, height - 1);
+  output_temp.copy_(grad_output_temp);
+
+  auto un_max_ind = max_ind.unsqueeze(2);
+  auto gt_mask = torch::zeros({batch, channel, width},
+                              at::device(at::kCUDA).dtype(at::kBool));
+  auto max_temp = torch::zeros({batch, channel, width},
+                               at::device(at::kCUDA).dtype(at::kFloat));
+  for (int32_t ind = 1; ind < height; ++ind) {
+    input_temp = input.select(2, height - ind - 1);
+    at::gt_out(gt_mask, input_temp, max_val);
+
+    at::masked_select_out(max_temp, input_temp, gt_mask);
+    max_val.masked_scatter_(gt_mask, max_temp);
+    max_ind.masked_fill_(gt_mask, height - ind - 1);
+
+    grad_output_temp = grad_output.select(2, height - ind - 1).unsqueeze(2);
+    output.scatter_add_(2, un_max_ind, grad_output_temp);
+  }
+
+  return output;
+}
diff --git a/mmcv/ops/csrc/pytorch/correlation.cpp b/mmcv/ops/csrc/pytorch/correlation.cpp
new file mode 100644
index 0000000..f4adba2
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/correlation.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include <iostream>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW) {
+  DISPATCH_DEVICE_IMPL(correlation_forward_impl, input1, input2, output, kH, kW,
+                       patchH, patchW, padH, padW, dilationH, dilationW,
+                       dilation_patchH, dilation_patchW, dH, dW);
+}
+
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW) {
+  DISPATCH_DEVICE_IMPL(correlation_backward_impl, grad_output, input1, input2,
+                       grad_input1, grad_input2, kH, kW, patchH, patchW, padH,
+                       padW, dilationH, dilationW, dilation_patchH,
+                       dilation_patchW, dH, dW);
+}
+
+void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
+                         int kW, int patchH, int patchW, int padH, int padW,
+                         int dilationH, int dilationW, int dilation_patchH,
+                         int dilation_patchW, int dH, int dW) {
+  correlation_forward_impl(input1, input2, output, kH, kW, patchH, patchW, padH,
+                           padW, dilationH, dilationW, dilation_patchH,
+                           dilation_patchW, dH, dW);
+}
+
+void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
+                          Tensor grad_input1, Tensor grad_input2, int kH,
+                          int kW, int patchH, int patchW, int padH, int padW,
+                          int dilationH, int dilationW, int dilation_patchH,
+                          int dilation_patchW, int dH, int dW) {
+  correlation_backward_impl(grad_output, input1, input2, grad_input1,
+                            grad_input2, kH, kW, patchH, patchW, padH, padW,
+                            dilationH, dilationW, dilation_patchH,
+                            dilation_patchW, dH, dW);
+}
diff --git a/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp b/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp
new file mode 100644
index 0000000..585d2c9
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp
@@ -0,0 +1,38 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
+#include "box_iou_rotated_utils.hpp"
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+void box_iou_rotated_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
+                                Tensor ious, const int mode_flag,
+                                const bool aligned) {
+  int output_size = ious.numel();
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+
+  if (aligned) {
+    for (int i = 0; i < output_size; i++) {
+      ious[i] = single_box_iou_rotated<T>(boxes1[i].data_ptr<T>(),
+                                          boxes2[i].data_ptr<T>(), mode_flag);
+    }
+  } else {
+    for (int i = 0; i < num_boxes1; i++) {
+      for (int j = 0; j < num_boxes2; j++) {
+        ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(
+            boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>(), mode_flag);
+      }
+    }
+  }
+}
+
+void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned) {
+  box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
+}
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CPU, box_iou_rotated_cpu);
diff --git a/mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp b/mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp
new file mode 100644
index 0000000..7ab67e7
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp
@@ -0,0 +1,408 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+T deformable_im2col_bilinear_cpu(const T *input, const int data_width,
+                                 const int height, const int width, T h, T w) {
+  if (h <= -1 || height <= h || w <= -1 || width <= w) {
+    return 0;
+  }
+
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+T get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
+                          const int height, const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+T get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
+                            const int width, const T *im_data,
+                            const int data_width, const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+void deformable_im2col_cpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  for (int index = 0; index < n; index++) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = deformable_im2col_bilinear_cpu(data_im_ptr, width, height,
+                                               width, h_im, w_im);
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+void deformable_col2im_cpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  for (int index = 0; index < n; index++) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight =
+              get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data,
+                                      cur_h + dy, cur_w + dx, height, width);
+          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void deformable_col2im_coord_cpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int offset_channels, const int deformable_group, const int height_col,
+    const int width_col, T *grad_offset) {
+  for (int index = 0; index < n; index++) {
+    T val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      const T weight = get_coordinate_weight_cpu(
+          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+          width, bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+void deformable_im2col_cpu(Tensor data_im, Tensor data_offset,
+                           const int channels, const int height,
+                           const int width, const int ksize_h,
+                           const int ksize_w, const int pad_h, const int pad_w,
+                           const int stride_h, const int stride_w,
+                           const int dilation_h, const int dilation_w,
+                           const int parallel_imgs, const int deformable_group,
+                           Tensor data_col) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "deformable_im2col_cpu", [&] {
+        deformable_im2col_cpu_kernel<scalar_t>(
+            num_kernels, data_im.data_ptr<scalar_t>(),
+            data_offset.data_ptr<scalar_t>(), height, width, ksize_h, ksize_w,
+            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, parallel_imgs, channels,
+            deformable_group, height_col, width_col,
+            data_col.data_ptr<scalar_t>());
+      });
+}
+
+void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
+                           const int channels, const int height,
+                           const int width, const int ksize_h,
+                           const int ksize_w, const int pad_h, const int pad_w,
+                           const int stride_h, const int stride_w,
+                           const int dilation_h, const int dilation_w,
+                           const int parallel_imgs, const int deformable_group,
+                           Tensor grad_im) {
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels =
+      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        deformable_col2im_cpu_kernel<scalar_t>(
+            num_kernels, data_col_, data_offset_, channels, height, width,
+            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+            dilation_w, channel_per_deformable_group, parallel_imgs,
+            deformable_group, height_col, width_col, grad_im_);
+      }));
+}
+
+void deformable_col2im_coord_cpu(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
+                    deformable_group * parallel_imgs;
+  int channel_per_deformable_group =
+      channels * ksize_h * ksize_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_coord_cpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+
+        deformable_col2im_coord_cpu_kernel<scalar_t>(
+            num_kernels, data_col_, data_im_, data_offset_, channels, height,
+            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
+            2 * ksize_h * ksize_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_);
+      }));
+}
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+REGISTER_DEVICE_IMPL(deformable_im2col_impl, CPU, deformable_im2col_cpu);
+REGISTER_DEVICE_IMPL(deformable_col2im_impl, CPU, deformable_col2im_cpu);
+REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CPU,
+                     deformable_col2im_coord_cpu);
diff --git a/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp b/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp
new file mode 100644
index 0000000..9539095
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp
@@ -0,0 +1,436 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+T dmcn_im2col_bilinear_cpu(const T *input, const int data_width,
+                           const int height, const int width, T h, T w) {
+  int h_low = floorf(h);
+  int w_low = floorf(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+T dmcn_get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
+                               const int height, const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+T dmcn_get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
+                                 const int width, const T *im_data,
+                                 const int data_width, const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+void modulated_deformable_im2col_cpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const T *data_mask,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  for (int index = 0; index < n; index++) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const T *data_mask_ptr =
+        data_mask + (b_col * deformable_group + deformable_group_index) *
+                        kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width,
+                                         h_im, w_im);
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+void modulated_deformable_col2im_cpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const T *data_mask,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  for (int index = 0; index < n; index++) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr =
+        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T mask = data_mask_ptr[data_mask_hw_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight = dmcn_get_gradient_weight_cpu(cur_inv_h_data,
+                                                  cur_inv_w_data, cur_h + dy,
+                                                  cur_w + dx, height, width);
+          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void modulated_deformable_col2im_coord_cpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const T *data_mask, const int channels, const int height, const int width,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int channel_per_deformable_group,
+    const int batch_size, const int offset_channels, const int deformable_group,
+    const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
+  for (int index = 0; index < n; index++) {
+    T val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const int data_mask_hw_ptr =
+          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      const T mask = data_mask_ptr[data_mask_hw_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      else
+        mval += data_col_ptr[col_pos] *
+                dmcn_im2col_bilinear_cpu(data_im_ptr + cnt * height * width,
+                                         width, height, width, inv_h, inv_w);
+      const T weight = dmcn_get_coordinate_weight_cpu(
+          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+          width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
+      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
+      // height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
+                      kernel_w +
+                  offset_c / 2) *
+                     height_col +
+                 h) *
+                    width_col +
+                w] = mval;
+  }
+}
+
+void modulated_deformable_im2col_cpu(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "modulated_deformable_im2col_cpu", ([&] {
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+
+        modulated_deformable_im2col_cpu_kernel(
+            num_kernels, data_im_, data_offset_, data_mask_, height_im,
+            width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, batch_size,
+            channels, deformable_group, height_col, width_col, data_col_);
+      }));
+}
+
+void modulated_deformable_col2im_cpu(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels =
+      channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_cpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_cpu_kernel(
+            num_kernels, data_col_, data_offset_, data_mask_, channels,
+            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+            stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, deformable_group, height_col, width_col, grad_im_);
+      }));
+}
+
+void modulated_deformable_col2im_coord_cpu(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
+                          kernel_w * deformable_group;
+  const int channel_per_deformable_group =
+      channels * kernel_h * kernel_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_coord_cpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_coord_cpu_kernel(
+            num_kernels, data_col_, data_im_, data_offset_, data_mask_,
+            channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
+            stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, batch_size,
+            2 * kernel_h * kernel_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_, grad_mask_);
+      }));
+}
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CPU,
+                     modulated_deformable_im2col_cpu);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CPU,
+                     modulated_deformable_col2im_cpu);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CPU,
+                     modulated_deformable_col2im_coord_cpu);
diff --git a/mmcv/ops/csrc/pytorch/cpu/nms.cpp b/mmcv/ops/csrc/pytorch/cpu/nms.cpp
new file mode 100644
index 0000000..53e9b9a
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cpu/nms.cpp
@@ -0,0 +1,230 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+  auto x1_t = boxes.select(1, 0).contiguous();
+  auto y1_t = boxes.select(1, 1).contiguous();
+  auto x2_t = boxes.select(1, 2).contiguous();
+  auto y2_t = boxes.select(1, 3).contiguous();
+
+  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto nboxes = boxes.size(0);
+  Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));
+
+  auto select = select_t.data_ptr<bool>();
+  auto order = order_t.data_ptr<int64_t>();
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+
+  for (int64_t _i = 0; _i < nboxes; _i++) {
+    if (select[_i] == false) continue;
+    auto i = order[_i];
+    auto ix1 = x1[i];
+    auto iy1 = y1[i];
+    auto ix2 = x2[i];
+    auto iy2 = y2[i];
+    auto iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < nboxes; _j++) {
+      if (select[_j] == false) continue;
+      auto j = order[_j];
+      auto xx1 = std::max(ix1, x1[j]);
+      auto yy1 = std::max(iy1, y1[j]);
+      auto xx2 = std::min(ix2, x2[j]);
+      auto yy2 = std::min(iy2, y2[j]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr > iou_threshold) select[_j] = false;
+    }
+  }
+  return order_t.masked_select(select_t);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, CPU, nms_cpu);
+
+Tensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets,
+                   float iou_threshold, float sigma, float min_score,
+                   int method, int offset) {
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+
+  auto x1_t = boxes.select(1, 0).contiguous();
+  auto y1_t = boxes.select(1, 1).contiguous();
+  auto x2_t = boxes.select(1, 2).contiguous();
+  auto y2_t = boxes.select(1, 3).contiguous();
+  auto scores_t = scores.clone();
+
+  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
+
+  auto nboxes = boxes.size(0);
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto sc = scores_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+  auto de = dets.data_ptr<float>();
+
+  int64_t pos = 0;
+  Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));
+  auto inds = inds_t.data_ptr<int64_t>();
+
+  for (int64_t i = 0; i < nboxes; i++) {
+    auto max_score = sc[i];
+    auto max_pos = i;
+
+    pos = i + 1;
+    // get max box
+    while (pos < nboxes) {
+      if (max_score < sc[pos]) {
+        max_score = sc[pos];
+        max_pos = pos;
+      }
+      pos = pos + 1;
+    }
+    // swap
+    auto ix1 = de[i * 5 + 0] = x1[max_pos];
+    auto iy1 = de[i * 5 + 1] = y1[max_pos];
+    auto ix2 = de[i * 5 + 2] = x2[max_pos];
+    auto iy2 = de[i * 5 + 3] = y2[max_pos];
+    auto iscore = de[i * 5 + 4] = sc[max_pos];
+    auto iarea = areas[max_pos];
+    auto iind = inds[max_pos];
+    x1[max_pos] = x1[i];
+    y1[max_pos] = y1[i];
+    x2[max_pos] = x2[i];
+    y2[max_pos] = y2[i];
+    sc[max_pos] = sc[i];
+    areas[max_pos] = areas[i];
+    inds[max_pos] = inds[i];
+    x1[i] = ix1;
+    y1[i] = iy1;
+    x2[i] = ix2;
+    y2[i] = iy2;
+    sc[i] = iscore;
+    areas[i] = iarea;
+    inds[i] = iind;
+
+    pos = i + 1;
+    while (pos < nboxes) {
+      auto xx1 = std::max(ix1, x1[pos]);
+      auto yy1 = std::max(iy1, y1[pos]);
+      auto xx2 = std::min(ix2, x2[pos]);
+      auto yy2 = std::min(iy2, y2[pos]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[pos] - inter);
+
+      float weight = 1.;
+      if (method == 0) {
+        if (ovr >= iou_threshold) weight = 0;
+      } else if (method == 1) {
+        if (ovr >= iou_threshold) weight = 1 - ovr;
+      } else if (method == 2) {
+        weight = std::exp(-(ovr * ovr) / sigma);
+      }
+      sc[pos] *= weight;
+      // if box score falls below threshold, discard the box by
+      // swapping with last box update N
+      if (sc[pos] < min_score) {
+        x1[pos] = x1[nboxes - 1];
+        y1[pos] = y1[nboxes - 1];
+        x2[pos] = x2[nboxes - 1];
+        y2[pos] = y2[nboxes - 1];
+        sc[pos] = sc[nboxes - 1];
+        areas[pos] = areas[nboxes - 1];
+        inds[pos] = inds[nboxes - 1];
+        nboxes = nboxes - 1;
+        pos = pos - 1;
+      }
+      pos = pos + 1;
+    }
+  }
+  return inds_t.slice(0, 0, nboxes);
+}
+
+Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
+                    float iou_threshold, float sigma, float min_score,
+                    int method, int offset);
+REGISTER_DEVICE_IMPL(softnms_impl, CPU, softnms_cpu);
+
+std::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {
+  auto x1_t = dets.select(1, 0).contiguous();
+  auto y1_t = dets.select(1, 1).contiguous();
+  auto x2_t = dets.select(1, 2).contiguous();
+  auto y2_t = dets.select(1, 3).contiguous();
+  auto scores = dets.select(1, 4).contiguous();
+
+  at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  at::Tensor suppressed_t =
+      at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto order = order_t.data_ptr<int64_t>();
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+
+  std::vector<int> keep;
+  std::vector<std::vector<int> > matched;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) continue;
+    keep.push_back(i);
+    std::vector<int> v_i;
+    auto ix1 = x1[i];
+    auto iy1 = y1[i];
+    auto ix2 = x2[i];
+    auto iy2 = y2[i];
+    auto iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) continue;
+      auto xx1 = std::max(ix1, x1[j]);
+      auto yy1 = std::max(iy1, y1[j]);
+      auto xx2 = std::min(ix2, x2[j]);
+      auto yy2 = std::min(iy2, y2[j]);
+
+      auto w = std::max(static_cast<float>(0), xx2 - xx1);
+      auto h = std::max(static_cast<float>(0), yy2 - yy1);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+        v_i.push_back(j);
+      }
+    }
+    matched.push_back(v_i);
+  }
+  for (size_t i = 0; i < keep.size(); i++)
+    matched[i].insert(matched[i].begin(), keep[i]);
+  return matched;
+}
+
+std::vector<std::vector<int> > nms_match_impl(Tensor dets, float iou_threshold);
+REGISTER_DEVICE_IMPL(nms_match_impl, CPU, nms_match_cpu);
diff --git a/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp b/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
new file mode 100644
index 0000000..223ee1a
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
@@ -0,0 +1,66 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
+#include "box_iou_rotated_utils.hpp"
+#include "pytorch_cpp_helper.hpp"
+
+template <typename scalar_t>
+Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
+                              const float iou_threshold) {
+  // nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
+  // however, the code in this function is much shorter because
+  // we delegate the IoU computation for rotated boxes to
+  // the single_box_iou_rotated function in box_iou_rotated_utils.h
+  AT_ASSERTM(!dets.is_cuda(), "dets must be a CPU tensor");
+  AT_ASSERTM(!scores.is_cuda(), "scores must be a CPU tensor");
+  AT_ASSERTM(dets.scalar_type() == scores.scalar_type(),
+             "dets should have the same type as scores");
+
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong));
+  }
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
+  Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto keep = keep_t.data_ptr<int64_t>();
+  auto order = order_t.data_ptr<int64_t>();
+
+  int64_t num_to_keep = 0;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) {
+      continue;
+    }
+
+    keep[num_to_keep++] = i;
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) {
+        continue;
+      }
+
+      auto ovr = single_box_iou_rotated<scalar_t>(
+          dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>(), 0);
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+      }
+    }
+  }
+  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
+}
+
+Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
+                       const float iou_threshold) {
+  auto result = at::empty({0}, dets.options());
+  AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms_rotated", [&] {
+    result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
+  });
+  return result;
+}
diff --git a/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp b/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
new file mode 100755
index 0000000..9083281
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
@@ -0,0 +1,124 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// It is modified from https://github.com/WenmuZhou/PAN.pytorch
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+std::vector<std::vector<float>> estimate_confidence(int32_t* label,
+                                                    float* score, int label_num,
+                                                    int height, int width) {
+  std::vector<std::vector<float>> point_vector;
+  for (int i = 0; i < label_num; i++) {
+    std::vector<float> point;
+    point.push_back(0);
+    point.push_back(0);
+    point_vector.push_back(point);
+  }
+  for (int y = 0; y < height; y++) {
+    auto label_tmp = label + y * width;
+    auto score_tmp = score + y * width;
+    for (int x = 0; x < width; x++) {
+      auto l = label_tmp[x];
+      if (l > 0) {
+        float confidence = score_tmp[x];
+        point_vector[l].push_back(x);
+        point_vector[l].push_back(y);
+        point_vector[l][0] += confidence;
+        point_vector[l][1] += 1;
+      }
+    }
+  }
+  for (size_t l = 0; l < point_vector.size(); l++)
+    if (point_vector[l][1] > 0) {
+      point_vector[l][0] /= point_vector[l][1];
+    }
+  return point_vector;
+}
+std::vector<std::vector<float>> pixel_group_cpu(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
+  assert(score.dim() == 2);
+  assert(mask.dim() == 2);
+  assert(embedding_dim.dim() == 3);
+  int height = score.size(0);
+  int width = score.size(1);
+  assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1));
+  assert(width == mask.size(1) == embedding.size(2) == kernel_label.size(2));
+
+  auto threshold_square = dis_threshold * dis_threshold;
+  auto ptr_score = score.data_ptr<float>();
+  auto ptr_mask = mask.data_ptr<bool>();
+  auto ptr_kernel_contour = kernel_contour.data_ptr<uint8_t>();
+  auto ptr_embedding = embedding.data_ptr<float>();
+  auto ptr_kernel_label = kernel_label.data_ptr<int32_t>();
+  std::queue<std::tuple<int, int, int32_t>> contour_pixels;
+  auto embedding_dim = embedding.size(2);
+  std::vector<std::vector<float>> kernel_vector(
+      kernel_region_num, std::vector<float>(embedding_dim + 1, 0));
+
+  Tensor text_label;
+  text_label = kernel_label.clone();
+  auto ptr_text_label = text_label.data_ptr<int32_t>();
+
+  for (int i = 0; i < height; i++) {
+    auto ptr_embedding_tmp = ptr_embedding + i * width * embedding_dim;
+    auto ptr_kernel_label_tmp = ptr_kernel_label + i * width;
+    auto ptr_kernel_contour_tmp = ptr_kernel_contour + i * width;
+
+    for (int j = 0, k = 0; j < width && k < width * embedding_dim;
+         j++, k += embedding_dim) {
+      int32_t label = ptr_kernel_label_tmp[j];
+      if (label > 0) {
+        for (int d = 0; d < embedding_dim; d++)
+          kernel_vector[label][d] += ptr_embedding_tmp[k + d];
+        kernel_vector[label][embedding_dim] += 1;
+        // kernel pixel number
+        if (ptr_kernel_contour_tmp[j]) {
+          contour_pixels.push(std::make_tuple(i, j, label));
+        }
+      }
+    }
+  }
+  for (int i = 0; i < kernel_region_num; i++) {
+    for (int j = 0; j < embedding_dim; j++) {
+      kernel_vector[i][j] /= kernel_vector[i][embedding_dim];
+    }
+  }
+  int dx[4] = {-1, 1, 0, 0};
+  int dy[4] = {0, 0, -1, 1};
+  while (!contour_pixels.empty()) {
+    auto query_pixel = contour_pixels.front();
+    contour_pixels.pop();
+    int y = std::get<0>(query_pixel);
+    int x = std::get<1>(query_pixel);
+    int32_t l = std::get<2>(query_pixel);
+    auto kernel_cv = kernel_vector[l];
+    for (int idx = 0; idx < 4; idx++) {
+      int tmpy = y + dy[idx];
+      int tmpx = x + dx[idx];
+      auto ptr_text_label_tmp = ptr_text_label + tmpy * width;
+      if (tmpy < 0 || tmpy >= height || tmpx < 0 || tmpx >= width) continue;
+      if (!ptr_mask[tmpy * width + tmpx] || ptr_text_label_tmp[tmpx] > 0)
+        continue;
+
+      float dis = 0;
+      auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
+      for (size_t i = 0; i < embedding_dim; i++) {
+        dis +=
+            pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
+        // ignore further computing if dis is big enough
+        if (dis >= threshold_square) break;
+      }
+      if (dis >= threshold_square) continue;
+      contour_pixels.push(std::make_tuple(tmpy, tmpx, l));
+      ptr_text_label_tmp[tmpx] = l;
+    }
+  }
+
+  return estimate_confidence(ptr_text_label, ptr_score, kernel_region_num,
+                             height, width);
+}
+std::vector<std::vector<float>> pixel_group_impl(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold);
+REGISTER_DEVICE_IMPL(pixel_group_impl, CPU, pixel_group_cpu);
diff --git a/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp b/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp
new file mode 100644
index 0000000..c16baa4
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp
@@ -0,0 +1,53 @@
+#include "pytorch_cpp_helper.hpp"
+
+inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz,
+                                      float &local_x, float &local_y) {
+  float cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d,
+                                 float &local_x, float &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
+  // cz in the bottom center
+  float x = pt[0], y = pt[1], z = pt[2];
+  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
+  cz += z_size /
+        2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > z_size / 2.0) return 0;
+  lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
+                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
+  return in_flag;
+}
+
+void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor pts_indices_tensor) {
+  // params boxes: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
+  // (npoints, 3) [x, y, z] in LiDAR coordinate params pts_indices: (N, npoints)
+
+  CHECK_CONTIGUOUS(boxes_tensor);
+  CHECK_CONTIGUOUS(pts_tensor);
+  CHECK_CONTIGUOUS(pts_indices_tensor);
+
+  int boxes_num = boxes_tensor.size(0);
+  int pts_num = pts_tensor.size(0);
+
+  const float *boxes = boxes_tensor.data_ptr<float>();
+  const float *pts = pts_tensor.data_ptr<float>();
+  int *pts_indices = pts_indices_tensor.data_ptr<int>();
+
+  float local_x = 0, local_y = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    for (int j = 0; j < pts_num; j++) {
+      int cur_in_flag =
+          check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y);
+      pts_indices[i * pts_num + j] = cur_in_flag;
+    }
+  }
+}
diff --git a/mmcv/ops/csrc/pytorch/cpu/psamask.cpp b/mmcv/ops/csrc/pytorch/cpu/psamask.cpp
new file mode 100644
index 0000000..aa7fdcb
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cpu/psamask.cpp
@@ -0,0 +1,199 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+#ifndef min
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+#endif
+#ifndef max
+#define max(a, b) (((a) > (b)) ? (a) : (b))
+#endif
+
+void psamask_collect_forward(const int num_, const int h_feature,
+                             const int w_feature, const int h_mask,
+                             const int w_mask, const int half_h_mask,
+                             const int half_w_mask, const Tensor mask_data,
+                             Tensor buffer_data) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            buffer_data.view({-1})[(n * h_feature * w_feature +
+                                    (hidx + h - half_h_mask) * w_feature +
+                                    (widx + w - half_w_mask)) *
+                                       h_feature * w_feature +
+                                   h * w_feature + w] =
+                mask_data.view(
+                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                               h_feature +
+                           h) *
+                              w_feature +
+                          w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_distribute_forward(const int num_, const int h_feature,
+                                const int w_feature, const int h_mask,
+                                const int w_mask, const int half_h_mask,
+                                const int half_w_mask, const Tensor mask_data,
+                                Tensor buffer_data) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            buffer_data.view(
+                {-1})[(n * h_feature * w_feature + h * w_feature + w) *
+                          h_feature * w_feature +
+                      (hidx + h - half_h_mask) * w_feature +
+                      (widx + w - half_w_mask)] =
+                mask_data.view(
+                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                               h_feature +
+                           h) *
+                              w_feature +
+                          w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_collect_backward(const int num_, const int h_feature,
+                              const int w_feature, const int h_mask,
+                              const int w_mask, const int half_h_mask,
+                              const int half_w_mask, const Tensor buffer_diff,
+                              Tensor mask_diff) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                                      h_feature +
+                                  h) *
+                                     w_feature +
+                                 w] =
+                buffer_diff.view({-1})[(n * h_feature * w_feature +
+                                        (hidx + h - half_h_mask) * w_feature +
+                                        (widx + w - half_w_mask)) *
+                                           h_feature * w_feature +
+                                       h * w_feature + w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_distribute_backward(const int num_, const int h_feature,
+                                 const int w_feature, const int h_mask,
+                                 const int w_mask, const int half_h_mask,
+                                 const int half_w_mask,
+                                 const Tensor buffer_diff, Tensor mask_diff) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                                      h_feature +
+                                  h) *
+                                     w_feature +
+                                 w] =
+                buffer_diff.view(
+                    {-1})[(n * h_feature * w_feature + h * w_feature + w) *
+                              h_feature * w_feature +
+                          (hidx + h - half_h_mask) * w_feature +
+                          (widx + w - half_w_mask)];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
+                         const int num_, const int h_feature,
+                         const int w_feature, const int h_mask,
+                         const int w_mask, const int half_h_mask,
+                         const int half_w_mask) {
+  if (psa_type == 0)
+    psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,
+                            half_h_mask, half_w_mask, input, output);
+  else
+    psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,
+                               half_h_mask, half_w_mask, input, output);
+}
+
+void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
+                          Tensor grad_input, const int num_,
+                          const int h_feature, const int w_feature,
+                          const int h_mask, const int w_mask,
+                          const int half_h_mask, const int half_w_mask) {
+  if (psa_type == 0)
+    psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,
+                             half_h_mask, half_w_mask, grad_output, grad_input);
+  else
+    psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,
+                                half_h_mask, half_w_mask, grad_output,
+                                grad_input);
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+REGISTER_DEVICE_IMPL(psamask_forward_impl, CPU, psamask_forward_cpu);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, CPU, psamask_backward_cpu);
diff --git a/mmcv/ops/csrc/pytorch/cpu/roi_align.cpp b/mmcv/ops/csrc/pytorch/cpu/roi_align.cpp
new file mode 100644
index 0000000..d545390
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cpu/roi_align.cpp
@@ -0,0 +1,466 @@
+// Modified from
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+// implementation taken from Caffe2
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+                     static_cast<T>(iy + .5f) * bin_size_h /
+                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+                       static_cast<T>(ix + .5f) * bin_size_w /
+                           static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignForward(const int nthreads, const T* input, const T* rois,
+                     T* output, T* argmax_y, T* argmax_x,
+                     const int pooled_height, const int pooled_width,
+                     const T spatial_scale, const int sampling_ratio,
+                     const int pool_mode,  // 0 - max pool, 1 - avg pool
+                     const bool aligned, const int channels, const int height,
+                     const int width) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlign cannot have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // When the grid is empty, output zeros == 0/1, instead of NaN.
+    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                     pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          T maxval = -10000;
+          T maxidx_y = -1.f, maxidx_x = -1.f;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            const T y = roi_start_h + ph * bin_size_h +
+                        static_cast<T>(iy + .5f) * bin_size_h /
+                            static_cast<T>(roi_bin_grid_h);
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              const T x = roi_start_w + pw * bin_size_w +
+                          static_cast<T>(ix + .5f) * bin_size_w /
+                              static_cast<T>(roi_bin_grid_w);
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              T val = pc.w1 * offset_input[pc.pos1] +
+                      pc.w2 * offset_input[pc.pos2] +
+                      pc.w3 * offset_input[pc.pos3] +
+                      pc.w4 * offset_input[pc.pos4];
+              if (val > maxval) {
+                maxval = val;
+                maxidx_y = y;
+                maxidx_x = x;
+              }
+              output_val += val;
+              pre_calc_index += 1;
+            }
+          }
+          if (pool_mode == 0) {
+            // We do max pooling inside a bin
+            output[index] = maxval;
+            argmax_y[index] = maxidx_y;
+            argmax_x[index] = maxidx_x;
+          } else if (pool_mode == 1) {
+            // We do average (integral) pooling inside a bin
+            output[index] = output_val / count;
+          }  // if
+        }    // for pw
+      }      // for ph
+    }        // for c
+  }          // for n
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
+                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
+                                   int& x_high, int& y_low, int& y_high,
+                                   const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(T* address, const T& val) {
+  *address += val;
+}
+
+template <typename T>
+void ROIAlignBackward(const int nthreads, const T* grad_output, const T* rois,
+                      const T* argmax_y, const T* argmax_x, T* grad_input,
+                      const int pooled_height, const int pooled_width,
+                      const T spatial_scale, const int sampling_ratio,
+                      const int pool_mode,  // 0 - max pool, 1 - avg pool
+                      const bool aligned, const int channels, const int height,
+                      const int width, const int n_stride, const int c_stride,
+                      const int h_stride, const int w_stride) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlign do not have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    int output_offset = n * n_stride + c * c_stride;
+    const T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    if (pool_mode == 0) {
+      // We do max pooling inside a bin
+      T y = argmax_y[index], x = argmax_x[index];
+      if (y != -1.f) {
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        T g1 = grad_output_this_bin * w1;
+        T g2 = grad_output_this_bin * w2;
+        T g3 = grad_output_this_bin * w3;
+        T g4 = grad_output_this_bin * w4;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        }  // if
+      }    // mode
+    } else if (pool_mode == 1) {
+      // We do average (integral) pooling inside a bin
+      // We use roi_bin_grid to sample the grid and mimic integral
+      int roi_bin_grid_h =
+          (sampling_ratio > 0)
+              ? sampling_ratio
+              : ceilf(roi_height / pooled_height);  // e.g., = 2
+      int roi_bin_grid_w = (sampling_ratio > 0)
+                               ? sampling_ratio
+                               : ceilf(roi_width / pooled_width);
+
+      const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+
+          T w1, w2, w3, w4;
+          int x_low, x_high, y_low, y_high;
+
+          bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                        x_low, x_high, y_low, y_high, index);
+
+          T g1 = grad_output_this_bin * w1 / count;
+          T g2 = grad_output_this_bin * w2 / count;
+          T g3 = grad_output_this_bin * w3 / count;
+          T g4 = grad_output_this_bin * w4 / count;
+
+          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+            // atomic add is not needed for now since it is single threaded
+            add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+            add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+            add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+            add(offset_grad_input + y_high * width + x_high,
+                static_cast<T>(g4));
+          }  // if
+        }    // ix
+      }      // iy
+    }        // mode
+  }          // for
+}  // ROIAlignBackward
+
+void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
+                                Tensor argmax_y, Tensor argmax_x,
+                                int aligned_height, int aligned_width,
+                                float spatial_scale, int sampling_ratio,
+                                int pool_mode, bool aligned) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "ROIAlign_forward", [&] {
+        ROIAlignForward<scalar_t>(
+            output_size, input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
+            argmax_x.data_ptr<scalar_t>(), aligned_height, aligned_width,
+            static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
+            aligned, channels, height, width);
+      });
+}
+
+void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
+                                 Tensor argmax_y, Tensor argmax_x,
+                                 Tensor grad_input, int aligned_height,
+                                 int aligned_width, float spatial_scale,
+                                 int sampling_ratio, int pool_mode,
+                                 bool aligned) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  // get stride values to ensure indexing into gradients is correct.
+  int n_stride = grad_output.stride(0);
+  int c_stride = grad_output.stride(1);
+  int h_stride = grad_output.stride(2);
+  int w_stride = grad_output.stride(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "ROIAlign_backward", [&] {
+        ROIAlignBackward<scalar_t>(
+            output_size, grad_output.data_ptr<scalar_t>(),
+            rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
+            argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+            aligned_height, aligned_width, static_cast<scalar_t>(spatial_scale),
+            sampling_ratio, pool_mode, aligned, channels, height, width,
+            n_stride, c_stride, h_stride, w_stride);
+      });
+}
+
+void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                           int aligned_width, float spatial_scale,
+                           int sampling_ratio, int pool_mode, bool aligned) {
+  ROIAlignForwardCPULauncher(input, rois, output, argmax_y, argmax_x,
+                             aligned_height, aligned_width, spatial_scale,
+                             sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                            Tensor argmax_x, Tensor grad_input,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignBackwardCPULauncher(grad_output, rois, argmax_y, argmax_x, grad_input,
+                              aligned_height, aligned_width, spatial_scale,
+                              sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, CPU, roi_align_forward_cpu);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, CPU, roi_align_backward_cpu);
diff --git a/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp b/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
new file mode 100644
index 0000000..0f7511b
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
@@ -0,0 +1,458 @@
+// Modified from
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+// implementation taken from Caffe2
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, T roi_center_h, T roi_center_w,
+    T cos_theta, T sin_theta, std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+                     static_cast<T>(iy + .5f) * bin_size_h /
+                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+                       static_cast<T>(ix + .5f) * bin_size_w /
+                           static_cast<T>(roi_bin_grid_w);
+
+          // Rotate by theta around the center and translate
+          // In image space, (y, x) is the order for Right Handed System,
+          // and this is essentially multiplying the point by a rotation matrix
+          // to rotate it counterclockwise through angle theta.
+          T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+          T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y < 0) {
+            y = 0;
+          }
+          if (x < 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignRotatedForward(const int nthreads, const T* input,
+                            const T& spatial_scale, const bool aligned,
+                            const bool clockwise, const int channels,
+                            const int height, const int width,
+                            const int pooled_height, const int pooled_width,
+                            const int sampling_ratio, const T* rois,
+                            T* output) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlignRotated do not have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                     pooled_width * pooled_height);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,
+        sin_theta, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_input[pc.pos1] +
+                            pc.w2 * offset_input[pc.pos2] +
+                            pc.w3 * offset_input[pc.pos3] +
+                            pc.w4 * offset_input[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          output[index] = output_val;
+        }  // for pw
+      }    // for ph
+    }      // for c
+  }        // for n
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
+                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
+                                   int& x_high, int& y_low, int& y_high) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(T* address, const T& val) {
+  *address += val;
+}
+
+template <typename T>
+void ROIAlignRotatedBackward(
+    const int nthreads,
+    // may not be contiguous. should index using n_stride, etc
+    const T* grad_output, const T& spatial_scale, const bool aligned,
+    const bool clockwise, const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int sampling_ratio,
+    T* grad_input, const T* rois, const int n_stride, const int c_stride,
+    const int h_stride, const int w_stride) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlignRotated do not have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    int output_offset = n * n_stride + c * c_stride;
+    const T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T yy = roi_start_h + ph * bin_size_h +
+                   static_cast<T>(iy + .5f) * bin_size_h /
+                       static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+                     static_cast<T>(ix + .5f) * bin_size_w /
+                         static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        }  // if
+      }    // ix
+    }      // iy
+  }        // for
+}  // ROIAlignRotatedBackward
+
+void ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       bool aligned, bool clockwise) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "ROIAlignRotated_forward", [&] {
+        ROIAlignRotatedForward<scalar_t>(
+            output_size, input.data_ptr<scalar_t>(),
+            static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
+            height, width, aligned_height, aligned_width, sampling_ratio,
+            rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
+      });
+}
+
+void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, bool aligned,
+                                        bool clockwise) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  // get stride values to ensure indexing into gradients is correct.
+  int n_stride = grad_output.stride(0);
+  int c_stride = grad_output.stride(1);
+  int h_stride = grad_output.stride(2);
+  int w_stride = grad_output.stride(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "ROIAlignRotated_backward", [&] {
+        ROIAlignRotatedBackward<scalar_t>(
+            grad_output.numel(), grad_output.data_ptr<scalar_t>(),
+            static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
+            height, width, aligned_height, aligned_width, sampling_ratio,
+            grad_input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+            n_stride, c_stride, h_stride, w_stride);
+      });
+}
+
+void roi_align_rotated_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                                   int aligned_height, int aligned_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   bool aligned, bool clockwise) {
+  ROIAlignRotatedForwardCPULauncher(input, rois, output, aligned_height,
+                                    aligned_width, spatial_scale,
+                                    sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
+                                    Tensor bottom_grad, int aligned_height,
+                                    int aligned_width, float spatial_scale,
+                                    int sampling_ratio, bool aligned,
+                                    bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  ROIAlignRotatedBackwardCPULauncher(
+      top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,
+      sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sample_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sample_ratio, bool aligned,
+                                     bool clockwise);
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CPU,
+                     roi_align_rotated_forward_cpu);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CPU,
+                     roi_align_rotated_backward_cpu);
diff --git a/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp b/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
new file mode 100644
index 0000000..25cc2b5
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
@@ -0,0 +1,170 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T, typename T_int>
+void dynamic_voxelize_forward_cpu_kernel(
+    const torch::TensorAccessor<T, 2> points,
+    torch::TensorAccessor<T_int, 2> coors, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const std::vector<int> grid_size,
+    const int num_points, const int num_features, const int NDim) {
+  const int ndim_minus_1 = NDim - 1;
+  bool failed = false;
+  // int coor[NDim];
+  int* coor = new int[NDim]();
+  int c;
+
+  for (int i = 0; i < num_points; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points[i][j] - coors_range[j]) / voxel_size[j]);
+      // necessary to rm points out of range
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+
+    if (failed)
+      memset(&coors[i][0], -1, NDim * sizeof(T_int));
+    else
+      memcpy(&coors[i][0], &coor[0], NDim * sizeof(T_int));
+  }
+
+  delete[] coor;
+}
+
+template <typename T, typename T_int>
+void hard_voxelize_forward_cpu_kernel(
+    const torch::TensorAccessor<T, 2> points,
+    torch::TensorAccessor<T, 3> voxels, torch::TensorAccessor<T_int, 2> coors,
+    torch::TensorAccessor<T_int, 1> num_points_per_voxel,
+    torch::TensorAccessor<T_int, 3> coor_to_voxelidx, int& voxel_num,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const std::vector<int> grid_size, const int max_points,
+    const int max_voxels, const int num_points, const int num_features,
+    const int NDim) {
+  // declare a temp coors
+  at::Tensor temp_coors = at::zeros(
+      {num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
+
+  // First use dynamic voxelization to get coors,
+  // then check max points/voxels constraints
+  dynamic_voxelize_forward_cpu_kernel<T, int>(
+      points, temp_coors.accessor<int, 2>(), voxel_size, coors_range, grid_size,
+      num_points, num_features, NDim);
+
+  int voxelidx, num;
+  auto coor = temp_coors.accessor<int, 2>();
+
+  for (int i = 0; i < num_points; ++i) {
+    // T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
+
+    if (coor[i][0] == -1) continue;
+
+    voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
+
+    // record voxel
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (max_voxels != -1 && voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+
+      coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
+      memcpy(&coors[voxelidx][0], &coor[i][0], NDim * sizeof(T_int));
+    }
+
+    // put points into voxel
+    num = num_points_per_voxel[voxelidx];
+    if (max_points == -1 || num < max_points) {
+      memcpy(&voxels[voxelidx][num][0], &points[i][0],
+             num_features * sizeof(T));
+      num_points_per_voxel[voxelidx] += 1;
+    }
+  }
+
+  return;
+}
+
+void dynamic_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& coors,
+                                  const std::vector<float> voxel_size,
+                                  const std::vector<float> coors_range,
+                                  const int NDim = 3) {
+  // check device
+  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+  std::vector<int> grid_size(NDim);
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+
+  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "dynamic_voxelize_forward_cpu_kernel", [&] {
+        dynamic_voxelize_forward_cpu_kernel<scalar_t, int>(
+            points.accessor<scalar_t, 2>(), coors.accessor<int, 2>(),
+            voxel_size, coors_range, grid_size, num_points, num_features, NDim);
+      });
+}
+
+int hard_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& voxels,
+                              at::Tensor& coors,
+                              at::Tensor& num_points_per_voxel,
+                              const std::vector<float> voxel_size,
+                              const std::vector<float> coors_range,
+                              const int max_points, const int max_voxels,
+                              const int NDim = 3) {
+  // current version tooks about 0.02s_0.03s for one frame on cpu
+  // check device
+  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+  std::vector<int> grid_size(NDim);
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+
+  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
+  // printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2],
+  // grid_size[1], grid_size[0]);
+  at::Tensor coor_to_voxelidx =
+      -at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
+
+  int voxel_num = 0;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "hard_voxelize_forward_cpu_kernel", [&] {
+        hard_voxelize_forward_cpu_kernel<scalar_t, int>(
+            points.accessor<scalar_t, 2>(), voxels.accessor<scalar_t, 3>(),
+            coors.accessor<int, 2>(), num_points_per_voxel.accessor<int, 1>(),
+            coor_to_voxelidx.accessor<int, 3>(), voxel_num, voxel_size,
+            coors_range, grid_size, max_points, max_voxels, num_points,
+            num_features, NDim);
+      });
+
+  return voxel_num;
+}
+
+int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim);
+
+void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim);
+REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CPU,
+                     hard_voxelize_forward_cpu);
+REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CPU,
+                     dynamic_voxelize_forward_cpu);
diff --git a/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
new file mode 100644
index 0000000..c4e684b
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
@@ -0,0 +1,66 @@
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "assign_score_withk_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void AssignScoreWithKForwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& points, const Tensor& centers, const Tensor& scores,
+    const Tensor& knn_idx, Tensor& output) {
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(DIVUP(B * O * N1 * K, THREADS_PER_BLOCK));
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "assign_score_withk_forward_cuda_kernel", [&] {
+        assign_score_withk_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                B, N0, N1, M, K, O, aggregate, points.data_ptr<scalar_t>(),
+                centers.data_ptr<scalar_t>(), scores.data_ptr<scalar_t>(),
+                knn_idx.data_ptr<int64_t>(), output.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void AssignScoreWithKBackwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks1(DIVUP(B * M * O, THREADS_PER_BLOCK));
+  dim3 threads1(THREADS_PER_BLOCK);
+  dim3 blocks2(DIVUP(B * N1 * K * M, THREADS_PER_BLOCK));
+  dim3 threads2(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "assign_score_withk_points_backward_cuda_kernel",
+      [&] {
+        assign_score_withk_points_backward_cuda_kernel<scalar_t>
+            <<<blocks1, threads1, 0, stream>>>(
+                B, N0, N1, M, K, O, aggregate, grad_out.data_ptr<scalar_t>(),
+                scores.data_ptr<scalar_t>(), knn_idx.data_ptr<int64_t>(),
+                grad_points.data_ptr<scalar_t>(),
+                grad_centers.data_ptr<scalar_t>());
+      });
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "assign_score_withk_scores_backward_cuda_kernel",
+      [&] {
+        assign_score_withk_scores_backward_cuda_kernel<scalar_t>
+            <<<blocks2, threads2, 0, stream>>>(
+                B, N0, N1, M, K, O, aggregate, grad_out.data_ptr<scalar_t>(),
+                points.data_ptr<scalar_t>(), centers.data_ptr<scalar_t>(),
+                knn_idx.data_ptr<int64_t>(), grad_scores.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
new file mode 100644
index 0000000..f5f5f39
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
@@ -0,0 +1,38 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "ball_query_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
+                                        float max_radius, int nsample,
+                                        const Tensor new_xyz, const Tensor xyz,
+                                        Tensor idx) {
+  // new_xyz: (B, M, 3)
+  // xyz: (B, N, 3)
+  // output:
+  //      idx: (B, M, nsample)
+
+  at::cuda::CUDAGuard device_guard(new_xyz.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      new_xyz.scalar_type(), "ball_query_forward_cuda_kernel", [&] {
+        ball_query_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, n, m, min_radius, max_radius, nsample,
+                new_xyz.data_ptr<scalar_t>(), xyz.data_ptr<scalar_t>(),
+                idx.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
new file mode 100644
index 0000000..16679c7
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
@@ -0,0 +1,23 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "bbox_overlaps_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                    Tensor ious, const int mode,
+                                    const bool aligned, const int offset) {
+  int output_size = ious.numel();
+  int num_bbox1 = bboxes1.size(0);
+  int num_bbox2 = bboxes2.size(0);
+
+  at::cuda::CUDAGuard device_guard(bboxes1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bboxes1.scalar_type(), "bbox_overlaps_cuda_kernel", ([&] {
+        bbox_overlaps_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                bboxes1.data_ptr<scalar_t>(), bboxes2.data_ptr<scalar_t>(),
+                ious.data_ptr<scalar_t>(), num_bbox1, num_bbox2, mode, aligned,
+                offset);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu
new file mode 100644
index 0000000..3aeefea
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu
@@ -0,0 +1,68 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "border_align_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void BorderAlignForwardCUDAKernelLauncher(const Tensor &input,
+                                          const Tensor &boxes, Tensor output,
+                                          Tensor argmax_idx,
+                                          const int pool_size) {
+  // shape assertion
+  AT_ASSERTM(input.ndimension() == 4,
+             "non-empty 4D(batch mode) tensor expected for input feature");
+  AT_ASSERTM(boxes.ndimension() == 3,
+             "boxes must be 3D tensor with size of [B, H*W, 4]");
+
+  int batch_size = input.size(0);
+  int feat_channels = input.size(1);
+  int channels = feat_channels / 4;
+  int height = input.size(2);
+  int width = input.size(3);
+  // shape [N, box_size, 4] for boxes. (x1, y1, x2, y2) format
+  int box_size = boxes.size(1);
+  // shape [N, channels, box_size, 4] for output
+  int nthreads = batch_size * channels * box_size;
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 block(128, 4);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "border_align_forward_cuda_kernel", [&] {
+        border_align_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(nthreads), block, 0, stream>>>(
+                nthreads, input.data_ptr<scalar_t>(),
+                boxes.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                argmax_idx.data_ptr<int>(), channels, box_size, height, width,
+                pool_size);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void BorderAlignBackwardCUDAKernelLauncher(const Tensor &grad_output,
+                                           const Tensor &boxes,
+                                           const Tensor &argmax_idx,
+                                           Tensor grad_input,
+                                           const int pool_size) {
+  int batch_size = grad_input.size(0);
+  int feat_channels = grad_input.size(1);
+  int channels = feat_channels / 4;
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+  int box_size = boxes.size(1);
+  int nthreads = batch_size * channels * box_size;
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 block(128, 4);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "border_align_backward_cuda_kernel", [&] {
+        border_align_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(nthreads), block, 0, stream>>>(
+                nthreads, grad_output.data_ptr<scalar_t>(),
+                boxes.data_ptr<scalar_t>(), argmax_idx.data_ptr<int>(),
+                grad_input.data_ptr<scalar_t>(), channels, box_size, height,
+                width, pool_size);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu
new file mode 100644
index 0000000..3c13e06
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu
@@ -0,0 +1,25 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
+#include "box_iou_rotated_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned) {
+  using scalar_t = float;
+  AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor");
+  AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor");
+
+  int output_size = ious.numel();
+  int num_boxes1 = boxes1.size(0);
+  int num_boxes2 = boxes2.size(0);
+
+  at::cuda::CUDAGuard device_guard(boxes1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  box_iou_rotated_cuda_kernel<scalar_t>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          num_boxes1, num_boxes2, boxes1.data_ptr<scalar_t>(),
+          boxes2.data_ptr<scalar_t>(), (scalar_t*)ious.data_ptr<scalar_t>(),
+          mode_flag, aligned);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu
new file mode 100644
index 0000000..984e734
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu
@@ -0,0 +1,180 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "carafe_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
+                                     Tensor rfeatures, Tensor routput,
+                                     Tensor rmasks, Tensor output,
+                                     const int kernel_size,
+                                     const int group_size,
+                                     const int scale_factor) {
+  const int batch_size = output.size(0);
+  const int channels = output.size(1);
+  const int output_height = output.size(2);
+  const int output_width = output.size(3);
+
+  const int input_height = features.size(2);
+  const int input_width = features.size(3);
+
+  const int mask_channels = masks.size(1);
+
+  rfeatures.resize_({batch_size, input_height, input_width, channels});
+  routput.resize_({batch_size, output_height, output_width, channels});
+  rmasks.resize_({batch_size, output_height, output_width, mask_channels});
+
+  // one warp per pixel
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NCHW2NHWC_Feature", ([&] {
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
+        scalar_t *top_data = rfeatures.data_ptr<scalar_t>();
+        const int dh = divideUP(channels, kTileDim);
+        const int dw = divideUP(input_height * input_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, channels, input_height * input_width, dh, dw,
+                bottom_data, top_data);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NCHW2NHWC_Masks", ([&] {
+        const scalar_t *bottom_data = masks.data_ptr<scalar_t>();
+        scalar_t *top_data = rmasks.data_ptr<scalar_t>();
+        const int dh = divideUP(mask_channels, kTileDim);
+        const int dw = divideUP(output_height * output_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, mask_channels, output_height * output_width, dh, dw,
+                bottom_data, top_data);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "CARAFELaucherForward", ([&] {
+        const int num_kernels =
+            batch_size * output_height * output_width * THREADS_PER_PIXEL;
+        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
+        const scalar_t *bottom_masks = rmasks.data_ptr<scalar_t>();
+        scalar_t *top_data = routput.data_ptr<scalar_t>();
+
+        CARAFEForward<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK),
+                                  THREADS_PER_BLOCK, 0, stream>>>(
+            num_kernels, bottom_data, bottom_masks, kernel_size, group_size,
+            scale_factor, channels, input_height, input_width, output_height,
+            output_width, mask_channels, top_data);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NHWC2NCHW", ([&] {
+        const scalar_t *bottom_data = routput.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
+        const int dh = divideUP(output_height * output_width, kTileDim);
+        const int dw = divideUP(channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, output_height * output_width, channels, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void CARAFEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
+    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
+    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor) {
+  const int batch_size = top_grad.size(0);
+  const int channels = top_grad.size(1);
+  const int output_height = top_grad.size(2);
+  const int output_width = top_grad.size(3);
+
+  const int input_height = bottom_grad.size(2);
+  const int input_width = bottom_grad.size(3);
+
+  const int mask_channels = masks.size(1);
+
+  rtop_grad.resize_({batch_size, output_height, output_width, channels});
+  rbottom_grad.resize_({batch_size, input_height, input_width, channels});
+  rbottom_grad_hs.resize_({batch_size, output_height, output_width, channels});
+  rmask_grad.resize_({batch_size, output_height, output_width, mask_channels});
+
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NCHW2NHWC_Top_Grad", ([&] {
+        const scalar_t *bottom_data = top_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = rtop_grad.data_ptr<scalar_t>();
+        const int dh = divideUP(channels, kTileDim);
+        const int dw = divideUP(output_height * output_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, channels, output_height * output_width, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFELaucherBackward_Feature", ([&] {
+        const int num_kernels =
+            batch_size * output_height * output_width * THREADS_PER_PIXEL;
+        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
+        const scalar_t *bottom_masks = masks.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = rbottom_grad_hs.data_ptr<scalar_t>();
+
+        CARAFEBackward_Feature<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, top_diff, bottom_masks, kernel_size,
+                         group_size, scale_factor, channels, input_height,
+                         input_width, output_height, output_width,
+                         mask_channels, bottom_diff);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "FeatureSum", ([&] {
+        const int num_kernels =
+            batch_size * input_height * input_width * THREADS_PER_PIXEL;
+        const scalar_t *bottom_diff_hs = rbottom_grad_hs.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = rbottom_grad.data_ptr<scalar_t>();
+
+        FeatureSum<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, bottom_diff_hs, scale_factor, channels,
+                         input_height, input_width, bottom_diff);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NHWC2NCHW_Bottom_Grad", ([&] {
+        const scalar_t *bottom_data = rbottom_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = bottom_grad.data_ptr<scalar_t>();
+        const int dh = divideUP(input_height * input_width, kTileDim);
+        const int dw = divideUP(channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, input_height * input_width, channels, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFELaucherBackward_Mask", ([&] {
+        const int num_kernels = batch_size * output_height * output_width *
+                                mask_channels * WARP_SIZE;
+        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
+        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
+        scalar_t *mask_diff = rmask_grad.data_ptr<scalar_t>();
+
+        CARAFEBackward_Mask<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, top_diff, bottom_data, kernel_size,
+                         group_size, scale_factor, channels, input_height,
+                         input_width, output_height, output_width,
+                         mask_channels, mask_diff);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NHWC2NCHW_Mask_Grad", ([&] {
+        const scalar_t *bottom_data = rmask_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = mask_grad.data_ptr<scalar_t>();
+        const int dh = divideUP(output_height * output_width, kTileDim);
+        const int dw = divideUP(mask_channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, output_height * output_width, mask_channels, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu
new file mode 100644
index 0000000..2fc5667
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu
@@ -0,0 +1,52 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "carafe_naive_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
+                                          const Tensor masks, Tensor output,
+                                          const int kernel_size,
+                                          const int group_size,
+                                          const int scale_factor) {
+  int output_size = output.numel();
+  int channels = output.size(1);
+  int height = output.size(2);
+  int width = output.size(3);
+
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "CARAFENAIVEForward", ([&] {
+        carafe_naive_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, features.data_ptr<scalar_t>(),
+                masks.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                kernel_size, group_size, scale_factor, channels, height, width);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void CARAFENAIVEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor features, const Tensor masks,
+    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
+    const int group_size, const int scale_factor) {
+  int output_size = top_grad.numel();
+  int channels = top_grad.size(1);
+  int height = top_grad.size(2);
+  int width = top_grad.size(3);
+
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFENAIVEBackward", ([&] {
+        carafe_naive_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, top_grad.data_ptr<scalar_t>(),
+                features.data_ptr<scalar_t>(), masks.data_ptr<scalar_t>(),
+                bottom_grad.data_ptr<scalar_t>(),
+                mask_grad.data_ptr<scalar_t>(), kernel_size, group_size,
+                scale_factor, channels, height, width);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
new file mode 100644
index 0000000..56d2e64
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
@@ -0,0 +1,93 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_cuda_kernel.cu
+// Original licence: Under MIT License
+
+#include "correlation_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
+                                          Tensor output, int kH, int kW,
+                                          int patchH, int patchW, int padH,
+                                          int padW, int dilationH,
+                                          int dilationW, int dilation_patchH,
+                                          int dilation_patchW, int dH, int dW) {
+  const int batch_size = input1.size(0);
+  const int iH = input1.size(2);
+  const int iW = input1.size(3);
+  const int dilatedKH = (kH - 1) * dilationH + 1;
+  const int dilatedKW = (kW - 1) * dilationW + 1;
+
+  const auto oH = (iH + 2 * padH - dilatedKH) / dH + 1;
+  const auto oW = (iW + 2 * padW - dilatedKW) / dW + 1;
+
+  auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
+  auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
+
+  const int threads = THREADS_FORWARD;
+  const dim3 blocks(batch_size, oH, oW);
+
+  at::cuda::CUDAGuard device_guard(input1.device());
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input1.scalar_type(), "correlation_forward_cuda", ([&] {
+        TensorAcc4R trInput1_acc =
+            trInput1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc4R trInput2_acc =
+            trInput2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc5R output_acc =
+            output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();
+
+        correlation_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                trInput1_acc, trInput2_acc, output_acc, kH, kW, patchH, patchW,
+                padH, padW, dilationH, dilationW, dilation_patchH,
+                dilation_patchW, dH, dW);
+      }));
+}
+
+void CorrelationBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input1, Tensor input2, Tensor grad_input1,
+    Tensor grad_input2, int kH, int kW, int patchH, int patchW, int padH,
+    int padW, int dilationH, int dilationW, int dilation_patchH,
+    int dilation_patchW, int dH, int dW) {
+  const int batch_size = input1.size(0);
+  const int iH = input1.size(2);
+  const int iW = input1.size(3);
+  const int C = input1.size(1);
+
+  const dim3 blocks(C, iH, iW);
+  const dim3 threads(THREADS_BACKWARD, THREADS_BACKWARD);
+
+  at::cuda::CUDAGuard device_guard(input1.device());
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input1.scalar_type(), "correlation_backward_cuda", ([&] {
+        TensorAcc4R input1_acc =
+            input1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc4R input2_acc =
+            input2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc4R grad_input1_acc =
+            grad_input1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc4R grad_input2_acc =
+            grad_input2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc5R grad_output_acc =
+            grad_output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();
+
+        for (int n = 0; n < batch_size; ++n) {
+          correlation_backward_cuda_kernel_input1<scalar_t>
+              <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                  grad_output_acc, input2_acc, grad_input1_acc, kH, kW, patchH,
+                  patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+                  dilation_patchW, dH, dW, n);
+        }
+
+        for (int n = 0; n < batch_size; ++n) {
+          correlation_backward_cuda_kernel_input2<scalar_t>
+              <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                  grad_output_acc, input1_acc, grad_input2_acc, kH, kW, patchH,
+                  patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+                  dilation_patchW, dH, dW, n);
+        }
+      }));
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp b/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
new file mode 100644
index 0000000..2e7a3f5
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
@@ -0,0 +1,1364 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void AssignScoreWithKForwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& points, const Tensor& centers, const Tensor& scores,
+    const Tensor& knn_idx, Tensor& output);
+
+void AssignScoreWithKBackwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);
+
+void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output) {
+  AssignScoreWithKForwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
+};
+
+void assign_score_withk_backward_cuda(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  AssignScoreWithKBackwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
+      grad_points, grad_centers, grad_scores);
+};
+
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output);
+
+void assign_score_withk_backward_impl(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);
+
+REGISTER_DEVICE_IMPL(assign_score_withk_forward_impl, CUDA,
+                     assign_score_withk_forward_cuda);
+REGISTER_DEVICE_IMPL(assign_score_withk_backward_impl, CUDA,
+                     assign_score_withk_backward_cuda);
+
+void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
+                                        float max_radius, int nsample,
+                                        const Tensor new_xyz, const Tensor xyz,
+                                        Tensor idx);
+
+void ball_query_forward_cuda(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx) {
+  BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,
+                                     new_xyz, xyz, idx);
+};
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx);
+REGISTER_DEVICE_IMPL(ball_query_forward_impl, CUDA, ball_query_forward_cuda);
+
+void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                    Tensor ious, const int mode,
+                                    const bool aligned, const int offset);
+
+void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, CUDA, bbox_overlaps_cuda);
+
+void BorderAlignForwardCUDAKernelLauncher(const Tensor& input,
+                                          const Tensor& boxes, Tensor output,
+                                          Tensor argmax_idx,
+                                          const int pool_size);
+
+void BorderAlignBackwardCUDAKernelLauncher(const Tensor& grad_output,
+                                           const Tensor& boxes,
+                                           const Tensor& argmax_idx,
+                                           Tensor grad_input,
+                                           const int pool_size);
+
+void border_align_forward_cuda(const Tensor& input, const Tensor& boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size) {
+  BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,
+                                       pool_size);
+}
+
+void border_align_backward_cuda(const Tensor& grad_output, const Tensor& boxes,
+                                const Tensor& argmax_idx, Tensor grad_input,
+                                const int pool_size) {
+  BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,
+                                        grad_input, pool_size);
+}
+
+void border_align_forward_impl(const Tensor& input, const Tensor& boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size);
+
+void border_align_backward_impl(const Tensor& grad_output, const Tensor& boxes,
+                                const Tensor& argmax_idx, Tensor grad_input,
+                                const int pool_size);
+
+REGISTER_DEVICE_IMPL(border_align_forward_impl, CUDA,
+                     border_align_forward_cuda);
+REGISTER_DEVICE_IMPL(border_align_backward_impl, CUDA,
+                     border_align_backward_cuda);
+
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CUDA, box_iou_rotated_cuda);
+
+void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
+                                     Tensor rfeatures, Tensor routput,
+                                     Tensor rmasks, Tensor output,
+                                     const int kernel_size,
+                                     const int group_size,
+                                     const int scale_factor);
+
+void CARAFEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
+    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
+    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor);
+
+void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
+  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
+                                  output, kernel_size, group_size,
+                                  scale_factor);
+}
+
+void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
+  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
+                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
+                                   bottom_grad, mask_grad, kernel_size,
+                                   group_size, scale_factor);
+}
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_forward_impl, CUDA, carafe_forward_cuda);
+REGISTER_DEVICE_IMPL(carafe_backward_impl, CUDA, carafe_backward_cuda);
+
+void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
+                                          const Tensor masks, Tensor output,
+                                          const int kernel_size,
+                                          const int group_size,
+                                          const int scale_factor);
+
+void CARAFENAIVEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor features, const Tensor masks,
+    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
+    const int group_size, const int scale_factor);
+
+void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
+  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
+                                       group_size, scale_factor);
+}
+
+void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
+  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
+                                        mask_grad, kernel_size, group_size,
+                                        scale_factor);
+}
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor);
+
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_naive_forward_impl, CUDA,
+                     carafe_naive_forward_cuda);
+REGISTER_DEVICE_IMPL(carafe_naive_backward_impl, CUDA,
+                     carafe_naive_backward_cuda);
+
+void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
+                                          Tensor output, int kH, int kW,
+                                          int patchH, int patchW, int padH,
+                                          int padW, int dilationH,
+                                          int dilationW, int dilation_patchH,
+                                          int dilation_patchW, int dH, int dW);
+
+void CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,
+                                           Tensor input2, Tensor grad_input1,
+                                           Tensor grad_input2, int kH, int kW,
+                                           int patchH, int patchW, int padH,
+                                           int padW, int dilationH,
+                                           int dilationW, int dilation_patchH,
+                                           int dilation_patchW, int dH, int dW);
+
+void correlation_forward_cuda(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW) {
+  CorrelationForwardCUDAKernelLauncher(
+      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,
+      dilationW, dilation_patchH, dilation_patchW, dH, dW);
+}
+
+void correlation_backward_cuda(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW) {
+  CorrelationBackwardCUDAKernelLauncher(
+      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,
+      patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW);
+}
+
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW);
+
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW);
+
+REGISTER_DEVICE_IMPL(correlation_forward_impl, CUDA, correlation_forward_cuda);
+REGISTER_DEVICE_IMPL(correlation_backward_impl, CUDA,
+                     correlation_backward_cuda);
+
+void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_cuda(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+REGISTER_DEVICE_IMPL(deformable_im2col_impl, CUDA, deformable_im2col_cuda);
+REGISTER_DEVICE_IMPL(deformable_col2im_impl, CUDA, deformable_col2im_cuda);
+REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CUDA,
+                     deformable_col2im_coord_cuda);
+
+void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                            Tensor offset, Tensor output,
+                                            int pooled_height, int pooled_width,
+                                            float spatial_scale,
+                                            int sampling_ratio, float gamma);
+
+void DeformRoIPoolBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
+    float spatial_scale, int sampling_ratio, float gamma);
+
+void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
+                                         pooled_height, pooled_width,
+                                         spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma) {
+  DeformRoIPoolBackwardCUDAKernelLauncher(
+      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
+      pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma);
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma);
+
+REGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, CUDA,
+                     deform_roi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, CUDA,
+                     deform_roi_pool_backward_cuda);
+
+void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                                Tensor weight,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                                Tensor weight, Tensor buff,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha) {
+  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
+                                             gamma, alpha);
+}
+
+void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha) {
+  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
+                                             grad_input, gamma, alpha);
+}
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha);
+
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, CUDA,
+                     sigmoid_focal_loss_forward_cuda);
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, CUDA,
+                     sigmoid_focal_loss_backward_cuda);
+REGISTER_DEVICE_IMPL(softmax_focal_loss_forward_impl, CUDA,
+                     softmax_focal_loss_forward_cuda);
+REGISTER_DEVICE_IMPL(softmax_focal_loss_backward_impl, CUDA,
+                     softmax_focal_loss_backward_cuda);
+
+void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
+                                                    const float* dataset,
+                                                    float* temp, int* idxs);
+
+void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
+    int b, int n, int m, const float* dataset, float* temp, int* idxs);
+
+void furthest_point_sampling_forward_cuda(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  const float* dataset = points_tensor.data_ptr<float>();
+  float* temp = temp_tensor.data_ptr<float>();
+  int* idxs = idx_tensor.data_ptr<int>();
+  FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
+}
+
+void furthest_point_sampling_with_dist_forward_cuda(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  const float* dataset = points_tensor.data_ptr<float>();
+  float* temp = temp_tensor.data_ptr<float>();
+  int* idxs = idx_tensor.data_ptr<int>();
+  FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
+                                                         idxs);
+}
+
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m);
+
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m);
+
+REGISTER_DEVICE_IMPL(furthest_point_sampling_forward_impl, CUDA,
+                     furthest_point_sampling_forward_cuda);
+REGISTER_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl, CUDA,
+                     furthest_point_sampling_with_dist_forward_cuda);
+
+torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
+                                      const torch::Tensor& bias,
+                                      const torch::Tensor& refer, int act,
+                                      int grad, float alpha, float scale);
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale);
+REGISTER_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, CUDA,
+                     fused_bias_leakyrelu_op);
+
+void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           const Tensor points,
+                                           const Tensor idx, Tensor out);
+
+void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                            const Tensor grad_out,
+                                            const Tensor idx,
+                                            Tensor grad_points);
+
+void gather_points_forward_cuda(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out) {
+  GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
+};
+
+void gather_points_backward_cuda(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points) {
+  GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
+                                         grad_points);
+};
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out);
+
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points);
+
+REGISTER_DEVICE_IMPL(gather_points_forward_impl, CUDA,
+                     gather_points_forward_cuda);
+REGISTER_DEVICE_IMPL(gather_points_backward_impl, CUDA,
+                     gather_points_backward_cuda);
+
+void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                          int nsample, const Tensor points,
+                                          const Tensor idx, Tensor out);
+
+void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           int nsample, const Tensor grad_out,
+                                           const Tensor idx,
+                                           Tensor grad_points);
+
+void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
+                                       out);
+};
+
+void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
+                                        idx, grad_points);
+};
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out);
+
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points);
+
+REGISTER_DEVICE_IMPL(group_points_forward_impl, CUDA,
+                     group_points_forward_cuda);
+REGISTER_DEVICE_IMPL(group_points_backward_impl, CUDA,
+                     group_points_backward_cuda);
+
+void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
+                                                   const Tensor boxes_a,
+                                                   const int num_b,
+                                                   const Tensor boxes_b,
+                                                   Tensor ans_overlap);
+
+void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
+                                               const Tensor boxes_a,
+                                               const int num_b,
+                                               const Tensor boxes_b,
+                                               Tensor ans_iou);
+
+void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
+                                       unsigned long long* mask, int boxes_num,
+                                       float nms_overlap_thresh);
+
+void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
+                                             unsigned long long* mask,
+                                             int boxes_num,
+                                             float nms_overlap_thresh);
+
+void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap) {
+  IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
+                                                ans_overlap);
+};
+
+void iou3d_boxes_iou_bev_forward_cuda(const int num_a, const Tensor boxes_a,
+                                      const int num_b, const Tensor boxes_b,
+                                      Tensor ans_iou) {
+  IoU3DBoxesIoUBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
+                                            ans_iou);
+};
+
+void iou3d_nms_forward_cuda(const Tensor boxes, unsigned long long* mask,
+                            int boxes_num, float nms_overlap_thresh) {
+  IoU3DNMSForwardCUDAKernelLauncher(boxes, mask, boxes_num, nms_overlap_thresh);
+};
+
+void iou3d_nms_normal_forward_cuda(const Tensor boxes, unsigned long long* mask,
+                                   int boxes_num, float nms_overlap_thresh) {
+  IoU3DNMSNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
+                                          nms_overlap_thresh);
+};
+
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap);
+
+void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                      const int num_b, const Tensor boxes_b,
+                                      Tensor ans_iou);
+
+void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long* mask,
+                            int boxes_num, float nms_overlap_thresh);
+
+void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long* mask,
+                                   int boxes_num, float nms_overlap_thresh);
+
+REGISTER_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, CUDA,
+                     iou3d_boxes_overlap_bev_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, CUDA,
+                     iou3d_boxes_iou_bev_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_nms_forward_impl, CUDA, iou3d_nms_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_nms_normal_forward_impl, CUDA,
+                     iou3d_nms_normal_forward_cuda);
+
+void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
+                                  const Tensor xyz, const Tensor new_xyz,
+                                  Tensor idx, Tensor dist2);
+
+void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
+  KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
+}
+
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2);
+REGISTER_DEVICE_IMPL(knn_forward_impl, CUDA, knn_forward_cuda);
+
+void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int kernel_h,
+                                           const int kernel_w, const int pad_h,
+                                           const int pad_w);
+
+void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int height,
+                                           const int width, const int channels);
+
+void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
+                                        kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
+                                        width, channels);
+}
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w);
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels);
+
+REGISTER_DEVICE_IMPL(masked_im2col_forward_impl, CUDA,
+                     masked_im2col_forward_cuda);
+REGISTER_DEVICE_IMPL(masked_col2im_forward_impl, CUDA,
+                     masked_col2im_forward_cuda);
+
+void modulated_deformable_im2col_cuda(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CUDA,
+                     modulated_deformable_im2col_cuda);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CUDA,
+                     modulated_deformable_col2im_cuda);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CUDA,
+                     modulated_deformable_col2im_coord_cuda);
+
+Tensor ms_deform_attn_cuda_forward(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_cuda_backward(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
+
+Tensor ms_deform_attn_impl_forward(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_impl_backward(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
+
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, CUDA,
+                     ms_deform_attn_cuda_forward);
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, CUDA,
+                     ms_deform_attn_cuda_backward);
+
+Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                             int offset);
+
+Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, CUDA, nms_cuda);
+
+void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                                int pts_num, const Tensor boxes,
+                                                const Tensor pts,
+                                                Tensor box_idx_of_points);
+
+void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                               int pts_num, const Tensor boxes,
+                                               const Tensor pts,
+                                               Tensor box_idx_of_points);
+
+void points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points) {
+  PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                             boxes, pts, box_idx_of_points);
+};
+
+void points_in_boxes_all_forward_cuda(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points) {
+  PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                            boxes, pts, box_idx_of_points);
+};
+
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points);
+
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points);
+REGISTER_DEVICE_IMPL(points_in_boxes_part_forward_impl, CUDA,
+                     points_in_boxes_part_forward_cuda);
+REGISTER_DEVICE_IMPL(points_in_boxes_all_forward_impl, CUDA,
+                     points_in_boxes_all_forward_cuda);
+
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
+                                      Tensor output, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask);
+
+void PSAMaskBackwardCUDAKernelLauncher(
+    const int psa_type, const Tensor grad_output, Tensor grad_input,
+    const int num_, const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int half_h_mask, const int half_w_mask);
+
+void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
+                                   w_feature, h_mask, w_mask, half_h_mask,
+                                   half_w_mask);
+}
+
+void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask) {
+  PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
+                                    h_feature, w_feature, h_mask, w_mask,
+                                    half_h_mask, half_w_mask);
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+REGISTER_DEVICE_IMPL(psamask_forward_impl, CUDA, psamask_forward_cuda);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, CUDA, psamask_backward_cuda);
+
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned);
+
+void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                        Tensor argmax_y, Tensor argmax_x,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, int pool_mode,
+                                        bool aligned);
+
+void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignForwardCUDAKernelLauncher(
+      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
+      spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  ROIAlignBackwardCUDAKernelLauncher(
+      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
+      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, CUDA, roi_align_backward_cuda);
+
+void ROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int sample_num, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor output);
+
+void ROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int sample_num, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor bottom_grad);
+
+void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sample_ratio,
+                                    bool aligned, bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = features.size(1);
+  int data_height = features.size(2);
+  int data_width = features.size(3);
+  ROIAlignRotatedForwardCUDAKernelLauncher(
+      features, rois, spatial_scale, sample_ratio, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, aligned_height,
+      aligned_width, output);
+}
+
+void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sample_ratio, bool aligned,
+                                     bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = bottom_grad.size(1);
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  ROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, sample_ratio, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, aligned_height,
+      aligned_width, bottom_grad);
+}
+
+void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sample_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sample_ratio, bool aligned,
+                                     bool clockwise);
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CUDA,
+                     roi_align_rotated_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CUDA,
+                     roi_align_rotated_backward_cuda);
+
+void RoiawarePool3dForwardCUDAKernelLauncher(
+    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
+    int out_y, int out_z, const Tensor rois, const Tensor pts,
+    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
+    Tensor pooled_features, int pool_method);
+
+void RoiawarePool3dBackwardCUDAKernelLauncher(
+    int boxes_num, int out_x, int out_y, int out_z, int channels,
+    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
+    const Tensor grad_out, Tensor grad_in, int pool_method);
+
+void roiaware_pool3d_forward_cuda(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method) {
+  RoiawarePool3dForwardCUDAKernelLauncher(
+      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+      rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features,
+      pool_method);
+};
+
+void roiaware_pool3d_backward_cuda(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method) {
+  RoiawarePool3dBackwardCUDAKernelLauncher(
+      boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
+      pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method);
+};
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method);
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method);
+
+REGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, CUDA,
+                     roiaware_pool3d_forward_cuda);
+REGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, CUDA,
+                     roiaware_pool3d_backward_cuda);
+
+void RoIPointPool3dForwardCUDAKernelLauncher(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
+    const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);
+
+void roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag) {
+  RoIPointPool3dForwardCUDAKernelLauncher(
+      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
+      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
+};
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag);
+REGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, CUDA,
+                     roipoint_pool3d_forward_cuda);
+
+void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax, int pooled_height,
+                                      int pooled_width, float spatial_scale);
+
+void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                       Tensor argmax, Tensor grad_input,
+                                       int pooled_height, int pooled_width,
+                                       float spatial_scale);
+
+void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale) {
+  ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
+                                   pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale) {
+  ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
+                                    pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+REGISTER_DEVICE_IMPL(roi_pool_forward_impl, CUDA, roi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_pool_backward_impl, CUDA, roi_pool_backward_cuda);
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
+    const at::Tensor& feats, const at::Tensor& coors,
+    const reduce_t reduce_type);
+
+void DynamicPointToVoxelBackwardCUDAKernelLauncher(
+    at::Tensor& grad_feats, const at::Tensor& grad_reduced_feats,
+    const at::Tensor& feats, const at::Tensor& reduced_feats,
+    const at::Tensor& coors_map, const at::Tensor& reduce_count,
+    const reduce_t reduce_type);
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_cuda(
+    const torch::Tensor& feats, const torch::Tensor& coors,
+    const reduce_t reduce_type) {
+  return DynamicPointToVoxelForwardCUDAKernelLauncher(feats, coors,
+                                                      reduce_type);
+};
+
+void dynamic_point_to_voxel_backward_cuda(
+    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
+    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
+    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
+    const reduce_t reduce_type) {
+  DynamicPointToVoxelBackwardCUDAKernelLauncher(grad_feats, grad_reduced_feats,
+                                                feats, reduced_feats, coors_idx,
+                                                reduce_count, reduce_type);
+};
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
+    const torch::Tensor& feats, const torch::Tensor& coors,
+    const reduce_t reduce_type);
+
+void dynamic_point_to_voxel_backward_impl(
+    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
+    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
+    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
+    const reduce_t reduce_type);
+
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, CUDA,
+                     dynamic_point_to_voxel_forward_cuda);
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, CUDA,
+                     dynamic_point_to_voxel_backward_cuda);
+
+void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);
+
+void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
+                                        Tensor var);
+
+void SyncBNForwardOutputCUDAKernelLauncher(
+    const Tensor input, const Tensor mean, const Tensor var,
+    Tensor running_mean, Tensor running_var, const Tensor weight,
+    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
+    float momentum, int group_size);
+
+void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
+                                           const Tensor norm,
+                                           Tensor grad_weight,
+                                           Tensor grad_bias);
+
+void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
+                                          const Tensor weight,
+                                          const Tensor grad_weight,
+                                          const Tensor grad_bias,
+                                          const Tensor norm, const Tensor std,
+                                          Tensor grad_input);
+
+void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
+  SyncBNForwardMeanCUDAKernelLauncher(input, mean);
+}
+
+void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
+                              Tensor var) {
+  SyncBNForwardVarCUDAKernelLauncher(input, mean, var);
+}
+
+void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size) {
+  SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,
+                                        running_var, weight, bias, norm, std,
+                                        output, eps, momentum, group_size);
+}
+
+void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias) {
+  SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
+                                        grad_bias);
+}
+
+void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input) {
+  SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
+                                       grad_bias, norm, std, grad_input);
+}
+
+void sync_bn_forward_mean_impl(const Tensor input, Tensor mean);
+
+void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
+                              Tensor var);
+
+void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size);
+
+void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias);
+
+void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input);
+
+REGISTER_DEVICE_IMPL(sync_bn_forward_mean_impl, CUDA,
+                     sync_bn_forward_mean_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_forward_var_impl, CUDA, sync_bn_forward_var_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_forward_output_impl, CUDA,
+                     sync_bn_forward_output_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_backward_param_impl, CUDA,
+                     sync_bn_backward_param_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_backward_data_impl, CUDA,
+                     sync_bn_backward_data_cuda);
+
+void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
+                                               const Tensor points,
+                                               const Tensor idx,
+                                               const Tensor weight, Tensor out);
+
+void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
+                                                const Tensor grad_out,
+                                                const Tensor idx,
+                                                const Tensor weight,
+                                                Tensor grad_points);
+
+void three_interpolate_forward_cuda(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out) {
+  ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight,
+                                            out);
+};
+
+void three_interpolate_backward_cuda(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points) {
+  ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight,
+                                             grad_points);
+};
+
+void three_interpolate_forward_impl(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out);
+
+void three_interpolate_backward_impl(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points);
+REGISTER_DEVICE_IMPL(three_interpolate_forward_impl, CUDA,
+                     three_interpolate_forward_cuda);
+REGISTER_DEVICE_IMPL(three_interpolate_backward_impl, CUDA,
+                     three_interpolate_backward_cuda);
+
+void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
+                                      const Tensor known, Tensor dist2,
+                                      Tensor idx);
+
+void three_nn_forward_cuda(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx) {
+  ThreeNNForwardCUDAKernelLauncher(b, n, m, unknown, known, dist2, idx);
+};
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx);
+REGISTER_DEVICE_IMPL(three_nn_forward_impl, CUDA, three_nn_forward_cuda);
+
+void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
+                                       Tensor output);
+
+void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
+                                        Tensor grad_input);
+
+void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output) {
+  TINShiftForwardCUDAKernelLauncher(input, shift, output);
+}
+
+void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
+                             Tensor grad_input) {
+  TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input);
+}
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input);
+REGISTER_DEVICE_IMPL(tin_shift_forward_impl, CUDA, tin_shift_forward_cuda);
+REGISTER_DEVICE_IMPL(tin_shift_backward_impl, CUDA, tin_shift_backward_cuda);
+
+torch::Tensor upfirdn2d_op(const torch::Tensor& input,
+                           const torch::Tensor& kernel, int up_x, int up_y,
+                           int down_x, int down_y, int pad_x0, int pad_x1,
+                           int pad_y0, int pad_y1);
+
+torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
+                                const torch::Tensor& kernel, int up_x, int up_y,
+                                int down_x, int down_y, int pad_x0, int pad_x1,
+                                int pad_y0, int pad_y1);
+REGISTER_DEVICE_IMPL(upfirdn2d_op_impl, CUDA, upfirdn2d_op);
+
+int HardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3);
+
+void DynamicVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& coors,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const int NDim = 3);
+
+int hard_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim) {
+  return HardVoxelizeForwardCUDAKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+};
+
+void dynamic_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim) {
+  DynamicVoxelizeForwardCUDAKernelLauncher(points, coors, voxel_size,
+                                           coors_range, NDim);
+};
+
+int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim);
+
+void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim);
+REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,
+                     hard_voxelize_forward_cuda);
+REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,
+                     dynamic_voxelize_forward_cuda);
diff --git a/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu
new file mode 100644
index 0000000..05fc08b
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu
@@ -0,0 +1,105 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "deform_conv_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col) {
+  // num_axes should be smaller than block size
+  // todo: check parallel_imgs is correctly passed in
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+
+        deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels),
+                                       THREADS_PER_BLOCK, 0,
+                                       at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_im_, data_offset_, height, width, ksize_h,
+            ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, parallel_imgs, channels,
+            deformable_group, height_col, width_col, data_col_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im) {
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels =
+      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels),
+                                       THREADS_PER_BLOCK, 0,
+                                       at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_offset_, channels, height, width,
+            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+            dilation_w, channel_per_deformable_group, parallel_imgs,
+            deformable_group, height_col, width_col, grad_im_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void deformable_col2im_coord_cuda(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
+                    deformable_group * parallel_imgs;
+  int channel_per_deformable_group =
+      channels * ksize_h * ksize_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+
+        deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_im_, data_offset_, channels, height,
+            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
+            2 * ksize_h * ksize_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu
new file mode 100644
index 0000000..d443998
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu
@@ -0,0 +1,55 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "deform_roi_pool_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                            Tensor offset, Tensor output,
+                                            int pooled_height, int pooled_width,
+                                            float spatial_scale,
+                                            int sampling_ratio, float gamma) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "deform_roi_pool_forward_cuda_kernel", [&] {
+        deform_roi_pool_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), offset.data_ptr<scalar_t>(),
+                output.data_ptr<scalar_t>(), pooled_height, pooled_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio,
+                static_cast<scalar_t>(gamma), channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void DeformRoIPoolBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
+    float spatial_scale, int sampling_ratio, float gamma) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "deform_roi_pool_backward_cuda_kernel", [&] {
+        deform_roi_pool_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+                offset.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+                grad_offset.data_ptr<scalar_t>(), pooled_height, pooled_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio,
+                static_cast<scalar_t>(gamma), channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu
new file mode 100644
index 0000000..cb899f9
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu
@@ -0,0 +1,111 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "sigmoid_focal_loss_cuda_kernel.cuh"
+#include "softmax_focal_loss_cuda_kernel.cuh"
+
+void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha) {
+  int output_size = output.numel();
+  int num_classes = input.size(1);
+  AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
+             "target label should smaller or equal than num classes");
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sigmoid_focal_loss_forward_cuda_kernel", [&] {
+        sigmoid_focal_loss_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                                Tensor weight,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha) {
+  int output_size = grad_input.numel();
+  int num_classes = input.size(1);
+
+  at::cuda::CUDAGuard device_guard(grad_input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sigmoid_focal_loss_backward_cuda_kernel", [&] {
+        sigmoid_focal_loss_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                grad_input.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha) {
+  int output_size = output.numel();
+  int num_classes = softmax.size(1);
+
+  AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
+             "target label should smaller or equal than num classes");
+  at::cuda::CUDAGuard device_guard(softmax.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      softmax.scalar_type(), "softmax_focal_loss_forward_cuda_kernel", [&] {
+        softmax_focal_loss_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                                Tensor weight, Tensor buff,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha) {
+  int num_classes = softmax.size(1);
+
+  int output_size = buff.numel();
+  at::cuda::CUDAGuard device_guard(grad_input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.scalar_type(),
+      "softmax_focal_loss_backward_cuda1_"
+      "kernel",
+      [&] {
+        softmax_focal_loss_backward_cuda1_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                buff.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  output_size = grad_input.numel();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.scalar_type(),
+      "softmax_focal_loss_backward_cuda2_"
+      "kernel",
+      [&] {
+        softmax_focal_loss_backward_cuda2_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), buff.data_ptr<scalar_t>(),
+                grad_input.data_ptr<scalar_t>(), num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu
new file mode 100644
index 0000000..cfb4cd3
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu
@@ -0,0 +1,143 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling_gpu.cu
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "furthest_point_sample_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+inline int opt_n_threads(int work_size) {
+  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
+
+  return max(min(1 << pow_2, 1024), 1);
+}
+
+void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
+                                                    const float* dataset,
+                                                    float* temp, int* idxs) {
+  // dataset: (B, N, 3)
+  // tmp: (B, N)
+  // output:
+  //      idx: (B, M)
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  unsigned int n_threads = opt_n_threads(n);
+
+  switch (n_threads) {
+    case 1024:
+      furthest_point_sampling_forward_cuda_kernel<1024>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 512:
+      furthest_point_sampling_forward_cuda_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 256:
+      furthest_point_sampling_forward_cuda_kernel<256>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 128:
+      furthest_point_sampling_forward_cuda_kernel<128>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 64:
+      furthest_point_sampling_forward_cuda_kernel<64>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 32:
+      furthest_point_sampling_forward_cuda_kernel<32>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 16:
+      furthest_point_sampling_forward_cuda_kernel<16>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 8:
+      furthest_point_sampling_forward_cuda_kernel<8>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 4:
+      furthest_point_sampling_forward_cuda_kernel<4>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 2:
+      furthest_point_sampling_forward_cuda_kernel<2>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 1:
+      furthest_point_sampling_forward_cuda_kernel<1>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    default:
+      furthest_point_sampling_forward_cuda_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
+    int b, int n, int m, const float* dataset, float* temp, int* idxs) {
+  // dataset: (B, N, N)
+  // temp: (B, N)
+  // output:
+  //      idx: (B, M)
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  unsigned int n_threads = opt_n_threads(n);
+
+  switch (n_threads) {
+    case 1024:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<1024>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 512:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 256:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<256>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 128:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<128>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 64:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<64>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 32:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<32>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 16:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<16>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 8:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<8>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 4:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<4>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 2:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<2>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 1:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<1>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    default:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu
new file mode 100644
index 0000000..911ea01
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu
@@ -0,0 +1,109 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act_kernel.cu
+// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, visit
+// https://nvlabs.github.io/stylegan2/license.html
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/types.h>
+
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+template <typename scalar_t>
+static __global__ void fused_bias_act_kernel(
+    scalar_t* out, const scalar_t* p_x, const scalar_t* p_b,
+    const scalar_t* p_ref, int act, int grad, scalar_t alpha, scalar_t scale,
+    int loop_x, int size_x, int step_b, int size_b, int use_bias, int use_ref) {
+  int xi = blockIdx.x * loop_x * blockDim.x + threadIdx.x;
+
+  scalar_t zero = 0.0;
+
+  for (int loop_idx = 0; loop_idx < loop_x && xi < size_x;
+       loop_idx++, xi += blockDim.x) {
+    scalar_t x = p_x[xi];
+
+    if (use_bias) {
+      x += p_b[(xi / step_b) % size_b];
+    }
+
+    scalar_t ref = use_ref ? p_ref[xi] : zero;
+
+    scalar_t y;
+
+    // act = 1: linear layer
+    // act = 3: leaky relu layer
+    // grad = 0: direct forward path
+    // grad = 1: first order deviation
+    // grad = 2: second order deviation
+    switch (act * 10 + grad) {
+      default:
+      case 10:
+        y = x;
+        break;
+      case 11:
+        y = x;
+        break;
+      case 12:
+        y = 0.0;
+        break;
+
+      case 30:
+        y = (x > 0.0) ? x : x * alpha;
+        break;
+      case 31:
+        y = (ref > 0.0) ? x : x * alpha;
+        break;
+      case 32:
+        y = 0.0;
+        break;
+    }
+
+    out[xi] = y * scale;
+  }
+}
+
+torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
+                                      const torch::Tensor& bias,
+                                      const torch::Tensor& refer, int act,
+                                      int grad, float alpha, float scale) {
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
+
+  auto x = input.contiguous();
+  auto b = bias.contiguous();
+  auto ref = refer.contiguous();
+
+  int use_bias = b.numel() ? 1 : 0;
+  int use_ref = ref.numel() ? 1 : 0;
+
+  int size_x = x.numel();
+  int size_b = b.numel();
+  int step_b = 1;
+
+  for (int i = 1 + 1; i < x.dim(); i++) {
+    step_b *= x.size(i);
+  }
+
+  int loop_x = 4;
+  int block_size = 4 * 32;
+  int grid_size = (size_x - 1) / (loop_x * block_size) + 1;
+
+  auto y = torch::empty_like(x);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      x.scalar_type(), "fused_bias_act_kernel", [&] {
+        fused_bias_act_kernel<scalar_t><<<grid_size, block_size, 0, stream>>>(
+            y.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(),
+            b.data_ptr<scalar_t>(), ref.data_ptr<scalar_t>(), act, grad, alpha,
+            scale, loop_x, size_x, step_b, size_b, use_bias, use_ref);
+      });
+
+  return y;
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
new file mode 100644
index 0000000..672fec6
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
@@ -0,0 +1,58 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "gather_points_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           const Tensor points,
+                                           const Tensor idx, Tensor out) {
+  // points: (B, C, N)
+  // idx: (B, npoints)
+  // output:
+  //      out: (B, C, npoints)
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "gather_points_forward_cuda_kernel", [&] {
+        gather_points_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, npoints, points.data_ptr<scalar_t>(),
+                idx.data_ptr<int>(), out.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                            const Tensor grad_out,
+                                            const Tensor idx,
+                                            Tensor grad_points) {
+  // grad_out: (B, C, npoints)
+  // idx: (B, npoints)
+  // output:
+  //      grad_points: (B, C, N)
+
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "gather_points_backward_cuda_kernel", [&] {
+        gather_points_backward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, npoints, grad_out.data_ptr<scalar_t>(),
+                idx.data_ptr<int>(), grad_points.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
new file mode 100644
index 0000000..e7c57b0
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
@@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "group_points_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                          int nsample, const Tensor points,
+                                          const Tensor idx, Tensor out) {
+  // points: (B, C, N)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      out: (B, C, npoints, nsample)
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "group_points_forward_cuda_kernel", [&] {
+        group_points_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, npoints, nsample, points.data_ptr<scalar_t>(),
+                idx.data_ptr<int>(), out.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           int nsample, const Tensor grad_out,
+                                           const Tensor idx,
+                                           Tensor grad_points) {
+  // grad_out: (B, C, npoints, nsample)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      grad_points: (B, C, N)
+
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "group_points_backward_cuda_kernel", [&] {
+        group_points_backward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, npoints, nsample, grad_out.data_ptr<scalar_t>(),
+                idx.data_ptr<int>(), grad_points.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
new file mode 100644
index 0000000..0643c16
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
@@ -0,0 +1,86 @@
+// Modified from
+// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#include <stdio.h>
+
+#include "iou3d_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
+                                                   const Tensor boxes_a,
+                                                   const int num_b,
+                                                   const Tensor boxes_b,
+                                                   Tensor ans_overlap) {
+  at::cuda::CUDAGuard device_guard(boxes_a.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK_IOU3D),
+              DIVUP(num_a, THREADS_PER_BLOCK_IOU3D));
+  dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);
+
+  iou3d_boxes_overlap_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+      num_a, boxes_a.data_ptr<float>(), num_b, boxes_b.data_ptr<float>(),
+      ans_overlap.data_ptr<float>());
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
+                                               const Tensor boxes_a,
+                                               const int num_b,
+                                               const Tensor boxes_b,
+                                               Tensor ans_iou) {
+  at::cuda::CUDAGuard device_guard(boxes_a.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK_IOU3D),
+              DIVUP(num_a, THREADS_PER_BLOCK_IOU3D));
+  dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);
+
+  iou3d_boxes_iou_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+      num_a, boxes_a.data_ptr<float>(), num_b, boxes_b.data_ptr<float>(),
+      ans_iou.data_ptr<float>());
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
+                                       unsigned long long *mask, int boxes_num,
+                                       float nms_overlap_thresh) {
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
+              DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+
+  nms_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(), mask);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
+                                             unsigned long long *mask,
+                                             int boxes_num,
+                                             float nms_overlap_thresh) {
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
+              DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+
+  nms_normal_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(), mask);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
new file mode 100644
index 0000000..4954fe4
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
@@ -0,0 +1,34 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
+
+#include <cmath>
+#include <cstdio>
+
+#include "knn_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
+                                  const Tensor xyz, const Tensor new_xyz,
+                                  Tensor idx, Tensor dist2) {
+  // param new_xyz: (B, m, 3)
+  // param xyz: (B, n, 3)
+  // param idx: (B, m, nsample)
+
+  at::cuda::CUDAGuard device_guard(new_xyz.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      new_xyz.scalar_type(), "knn_forward_cuda_kernel", [&] {
+        knn_forward_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            b, n, m, nsample, xyz.data_ptr<scalar_t>(),
+            new_xyz.data_ptr<scalar_t>(), idx.data_ptr<int>(),
+            dist2.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu
new file mode 100644
index 0000000..022e189
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu
@@ -0,0 +1,54 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "masked_conv2d_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int kernel_h,
+                                           const int kernel_w, const int pad_h,
+                                           const int pad_w) {
+  int channels = bottom_data.size(1);
+  int height = bottom_data.size(2);
+  int width = bottom_data.size(3);
+  int mask_cnt = mask_h_idx.size(0);
+  int output_size = mask_cnt * channels;
+
+  at::cuda::CUDAGuard device_guard(bottom_data.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.scalar_type(), "MaskedIm2colLaucherForward", ([&] {
+        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
+        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
+        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
+        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
+        MaskedIm2colForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data_, height, width, kernel_h, kernel_w,
+                pad_h, pad_w, mask_h_idx_, mask_w_idx_, mask_cnt, top_data_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void MaskedCol2imForwardCUDAKernelLauncher(
+    const Tensor bottom_data, const Tensor mask_h_idx, const Tensor mask_w_idx,
+    Tensor top_data, const int height, const int width, const int channels) {
+  int mask_cnt = mask_h_idx.size(0);
+  int output_size = mask_cnt * channels;
+
+  at::cuda::CUDAGuard device_guard(bottom_data.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.scalar_type(), "MaskedCol2imLaucherForward", ([&] {
+        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
+        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
+        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
+        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
+
+        MaskedCol2imForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data_, height, width, channels, mask_h_idx_,
+                mask_w_idx_, mask_cnt, top_data_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu
new file mode 100644
index 0000000..2b52796
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu
@@ -0,0 +1,96 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "modulated_deform_conv_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void modulated_deformable_im2col_cuda(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+
+        modulated_deformable_im2col_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_im_, data_offset_, data_mask_, height_im,
+            width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, batch_size,
+            channels, deformable_group, height_col, width_col, data_col_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void modulated_deformable_col2im_cuda(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels =
+      channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_offset_, data_mask_, channels,
+            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+            stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, deformable_group, height_col, width_col, grad_im_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void modulated_deformable_col2im_coord_cuda(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
+                          kernel_w * deformable_group;
+  const int channel_per_deformable_group =
+      channels * kernel_h * kernel_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_im_, data_offset_, data_mask_,
+            channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
+            stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, batch_size,
+            2 * kernel_h * kernel_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_, grad_mask_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
new file mode 100644
index 0000000..2fccaa2
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
@@ -0,0 +1,361 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <THC/THCAtomics.cuh>
+#include <vector>
+
+#include "ms_deform_attn_cuda_kernel.cuh"
+
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream, const scalar_t *data_value,
+                               const int64_t *data_spatial_shapes,
+                               const int64_t *data_level_start_index,
+                               const scalar_t *data_sampling_loc,
+                               const scalar_t *data_attn_weight,
+                               const int batch_size, const int spatial_size,
+                               const int num_heads, const int channels,
+                               const int num_levels, const int num_query,
+                               const int num_point, scalar_t *data_col) {
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = CUDA_NUM_THREADS;
+  ms_deformable_im2col_gpu_kernel<scalar_t>
+      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0, stream>>>(
+          num_kernels, data_value, data_spatial_shapes, data_level_start_index,
+          data_sampling_loc, data_attn_weight, batch_size, spatial_size,
+          num_heads, channels, num_levels, num_query, num_point, data_col);
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(
+    cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  const int num_threads =
+      (channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > 1024) {
+    if ((channels & 1023) == 0) {
+      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+             num_threads * 3 * sizeof(scalar_t), stream>>>(
+              num_kernels, grad_col, data_value, data_spatial_shapes,
+              data_level_start_index, data_sampling_loc, data_attn_weight,
+              batch_size, spatial_size, num_heads, channels, num_levels,
+              num_query, num_point, grad_value, grad_sampling_loc,
+              grad_attn_weight);
+    } else {
+      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, spatial_size, num_heads,
+                       channels, num_levels, num_query, num_point, grad_value,
+                       grad_sampling_loc, grad_attn_weight);
+    }
+  } else {
+    switch (channels) {
+      case 1:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      1>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 2:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      2>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 4:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      4>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 8:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      8>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 16:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      16>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 32:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      32>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 64:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      64>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 128:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      128>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 256:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      256>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 512:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      512>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 1024:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      1024>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      default:
+        if (channels < 64) {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+                 num_threads * 3 * sizeof(scalar_t), stream>>>(
+                  num_kernels, grad_col, data_value, data_spatial_shapes,
+                  data_level_start_index, data_sampling_loc, data_attn_weight,
+                  batch_size, spatial_size, num_heads, channels, num_levels,
+                  num_query, num_point, grad_value, grad_sampling_loc,
+                  grad_attn_weight);
+        } else {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+                 num_threads * 3 * sizeof(scalar_t), stream>>>(
+                  num_kernels, grad_col, data_value, data_spatial_shapes,
+                  data_level_start_index, data_sampling_loc, data_attn_weight,
+                  batch_size, spatial_size, num_heads, channels, num_levels,
+                  num_query, num_point, grad_value, grad_sampling_loc,
+                  grad_attn_weight);
+        }
+    }
+  }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+at::Tensor ms_deform_attn_cuda_forward(const at::Tensor &value,
+                                       const at::Tensor &spatial_shapes,
+                                       const at::Tensor &level_start_index,
+                                       const at::Tensor &sampling_loc,
+                                       const at::Tensor &attn_weight,
+                                       const int im2col_step) {
+  AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+  AT_ASSERTM(spatial_shapes.is_contiguous(),
+             "spatial_shapes tensor has to be contiguous");
+  AT_ASSERTM(level_start_index.is_contiguous(),
+             "level_start_index tensor has to be contiguous");
+  AT_ASSERTM(sampling_loc.is_contiguous(),
+             "sampling_loc tensor has to be contiguous");
+  AT_ASSERTM(attn_weight.is_contiguous(),
+             "attn_weight tensor has to be contiguous");
+
+  AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
+  AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
+  AT_ASSERTM(level_start_index.is_cuda(),
+             "level_start_index must be a CUDA tensor");
+  AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+  AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+
+  const int batch = value.size(0);
+  const int spatial_size = value.size(1);
+  const int num_heads = value.size(2);
+  const int channels = value.size(3);
+
+  const int num_levels = spatial_shapes.size(0);
+
+  const int num_query = sampling_loc.size(1);
+  const int num_point = sampling_loc.size(4);
+
+  const int im2col_step_ = std::min(batch, im2col_step);
+
+  AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)",
+             batch, im2col_step_);
+
+  auto output =
+      at::zeros({batch, num_query, num_heads, channels}, value.options());
+
+  const int batch_n = im2col_step_;
+  auto output_n = output.view(
+      {batch / im2col_step_, batch_n, num_query, num_heads, channels});
+  auto per_value_size = spatial_size * num_heads * channels;
+  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+  for (int n = 0; n < batch / im2col_step_; ++n) {
+    auto columns = output_n.select(0, n);
+    AT_DISPATCH_FLOATING_TYPES(
+        value.scalar_type(), "ms_deform_attn_forward_cuda", ([&] {
+          ms_deformable_im2col_cuda(
+              at::cuda::getCurrentCUDAStream(),
+              value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
+              spatial_shapes.data_ptr<int64_t>(),
+              level_start_index.data_ptr<int64_t>(),
+              sampling_loc.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              attn_weight.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size,
+              batch_n, spatial_size, num_heads, channels, num_levels, num_query,
+              num_point, columns.data_ptr<scalar_t>());
+        }));
+  }
+
+  output = output.view({batch, num_query, num_heads * channels});
+
+  return output;
+}
+
+void ms_deform_attn_cuda_backward(
+    const at::Tensor &value, const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index, const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight, const at::Tensor &grad_output,
+    at::Tensor &grad_value, at::Tensor &grad_sampling_loc,
+    at::Tensor &grad_attn_weight, const int im2col_step) {
+  AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+  AT_ASSERTM(spatial_shapes.is_contiguous(),
+             "spatial_shapes tensor has to be contiguous");
+  AT_ASSERTM(level_start_index.is_contiguous(),
+             "level_start_index tensor has to be contiguous");
+  AT_ASSERTM(sampling_loc.is_contiguous(),
+             "sampling_loc tensor has to be contiguous");
+  AT_ASSERTM(attn_weight.is_contiguous(),
+             "attn_weight tensor has to be contiguous");
+  AT_ASSERTM(grad_output.is_contiguous(),
+             "grad_output tensor has to be contiguous");
+
+  AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
+  AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
+  AT_ASSERTM(level_start_index.is_cuda(),
+             "level_start_index must be a CUDA tensor");
+  AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+  AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+  AT_ASSERTM(grad_output.is_cuda(), "grad_output must be a CUDA tensor");
+
+  const int batch = value.size(0);
+  const int spatial_size = value.size(1);
+  const int num_heads = value.size(2);
+  const int channels = value.size(3);
+
+  const int num_levels = spatial_shapes.size(0);
+
+  const int num_query = sampling_loc.size(1);
+  const int num_point = sampling_loc.size(4);
+
+  const int im2col_step_ = std::min(batch, im2col_step);
+
+  AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)",
+             batch, im2col_step_);
+
+  const int batch_n = im2col_step_;
+  auto per_value_size = spatial_size * num_heads * channels;
+  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+  auto grad_output_n = grad_output.view(
+      {batch / im2col_step_, batch_n, num_query, num_heads, channels});
+
+  for (int n = 0; n < batch / im2col_step_; ++n) {
+    auto grad_output_g = grad_output_n.select(0, n);
+    AT_DISPATCH_FLOATING_TYPES(
+        value.scalar_type(), "ms_deform_attn_backward_cuda", ([&] {
+          ms_deformable_col2im_cuda(
+              at::cuda::getCurrentCUDAStream(),
+              grad_output_g.data_ptr<scalar_t>(),
+              value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
+              spatial_shapes.data_ptr<int64_t>(),
+              level_start_index.data_ptr<int64_t>(),
+              sampling_loc.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              attn_weight.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size,
+              batch_n, spatial_size, num_heads, channels, num_levels, num_query,
+              num_point,
+              grad_value.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_value_size,
+              grad_sampling_loc.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              grad_attn_weight.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size);
+        }));
+  }
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
new file mode 100644
index 0000000..16cf646
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "nms_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                             int offset) {
+  at::cuda::CUDAGuard device_guard(boxes.device());
+
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+  auto order_t = std::get<1>(scores.sort(0, /*descending=*/true));
+  auto boxes_sorted = boxes.index_select(0, order_t);
+
+  int boxes_num = boxes.size(0);
+  const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  nms_cuda<<<blocks, threads, 0, stream>>>(
+      boxes_num, iou_threshold, offset, boxes_sorted.data_ptr<float>(),
+      (unsigned long long*)mask.data_ptr<int64_t>());
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host =
+      (unsigned long long*)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  at::Tensor keep_t =
+      at::zeros({boxes_num}, boxes.options().dtype(at::kBool).device(at::kCPU));
+  bool* keep = keep_t.data_ptr<bool>();
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep[i] = true;
+      // set every overlap box with bit 1 in remv
+      unsigned long long* p = mask_host + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.masked_select(keep_t.to(at::kCUDA));
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu
new file mode 100644
index 0000000..e1185f8
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu
@@ -0,0 +1,62 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
+#include "nms_rotated_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
+                        const Tensor order_t, const Tensor dets_sorted,
+                        float iou_threshold, const int multi_label) {
+  // using scalar_t = float;
+  AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
+  AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
+  at::cuda::CUDAGuard device_guard(dets.device());
+
+  int dets_num = dets.size(0);
+
+  const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock);
+
+  Tensor mask =
+      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
+
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      dets_sorted.scalar_type(), "nms_rotated_kernel_cuda", [&] {
+        nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            dets_num, iou_threshold, dets_sorted.data_ptr<scalar_t>(),
+            (unsigned long long*)mask.data_ptr<int64_t>(), multi_label);
+      });
+
+  Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host =
+      (unsigned long long*)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  Tensor keep =
+      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
+  int64_t* keep_out = keep.data_ptr<int64_t>();
+
+  int num_to_keep = 0;
+  for (int i = 0; i < dets_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long* p = mask_host + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.index(
+      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
+           .to(order_t.device(), keep.scalar_type())});
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
new file mode 100644
index 0000000..17e6441
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
@@ -0,0 +1,62 @@
+// Modified from
+// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+// Written by Shaoshuai Shi
+// All Rights Reserved 2019.
+
+#include <stdio.h>
+
+#include "points_in_boxes_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                                int pts_num, const Tensor boxes,
+                                                const Tensor pts,
+                                                Tensor box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is
+  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
+  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
+  // -1
+
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      boxes.scalar_type(), "points_in_boxes_part_forward_cuda_kernel", [&] {
+        points_in_boxes_part_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                batch_size, boxes_num, pts_num, boxes.data_ptr<scalar_t>(),
+                pts.data_ptr<scalar_t>(), box_idx_of_points.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                               int pts_num, const Tensor boxes,
+                                               const Tensor pts,
+                                               Tensor box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
+  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
+  // default -1
+
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      boxes.scalar_type(), "points_in_boxes_all_forward_cuda_kernel", [&] {
+        points_in_boxes_all_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                batch_size, boxes_num, pts_num, boxes.data_ptr<scalar_t>(),
+                pts.data_ptr<scalar_t>(), box_idx_of_points.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
new file mode 100644
index 0000000..274be83
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
@@ -0,0 +1,59 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+
+#include <torch/serialize/tensor.h>
+#include "psamask_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
+                                      Tensor output, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask) {
+  int nthreads = num_ * h_feature * w_feature;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (psa_type == 0)
+    AT_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(), "psamask_collect_forward_cuda", [&] {
+          psamask_collect_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
+              nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+              half_w_mask, input.data_ptr<scalar_t>(),
+              output.data_ptr<scalar_t>());
+        });
+  else
+    AT_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(), "psamask_distribute_forward_cuda", [&] {
+          psamask_distribute_forward_cuda<scalar_t>
+              <<<nthreads, 512, 0, stream>>>(
+                  nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                  half_w_mask, input.data_ptr<scalar_t>(),
+                  output.data_ptr<scalar_t>());
+        });
+}
+
+void PSAMaskBackwardCUDAKernelLauncher(
+    const int psa_type, const Tensor grad_output, Tensor grad_input,
+    const int num_, const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int half_h_mask, const int half_w_mask) {
+  int nthreads = num_ * h_feature * w_feature;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (psa_type == 0)
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_input.scalar_type(), "psamask_collect_backward_cuda", [&] {
+          psamask_collect_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
+              nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+              half_w_mask, grad_output.data_ptr<scalar_t>(),
+              grad_input.data_ptr<scalar_t>());
+        });
+  else
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_input.scalar_type(), "psamask_distribute_backward_cuda", [&] {
+          psamask_distribute_backward_cuda<scalar_t>
+              <<<nthreads, 512, 0, stream>>>(
+                  nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                  half_w_mask, grad_output.data_ptr<scalar_t>(),
+                  grad_input.data_ptr<scalar_t>());
+        });
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu
new file mode 100644
index 0000000..3d4f761
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "roi_align_cuda_kernel.cuh"
+
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "roi_align_forward_cuda_kernel", [&] {
+        roi_align_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                argmax_y.data_ptr<scalar_t>(), argmax_x.data_ptr<scalar_t>(),
+                aligned_height, aligned_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
+                aligned, channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                        Tensor argmax_y, Tensor argmax_x,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, int pool_mode,
+                                        bool aligned) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "roi_align_backward_cuda_kernel", [&] {
+        roi_align_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
+                argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+                aligned_height, aligned_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
+                aligned, channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
new file mode 100644
index 0000000..aa631bc
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
@@ -0,0 +1,45 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "roi_align_rotated_cuda_kernel.cuh"
+
+void ROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int sample_num, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor output) {
+  const int output_size = num_rois * pooled_height * pooled_width * channels;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "ROIAlignRotatedLaucherForward", ([&] {
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
+
+        roi_align_rotated_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
+                sample_num, aligned, clockwise, channels, height, width,
+                pooled_height, pooled_width, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int sample_num, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor bottom_grad) {
+  const int output_size = num_rois * pooled_height * pooled_width * channels;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "ROIAlignLaucherBackward", ([&] {
+        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
+        roi_align_rotated_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, top_diff, rois_data, spatial_scale, sample_num,
+                aligned, clockwise, channels, height, width, pooled_height,
+                pooled_width, bottom_diff);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu
new file mode 100644
index 0000000..d9cdf30
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu
@@ -0,0 +1,50 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "roi_pool_cuda_kernel.cuh"
+
+void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax, int pooled_height,
+                                      int pooled_width, float spatial_scale) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "roi_pool_forward_cuda_kernel", [&] {
+        roi_pool_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                argmax.data_ptr<int>(), pooled_height, pooled_width,
+                static_cast<scalar_t>(spatial_scale), channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                       Tensor argmax, Tensor grad_input,
+                                       int pooled_height, int pooled_width,
+                                       float spatial_scale) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "roi_pool_backward_cuda_kernel", [&] {
+        roi_pool_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), argmax.data_ptr<int>(),
+                grad_input.data_ptr<scalar_t>(), pooled_height, pooled_width,
+                channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
new file mode 100644
index 0000000..2bc7c3f
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
@@ -0,0 +1,118 @@
+// Modified from
+// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+// Written by Shaoshuai Shi
+// All Rights Reserved 2019.
+
+#include <stdio.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "roiaware_pool3d_cuda_kernel.cuh"
+
+void RoiawarePool3dForwardCUDAKernelLauncher(
+    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
+    int out_y, int out_z, const Tensor rois, const Tensor pts,
+    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
+    Tensor pooled_features, int pool_method) {
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate params pts: (npoints, 3) [x, y, z] in LiDAR coordinate params
+  // pts_feature: (npoints, C) params argmax: (N, out_x, out_y, out_z, C) params
+  // pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) params
+  // pooled_features: (N, out_x, out_y, out_z, C) params pool_method: 0:
+  // max_pool 1: avg_pool
+
+  at::cuda::CUDAGuard device_guard(pts_feature.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  Tensor pts_mask =
+      -at::ones({boxes_num, pts_num}, pts_feature.options().dtype(at::kInt));
+
+  dim3 blocks_mask(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      rois.scalar_type(), "generate_pts_mask_for_box3d", [&] {
+        generate_pts_mask_for_box3d<scalar_t>
+            <<<blocks_mask, threads, 0, stream>>>(
+                boxes_num, pts_num, out_x, out_y, out_z,
+                rois.data_ptr<scalar_t>(), pts.data_ptr<scalar_t>(),
+                pts_mask.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // TODO: Merge the collect and pool functions, SS
+
+  dim3 blocks_collect(DIVUP(boxes_num, THREADS_PER_BLOCK));
+
+  AT_DISPATCH_INTEGRAL_TYPES(
+      pts_idx_of_voxels.scalar_type(), "collect_inside_pts_for_box3d", [&] {
+        collect_inside_pts_for_box3d<scalar_t>
+            <<<blocks_collect, threads, 0, stream>>>(
+                boxes_num, pts_num, max_pts_each_voxel, out_x, out_y, out_z,
+                pts_mask.data_ptr<int>(),
+                pts_idx_of_voxels.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  dim3 blocks_pool(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
+                   boxes_num);
+  if (pool_method == 0) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        pts_feature.scalar_type(), "roiaware_maxpool3d", [&] {
+          roiaware_maxpool3d<scalar_t><<<blocks_pool, threads, 0, stream>>>(
+              boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y,
+              out_z, pts_feature.data_ptr<scalar_t>(),
+              pts_idx_of_voxels.data_ptr<int>(),
+              pooled_features.data_ptr<scalar_t>(), argmax.data_ptr<int>());
+        });
+  } else if (pool_method == 1) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        pts_feature.scalar_type(), "roiaware_avgpool3d", [&] {
+          roiaware_avgpool3d<scalar_t><<<blocks_pool, threads, 0, stream>>>(
+              boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y,
+              out_z, pts_feature.data_ptr<scalar_t>(),
+              pts_idx_of_voxels.data_ptr<int>(),
+              pooled_features.data_ptr<scalar_t>());
+        });
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void RoiawarePool3dBackwardCUDAKernelLauncher(
+    int boxes_num, int out_x, int out_y, int out_z, int channels,
+    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
+    const Tensor grad_out, Tensor grad_in, int pool_method) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+  // params pool_method: 0: max_pool, 1: avg_pool
+
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
+              boxes_num);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  if (pool_method == 0) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        grad_in.scalar_type(), "roiaware_maxpool3d_backward", [&] {
+          roiaware_maxpool3d_backward<scalar_t><<<blocks, threads, 0, stream>>>(
+              boxes_num, channels, out_x, out_y, out_z, argmax.data_ptr<int>(),
+              grad_out.data_ptr<scalar_t>(), grad_in.data_ptr<scalar_t>());
+        });
+  } else if (pool_method == 1) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        grad_in.scalar_type(), "roiaware_avgpool3d_backward", [&] {
+          roiaware_avgpool3d_backward<scalar_t><<<blocks, threads, 0, stream>>>(
+              boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel,
+              pts_idx_of_voxels.data_ptr<int>(), grad_out.data_ptr<scalar_t>(),
+              grad_in.data_ptr<scalar_t>());
+        });
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
new file mode 100644
index 0000000..49c003f
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
@@ -0,0 +1,60 @@
+/*
+Modified from
+https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
+Point cloud feature pooling
+Written by Shaoshuai Shi
+All Rights Reserved 2018.
+*/
+
+#include <math.h>
+#include <stdio.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "roipoint_pool3d_cuda_kernel.cuh"
+
+void RoIPointPool3dForwardCUDAKernelLauncher(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
+    const Tensor pts_feature, Tensor pooled_features,
+    Tensor pooled_empty_flag) {
+  Tensor pts_assign = at::empty({batch_size, pts_num, boxes_num},
+                                boxes3d.options().dtype(at::kInt));
+
+  at::cuda::CUDAGuard device_guard(xyz.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz.scalar_type(), "assign_pts_to_box3d", [&] {
+        assign_pts_to_box3d<scalar_t><<<blocks, threads, 0, stream>>>(
+            batch_size, pts_num, boxes_num, xyz.data_ptr<scalar_t>(),
+            boxes3d.data_ptr<scalar_t>(), pts_assign.data_ptr<int>());
+      });
+
+  Tensor pts_idx = at::empty({batch_size, boxes_num, sampled_pts_num},
+                             boxes3d.options().dtype(at::kInt));
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks2(DIVUP(boxes_num, THREADS_PER_BLOCK), batch_size);
+
+  get_pooled_idx<<<blocks2, threads, 0, stream>>>(
+      batch_size, pts_num, boxes_num, sampled_pts_num,
+      pts_assign.data_ptr<int>(), pts_idx.data_ptr<int>(),
+      pooled_empty_flag.data_ptr<int>());
+
+  dim3 blocks_pool(DIVUP(sampled_pts_num, THREADS_PER_BLOCK), boxes_num,
+                   batch_size);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz.scalar_type(), "roipoint_pool3d_forward", [&] {
+        roipoint_pool3d_forward<scalar_t><<<blocks_pool, threads, 0, stream>>>(
+            batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
+            xyz.data_ptr<scalar_t>(), pts_idx.data_ptr<int>(),
+            pts_feature.data_ptr<scalar_t>(),
+            pooled_features.data_ptr<scalar_t>(),
+            pooled_empty_flag.data_ptr<int>());
+      });
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
new file mode 100644
index 0000000..4939fe4
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
@@ -0,0 +1,127 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include <stdio.h>
+#include <stdlib.h>
+#include <torch/types.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "scatter_points_cuda_kernel.cuh"
+
+std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
+    const at::Tensor &feats, const at::Tensor &coors,
+    const reduce_t reduce_type) {
+  const int num_input = feats.size(0);
+  const int num_feats = feats.size(1);
+
+  if (num_input == 0)
+    return {feats.clone().detach(), coors.clone().detach(),
+            coors.new_empty({0}, torch::kInt32),
+            coors.new_empty({0}, torch::kInt32)};
+
+  at::Tensor out_coors;
+  at::Tensor coors_map;
+  at::Tensor reduce_count;
+
+  auto coors_clean = coors.masked_fill(coors.lt(0).any(-1, true), -1);
+
+  std::tie(out_coors, coors_map, reduce_count) =
+      at::unique_dim(coors_clean, 0, true, true, true);
+
+  // the first element of out_coors is always (-1,-1,-1) and should be removed
+  out_coors = out_coors.slice(0, 1);
+  reduce_count = reduce_count.slice(0, 1).to(torch::kInt32);
+  coors_map = coors_map.to(torch::kInt32) - 1;
+
+  auto reduced_feats =
+      at::empty({out_coors.size(0), num_feats}, feats.options());
+
+  at::cuda::CUDAGuard device_guard(feats.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      feats.scalar_type(), "feats_reduce_kernel", ([&] {
+        if (reduce_type == reduce_t::MAX)
+          reduced_feats.fill_(-std::numeric_limits<scalar_t>::infinity());
+        else
+          reduced_feats.fill_(static_cast<scalar_t>(0));
+
+        dim3 blocks(std::min(
+            at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
+        dim3 threads(THREADS_PER_BLOCK);
+        feats_reduce_kernel<<<blocks, threads, 0, stream>>>(
+            feats.data_ptr<scalar_t>(), coors_map.data_ptr<int32_t>(),
+            reduced_feats.data_ptr<scalar_t>(), num_input, num_feats,
+            reduce_type);
+        if (reduce_type == reduce_t::MEAN)
+          reduced_feats /= reduce_count.unsqueeze(-1).to(reduced_feats.dtype());
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  return {reduced_feats, out_coors, coors_map, reduce_count};
+}
+
+void DynamicPointToVoxelBackwardCUDAKernelLauncher(
+    at::Tensor &grad_feats, const at::Tensor &grad_reduced_feats,
+    const at::Tensor &feats, const at::Tensor &reduced_feats,
+    const at::Tensor &coors_map, const at::Tensor &reduce_count,
+    const reduce_t reduce_type) {
+  const int num_input = feats.size(0);
+  const int num_reduced = reduced_feats.size(0);
+  const int num_feats = feats.size(1);
+
+  grad_feats.fill_(0);
+  // copy voxel grad to points
+
+  if (num_input == 0 || num_reduced == 0) return;
+  at::cuda::CUDAGuard device_guard(feats.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (reduce_type == reduce_t::MEAN || reduce_type == reduce_t::SUM) {
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(), "add_reduce_traceback_grad_kernel",
+        ([&] {
+          dim3 blocks(std::min(
+              at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
+          dim3 threads(THREADS_PER_BLOCK);
+          add_reduce_traceback_grad_kernel<<<blocks, threads, 0, stream>>>(
+              grad_feats.data_ptr<scalar_t>(),
+              grad_reduced_feats.data_ptr<scalar_t>(),
+              coors_map.data_ptr<int32_t>(), reduce_count.data_ptr<int32_t>(),
+              num_input, num_feats, reduce_type);
+        }));
+
+    AT_CUDA_CHECK(cudaGetLastError());
+  } else {
+    auto reduce_from = at::full({num_reduced, num_feats}, num_input,
+                                coors_map.options().dtype(torch::kInt32));
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(),
+        "max_reduce_traceback_scatter_idx_kernel", ([&] {
+          dim3 blocks(std::min(
+              at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
+          dim3 threads(THREADS_PER_BLOCK);
+          max_reduce_traceback_scatter_idx_kernel<<<blocks, threads, 0,
+                                                    stream>>>(
+              feats.data_ptr<scalar_t>(), reduced_feats.data_ptr<scalar_t>(),
+              reduce_from.data_ptr<int32_t>(), coors_map.data_ptr<int32_t>(),
+              num_input, num_feats);
+        }));
+
+    AT_CUDA_CHECK(cudaGetLastError());
+
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(),
+        "max_reduce_traceback_scatter_idx_kernel", ([&] {
+          dim3 blocks(
+              std::min(at::cuda::ATenCeilDiv(num_reduced, THREADS_PER_BLOCK),
+                       maxGridDim));
+          dim3 threads(THREADS_PER_BLOCK);
+          max_reduce_scatter_grad_kernel<<<blocks, threads, 0, stream>>>(
+              grad_feats.data_ptr<scalar_t>(),
+              grad_reduced_feats.data_ptr<scalar_t>(),
+              reduce_from.data_ptr<int32_t>(), num_reduced, num_feats);
+        }));
+
+    AT_CUDA_CHECK(cudaGetLastError());
+  }
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu
new file mode 100644
index 0000000..657c817
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu
@@ -0,0 +1,110 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "sync_bn_cuda_kernel.cuh"
+
+void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean) {
+  int num = input.size(0);
+  int channels = input.size(1);
+  int spatial = input.size(2);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
+        sync_bn_forward_mean_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                input.data_ptr<scalar_t>(), mean.data_ptr<float>(), num,
+                channels, spatial);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
+                                        Tensor var) {
+  int num = input.size(0);
+  int channels = input.size(1);
+  int spatial = input.size(2);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
+        sync_bn_forward_var_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                input.data_ptr<scalar_t>(), mean.data_ptr<float>(),
+                var.data_ptr<float>(), num, channels, spatial);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNForwardOutputCUDAKernelLauncher(
+    const Tensor input, const Tensor mean, const Tensor var,
+    Tensor running_mean, Tensor running_var, const Tensor weight,
+    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
+    float momentum, int group_size) {
+  int num = input.size(0);
+  int channels = input.size(1);
+  int spatial = input.size(2);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
+        sync_bn_forward_output_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                input.data_ptr<scalar_t>(), mean.data_ptr<float>(),
+                var.data_ptr<float>(), running_mean.data_ptr<float>(),
+                running_var.data_ptr<float>(), weight.data_ptr<float>(),
+                bias.data_ptr<float>(), norm.data_ptr<float>(),
+                std.data_ptr<float>(), output.data_ptr<scalar_t>(), num,
+                channels, spatial, eps, momentum, group_size);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
+                                           const Tensor norm,
+                                           Tensor grad_weight,
+                                           Tensor grad_bias) {
+  int num = grad_output.size(0);
+  int channels = grad_output.size(1);
+  int spatial = grad_output.size(2);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "sync_bn_backward_param_cuda_kernel", [&] {
+        sync_bn_backward_param_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                grad_output.data_ptr<scalar_t>(), norm.data_ptr<float>(),
+                grad_weight.data_ptr<float>(), grad_bias.data_ptr<float>(), num,
+                channels, spatial);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
+                                          const Tensor weight,
+                                          const Tensor grad_weight,
+                                          const Tensor grad_bias,
+                                          const Tensor norm, const Tensor std,
+                                          Tensor grad_input) {
+  int output_size = grad_input.numel();
+  int num = grad_input.size(0);
+  int channels = grad_input.size(1);
+  int spatial = grad_input.size(2);
+
+  at::cuda::CUDAGuard device_guard(grad_input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "sync_bn_backward_data_cuda_kernel", [&] {
+        sync_bn_backward_data_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                weight.data_ptr<float>(), grad_weight.data_ptr<float>(),
+                grad_bias.data_ptr<float>(), norm.data_ptr<float>(),
+                std.data_ptr<float>(), grad_input.data_ptr<scalar_t>(), num,
+                channels, spatial);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
new file mode 100644
index 0000000..839d2d8
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
@@ -0,0 +1,66 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "three_interpolate_cuda_kernel.cuh"
+
+void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
+                                               const Tensor points,
+                                               const Tensor idx,
+                                               const Tensor weight,
+                                               Tensor out) {
+  // points: (B, C, M)
+  // idx: (B, N, 3)
+  // weight: (B, N, 3)
+  // output:
+  //      out: (B, C, N)
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "three_interpolate_forward_cuda_kernel", [&] {
+        three_interpolate_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, m, n, points.data_ptr<scalar_t>(), idx.data_ptr<int>(),
+                weight.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
+                                                const Tensor grad_out,
+                                                const Tensor idx,
+                                                const Tensor weight,
+                                                Tensor grad_points) {
+  // grad_out: (B, C, N)
+  // weight: (B, N, 3)
+  // output:
+  //      grad_points: (B, C, M)
+
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "three_interpolate_backward_cuda_kernel", [&] {
+        three_interpolate_backward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, m, grad_out.data_ptr<scalar_t>(), idx.data_ptr<int>(),
+                weight.data_ptr<scalar_t>(), grad_points.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
new file mode 100644
index 0000000..9afde8f
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
@@ -0,0 +1,35 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "three_nn_cuda_kernel.cuh"
+
+void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
+                                      const Tensor known, Tensor dist2,
+                                      Tensor idx) {
+  // unknown: (B, N, 3)
+  // known: (B, M, 3)
+  // output:
+  //      dist2: (B, N, 3)
+  //      idx: (B, N, 3)
+
+  at::cuda::CUDAGuard device_guard(unknown.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      unknown.scalar_type(), "three_nn_forward_cuda_kernel", [&] {
+        three_nn_forward_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            b, n, m, unknown.data_ptr<scalar_t>(), known.data_ptr<scalar_t>(),
+            dist2.data_ptr<scalar_t>(), idx.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu
new file mode 100644
index 0000000..19c85c7
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu
@@ -0,0 +1,55 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "pytorch_device_registry.hpp"
+#include "tin_shift_cuda_kernel.cuh"
+
+void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
+                                       Tensor output) {
+  int output_size = output.numel();
+  int batch_size = input.size(0);
+  int t_size = input.size(1);
+  int channels = input.size(2);
+  int hw_size = input.size(3);
+  int group_size = shift.size(1);
+  int group_channel = channels / group_size;
+  int num_kernels = batch_size * hw_size * channels;
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "tin_shift_forward_cuda_kernel", [&] {
+        tin_shift_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(), shift.data_ptr<int>(),
+                output.data_ptr<scalar_t>(), batch_size, channels, t_size,
+                hw_size, group_size, group_channel);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
+                                        Tensor grad_input) {
+  int output_size = grad_output.numel();
+  int batch_size = grad_output.size(0);
+  int t_size = grad_output.size(1);
+  int channels = grad_output.size(2);
+  int hw_size = grad_output.size(3);
+  int group_size = shift.size(1);
+  int group_channel = channels / group_size;
+  int num_kernels = batch_size * hw_size * channels;
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "tin_shift_backward_cuda_kernel", [&] {
+        tin_shift_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                shift.data_ptr<int>(), grad_input.data_ptr<scalar_t>(),
+                batch_size, channels, t_size, hw_size, group_size,
+                group_channel);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu b/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu
new file mode 100644
index 0000000..ea2f088
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu
@@ -0,0 +1,370 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d_kernel.cu
+// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, visit
+// https://nvlabs.github.io/stylegan2/license.html
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/types.h>
+
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+static __host__ __device__ __forceinline__ int floor_div(int a, int b) {
+  int c = a / b;
+
+  if (c * b > a) {
+    c--;
+  }
+
+  return c;
+}
+
+struct UpFirDn2DKernelParams {
+  int up_x;
+  int up_y;
+  int down_x;
+  int down_y;
+  int pad_x0;
+  int pad_x1;
+  int pad_y0;
+  int pad_y1;
+
+  int major_dim;
+  int in_h;
+  int in_w;
+  int minor_dim;
+  int kernel_h;
+  int kernel_w;
+  int out_h;
+  int out_w;
+  int loop_major;
+  int loop_x;
+};
+
+template <typename scalar_t>
+__global__ void upfirdn2d_kernel_large(scalar_t *out, const scalar_t *input,
+                                       const scalar_t *kernel,
+                                       const UpFirDn2DKernelParams p) {
+  int minor_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int out_y = minor_idx / p.minor_dim;
+  minor_idx -= out_y * p.minor_dim;
+  int out_x_base = blockIdx.y * p.loop_x * blockDim.y + threadIdx.y;
+  int major_idx_base = blockIdx.z * p.loop_major;
+
+  if (out_x_base >= p.out_w || out_y >= p.out_h ||
+      major_idx_base >= p.major_dim) {
+    return;
+  }
+
+  int mid_y = out_y * p.down_y + p.up_y - 1 - p.pad_y0;
+  int in_y = min(max(floor_div(mid_y, p.up_y), 0), p.in_h);
+  int h = min(max(floor_div(mid_y + p.kernel_h, p.up_y), 0), p.in_h) - in_y;
+  int kernel_y = mid_y + p.kernel_h - (in_y + 1) * p.up_y;
+
+  for (int loop_major = 0, major_idx = major_idx_base;
+       loop_major < p.loop_major && major_idx < p.major_dim;
+       loop_major++, major_idx++) {
+    for (int loop_x = 0, out_x = out_x_base;
+         loop_x < p.loop_x && out_x < p.out_w; loop_x++, out_x += blockDim.y) {
+      int mid_x = out_x * p.down_x + p.up_x - 1 - p.pad_x0;
+      int in_x = min(max(floor_div(mid_x, p.up_x), 0), p.in_w);
+      int w = min(max(floor_div(mid_x + p.kernel_w, p.up_x), 0), p.in_w) - in_x;
+      int kernel_x = mid_x + p.kernel_w - (in_x + 1) * p.up_x;
+
+      const scalar_t *x_p =
+          &input[((major_idx * p.in_h + in_y) * p.in_w + in_x) * p.minor_dim +
+                 minor_idx];
+      const scalar_t *k_p = &kernel[kernel_y * p.kernel_w + kernel_x];
+      int x_px = p.minor_dim;
+      int k_px = -p.up_x;
+      int x_py = p.in_w * p.minor_dim;
+      int k_py = -p.up_y * p.kernel_w;
+
+      scalar_t v = 0.0f;
+
+      for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++) {
+          v += static_cast<scalar_t>(*x_p) * static_cast<scalar_t>(*k_p);
+          x_p += x_px;
+          k_p += k_px;
+        }
+
+        x_p += x_py - w * x_px;
+        k_p += k_py - w * k_px;
+      }
+
+      out[((major_idx * p.out_h + out_y) * p.out_w + out_x) * p.minor_dim +
+          minor_idx] = v;
+    }
+  }
+}
+
+template <typename scalar_t, int up_x, int up_y, int down_x, int down_y,
+          int kernel_h, int kernel_w, int tile_out_h, int tile_out_w>
+__global__ void upfirdn2d_kernel(scalar_t *out, const scalar_t *input,
+                                 const scalar_t *kernel,
+                                 const UpFirDn2DKernelParams p) {
+  const int tile_in_h = ((tile_out_h - 1) * down_y + kernel_h - 1) / up_y + 1;
+  const int tile_in_w = ((tile_out_w - 1) * down_x + kernel_w - 1) / up_x + 1;
+
+  __shared__ volatile float sk[kernel_h][kernel_w];
+  __shared__ volatile float sx[tile_in_h][tile_in_w];
+
+  int minor_idx = blockIdx.x;
+  int tile_out_y = minor_idx / p.minor_dim;
+  minor_idx -= tile_out_y * p.minor_dim;
+  tile_out_y *= tile_out_h;
+  int tile_out_x_base = blockIdx.y * p.loop_x * tile_out_w;
+  int major_idx_base = blockIdx.z * p.loop_major;
+
+  if (tile_out_x_base >= p.out_w | tile_out_y >= p.out_h |
+      major_idx_base >= p.major_dim) {
+    return;
+  }
+
+  for (int tap_idx = threadIdx.x; tap_idx < kernel_h * kernel_w;
+       tap_idx += blockDim.x) {
+    int ky = tap_idx / kernel_w;
+    int kx = tap_idx - ky * kernel_w;
+    scalar_t v = 0.0;
+
+    if (kx < p.kernel_w & ky < p.kernel_h) {
+      v = kernel[(p.kernel_h - 1 - ky) * p.kernel_w + (p.kernel_w - 1 - kx)];
+    }
+
+    sk[ky][kx] = v;
+  }
+
+  for (int loop_major = 0, major_idx = major_idx_base;
+       loop_major < p.loop_major & major_idx < p.major_dim;
+       loop_major++, major_idx++) {
+    for (int loop_x = 0, tile_out_x = tile_out_x_base;
+         loop_x < p.loop_x & tile_out_x < p.out_w;
+         loop_x++, tile_out_x += tile_out_w) {
+      int tile_mid_x = tile_out_x * down_x + up_x - 1 - p.pad_x0;
+      int tile_mid_y = tile_out_y * down_y + up_y - 1 - p.pad_y0;
+      int tile_in_x = floor_div(tile_mid_x, up_x);
+      int tile_in_y = floor_div(tile_mid_y, up_y);
+
+      __syncthreads();
+
+      for (int in_idx = threadIdx.x; in_idx < tile_in_h * tile_in_w;
+           in_idx += blockDim.x) {
+        int rel_in_y = in_idx / tile_in_w;
+        int rel_in_x = in_idx - rel_in_y * tile_in_w;
+        int in_x = rel_in_x + tile_in_x;
+        int in_y = rel_in_y + tile_in_y;
+
+        scalar_t v = 0.0;
+
+        if (in_x >= 0 & in_y >= 0 & in_x < p.in_w & in_y < p.in_h) {
+          v = input[((major_idx * p.in_h + in_y) * p.in_w + in_x) *
+                        p.minor_dim +
+                    minor_idx];
+        }
+
+        sx[rel_in_y][rel_in_x] = v;
+      }
+
+      __syncthreads();
+      for (int out_idx = threadIdx.x; out_idx < tile_out_h * tile_out_w;
+           out_idx += blockDim.x) {
+        int rel_out_y = out_idx / tile_out_w;
+        int rel_out_x = out_idx - rel_out_y * tile_out_w;
+        int out_x = rel_out_x + tile_out_x;
+        int out_y = rel_out_y + tile_out_y;
+
+        int mid_x = tile_mid_x + rel_out_x * down_x;
+        int mid_y = tile_mid_y + rel_out_y * down_y;
+        int in_x = floor_div(mid_x, up_x);
+        int in_y = floor_div(mid_y, up_y);
+        int rel_in_x = in_x - tile_in_x;
+        int rel_in_y = in_y - tile_in_y;
+        int kernel_x = (in_x + 1) * up_x - mid_x - 1;
+        int kernel_y = (in_y + 1) * up_y - mid_y - 1;
+
+        scalar_t v = 0.0;
+
+#pragma unroll
+        for (int y = 0; y < kernel_h / up_y; y++)
+#pragma unroll
+          for (int x = 0; x < kernel_w / up_x; x++)
+            v += sx[rel_in_y + y][rel_in_x + x] *
+                 sk[kernel_y + y * up_y][kernel_x + x * up_x];
+
+        if (out_x < p.out_w & out_y < p.out_h) {
+          out[((major_idx * p.out_h + out_y) * p.out_w + out_x) * p.minor_dim +
+              minor_idx] = v;
+        }
+      }
+    }
+  }
+}
+
+torch::Tensor upfirdn2d_op(const torch::Tensor &input,
+                           const torch::Tensor &kernel, int up_x, int up_y,
+                           int down_x, int down_y, int pad_x0, int pad_x1,
+                           int pad_y0, int pad_y1) {
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
+
+  UpFirDn2DKernelParams p;
+
+  auto x = input.contiguous();
+  auto k = kernel.contiguous();
+
+  p.major_dim = x.size(0);
+  p.in_h = x.size(1);
+  p.in_w = x.size(2);
+  p.minor_dim = x.size(3);
+  p.kernel_h = k.size(0);
+  p.kernel_w = k.size(1);
+  p.up_x = up_x;
+  p.up_y = up_y;
+  p.down_x = down_x;
+  p.down_y = down_y;
+  p.pad_x0 = pad_x0;
+  p.pad_x1 = pad_x1;
+  p.pad_y0 = pad_y0;
+  p.pad_y1 = pad_y1;
+
+  p.out_h = (p.in_h * p.up_y + p.pad_y0 + p.pad_y1 - p.kernel_h + p.down_y) /
+            p.down_y;
+  p.out_w = (p.in_w * p.up_x + p.pad_x0 + p.pad_x1 - p.kernel_w + p.down_x) /
+            p.down_x;
+
+  auto out =
+      at::empty({p.major_dim, p.out_h, p.out_w, p.minor_dim}, x.options());
+
+  int mode = -1;
+
+  int tile_out_h = -1;
+  int tile_out_w = -1;
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 4 && p.kernel_w <= 4) {
+    mode = 1;
+    tile_out_h = 16;
+    tile_out_w = 64;
+  }
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 3 && p.kernel_w <= 3) {
+    mode = 2;
+    tile_out_h = 16;
+    tile_out_w = 64;
+  }
+
+  if (p.up_x == 2 && p.up_y == 2 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 4 && p.kernel_w <= 4) {
+    mode = 3;
+    tile_out_h = 16;
+    tile_out_w = 64;
+  }
+
+  if (p.up_x == 2 && p.up_y == 2 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 2 && p.kernel_w <= 2) {
+    mode = 4;
+    tile_out_h = 16;
+    tile_out_w = 64;
+  }
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 2 && p.down_y == 2 &&
+      p.kernel_h <= 4 && p.kernel_w <= 4) {
+    mode = 5;
+    tile_out_h = 8;
+    tile_out_w = 32;
+  }
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 2 && p.down_y == 2 &&
+      p.kernel_h <= 2 && p.kernel_w <= 2) {
+    mode = 6;
+    tile_out_h = 8;
+    tile_out_w = 32;
+  }
+
+  dim3 block_size;
+  dim3 grid_size;
+
+  if (tile_out_h > 0 && tile_out_w > 0) {
+    p.loop_major = (p.major_dim - 1) / 16384 + 1;
+    p.loop_x = 1;
+    block_size = dim3(32 * 8, 1, 1);
+    grid_size = dim3(((p.out_h - 1) / tile_out_h + 1) * p.minor_dim,
+                     (p.out_w - 1) / (p.loop_x * tile_out_w) + 1,
+                     (p.major_dim - 1) / p.loop_major + 1);
+  } else {
+    p.loop_major = (p.major_dim - 1) / 16384 + 1;
+    p.loop_x = 4;
+    block_size = dim3(4, 32, 1);
+    grid_size = dim3((p.out_h * p.minor_dim - 1) / block_size.x + 1,
+                     (p.out_w - 1) / (p.loop_x * block_size.y) + 1,
+                     (p.major_dim - 1) / p.loop_major + 1);
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&] {
+    switch (mode) {
+      case 1:
+        upfirdn2d_kernel<scalar_t, 1, 1, 1, 1, 4, 4, 16, 64>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      case 2:
+        upfirdn2d_kernel<scalar_t, 1, 1, 1, 1, 3, 3, 16, 64>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      case 3:
+        upfirdn2d_kernel<scalar_t, 2, 2, 1, 1, 4, 4, 16, 64>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      case 4:
+        upfirdn2d_kernel<scalar_t, 2, 2, 1, 1, 2, 2, 16, 64>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      case 5:
+        upfirdn2d_kernel<scalar_t, 1, 1, 2, 2, 4, 4, 8, 32>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      case 6:
+        upfirdn2d_kernel<scalar_t, 1, 1, 2, 2, 4, 4, 8, 32>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      default:
+        upfirdn2d_kernel_large<scalar_t><<<grid_size, block_size, 0, stream>>>(
+            out.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(),
+            k.data_ptr<scalar_t>(), p);
+    }
+  });
+
+  return out;
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
new file mode 100644
index 0000000..bcb7da3
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
@@ -0,0 +1,188 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "voxelization_cuda_kernel.cuh"
+
+int HardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  // current version tooks about 0.04s for one frame on cpu
+  // check device
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  // map points to voxel coors
+  at::Tensor temp_coors =
+      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
+
+  dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 block(512);
+
+  // 1. link point to corresponding voxel coors
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "hard_voxelize_kernel", ([&] {
+        dynamic_voxelize_kernel<scalar_t, int><<<grid, block, 0, stream>>>(
+            points.contiguous().data_ptr<scalar_t>(),
+            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
+            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
+            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,
+            NDim);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 2. map point to the idx of the corresponding voxel, find duplicate coor
+  // create some temporary variables
+  auto point_to_pointidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+  auto point_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+
+  dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 map_block(512);
+
+  AT_DISPATCH_ALL_TYPES(
+      temp_coors.scalar_type(), "determin_duplicate", ([&] {
+        point_to_voxelidx_kernel<int><<<map_grid, map_block, 0, stream>>>(
+            temp_coors.contiguous().data_ptr<int>(),
+            point_to_voxelidx.contiguous().data_ptr<int>(),
+            point_to_pointidx.contiguous().data_ptr<int>(), max_points,
+            max_voxels, num_points, NDim);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 3. determine voxel num and voxel's coor index
+  // make the logic in the CUDA device could accelerate about 10 times
+  auto coor_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+  auto voxel_num = at::zeros(
+      {
+          1,
+      },
+      points.options().dtype(at::kInt));  // must be zero from the beginning
+
+  AT_DISPATCH_ALL_TYPES(temp_coors.scalar_type(), "determin_duplicate", ([&] {
+                          determin_voxel_num<int><<<1, 1, 0, stream>>>(
+                              num_points_per_voxel.contiguous().data_ptr<int>(),
+                              point_to_voxelidx.contiguous().data_ptr<int>(),
+                              point_to_pointidx.contiguous().data_ptr<int>(),
+                              coor_to_voxelidx.contiguous().data_ptr<int>(),
+                              voxel_num.contiguous().data_ptr<int>(),
+                              max_points, max_voxels, num_points);
+                        }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 4. copy point features to voxels
+  // Step 4 & 5 could be parallel
+  auto pts_output_size = num_points * num_features;
+  dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096));
+  dim3 cp_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+        assign_point_to_voxel<float, int><<<cp_grid, cp_block, 0, stream>>>(
+            pts_output_size, points.contiguous().data_ptr<float>(),
+            point_to_voxelidx.contiguous().data_ptr<int>(),
+            coor_to_voxelidx.contiguous().data_ptr<int>(),
+            voxels.contiguous().data_ptr<float>(), max_points, num_features,
+            num_points, NDim);
+      }));
+  //   cudaDeviceSynchronize();
+  //   AT_CUDA_CHECK(cudaGetLastError());
+
+  // 5. copy coors of each voxels
+  auto coors_output_size = num_points * NDim;
+  dim3 coors_cp_grid(
+      std::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096));
+  dim3 coors_cp_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+        assign_voxel_coors<float, int>
+            <<<coors_cp_grid, coors_cp_block, 0, stream>>>(
+                coors_output_size, temp_coors.contiguous().data_ptr<int>(),
+                point_to_voxelidx.contiguous().data_ptr<int>(),
+                coor_to_voxelidx.contiguous().data_ptr<int>(),
+                coors.contiguous().data_ptr<int>(), num_points, NDim);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  auto voxel_num_cpu = voxel_num.to(at::kCPU);
+  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
+
+  return voxel_num_int;
+}
+
+void DynamicVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &coors,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const int NDim = 3) {
+  // current version tooks about 0.04s for one frame on cpu
+  // check device
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  const int col_blocks = at::cuda::ATenCeilDiv(num_points, THREADS_PER_BLOCK);
+  dim3 blocks(col_blocks);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "dynamic_voxelize_kernel", [&] {
+    dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
+        points.contiguous().data_ptr<scalar_t>(),
+        coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
+        coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
+        coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);
+  });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/deform_conv.cpp b/mmcv/ops/csrc/pytorch/deform_conv.cpp
new file mode 100644
index 0000000..86690b9
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/deform_conv.cpp
@@ -0,0 +1,517 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, data_col);
+}
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, grad_im);
+}
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,
+                       data_offset, channels, height, width, ksize_h, ksize_w,
+                       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                       parallel_imgs, deformable_group, grad_offset);
+}
+
+void deform_conv_shape_check(at::Tensor input, at::Tensor offset,
+                             at::Tensor *gradOutput, at::Tensor weight, int kH,
+                             int kW, int dH, int dW, int padH, int padW,
+                             int dilationH, int dilationW, int group,
+                             int deformable_group) {
+  TORCH_CHECK(
+      weight.ndimension() == 4,
+      "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: %s",
+      weight.ndimension());
+
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  TORCH_CHECK(kW > 0 && kH > 0,
+              "kernel size should be greater than zero, but got kH: %d kW: %d",
+              kH, kW);
+
+  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+              "kernel size should be consistent with weight, ",
+              "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
+              kH, kW, weight.size(2), weight.size(3));
+
+  TORCH_CHECK(dW > 0 && dH > 0,
+              "stride should be greater than zero, but got dH: %d dW: %d", dH,
+              dW);
+
+  TORCH_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH, dilationW);
+
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  TORCH_CHECK(ndim == 3 || ndim == 4,
+              "3D or 4D input tensor expected but got: %s", ndim);
+
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  TORCH_CHECK(nInputPlane % deformable_group == 0,
+              "input channels must divide deformable group size");
+
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
+        outputWidth);
+
+  TORCH_CHECK(input.size(1) == nInputPlane,
+              "invalid number of input planes, expected: %d, but got: %d",
+              nInputPlane, input.size(1));
+
+  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
+              "input image is smaller than kernel");
+
+  TORCH_CHECK(
+      (offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+      "invalid spatial size of offset, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      outputHeight, outputWidth, offset.size(2), offset.size(3));
+
+  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+              "invalid number of channels of offset");
+
+  if (gradOutput != NULL) {
+    TORCH_CHECK(
+        gradOutput->size(dimf) == nOutputPlane,
+        "invalid number of gradOutput planes, expected: %d, but got: %d",
+        nOutputPlane, gradOutput->size(dimf));
+
+    TORCH_CHECK(
+        (gradOutput->size(dimh) == outputHeight &&
+         gradOutput->size(dimw) == outputWidth),
+        "invalid size of gradOutput, expected height: %d width: %d , but "
+        "got height: %d width: %d",
+        outputHeight, outputWidth, gradOutput->size(dimh),
+        gradOutput->size(dimw));
+  }
+}
+
+void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
+                         Tensor output, Tensor columns, Tensor ones, int kW,
+                         int kH, int dW, int dH, int padW, int padH,
+                         int dilationW, int dilationH, int group,
+                         int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(weight);
+    CHECK_CPU_INPUT(output);
+    CHECK_CPU_INPUT(columns);
+    CHECK_CPU_INPUT(ones);
+  }
+
+  deform_conv_shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH,
+                          padW, dilationH, dilationW, group, deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+
+  // todo: assert batchsize dividable by im2col_step
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
+                        outputHeight, outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  Tensor output_buffer = at::zeros({batchSize / im2col_step, nOutputPlane,
+                                    im2col_step * outputHeight, outputWidth},
+                                   output.options());
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), group, output_buffer.size(1) / group,
+       output_buffer.size(2), output_buffer.size(3)});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+  }
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
+       output_buffer.size(3), output_buffer.size(4)});
+
+  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
+                                      im2col_step, outputHeight, outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+}
+
+void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
+                                Tensor gradInput, Tensor gradOffset,
+                                Tensor weight, Tensor columns, int kW, int kH,
+                                int dW, int dH, int padW, int padH,
+                                int dilationW, int dilationH, int group,
+                                int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(gradOutput);
+    CHECK_CUDA_INPUT(gradInput);
+    CHECK_CUDA_INPUT(gradOffset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(columns);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(gradOutput);
+    CHECK_CPU_INPUT(gradInput);
+    CHECK_CPU_INPUT(gradOffset);
+    CHECK_CPU_INPUT(weight);
+    CHECK_CPU_INPUT(columns);
+  }
+  deform_conv_shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW,
+                          padH, padW, dilationH, dilationW, group,
+                          deformable_group);
+
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  // change order of grad output
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                              inputHeight, inputWidth});
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
+                                deformable_group * 2 * kH * kW, outputHeight,
+                                outputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), group, gradOutput.size(1) / group,
+         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
+         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
+
+    deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,
+                                 inputHeight, inputWidth, kH, kW, padH, padW,
+                                 dH, dW, dilationH, dilationW, im2col_step,
+                                 deformable_group, gradOffset[elt]);
+
+    deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group,
+                           gradInput[elt]);
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+  }
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+}
+
+void deform_conv_backward_parameters(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradWeight,
+                                     Tensor columns, Tensor ones, int kW,
+                                     int kH, int dW, int dH, int padW, int padH,
+                                     int dilationW, int dilationH, int group,
+                                     int deformable_group, float scale,
+                                     int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(gradOutput);
+    CHECK_CUDA_INPUT(gradWeight);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(gradOutput);
+    CHECK_CPU_INPUT(gradWeight);
+    CHECK_CPU_INPUT(columns);
+    CHECK_CPU_INPUT(ones);
+  }
+
+  deform_conv_shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH,
+                          dW, padH, padW, dilationH, dilationW, group,
+                          deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = gradWeight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
+                             outputHeight, outputWidth});
+  gradOutputBuffer = gradOutputBuffer.contiguous();
+  gradOutputBuffer.copy_(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
+                             im2col_step * outputHeight, outputWidth});
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);
+
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
+         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight =
+        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
+                         gradWeight.size(2), gradWeight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(gradOutputBuffer[elt][g].flatten(1),
+                                  columns[g].transpose(1, 0), 1.0, scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+                                  gradWeight.size(2), gradWeight.size(3),
+                                  gradWeight.size(4)});
+  }
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+}
diff --git a/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp b/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
new file mode 100644
index 0000000..4fb78a9
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
@@ -0,0 +1,42 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,
+                       output, pooled_height, pooled_width, spatial_scale,
+                       sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma) {
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,
+                       offset, grad_input, grad_offset, pooled_height,
+                       pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
+                             Tensor output, int pooled_height, int pooled_width,
+                             float spatial_scale, int sampling_ratio,
+                             float gamma) {
+  deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,
+                               pooled_width, spatial_scale, sampling_ratio,
+                               gamma);
+}
+
+void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
+                              Tensor offset, Tensor grad_input,
+                              Tensor grad_offset, int pooled_height,
+                              int pooled_width, float spatial_scale,
+                              int sampling_ratio, float gamma) {
+  deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,
+                                grad_offset, pooled_height, pooled_width,
+                                spatial_scale, sampling_ratio, gamma);
+}
diff --git a/mmcv/ops/csrc/pytorch/focal_loss.cpp b/mmcv/ops/csrc/pytorch/focal_loss.cpp
new file mode 100644
index 0000000..ed0e218
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/focal_loss.cpp
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,
+                       grad_input, gamma, alpha);
+}
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha) {
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,
+                       buff, grad_input, gamma, alpha);
+}
+
+void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha) {
+  sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor grad_input, float gamma, float alpha) {
+  sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
+                                   alpha);
+}
+
+void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha) {
+  softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor buff, Tensor grad_input, float gamma,
+                                 float alpha) {
+  softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,
+                                   gamma, alpha);
+}
diff --git a/mmcv/ops/csrc/pytorch/furthest_point_sample.cpp b/mmcv/ops/csrc/pytorch/furthest_point_sample.cpp
new file mode 100644
index 0000000..9c7098a
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/furthest_point_sample.cpp
@@ -0,0 +1,34 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,
+                       temp_tensor, idx_tensor, b, n, m);
+}
+
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,
+                       points_tensor, temp_tensor, idx_tensor, b, n, m);
+}
+
+void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
+                                     Tensor idx_tensor, int b, int n, int m) {
+  furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,
+                                       b, n, m);
+}
+
+void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
+                                               Tensor temp_tensor,
+                                               Tensor idx_tensor, int b, int n,
+                                               int m) {
+  furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,
+                                                 idx_tensor, b, n, m);
+}
diff --git a/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp b/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp
new file mode 100644
index 0000000..8d411c9
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp
@@ -0,0 +1,119 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act.cpp
+
+/*
+Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+
+NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+Augmentation (ADA)
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale) {
+  return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
+                              act, grad, alpha, scale);
+}
+
+torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
+                                   const torch::Tensor& bias,
+                                   const torch::Tensor& refer, int act,
+                                   int grad, float alpha, float scale) {
+  return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
+                                      scale);
+}
diff --git a/mmcv/ops/csrc/pytorch/gather_points.cpp b/mmcv/ops/csrc/pytorch/gather_points.cpp
new file mode 100644
index 0000000..b8fb020
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/gather_points.cpp
@@ -0,0 +1,30 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out) {
+  DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
+                       idx, out);
+}
+
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
+                       idx, grad_points);
+}
+
+void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                           Tensor out_tensor, int b, int c, int n,
+                           int npoints) {
+  gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
+                             out_tensor);
+}
+
+void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                            Tensor grad_points_tensor, int b, int c, int n,
+                            int npoints) {
+  gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
+                              grad_points_tensor);
+}
diff --git a/mmcv/ops/csrc/pytorch/group_points.cpp b/mmcv/ops/csrc/pytorch/group_points.cpp
new file mode 100644
index 0000000..cdd190d
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/group_points.cpp
@@ -0,0 +1,34 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points, idx, out);
+}
+
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
+                       grad_out, idx, grad_points);
+}
+
+void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                          Tensor out_tensor, int b, int c, int n, int npoints,
+                          int nsample) {
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points_tensor, idx_tensor, out_tensor);
+}
+
+void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                           Tensor grad_points_tensor, int b, int c, int n,
+                           int npoints, int nsample) {
+  group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
+                             idx_tensor, grad_points_tensor);
+}
diff --git a/mmcv/ops/csrc/pytorch/info.cpp b/mmcv/ops/csrc/pytorch/info.cpp
new file mode 100644
index 0000000..a08d227
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/info.cpp
@@ -0,0 +1,56 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef MMCV_WITH_CUDA
+#ifndef HIP_DIFF
+#include <cuda_runtime_api.h>
+int get_cudart_version() { return CUDART_VERSION; }
+#endif
+#endif
+
+std::string get_compiling_cuda_version() {
+#ifdef MMCV_WITH_CUDA
+#ifndef HIP_DIFF
+  std::ostringstream oss;
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else
+  return std::string("rocm not available");
+#endif
+#else
+  return std::string("not available");
+#endif
+}
+
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
diff --git a/mmcv/ops/csrc/pytorch/iou3d.cpp b/mmcv/ops/csrc/pytorch/iou3d.cpp
new file mode 100644
index 0000000..71f5030
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/iou3d.cpp
@@ -0,0 +1,151 @@
+// Modified from
+// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms.cpp
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap) {
+  DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
+                       num_b, boxes_b, ans_overlap);
+}
+
+void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                      const int num_b, const Tensor boxes_b,
+                                      Tensor ans_iou) {
+  DISPATCH_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, num_a, boxes_a, num_b,
+                       boxes_b, ans_iou);
+}
+
+void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long *mask,
+                            int boxes_num, float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms_forward_impl, boxes, mask, boxes_num,
+                       nms_overlap_thresh);
+}
+
+void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long *mask,
+                                   int boxes_num, float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms_normal_forward_impl, boxes, mask, boxes_num,
+                       nms_overlap_thresh);
+}
+
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap) {
+  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
+  // params boxes_b: (M, 5)
+  // params ans_overlap: (N, M)
+
+  int num_a = boxes_a.size(0);
+  int num_b = boxes_b.size(0);
+
+  iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
+                                       ans_overlap);
+}
+
+void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                 Tensor ans_iou) {
+  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
+  // params boxes_b: (M, 5)
+  // params ans_overlap: (N, M)
+  int num_a = boxes_a.size(0);
+  int num_b = boxes_b.size(0);
+
+  iou3d_boxes_iou_bev_forward_impl(num_a, boxes_a, num_b, boxes_b, ans_iou);
+}
+
+void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                       float nms_overlap_thresh) {
+  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
+  // params keep: (N)
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
+
+  int boxes_num = boxes.size(0);
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
+
+  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_data =
+      (unsigned long long *)mask.data_ptr<int64_t>();
+  iou3d_nms_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long *mask_host =
+      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv_cpu(col_blocks);
+  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  int num_to_keep = 0;
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
+      }
+    }
+    *keep_num_data = num_to_keep;
+  }
+}
+
+void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                              float nms_overlap_thresh) {
+  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
+  // params keep: (N)
+
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
+
+  int boxes_num = boxes.size(0);
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
+
+  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_data =
+      (unsigned long long *)mask.data_ptr<int64_t>();
+  iou3d_nms_normal_forward_impl(boxes, mask_data, boxes_num,
+                                nms_overlap_thresh);
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long *mask_host =
+      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv_cpu(col_blocks);
+  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+  int num_to_keep = 0;
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
+      }
+    }
+  }
+
+  *keep_num_data = num_to_keep;
+}
diff --git a/mmcv/ops/csrc/pytorch/knn.cpp b/mmcv/ops/csrc/pytorch/knn.cpp
new file mode 100644
index 0000000..b4be942
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/knn.cpp
@@ -0,0 +1,17 @@
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
+  DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
+                       dist2);
+}
+
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample) {
+  knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
+                   dist2_tensor);
+}
diff --git a/mmcv/ops/csrc/pytorch/masked_conv2d.cpp b/mmcv/ops/csrc/pytorch/masked_conv2d.cpp
new file mode 100644
index 0000000..5903925
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/masked_conv2d.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
+  DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
+                       col, kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels) {
+  DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
+                       im, height, width, channels);
+}
+
+void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor col,
+                           const int kernel_h, const int kernel_w,
+                           const int pad_h, const int pad_w) {
+  masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
+                             kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor im, int height,
+                           int width, int channels) {
+  masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
+                             channels);
+}
diff --git a/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp b/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp
new file mode 100644
index 0000000..12b538a
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp
@@ -0,0 +1,237 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, data_col);
+}
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, grad_im);
+}
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
+                       data_im, data_offset, data_mask, batch_size, channels,
+                       height_im, width_im, height_col, width_col, kernel_h,
+                       kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+                       dilation_w, deformable_group, grad_offset, grad_mask);
+}
+
+void modulated_deform_conv_forward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias) {
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
+  // resize temporary columns
+  columns =
+      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
+                input.options());
+
+  output = output.view({output.size(0), group, output.size(1) / group,
+                        output.size(2), output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    // divide into group
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
+    }
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+  }
+
+  output = output.view({output.size(0), output.size(1) * output.size(2),
+                        output.size(3), output.size(4)});
+
+  if (with_bias) {
+    output += bias.view({1, bias.size(0), 1, 1});
+  }
+}
+
+void modulated_deform_conv_backward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias) {
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
+                      input.options());
+
+  grad_output =
+      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
+                        grad_output.size(2), grad_output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    // divide int group
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                        grad_output[b][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_impl(
+        columns, input[b], offset[b], mask[b], 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_impl(
+        columns, offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
+                                    grad_weight.size(1), grad_weight.size(2),
+                                    grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
+      if (with_bias) {
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
+      }
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
+                                    grad_weight.size(2), grad_weight.size(3),
+                                    grad_weight.size(4)});
+    if (with_bias)
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+  }
+  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
+                                  grad_output.size(2), grad_output.size(3),
+                                  grad_output.size(4)});
+}
diff --git a/mmcv/ops/csrc/pytorch/ms_deform_attn.cpp b/mmcv/ops/csrc/pytorch/ms_deform_attn.cpp
new file mode 100644
index 0000000..25c8f62
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/ms_deform_attn.cpp
@@ -0,0 +1,60 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor ms_deform_attn_impl_forward(const Tensor &value,
+                                   const Tensor &spatial_shapes,
+                                   const Tensor &level_start_index,
+                                   const Tensor &sampling_loc,
+                                   const Tensor &attn_weight,
+                                   const int im2col_step) {
+  return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
+                              spatial_shapes, level_start_index, sampling_loc,
+                              attn_weight, im2col_step);
+}
+
+void ms_deform_attn_impl_backward(
+    const Tensor &value, const Tensor &spatial_shapes,
+    const Tensor &level_start_index, const Tensor &sampling_loc,
+    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
+    Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
+    const int im2col_step) {
+  DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
+                       level_start_index, sampling_loc, attn_weight,
+                       grad_output, grad_value, grad_sampling_loc,
+                       grad_attn_weight, im2col_step);
+}
+
+Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
+                              const Tensor &level_start_index,
+                              const Tensor &sampling_loc,
+                              const Tensor &attn_weight,
+                              const int im2col_step) {
+  at::DeviceGuard guard(value.device());
+  return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
+                                     sampling_loc, attn_weight, im2col_step);
+}
+
+void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
+                             const Tensor &level_start_index,
+                             const Tensor &sampling_loc,
+                             const Tensor &attn_weight,
+                             const Tensor &grad_output, Tensor &grad_value,
+                             Tensor &grad_sampling_loc,
+                             Tensor &grad_attn_weight, const int im2col_step) {
+  at::DeviceGuard guard(value.device());
+  ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
+                               sampling_loc, attn_weight, grad_output,
+                               grad_value, grad_sampling_loc, grad_attn_weight,
+                               im2col_step);
+}
diff --git a/mmcv/ops/csrc/pytorch/nms.cpp b/mmcv/ops/csrc/pytorch/nms.cpp
new file mode 100644
index 0000000..199d8af
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/nms.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);
+}
+
+Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
+                    float iou_threshold, float sigma, float min_score,
+                    int method, int offset) {
+  return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,
+                              sigma, min_score, method, offset);
+}
+
+std::vector<std::vector<int> > nms_match_impl(Tensor dets,
+                                              float iou_threshold) {
+  return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);
+}
+
+Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return nms_impl(boxes, scores, iou_threshold, offset);
+}
+
+Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
+               float sigma, float min_score, int method, int offset) {
+  return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,
+                      method, offset);
+}
+
+std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
+  return nms_match_impl(dets, iou_threshold);
+}
diff --git a/mmcv/ops/csrc/pytorch/nms_rotated.cpp b/mmcv/ops/csrc/pytorch/nms_rotated.cpp
new file mode 100644
index 0000000..e4ef676
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/nms_rotated.cpp
@@ -0,0 +1,32 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated.h
+#include "pytorch_cpp_helper.hpp"
+
+Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
+                       const float iou_threshold);
+
+#ifdef MMCV_WITH_CUDA
+Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
+                        const Tensor order, const Tensor dets_sorted,
+                        const float iou_threshold, const int multi_label);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
+                   const Tensor dets_sorted, const float iou_threshold,
+                   const int multi_label) {
+  assert(dets.device().is_cuda() == scores.device().is_cuda());
+  if (dets.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    return nms_rotated_cuda(dets, scores, order, dets_sorted, iou_threshold,
+                            multi_label);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+
+  return nms_rotated_cpu(dets, scores, iou_threshold);
+}
diff --git a/mmcv/ops/csrc/pytorch/pixel_group.cpp b/mmcv/ops/csrc/pytorch/pixel_group.cpp
new file mode 100755
index 0000000..2bf8c8b
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/pixel_group.cpp
@@ -0,0 +1,26 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// It is modified from https://github.com/WenmuZhou/PAN.pytorch
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+std::vector<std::vector<float>> pixel_group_impl(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
+  return DISPATCH_DEVICE_IMPL(pixel_group_impl, score, mask, embedding,
+                              kernel_label, kernel_contour, kernel_region_num,
+                              dis_threshold);
+}
+
+std::vector<std::vector<float>> pixel_group(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float distance_threshold) {
+  score = score.contiguous();
+  mask = mask.contiguous();
+  embedding = embedding.contiguous();
+  kernel_label = kernel_label.contiguous();
+  kernel_contour = kernel_contour.contiguous();
+
+  return pixel_group_impl(score, mask, embedding, kernel_label, kernel_contour,
+                          kernel_region_num, distance_threshold);
+}
diff --git a/mmcv/ops/csrc/pytorch/points_in_boxes.cpp b/mmcv/ops/csrc/pytorch/points_in_boxes.cpp
new file mode 100644
index 0000000..540da94
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/points_in_boxes.cpp
@@ -0,0 +1,44 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points) {
+  DISPATCH_DEVICE_IMPL(points_in_boxes_part_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}
+
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points) {
+  DISPATCH_DEVICE_IMPL(points_in_boxes_all_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}
+
+void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                  Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
+  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
+  // default -1
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_part_forward_impl(batch_size, boxes_num, pts_num,
+                                    boxes_tensor, pts_tensor,
+                                    box_idx_of_points_tensor);
+}
+
+void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z]
+  // in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_all_forward_impl(batch_size, boxes_num, pts_num, boxes_tensor,
+                                   pts_tensor, box_idx_of_points_tensor);
+}
diff --git a/mmcv/ops/csrc/pytorch/psamask.cpp b/mmcv/ops/csrc/pytorch/psamask.cpp
new file mode 100644
index 0000000..6064c9b
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/psamask.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  DISPATCH_DEVICE_IMPL(psamask_forward_impl, psa_type, input, output, num_,
+                       h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
+}
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask) {
+  DISPATCH_DEVICE_IMPL(psamask_backward_impl, psa_type, grad_output, grad_input,
+                       num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
+}
+
+void psamask_forward(const Tensor input, Tensor output, const int psa_type,
+                     const int num_, const int h_feature, const int w_feature,
+                     const int h_mask, const int w_mask, const int half_h_mask,
+                     const int half_w_mask) {
+  psamask_forward_impl(psa_type, input, output, num_, h_feature, w_feature,
+                       h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+void psamask_backward(Tensor grad_output, const Tensor grad_input,
+                      const int psa_type, const int num_, const int h_feature,
+                      const int w_feature, const int h_mask, const int w_mask,
+                      const int half_h_mask, const int half_w_mask) {
+  psamask_backward_impl(psa_type, grad_output, grad_input, num_, h_feature,
+                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+}
diff --git a/mmcv/ops/csrc/pytorch/pybind.cpp b/mmcv/ops/csrc/pytorch/pybind.cpp
new file mode 100644
index 0000000..09d62d3
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/pybind.cpp
@@ -0,0 +1,689 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+
+std::string get_compiler_version();
+std::string get_compiling_cuda_version();
+
+void assign_score_withk_forward(const Tensor &points, const Tensor &centers,
+                                const Tensor &scores, const Tensor &knn_idx,
+                                Tensor &output, int B, int N0, int N1, int M,
+                                int K, int O, int aggregate);
+
+void assign_score_withk_backward(const Tensor &grad_out, const Tensor &points,
+                                 const Tensor &centers, const Tensor &scores,
+                                 const Tensor &knn_idx, Tensor &grad_points,
+                                 Tensor &grad_centers, Tensor &grad_scores,
+                                 int B, int N0, int N1, int M, int K, int O,
+                                 int aggregate);
+
+void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
+                          int kernel_size, int group_size, int scale_factor);
+
+void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
+                           Tensor bottom_grad, Tensor mask_grad,
+                           int kernel_size, int group_size, int scale_factor);
+
+void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
+                    Tensor routput, Tensor rmasks, Tensor output,
+                    int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                     Tensor rtop_grad, Tensor rbottom_grad_hs,
+                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
+                     Tensor mask_grad, int kernel_size, int group_size,
+                     int scale_factor);
+
+void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
+                         Tensor output, Tensor columns, Tensor ones, int kW,
+                         int kH, int dW, int dH, int padW, int padH,
+                         int dilationW, int dilationH, int group,
+                         int deformable_group, int im2col_step);
+
+void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
+                                Tensor gradInput, Tensor gradOffset,
+                                Tensor weight, Tensor columns, int kW, int kH,
+                                int dW, int dH, int padW, int padH,
+                                int dilationW, int dilationH, int group,
+                                int deformable_group, int im2col_step);
+
+void deform_conv_backward_parameters(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradWeight,
+                                     Tensor columns, Tensor ones, int kW,
+                                     int kH, int dW, int dH, int padW, int padH,
+                                     int dilationW, int dilationH, int group,
+                                     int deformable_group, float scale,
+                                     int im2col_step);
+
+void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
+                             Tensor output, int pooled_height, int pooled_width,
+                             float spatial_scale, int sampling_ratio,
+                             float gamma);
+
+void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
+                              Tensor offset, Tensor grad_input,
+                              Tensor grad_offset, int pooled_height,
+                              int pooled_width, float spatial_scale,
+                              int sampling_ratio, float gamma);
+
+void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                          Tensor out_tensor, int b, int c, int n, int npoints,
+                          int nsample);
+
+void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                           Tensor grad_points_tensor, int b, int c, int n,
+                           int npoints, int nsample);
+
+void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
+                             Tensor pooled_features, Tensor pooled_empty_flag);
+
+void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                           Tensor out_tensor, int b, int c, int n, int npoints);
+
+void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                            Tensor grad_points_tensor, int b, int c, int n,
+                            int npoints);
+
+void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor grad_input, float gamma, float alpha);
+
+void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha);
+
+void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor buff, Tensor grad_input, float gamma,
+                                 float alpha);
+
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
+                               int c, int m, int n);
+
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
+                                int b, int c, int n, int m);
+
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
+                      int m);
+
+void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                   const int mode, const bool aligned, const int offset);
+
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample);
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap);
+
+void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                 Tensor ans_iou);
+
+void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                       float nms_overlap_thresh);
+
+void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                              float nms_overlap_thresh);
+
+void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
+                                     Tensor idx_tensor, int b, int n, int m);
+
+void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
+                                               Tensor temp_tensor,
+                                               Tensor idx_tensor, int b, int n,
+                                               int m);
+
+void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor col,
+                           const int kernel_h, const int kernel_w,
+                           const int pad_h, const int pad_w);
+
+void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor im, int height,
+                           int width, int channels);
+
+void modulated_deform_conv_forward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias);
+
+void modulated_deform_conv_backward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias);
+
+Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
+                              const Tensor &level_start_index,
+                              const Tensor &sampling_loc,
+                              const Tensor &attn_weight, const int im2col_step);
+
+void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
+                             const Tensor &level_start_index,
+                             const Tensor &sampling_loc,
+                             const Tensor &attn_weight,
+                             const Tensor &grad_output, Tensor &grad_value,
+                             Tensor &grad_sampling_loc,
+                             Tensor &grad_attn_weight, const int im2col_step);
+
+Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+
+Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
+               float sigma, float min_score, int method, int offset);
+
+std::vector<std::vector<int>> nms_match(Tensor dets, float iou_threshold);
+
+std::vector<std::vector<float>> pixel_group(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float distance_threshold);
+
+std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
+                                             Tensor internal_kernel_label,
+                                             int min_kernel_area,
+                                             int kernel_num);
+
+void roi_align_forward(Tensor input, Tensor rois, Tensor output,
+                       Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                       int aligned_width, float spatial_scale,
+                       int sampling_ratio, int pool_mode, bool aligned);
+
+void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                        Tensor argmax_x, Tensor grad_input, int aligned_height,
+                        int aligned_width, float spatial_scale,
+                        int sampling_ratio, int pool_mode, bool aligned);
+
+void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
+                      int pooled_height, int pooled_width, float spatial_scale);
+
+void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
+                       Tensor grad_input, int pooled_height, int pooled_width,
+                       float spatial_scale);
+
+void sync_bn_forward_mean(const Tensor input, Tensor mean);
+
+void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var);
+
+void sync_bn_forward_output(const Tensor input, const Tensor mean,
+                            const Tensor var, const Tensor weight,
+                            const Tensor bias, Tensor running_mean,
+                            Tensor running_var, Tensor norm, Tensor std,
+                            Tensor output, float eps, float momentum,
+                            int group_size);
+
+void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
+                            Tensor grad_weight, Tensor grad_bias);
+
+void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
+                           const Tensor grad_weight, const Tensor grad_bias,
+                           const Tensor norm, const Tensor std,
+                           Tensor grad_input);
+
+void psamask_forward(const Tensor input, Tensor output, const int psa_type,
+                     const int num_, const int h_feature, const int w_feature,
+                     const int h_mask, const int w_mask, const int half_h_mask,
+                     const int half_w_mask);
+
+void psamask_backward(Tensor grad_output, const Tensor grad_input,
+                      const int psa_type, const int num_, const int h_feature,
+                      const int w_feature, const int h_mask, const int w_mask,
+                      const int half_h_mask, const int half_w_mask);
+
+void tin_shift_forward(Tensor input, Tensor shift, Tensor output);
+
+void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input);
+
+void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
+                        Tensor idx_tensor, int b, int n, int m,
+                        float min_radius, float max_radius, int nsample);
+
+Tensor bottom_pool_forward(Tensor input);
+
+Tensor bottom_pool_backward(Tensor input, Tensor grad_output);
+
+Tensor left_pool_forward(Tensor input);
+
+Tensor left_pool_backward(Tensor input, Tensor grad_output);
+
+Tensor right_pool_forward(Tensor input);
+
+Tensor right_pool_backward(Tensor input, Tensor grad_output);
+
+Tensor top_pool_forward(Tensor input);
+
+Tensor top_pool_backward(Tensor input, Tensor grad_output);
+
+void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                     const int mode_flag, const bool aligned);
+
+Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
+                   const Tensor dets_sorted, const float iou_threshold,
+                   const int multi_label);
+
+Tensor upfirdn2d(const Tensor &input, const Tensor &kernel, int up_x, int up_y,
+                 int down_x, int down_y, int pad_x0, int pad_x1, int pad_y0,
+                 int pad_y1);
+
+Tensor fused_bias_leakyrelu(const Tensor &input, const Tensor &bias,
+                            const Tensor &refer, int act, int grad, float alpha,
+                            float scale);
+
+void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
+                               int pooled_height, int pooled_width,
+                               float spatial_scale, int sample_num,
+                               bool aligned, bool clockwise);
+
+void roi_align_rotated_backward(Tensor grad_output, Tensor rois,
+                                Tensor grad_input, int pooled_height,
+                                int pooled_width, float spatial_scale,
+                                int sample_num, bool aligned, bool clockwise);
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
+    const torch::Tensor &feats, const torch::Tensor &coors,
+    const std::string &reduce_type);
+
+void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
+                                     const torch::Tensor &grad_reduced_feats,
+                                     const torch::Tensor &feats,
+                                     const torch::Tensor &reduced_feats,
+                                     const torch::Tensor &coors_idx,
+                                     const torch::Tensor &reduce_count,
+                                     const std::string &reduce_type);
+
+void hard_voxelize_forward(const at::Tensor &points,
+                           const at::Tensor &voxel_size,
+                           const at::Tensor &coors_range, at::Tensor &voxels,
+                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                           at::Tensor &voxel_num, const int max_points,
+                           const int max_voxels, const int NDim);
+
+void dynamic_voxelize_forward(const at::Tensor &points,
+                              const at::Tensor &voxel_size,
+                              const at::Tensor &coors_range, at::Tensor &coors,
+                              const int NDim);
+
+void border_align_forward(const Tensor &input, const Tensor &boxes,
+                          Tensor output, Tensor argmax_idx,
+                          const int pool_size);
+
+void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
+                           const Tensor &argmax_idx, Tensor grad_input,
+                           const int pool_size);
+
+void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor pts_indices_tensor);
+
+void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                  Tensor box_idx_of_points_tensor);
+
+void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor box_idx_of_points_tensor);
+
+void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
+                             Tensor argmax, Tensor pts_idx_of_voxels,
+                             Tensor pooled_features, int pool_method);
+
+void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
+                              Tensor grad_out, Tensor grad_in, int pool_method);
+
+void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
+                         int kW, int patchH, int patchW, int padH, int padW,
+                         int dilationH, int dilationW, int dilation_patchH,
+                         int dilation_patchW, int dH, int dW);
+
+void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
+                          Tensor grad_input1, Tensor grad_input2, int kH,
+                          int kW, int patchH, int patchW, int padH, int padW,
+                          int dilationH, int dilationW, int dilation_patchH,
+                          int dilation_patchW, int dH, int dW);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("upfirdn2d", &upfirdn2d, "upfirdn2d (CUDA)", py::arg("input"),
+        py::arg("kernel"), py::arg("up_x"), py::arg("up_y"), py::arg("down_x"),
+        py::arg("down_y"), py::arg("pad_x0"), py::arg("pad_x1"),
+        py::arg("pad_y0"), py::arg("pad_y1"));
+  m.def("fused_bias_leakyrelu", &fused_bias_leakyrelu,
+        "fused_bias_leakyrelu (CUDA)", py::arg("input"), py::arg("bias"),
+        py::arg("empty"), py::arg("act"), py::arg("grad"), py::arg("alpha"),
+        py::arg("scale"));
+  m.def("gather_points_forward", &gather_points_forward,
+        "gather_points_forward", py::arg("points_tensor"),
+        py::arg("idx_tensor"), py::arg("out_tensor"), py::arg("b"),
+        py::arg("c"), py::arg("n"), py::arg("npoints"));
+  m.def("gather_points_backward", &gather_points_backward,
+        "gather_points_backward", py::arg("grad_out_tensor"),
+        py::arg("idx_tensor"), py::arg("grad_points_tensor"), py::arg("b"),
+        py::arg("c"), py::arg("n"), py::arg("npoints"));
+  m.def("get_compiler_version", &get_compiler_version, "get_compiler_version");
+  m.def("get_compiling_cuda_version", &get_compiling_cuda_version,
+        "get_compiling_cuda_version");
+  m.def("assign_score_withk_forward", &assign_score_withk_forward,
+        "assign_score_withk_forward", py::arg("points"), py::arg("centers"),
+        py::arg("scores"), py::arg("knn_idx"), py::arg("output"), py::arg("B"),
+        py::arg("N0"), py::arg("N1"), py::arg("M"), py::arg("K"), py::arg("O"),
+        py::arg("aggregate"));
+  m.def("assign_score_withk_backward", &assign_score_withk_backward,
+        "assign_score_withk_backward", py::arg("grad_out"), py::arg("points"),
+        py::arg("centers"), py::arg("scores"), py::arg("knn_idx"),
+        py::arg("grad_points"), py::arg("grad_centers"), py::arg("grad_scores"),
+        py::arg("B"), py::arg("N0"), py::arg("N1"), py::arg("M"), py::arg("K"),
+        py::arg("O"), py::arg("aggregate"));
+  m.def("knn_forward", &knn_forward, "knn_forward", py::arg("xyz_tensor"),
+        py::arg("new_xyz_tensor"), py::arg("idx_tensor"),
+        py::arg("dist2_tensor"), py::arg("b"), py::arg("n"), py::arg("m"),
+        py::arg("nsample"));
+  m.def("carafe_naive_forward", &carafe_naive_forward, "carafe_naive_forward",
+        py::arg("features"), py::arg("masks"), py::arg("output"),
+        py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
+  m.def("carafe_naive_backward", &carafe_naive_backward,
+        "carafe_naive_backward", py::arg("top_grad"), py::arg("features"),
+        py::arg("masks"), py::arg("bottom_grad"), py::arg("mask_grad"),
+        py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
+  m.def("carafe_forward", &carafe_forward, "carafe_forward",
+        py::arg("features"), py::arg("masks"), py::arg("rfeatures"),
+        py::arg("routput"), py::arg("rmasks"), py::arg("output"),
+        py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
+  m.def("carafe_backward", &carafe_backward, "carafe_backward",
+        py::arg("top_grad"), py::arg("rfeatures"), py::arg("masks"),
+        py::arg("rtop_grad"), py::arg("rbottom_grad_hs"),
+        py::arg("rbottom_grad"), py::arg("rmask_grad"), py::arg("bottom_grad"),
+        py::arg("mask_grad"), py::arg("kernel_size"), py::arg("group_size"),
+        py::arg("scale_factor"));
+  m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward",
+        py::arg("input"), py::arg("weight"), py::arg("offset"),
+        py::arg("output"), py::arg("columns"), py::arg("ones"), py::arg("kW"),
+        py::arg("kH"), py::arg("dW"), py::arg("dH"), py::arg("padH"),
+        py::arg("padW"), py::arg("dilationW"), py::arg("dilationH"),
+        py::arg("group"), py::arg("deformable_group"), py::arg("im2col_step"));
+  m.def("deform_conv_backward_input", &deform_conv_backward_input,
+        "deform_conv_backward_input", py::arg("input"), py::arg("offset"),
+        py::arg("gradOutput"), py::arg("gradInput"), py::arg("gradOffset"),
+        py::arg("weight"), py::arg("columns"), py::arg("kW"), py::arg("kH"),
+        py::arg("dW"), py::arg("dH"), py::arg("padH"), py::arg("padW"),
+        py::arg("dilationW"), py::arg("dilationH"), py::arg("group"),
+        py::arg("deformable_group"), py::arg("im2col_step"));
+  m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters,
+        "deform_conv_backward_parameters", py::arg("input"), py::arg("offset"),
+        py::arg("gradOutput"), py::arg("gradWeight"), py::arg("columns"),
+        py::arg("ones"), py::arg("kW"), py::arg("kH"), py::arg("dW"),
+        py::arg("dH"), py::arg("padH"), py::arg("padW"), py::arg("dilationW"),
+        py::arg("dilationH"), py::arg("group"), py::arg("deformable_group"),
+        py::arg("scale"), py::arg("im2col_step"));
+  m.def("deform_roi_pool_forward", &deform_roi_pool_forward,
+        "deform roi pool forward", py::arg("input"), py::arg("rois"),
+        py::arg("offset"), py::arg("output"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("gamma"));
+  m.def("deform_roi_pool_backward", &deform_roi_pool_backward,
+        "deform roi pool backward", py::arg("grad_output"), py::arg("input"),
+        py::arg("rois"), py::arg("offset"), py::arg("grad_input"),
+        py::arg("grad_offset"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("gamma"));
+  m.def("roipoint_pool3d_forward", &roipoint_pool3d_forward,
+        "roipoint_pool3d_forward", py::arg("xyz"), py::arg("boxes3d"),
+        py::arg("pts_feature"), py::arg("pooled_features"),
+        py::arg("pooled_empty_flag"));
+  m.def("sigmoid_focal_loss_forward", &sigmoid_focal_loss_forward,
+        "sigmoid_focal_loss_forward ", py::arg("input"), py::arg("target"),
+        py::arg("weight"), py::arg("output"), py::arg("gamma"),
+        py::arg("alpha"));
+  m.def("sigmoid_focal_loss_backward", &sigmoid_focal_loss_backward,
+        "sigmoid_focal_loss_backward", py::arg("input"), py::arg("target"),
+        py::arg("weight"), py::arg("grad_input"), py::arg("gamma"),
+        py::arg("alpha"));
+  m.def("softmax_focal_loss_forward", &softmax_focal_loss_forward,
+        "softmax_focal_loss_forward", py::arg("input"), py::arg("target"),
+        py::arg("weight"), py::arg("output"), py::arg("gamma"),
+        py::arg("alpha"));
+  m.def("softmax_focal_loss_backward", &softmax_focal_loss_backward,
+        "softmax_focal_loss_backward", py::arg("input"), py::arg("target"),
+        py::arg("weight"), py::arg("buff"), py::arg("grad_input"),
+        py::arg("gamma"), py::arg("alpha"));
+  m.def("three_interpolate_forward", &three_interpolate_forward,
+        "three_interpolate_forward", py::arg("points_tensor"),
+        py::arg("idx_tensor"), py::arg("weight_tensor"), py::arg("out_tensor"),
+        py::arg("b"), py::arg("c"), py::arg("m"), py::arg("n"));
+  m.def("three_interpolate_backward", &three_interpolate_backward,
+        "three_interpolate_backward", py::arg("grad_out_tensor"),
+        py::arg("idx_tensor"), py::arg("weight_tensor"),
+        py::arg("grad_points_tensor"), py::arg("b"), py::arg("c"), py::arg("n"),
+        py::arg("m"));
+  m.def("three_nn_forward", &three_nn_forward, "three_nn_forward",
+        py::arg("unknown_tensor"), py::arg("known_tensor"),
+        py::arg("dist2_tensor"), py::arg("idx_tensor"), py::arg("b"),
+        py::arg("n"), py::arg("m"));
+  m.def("bbox_overlaps", &bbox_overlaps, "bbox_overlaps", py::arg("bboxes1"),
+        py::arg("bboxes2"), py::arg("ious"), py::arg("mode"),
+        py::arg("aligned"), py::arg("offset"));
+  m.def("group_points_forward", &group_points_forward, "group_points_forward",
+        py::arg("points_tensor"), py::arg("idx_tensor"), py::arg("out_tensor"),
+        py::arg("b"), py::arg("c"), py::arg("n"), py::arg("npoints"),
+        py::arg("nsample"));
+  m.def("group_points_backward", &group_points_backward,
+        "group_points_backward", py::arg("grad_out_tensor"),
+        py::arg("idx_tensor"), py::arg("grad_points_tensor"), py::arg("b"),
+        py::arg("c"), py::arg("n"), py::arg("npoints"), py::arg("nsample"));
+  m.def("knn_forward", &knn_forward, "knn_forward", py::arg("b"), py::arg("n"),
+        py::arg("m"), py::arg("nsample"), py::arg("xyz_tensor"),
+        py::arg("new_xyz_tensor"), py::arg("idx_tensor"),
+        py::arg("dist2_tensor"));
+  m.def("iou3d_boxes_overlap_bev_forward", &iou3d_boxes_overlap_bev_forward,
+        "iou3d_boxes_overlap_bev_forward", py::arg("boxes_a"),
+        py::arg("boxes_b"), py::arg("ans_overlap"));
+  m.def("iou3d_boxes_iou_bev_forward", &iou3d_boxes_iou_bev_forward,
+        "iou3d_boxes_iou_bev_forward", py::arg("boxes_a"), py::arg("boxes_b"),
+        py::arg("ans_iou"));
+  m.def("iou3d_nms_forward", &iou3d_nms_forward, "iou3d_nms_forward",
+        py::arg("boxes"), py::arg("keep"), py::arg("num_out"),
+        py::arg("nms_overlap_thresh"));
+  m.def("iou3d_nms_normal_forward", &iou3d_nms_normal_forward,
+        "iou3d_nms_normal_forward", py::arg("boxes"), py::arg("keep"),
+        py::arg("num_out"), py::arg("nms_overlap_thresh"));
+  m.def("furthest_point_sampling_forward", &furthest_point_sampling_forward,
+        "furthest_point_sampling_forward", py::arg("points_tensor"),
+        py::arg("temp_tensor"), py::arg("idx_tensor"), py::arg("b"),
+        py::arg("n"), py::arg("m"));
+  m.def("furthest_point_sampling_with_dist_forward",
+        &furthest_point_sampling_with_dist_forward,
+        "furthest_point_sampling_with_dist_forward", py::arg("points_tensor"),
+        py::arg("temp_tensor"), py::arg("idx_tensor"), py::arg("b"),
+        py::arg("n"), py::arg("m"));
+  m.def("masked_im2col_forward", &masked_im2col_forward,
+        "masked_im2col_forward", py::arg("im"), py::arg("mask_h_idx"),
+        py::arg("mask_w_idx"), py::arg("col"), py::arg("kernel_h"),
+        py::arg("kernel_w"), py::arg("pad_h"), py::arg("pad_w"));
+  m.def("masked_col2im_forward", &masked_col2im_forward,
+        "masked_col2im_forward", py::arg("col"), py::arg("mask_h_idx"),
+        py::arg("mask_w_idx"), py::arg("im"), py::arg("height"),
+        py::arg("width"), py::arg("channels"));
+  m.def("modulated_deform_conv_forward", &modulated_deform_conv_forward,
+        "modulated deform conv forward", py::arg("input"), py::arg("weight"),
+        py::arg("bias"), py::arg("ones"), py::arg("offset"), py::arg("mask"),
+        py::arg("output"), py::arg("columns"), py::arg("kernel_h"),
+        py::arg("kernel_w"), py::arg("stride_h"), py::arg("stride_w"),
+        py::arg("pad_h"), py::arg("pad_w"), py::arg("dilation_h"),
+        py::arg("dilation_w"), py::arg("group"), py::arg("deformable_group"),
+        py::arg("with_bias"));
+  m.def("modulated_deform_conv_backward", &modulated_deform_conv_backward,
+        "modulated deform conv backward", py::arg("input"), py::arg("weight"),
+        py::arg("bias"), py::arg("ones"), py::arg("offset"), py::arg("mask"),
+        py::arg("columns"), py::arg("grad_input"), py::arg("grad_weight"),
+        py::arg("grad_bias"), py::arg("grad_offset"), py::arg("grad_mask"),
+        py::arg("grad_output"), py::arg("kernel_h"), py::arg("kernel_w"),
+        py::arg("stride_h"), py::arg("stride_w"), py::arg("pad_h"),
+        py::arg("pad_w"), py::arg("dilation_h"), py::arg("dilation_w"),
+        py::arg("group"), py::arg("deformable_group"), py::arg("with_bias"));
+  m.def("nms", &nms, "nms (CPU/CUDA) ", py::arg("boxes"), py::arg("scores"),
+        py::arg("iou_threshold"), py::arg("offset"));
+  m.def("softnms", &softnms, "softnms (CPU) ", py::arg("boxes"),
+        py::arg("scores"), py::arg("dets"), py::arg("iou_threshold"),
+        py::arg("sigma"), py::arg("min_score"), py::arg("method"),
+        py::arg("offset"));
+  m.def("nms_match", &nms_match, "nms_match (CPU) ", py::arg("dets"),
+        py::arg("iou_threshold"));
+  m.def("pixel_group", &pixel_group, "pixel group (CPU) ", py::arg("score"),
+        py::arg("mask"), py::arg("embedding"), py::arg("kernel_label"),
+        py::arg("kernel_contour"), py::arg("kernel_region_label"),
+        py::arg("distance_threshold"));
+  m.def("contour_expand", &contour_expand, "contour exapnd (CPU) ",
+        py::arg("kernel_mask"), py::arg("internal_kernel_label"),
+        py::arg("min_kernel_area"), py::arg("kernel_num"));
+  m.def("roi_align_forward", &roi_align_forward, "roi_align forward",
+        py::arg("input"), py::arg("rois"), py::arg("output"),
+        py::arg("argmax_y"), py::arg("argmax_x"), py::arg("aligned_height"),
+        py::arg("aligned_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("pool_mode"), py::arg("aligned"));
+  m.def("roi_align_backward", &roi_align_backward, "roi_align backward",
+        py::arg("grad_output"), py::arg("rois"), py::arg("argmax_y"),
+        py::arg("argmax_x"), py::arg("grad_input"), py::arg("aligned_height"),
+        py::arg("aligned_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("pool_mode"), py::arg("aligned"));
+  m.def("roi_pool_forward", &roi_pool_forward, "roi_pool forward",
+        py::arg("input"), py::arg("rois"), py::arg("output"), py::arg("argmax"),
+        py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"));
+  m.def("roi_pool_backward", &roi_pool_backward, "roi_pool backward",
+        py::arg("grad_output"), py::arg("rois"), py::arg("argmax"),
+        py::arg("grad_input"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"));
+  m.def("sync_bn_forward_mean", &sync_bn_forward_mean, "sync_bn forward_mean",
+        py::arg("input"), py::arg("mean"));
+  m.def("sync_bn_forward_var", &sync_bn_forward_var, "sync_bn forward_var",
+        py::arg("input"), py::arg("mean"), py::arg("var"));
+  m.def("sync_bn_forward_output", &sync_bn_forward_output,
+        "sync_bn forward_output", py::arg("input"), py::arg("mean"),
+        py::arg("var"), py::arg("weight"), py::arg("bias"),
+        py::arg("running_mean"), py::arg("running_var"), py::arg("norm"),
+        py::arg("std"), py::arg("output"), py::arg("eps"), py::arg("momentum"),
+        py::arg("group_size"));
+  m.def("sync_bn_backward_param", &sync_bn_backward_param,
+        "sync_bn backward_param", py::arg("grad_output"), py::arg("norm"),
+        py::arg("grad_weight"), py::arg("grad_bias"));
+  m.def("sync_bn_backward_data", &sync_bn_backward_data,
+        "sync_bn backward_data", py::arg("grad_output"), py::arg("weight"),
+        py::arg("grad_weight"), py::arg("grad_bias"), py::arg("norm"),
+        py::arg("std"), py::arg("grad_input"));
+  m.def("psamask_forward", &psamask_forward, "PSAMASK forward (CPU/CUDA)",
+        py::arg("input"), py::arg("output"), py::arg("psa_type"),
+        py::arg("num_"), py::arg("h_feature"), py::arg("w_feature"),
+        py::arg("h_mask"), py::arg("w_mask"), py::arg("half_h_mask"),
+        py::arg("half_w_mask"));
+  m.def("psamask_backward", &psamask_backward, "PSAMASK backward (CPU/CUDA)",
+        py::arg("grad_output"), py::arg("grad_input"), py::arg("psa_type"),
+        py::arg("num_"), py::arg("h_feature"), py::arg("w_feature"),
+        py::arg("h_mask"), py::arg("w_mask"), py::arg("half_h_mask"),
+        py::arg("half_w_mask"));
+  m.def("tin_shift_forward", &tin_shift_forward, "tin_shift forward",
+        py::arg("input"), py::arg("shift"), py::arg("output"));
+  m.def("tin_shift_backward", &tin_shift_backward, "tin_shift backward",
+        py::arg("grad_output"), py::arg("shift"), py::arg("grad_input"));
+  m.def("bottom_pool_forward", &bottom_pool_forward, "Bottom Pool Forward",
+        py::arg("input"), py::call_guard<py::gil_scoped_release>());
+  m.def("bottom_pool_backward", &bottom_pool_backward, "Bottom Pool Backward",
+        py::arg("input"), py::arg("grad_output"),
+        py::call_guard<py::gil_scoped_release>());
+  m.def("left_pool_forward", &left_pool_forward, "Left Pool Forward",
+        py::arg("input"), py::call_guard<py::gil_scoped_release>());
+  m.def("left_pool_backward", &left_pool_backward, "Left Pool Backward",
+        py::arg("input"), py::arg("grad_output"),
+        py::call_guard<py::gil_scoped_release>());
+  m.def("right_pool_forward", &right_pool_forward, "Right Pool Forward",
+        py::arg("input"), py::call_guard<py::gil_scoped_release>());
+  m.def("right_pool_backward", &right_pool_backward, "Right Pool Backward",
+        py::arg("input"), py::arg("grad_output"),
+        py::call_guard<py::gil_scoped_release>());
+  m.def("top_pool_forward", &top_pool_forward, "Top Pool Forward",
+        py::arg("input"), py::call_guard<py::gil_scoped_release>());
+  m.def("top_pool_backward", &top_pool_backward, "Top Pool Backward",
+        py::arg("input"), py::arg("grad_output"),
+        py::call_guard<py::gil_scoped_release>());
+  m.def("box_iou_rotated", &box_iou_rotated, "IoU for rotated boxes",
+        py::arg("boxes1"), py::arg("boxes2"), py::arg("ious"),
+        py::arg("mode_flag"), py::arg("aligned"));
+  m.def("nms_rotated", &nms_rotated, "NMS for rotated boxes", py::arg("dets"),
+        py::arg("scores"), py::arg("order"), py::arg("dets_sorted"),
+        py::arg("iou_threshold"), py::arg("multi_label"));
+  m.def("ball_query_forward", &ball_query_forward, "ball_query_forward",
+        py::arg("new_xyz_tensor"), py::arg("xyz_tensor"), py::arg("idx_tensor"),
+        py::arg("b"), py::arg("n"), py::arg("m"), py::arg("min_radius"),
+        py::arg("max_radius"), py::arg("nsample"));
+  m.def("roi_align_rotated_forward", &roi_align_rotated_forward,
+        "roi_align_rotated forward", py::arg("input"), py::arg("rois"),
+        py::arg("output"), py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"), py::arg("sample_num"), py::arg("aligned"),
+        py::arg("clockwise"));
+  m.def("roi_align_rotated_backward", &roi_align_rotated_backward,
+        "roi_align_rotated backward", py::arg("rois"), py::arg("grad_input"),
+        py::arg("grad_output"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("sample_num"), py::arg("aligned"), py::arg("clockwise"));
+  m.def("dynamic_point_to_voxel_forward", &dynamic_point_to_voxel_forward,
+        "dynamic_point_to_voxel_forward", py::arg("feats"), py::arg("coors"),
+        py::arg("reduce_type"));
+  m.def("dynamic_point_to_voxel_backward", &dynamic_point_to_voxel_backward,
+        "dynamic_point_to_voxel_backward", py::arg("grad_feats"),
+        py::arg("grad_reduced_feats"), py::arg("feats"),
+        py::arg("reduced_feats"), py::arg("coors_idx"), py::arg("reduce_count"),
+        py::arg("reduce_type"));
+  m.def("hard_voxelize_forward", &hard_voxelize_forward,
+        "hard_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
+        py::arg("coors_range"), py::arg("voxels"), py::arg("coors"),
+        py::arg("num_points_per_voxel"), py::arg("voxel_num"),
+        py::arg("max_points"), py::arg("max_voxels"), py::arg("NDim"));
+  m.def("dynamic_voxelize_forward", &dynamic_voxelize_forward,
+        "dynamic_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
+        py::arg("coors_range"), py::arg("coors"), py::arg("NDim"));
+  m.def("ms_deform_attn_forward", &ms_deform_attn_forward,
+        "forward function of multi-scale deformable attention",
+        py::arg("value"), py::arg("value_spatial_shapes"),
+        py::arg("value_level_start_index"), py::arg("sampling_locations"),
+        py::arg("attention_weights"), py::arg("im2col_step"));
+  m.def("ms_deform_attn_backward", &ms_deform_attn_backward,
+        "backward function of multi-scale deformable attention",
+        py::arg("value"), py::arg("value_spatial_shapes"),
+        py::arg("value_level_start_index"), py::arg("sampling_locations"),
+        py::arg("attention_weights"), py::arg("grad_output"),
+        py::arg("grad_value"), py::arg("grad_sampling_loc"),
+        py::arg("grad_attn_weight"), py::arg("im2col_step"));
+  m.def("border_align_forward", &border_align_forward,
+        "forward function of border_align", py::arg("input"), py::arg("boxes"),
+        py::arg("output"), py::arg("argmax_idx"), py::arg("pool_size"));
+  m.def("border_align_backward", &border_align_backward,
+        "backward function of border_align", py::arg("grad_output"),
+        py::arg("boxes"), py::arg("argmax_idx"), py::arg("grad_input"),
+        py::arg("pool_size"));
+  m.def("correlation_forward", &correlation_forward, "Correlation forward",
+        py::arg("input1"), py::arg("input2"), py::arg("output"), py::arg("kH"),
+        py::arg("kW"), py::arg("patchH"), py::arg("patchW"), py::arg("padH"),
+        py::arg("padW"), py::arg("dilationH"), py::arg("dilationW"),
+        py::arg("dilation_patchH"), py::arg("dilation_patchW"), py::arg("dH"),
+        py::arg("dW"));
+  m.def("correlation_backward", &correlation_backward, "Correlation backward",
+        py::arg("grad_output"), py::arg("input1"), py::arg("input2"),
+        py::arg("grad_input1"), py::arg("grad_input2"), py::arg("kH"),
+        py::arg("kW"), py::arg("patchH"), py::arg("patchW"), py::arg("padH"),
+        py::arg("padW"), py::arg("dilationH"), py::arg("dilationW"),
+        py::arg("dilation_patchH"), py::arg("dilation_patchW"), py::arg("dH"),
+        py::arg("dW"));
+  m.def("points_in_boxes_cpu_forward", &points_in_boxes_cpu_forward,
+        "points_in_boxes_cpu_forward", py::arg("boxes_tensor"),
+        py::arg("pts_tensor"), py::arg("pts_indices_tensor"));
+  m.def("points_in_boxes_part_forward", &points_in_boxes_part_forward,
+        "points_in_boxes_part_forward", py::arg("boxes_tensor"),
+        py::arg("pts_tensor"), py::arg("box_idx_of_points_tensor"));
+  m.def("points_in_boxes_all_forward", &points_in_boxes_all_forward,
+        "points_in_boxes_all_forward", py::arg("boxes_tensor"),
+        py::arg("pts_tensor"), py::arg("box_idx_of_points_tensor"));
+  m.def("roiaware_pool3d_forward", &roiaware_pool3d_forward,
+        "roiaware_pool3d_forward", py::arg("rois"), py::arg("pts"),
+        py::arg("pts_feature"), py::arg("argmax"), py::arg("pts_idx_of_voxels"),
+        py::arg("pooled_features"), py::arg("pool_method"));
+  m.def("roiaware_pool3d_backward", &roiaware_pool3d_backward,
+        "roiaware_pool3d_backward", py::arg("pts_idx_of_voxels"),
+        py::arg("argmax"), py::arg("grad_out"), py::arg("grad_in"),
+        py::arg("pool_method"));
+}
diff --git a/mmcv/ops/csrc/pytorch/roi_align.cpp b/mmcv/ops/csrc/pytorch/roi_align.cpp
new file mode 100644
index 0000000..6e70773
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/roi_align.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
+                       argmax_x, aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_backward_impl, grad_output, rois, argmax_y,
+                       argmax_x, grad_input, aligned_height, aligned_width,
+                       spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward(Tensor input, Tensor rois, Tensor output,
+                       Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                       int aligned_width, float spatial_scale,
+                       int sampling_ratio, int pool_mode, bool aligned) {
+  roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
+                         aligned_height, aligned_width, spatial_scale,
+                         sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                        Tensor argmax_x, Tensor grad_input, int aligned_height,
+                        int aligned_width, float spatial_scale,
+                        int sampling_ratio, int pool_mode, bool aligned) {
+  roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+}
diff --git a/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp b/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
new file mode 100644
index 0000000..5ef691a
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sample_ratio,
+                                    bool aligned, bool clockwise) {
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, features, rois, output,
+                       aligned_height, aligned_width, spatial_scale,
+                       sample_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sample_ratio, bool aligned,
+                                     bool clockwise) {
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, aligned_height, aligned_width,
+                       spatial_scale, sample_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
+                               int aligned_height, int aligned_width,
+                               float spatial_scale, int sampling_ratio,
+                               bool aligned, bool clockwise) {
+  roi_align_rotated_forward_impl(input, rois, output, aligned_height,
+                                 aligned_width, spatial_scale, sampling_ratio,
+                                 aligned, clockwise);
+}
+
+void roi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                Tensor bottom_grad, int aligned_height,
+                                int aligned_width, float spatial_scale,
+                                int sampling_ratio, bool aligned,
+                                bool clockwise) {
+  roi_align_rotated_backward_impl(top_grad, rois, bottom_grad, aligned_height,
+                                  aligned_width, spatial_scale, sampling_ratio,
+                                  aligned, clockwise);
+}
diff --git a/mmcv/ops/csrc/pytorch/roi_pool.cpp b/mmcv/ops/csrc/pytorch/roi_pool.cpp
new file mode 100644
index 0000000..bba90b8
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/roi_pool.cpp
@@ -0,0 +1,31 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(roi_pool_forward_impl, input, rois, output, argmax,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(roi_pool_backward_impl, grad_output, rois, argmax,
+                       grad_input, pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
+                      int pooled_height, int pooled_width,
+                      float spatial_scale) {
+  roi_pool_forward_impl(input, rois, output, argmax, pooled_height,
+                        pooled_width, spatial_scale);
+}
+
+void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
+                       Tensor grad_input, int pooled_height, int pooled_width,
+                       float spatial_scale) {
+  roi_pool_backward_impl(grad_output, rois, argmax, grad_input, pooled_height,
+                         pooled_width, spatial_scale);
+}
diff --git a/mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp b/mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp
new file mode 100644
index 0000000..6cf9cf0
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp
@@ -0,0 +1,72 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method) {
+  DISPATCH_DEVICE_IMPL(roiaware_pool3d_forward_impl, boxes_num, pts_num,
+                       channels, max_pts_each_voxel, out_x, out_y, out_z, rois,
+                       pts, pts_feature, argmax, pts_idx_of_voxels,
+                       pooled_features, pool_method);
+}
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method) {
+  DISPATCH_DEVICE_IMPL(roiaware_pool3d_backward_impl, boxes_num, out_x, out_y,
+                       out_z, channels, max_pts_each_voxel, pts_idx_of_voxels,
+                       argmax, grad_out, grad_in, pool_method);
+}
+
+void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
+                             Tensor argmax, Tensor pts_idx_of_voxels,
+                             Tensor pooled_features, int pool_method) {
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR
+  // coordinate
+  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
+  // params pts_feature: (npoints, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params pooled_features: (N, out_x, out_y, out_z, C)
+  // params pool_method: 0: max_pool 1: avg_pool
+  int boxes_num = rois.size(0);
+  int pts_num = pts.size(0);
+  int channels = pts_feature.size(1);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  assert((out_x < 256) && (out_y < 256) &&
+         (out_z < 256));  // we encode index with 8bit
+
+  roiaware_pool3d_forward_impl(boxes_num, pts_num, channels, max_pts_each_voxel,
+                               out_x, out_y, out_z, rois, pts, pts_feature,
+                               argmax, pts_idx_of_voxels, pooled_features,
+                               pool_method);
+}
+
+void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
+                              Tensor grad_out, Tensor grad_in,
+                              int pool_method) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+  // params pool_method: 0: max_pool 1: avg_pool
+  int boxes_num = pts_idx_of_voxels.size(0);
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int channels = grad_out.size(4);
+
+  roiaware_pool3d_backward_impl(boxes_num, out_x, out_y, out_z, channels,
+                                max_pts_each_voxel, pts_idx_of_voxels, argmax,
+                                grad_out, grad_in, pool_method);
+}
diff --git a/mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp b/mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp
new file mode 100644
index 0000000..a10080b
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp
@@ -0,0 +1,39 @@
+/*
+Modified from
+https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
+Point cloud feature pooling
+Written by Shaoshuai Shi
+All Rights Reserved 2018.
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag) {
+  DISPATCH_DEVICE_IMPL(roipoint_pool3d_forward_impl, batch_size, pts_num,
+                       boxes_num, feature_in_len, sampled_pts_num, xyz, boxes3d,
+                       pts_feature, pooled_features, pooled_empty_flag);
+}
+
+void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
+                             Tensor pooled_features, Tensor pooled_empty_flag) {
+  // params xyz: (B, N, 3)
+  // params boxes3d: (B, M, 7)
+  // params pts_feature: (B, N, C)
+  // params pooled_features: (B, M, 512, 3+C)
+  // params pooled_empty_flag: (B, M)
+  int batch_size = xyz.size(0);
+  int pts_num = xyz.size(1);
+  int boxes_num = boxes3d.size(1);
+  int feature_in_len = pts_feature.size(2);
+  int sampled_pts_num = pooled_features.size(2);
+
+  roipoint_pool3d_forward_impl(batch_size, pts_num, boxes_num, feature_in_len,
+                               sampled_pts_num, xyz, boxes3d, pts_feature,
+                               pooled_features, pooled_empty_flag);
+}
diff --git a/mmcv/ops/csrc/pytorch/scatter_points.cpp b/mmcv/ops/csrc/pytorch/scatter_points.cpp
new file mode 100644
index 0000000..0de8ebf
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/scatter_points.cpp
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
+    const torch::Tensor &feats, const torch::Tensor &coors,
+    const reduce_t reduce_type) {
+  return DISPATCH_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, feats, coors,
+                              reduce_type);
+}
+
+void dynamic_point_to_voxel_backward_impl(
+    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
+    const torch::Tensor &feats, const torch::Tensor &reduced_feats,
+    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
+    const reduce_t reduce_type) {
+  DISPATCH_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, grad_feats,
+                       grad_reduced_feats, feats, reduced_feats, coors_idx,
+                       reduce_count, reduce_type);
+}
+
+inline reduce_t convert_reduce_type(const std::string &reduce_type) {
+  if (reduce_type == "max")
+    return reduce_t::MAX;
+  else if (reduce_type == "sum")
+    return reduce_t::SUM;
+  else if (reduce_type == "mean")
+    return reduce_t::MEAN;
+  else
+    TORCH_CHECK(false, "do not support reduce type " + reduce_type)
+  return reduce_t::SUM;
+}
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
+    const torch::Tensor &feats, const torch::Tensor &coors,
+    const std::string &reduce_type) {
+  return dynamic_point_to_voxel_forward_impl(feats, coors,
+                                             convert_reduce_type(reduce_type));
+}
+
+void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
+                                     const torch::Tensor &grad_reduced_feats,
+                                     const torch::Tensor &feats,
+                                     const torch::Tensor &reduced_feats,
+                                     const torch::Tensor &coors_idx,
+                                     const torch::Tensor &reduce_count,
+                                     const std::string &reduce_type) {
+  dynamic_point_to_voxel_backward_impl(grad_feats, grad_reduced_feats, feats,
+                                       reduced_feats, coors_idx, reduce_count,
+                                       convert_reduce_type(reduce_type));
+}
diff --git a/mmcv/ops/csrc/pytorch/sync_bn.cpp b/mmcv/ops/csrc/pytorch/sync_bn.cpp
new file mode 100644
index 0000000..fd5a513
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/sync_bn.cpp
@@ -0,0 +1,69 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void sync_bn_forward_mean_impl(const Tensor input, Tensor mean) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_mean_impl, input, mean);
+}
+
+void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
+                              Tensor var) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_var_impl, input, mean, var);
+}
+
+void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_output_impl, input, mean, var,
+                       running_mean, running_var, weight, bias, norm, std,
+                       output, eps, momentum, group_size);
+}
+
+void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias) {
+  DISPATCH_DEVICE_IMPL(sync_bn_backward_param_impl, grad_output, norm,
+                       grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input) {
+  DISPATCH_DEVICE_IMPL(sync_bn_backward_data_impl, grad_output, weight,
+                       grad_weight, grad_bias, norm, std, grad_input);
+}
+
+void sync_bn_forward_mean(const Tensor input, Tensor mean) {
+  sync_bn_forward_mean_impl(input, mean);
+}
+
+void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
+  sync_bn_forward_var_impl(input, mean, var);
+}
+
+void sync_bn_forward_output(const Tensor input, const Tensor mean,
+                            const Tensor var, const Tensor weight,
+                            const Tensor bias, Tensor running_mean,
+                            Tensor running_var, Tensor norm, Tensor std,
+                            Tensor output, float eps, float momentum,
+                            int group_size) {
+  sync_bn_forward_output_impl(input, mean, var, running_mean, running_var,
+                              weight, bias, norm, std, output, eps, momentum,
+                              group_size);
+}
+
+void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
+                            Tensor grad_weight, Tensor grad_bias) {
+  sync_bn_backward_param_impl(grad_output, norm, grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
+                           const Tensor grad_weight, const Tensor grad_bias,
+                           const Tensor norm, const Tensor std,
+                           Tensor grad_input) {
+  sync_bn_backward_data_impl(grad_output, weight, grad_weight, grad_bias, norm,
+                             std, grad_input);
+}
diff --git a/mmcv/ops/csrc/pytorch/three_interpolate.cpp b/mmcv/ops/csrc/pytorch/three_interpolate.cpp
new file mode 100644
index 0000000..1e0ec71
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/three_interpolate.cpp
@@ -0,0 +1,33 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void three_interpolate_forward_impl(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out) {
+  DISPATCH_DEVICE_IMPL(three_interpolate_forward_impl, b, c, m, n, points, idx,
+                       weight, out);
+}
+
+void three_interpolate_backward_impl(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(three_interpolate_backward_impl, b, c, n, m, grad_out,
+                       idx, weight, grad_points);
+}
+
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
+                               int c, int m, int n) {
+  three_interpolate_forward_impl(b, c, m, n, points_tensor, idx_tensor,
+                                 weight_tensor, out_tensor);
+}
+
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
+                                int b, int c, int n, int m) {
+  three_interpolate_backward_impl(b, c, n, m, grad_out_tensor, idx_tensor,
+                                  weight_tensor, grad_points_tensor);
+}
diff --git a/mmcv/ops/csrc/pytorch/three_nn.cpp b/mmcv/ops/csrc/pytorch/three_nn.cpp
new file mode 100644
index 0000000..b629200
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/three_nn.cpp
@@ -0,0 +1,18 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx) {
+  DISPATCH_DEVICE_IMPL(three_nn_forward_impl, b, n, m, unknown, known, dist2,
+                       idx);
+}
+
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
+                      int m) {
+  three_nn_forward_impl(b, n, m, unknown_tensor, known_tensor, dist2_tensor,
+                        idx_tensor);
+}
diff --git a/mmcv/ops/csrc/pytorch/tin_shift.cpp b/mmcv/ops/csrc/pytorch/tin_shift.cpp
new file mode 100644
index 0000000..b03f587
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/tin_shift.cpp
@@ -0,0 +1,20 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output) {
+  DISPATCH_DEVICE_IMPL(tin_shift_forward_impl, input, shift, output);
+}
+
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input) {
+  DISPATCH_DEVICE_IMPL(tin_shift_backward_impl, grad_output, shift, grad_input);
+}
+
+void tin_shift_forward(Tensor input, Tensor shift, Tensor output) {
+  tin_shift_forward_impl(input, shift, output);
+}
+
+void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input) {
+  tin_shift_backward_impl(grad_output, shift, grad_input);
+}
diff --git a/mmcv/ops/csrc/pytorch/upfirdn2d.cpp b/mmcv/ops/csrc/pytorch/upfirdn2d.cpp
new file mode 100644
index 0000000..dd325bd
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/upfirdn2d.cpp
@@ -0,0 +1,118 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.cpp
+
+/*
+Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+
+NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+Augmentation (ADA)
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
+                                const torch::Tensor& kernel, int up_x, int up_y,
+                                int down_x, int down_y, int pad_x0, int pad_x1,
+                                int pad_y0, int pad_y1) {
+  return DISPATCH_DEVICE_IMPL(upfirdn2d_op_impl, input, kernel, up_x, up_y,
+                              down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1);
+}
+
+torch::Tensor upfirdn2d(const torch::Tensor& input, const torch::Tensor& kernel,
+                        int up_x, int up_y, int down_x, int down_y, int pad_x0,
+                        int pad_x1, int pad_y0, int pad_y1) {
+  return upfirdn2d_op_impl(input, kernel, up_x, up_y, down_x, down_y, pad_x0,
+                           pad_x1, pad_y0, pad_y1);
+}
diff --git a/mmcv/ops/csrc/pytorch/voxelization.cpp b/mmcv/ops/csrc/pytorch/voxelization.cpp
new file mode 100644
index 0000000..1d1c229
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/voxelization.cpp
@@ -0,0 +1,56 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
+                               at::Tensor &coors,
+                               at::Tensor &num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim = 3) {
+  return DISPATCH_DEVICE_IMPL(hard_voxelize_forward_impl, points, voxels, coors,
+                              num_points_per_voxel, voxel_size, coors_range,
+                              max_points, max_voxels, NDim);
+}
+
+void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim = 3) {
+  DISPATCH_DEVICE_IMPL(dynamic_voxelize_forward_impl, points, coors, voxel_size,
+                       coors_range, NDim);
+}
+
+void hard_voxelize_forward(const at::Tensor &points,
+                           const at::Tensor &voxel_size,
+                           const at::Tensor &coors_range, at::Tensor &voxels,
+                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                           at::Tensor &voxel_num, const int max_points,
+                           const int max_voxels, const int NDim = 3) {
+  int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
+  std::vector<float> voxel_size_v(
+      voxel_size.data_ptr<float>(),
+      voxel_size.data_ptr<float>() + voxel_size.numel());
+  std::vector<float> coors_range_v(
+      coors_range.data_ptr<float>(),
+      coors_range.data_ptr<float>() + coors_range.numel());
+
+  *voxel_num_data = hard_voxelize_forward_impl(
+      points, voxels, coors, num_points_per_voxel, voxel_size_v, coors_range_v,
+      max_points, max_voxels, NDim);
+}
+
+void dynamic_voxelize_forward(const at::Tensor &points,
+                              const at::Tensor &voxel_size,
+                              const at::Tensor &coors_range, at::Tensor &coors,
+                              const int NDim = 3) {
+  std::vector<float> voxel_size_v(
+      voxel_size.data_ptr<float>(),
+      voxel_size.data_ptr<float>() + voxel_size.numel());
+  std::vector<float> coors_range_v(
+      coors_range.data_ptr<float>(),
+      coors_range.data_ptr<float>() + coors_range.numel());
+  dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,
+                                NDim);
+}
diff --git a/mmcv/ops/deform_conv.py b/mmcv/ops/deform_conv.py
new file mode 100644
index 0000000..bca9b56
--- /dev/null
+++ b/mmcv/ops/deform_conv.py
@@ -0,0 +1,405 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair, _single
+
+from mmcv.utils import deprecated_api_warning
+from ..models.bricks import CONV_LAYERS
+from ..utils import ext_loader, print_log
+
+ext_module = ext_loader.load_ext('_ext', [
+    'deform_conv_forward', 'deform_conv_backward_input',
+    'deform_conv_backward_parameters'
+])
+
+
+class DeformConv2dFunction(Function):
+
+    @staticmethod
+    def symbolic(g,
+                 input,
+                 offset,
+                 weight,
+                 stride,
+                 padding,
+                 dilation,
+                 groups,
+                 deform_groups,
+                 bias=False,
+                 im2col_step=32):
+        return g.op(
+            'mmcv::MMCVDeformConv2d',
+            input,
+            offset,
+            weight,
+            stride_i=stride,
+            padding_i=padding,
+            dilation_i=dilation,
+            groups_i=groups,
+            deform_groups_i=deform_groups,
+            bias_i=bias,
+            im2col_step_i=im2col_step)
+
+    @staticmethod
+    def forward(ctx,
+                input,
+                offset,
+                weight,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                deform_groups=1,
+                bias=False,
+                im2col_step=32):
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                f'Expected 4D tensor as input, got {input.dim()}D tensor \
+                  instead.')
+        assert bias is False, 'Only support bias is False.'
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deform_groups = deform_groups
+        ctx.im2col_step = im2col_step
+
+        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
+        # amp won't cast the type of model (float32), but "offset" is cast
+        # to float16 by nn.Conv2d automatically, leading to the type
+        # mismatch with input (when it is float32) or weight.
+        # The flag for whether to use fp16 or amp is the type of "offset",
+        # we cast weight and input to temporarily support fp16 and amp
+        # whatever the pytorch version is.
+        input = input.type_as(offset)
+        weight = weight.type_as(input)
+        ctx.save_for_backward(input, offset, weight)
+
+        output = input.new_empty(
+            DeformConv2dFunction._output_size(ctx, input, weight))
+
+        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
+
+        cur_im2col_step = min(ctx.im2col_step, input.size(0))
+        assert (input.size(0) % cur_im2col_step
+                ) == 0, 'batch size must be divisible by im2col_step'
+        ext_module.deform_conv_forward(
+            input,
+            weight,
+            offset,
+            output,
+            ctx.bufs_[0],
+            ctx.bufs_[1],
+            kW=weight.size(3),
+            kH=weight.size(2),
+            dW=ctx.stride[1],
+            dH=ctx.stride[0],
+            padW=ctx.padding[1],
+            padH=ctx.padding[0],
+            dilationW=ctx.dilation[1],
+            dilationH=ctx.dilation[0],
+            group=ctx.groups,
+            deformable_group=ctx.deform_groups,
+            im2col_step=cur_im2col_step)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, offset, weight = ctx.saved_tensors
+
+        grad_input = grad_offset = grad_weight = None
+
+        cur_im2col_step = min(ctx.im2col_step, input.size(0))
+        assert (input.size(0) % cur_im2col_step
+                ) == 0, 'batch size must be divisible by im2col_step'
+
+        grad_output = grad_output.contiguous()
+        if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+            grad_input = torch.zeros_like(input)
+            grad_offset = torch.zeros_like(offset)
+            ext_module.deform_conv_backward_input(
+                input,
+                offset,
+                grad_output,
+                grad_input,
+                grad_offset,
+                weight,
+                ctx.bufs_[0],
+                kW=weight.size(3),
+                kH=weight.size(2),
+                dW=ctx.stride[1],
+                dH=ctx.stride[0],
+                padW=ctx.padding[1],
+                padH=ctx.padding[0],
+                dilationW=ctx.dilation[1],
+                dilationH=ctx.dilation[0],
+                group=ctx.groups,
+                deformable_group=ctx.deform_groups,
+                im2col_step=cur_im2col_step)
+
+        if ctx.needs_input_grad[2]:
+            grad_weight = torch.zeros_like(weight)
+            ext_module.deform_conv_backward_parameters(
+                input,
+                offset,
+                grad_output,
+                grad_weight,
+                ctx.bufs_[0],
+                ctx.bufs_[1],
+                kW=weight.size(3),
+                kH=weight.size(2),
+                dW=ctx.stride[1],
+                dH=ctx.stride[0],
+                padW=ctx.padding[1],
+                padH=ctx.padding[0],
+                dilationW=ctx.dilation[1],
+                dilationH=ctx.dilation[0],
+                group=ctx.groups,
+                deformable_group=ctx.deform_groups,
+                scale=1,
+                im2col_step=cur_im2col_step)
+
+        return grad_input, grad_offset, grad_weight, \
+            None, None, None, None, None, None, None
+
+    @staticmethod
+    def _output_size(ctx, input, weight):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = ctx.padding[d]
+            kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = ctx.stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                'convolution input is too small (output would be ' +
+                'x'.join(map(str, output_size)) + ')')
+        return output_size
+
+
+deform_conv2d = DeformConv2dFunction.apply
+
+
+class DeformConv2d(nn.Module):
+    r"""Deformable 2D convolution.
+
+    Applies a deformable 2D convolution over an input signal composed of
+    several input planes. DeformConv2d was described in the paper
+    `Deformable Convolutional Networks
+    <https://arxiv.org/pdf/1703.06211.pdf>`_
+
+    Note:
+        The argument ``im2col_step`` was added in version 1.3.17, which means
+        number of samples processed by the ``im2col_cuda_kernel`` per call.
+        It enables users to define ``batch_size`` and ``im2col_step`` more
+        flexibly and solved `issue mmcv#1440
+        <https://github.com/open-mmlab/mmcv/issues/1440>`_.
+
+    Args:
+        in_channels (int): Number of channels in the input image.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size(int, tuple): Size of the convolving kernel.
+        stride(int, tuple): Stride of the convolution. Default: 1.
+        padding (int or tuple): Zero-padding added to both sides of the input.
+            Default: 0.
+        dilation (int or tuple): Spacing between kernel elements. Default: 1.
+        groups (int): Number of blocked connections from input.
+            channels to output channels. Default: 1.
+        deform_groups (int): Number of deformable group partitions.
+        bias (bool): If True, adds a learnable bias to the output.
+            Default: False.
+        im2col_step (int): Number of samples processed by im2col_cuda_kernel
+            per call. It will work when ``batch_size`` > ``im2col_step``, but
+            ``batch_size`` must be divisible by ``im2col_step``. Default: 32.
+            `New in version 1.3.17.`
+    """
+
+    @deprecated_api_warning({'deformable_groups': 'deform_groups'},
+                            cls_name='DeformConv2d')
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, ...]],
+                 stride: Union[int, Tuple[int, ...]] = 1,
+                 padding: Union[int, Tuple[int, ...]] = 0,
+                 dilation: Union[int, Tuple[int, ...]] = 1,
+                 groups: int = 1,
+                 deform_groups: int = 1,
+                 bias: bool = False,
+                 im2col_step: int = 32) -> None:
+        super(DeformConv2d, self).__init__()
+
+        assert not bias, \
+            f'bias={bias} is not supported in DeformConv2d.'
+        assert in_channels % groups == 0, \
+            f'in_channels {in_channels} cannot be divisible by groups {groups}'
+        assert out_channels % groups == 0, \
+            f'out_channels {out_channels} cannot be divisible by groups \
+              {groups}'
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deform_groups = deform_groups
+        self.im2col_step = im2col_step
+        # enable compatibility with nn.Conv2d
+        self.transposed = False
+        self.output_padding = _single(0)
+
+        # only weight, no bias
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // self.groups,
+                         *self.kernel_size))
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        # switch the initialization of `self.weight` to the standard kaiming
+        # method described in `Delving deep into rectifiers: Surpassing
+        # human-level performance on ImageNet classification` - He, K. et al.
+        # (2015), using a uniform distribution
+        nn.init.kaiming_uniform_(self.weight, nonlinearity='relu')
+
+    def forward(self, x: Tensor, offset: Tensor) -> Tensor:
+        """Deformable Convolutional forward function.
+
+        Args:
+            x (Tensor): Input feature, shape (B, C_in, H_in, W_in)
+            offset (Tensor): Offset for deformable convolution, shape
+                (B, deform_groups*kernel_size[0]*kernel_size[1]*2,
+                H_out, W_out), H_out, W_out are equal to the output's.
+
+                An offset is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
+                The spatial arrangement is like:
+
+                .. code:: text
+
+                    (x0, y0) (x1, y1) (x2, y2)
+                    (x3, y3) (x4, y4) (x5, y5)
+                    (x6, y6) (x7, y7) (x8, y8)
+
+        Returns:
+            Tensor: Output of the layer.
+        """
+        # To fix an assert error in deform_conv_cuda.cpp:128
+        # input image is smaller than kernel
+        input_pad = (x.size(2) < self.kernel_size[0]) or (x.size(3) <
+                                                          self.kernel_size[1])
+        if input_pad:
+            pad_h = max(self.kernel_size[0] - x.size(2), 0)
+            pad_w = max(self.kernel_size[1] - x.size(3), 0)
+            x = F.pad(x, (0, pad_w, 0, pad_h), 'constant', 0).contiguous()
+            offset = F.pad(offset, (0, pad_w, 0, pad_h), 'constant', 0)
+            offset = offset.contiguous()
+        out = deform_conv2d(x, offset, self.weight, self.stride, self.padding,
+                            self.dilation, self.groups, self.deform_groups,
+                            False, self.im2col_step)
+        if input_pad:
+            out = out[:, :, :out.size(2) - pad_h, :out.size(3) -
+                      pad_w].contiguous()
+        return out
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(in_channels={self.in_channels},\n'
+        s += f'out_channels={self.out_channels},\n'
+        s += f'kernel_size={self.kernel_size},\n'
+        s += f'stride={self.stride},\n'
+        s += f'padding={self.padding},\n'
+        s += f'dilation={self.dilation},\n'
+        s += f'groups={self.groups},\n'
+        s += f'deform_groups={self.deform_groups},\n'
+        # bias is not supported in DeformConv2d.
+        s += 'bias=False)'
+        return s
+
+
+@CONV_LAYERS.register_module('DCN')
+class DeformConv2dPack(DeformConv2d):
+    """A Deformable Conv Encapsulation that acts as normal Conv layers.
+
+    The offset tensor is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
+    The spatial arrangement is like:
+
+    .. code:: text
+
+        (x0, y0) (x1, y1) (x2, y2)
+        (x3, y3) (x4, y4) (x5, y5)
+        (x6, y6) (x7, y7) (x8, y8)
+
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int or tuple[int]): Same as nn.Conv2d.
+        padding (int or tuple[int]): Same as nn.Conv2d.
+        dilation (int or tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        bias (bool or str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
+            False.
+    """
+
+    _version = 2
+
+    def __init__(self, *args, **kwargs):
+        super(DeformConv2dPack, self).__init__(*args, **kwargs)
+        self.conv_offset = nn.Conv2d(
+            self.in_channels,
+            self.deform_groups * 2 * self.kernel_size[0] * self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=_pair(self.stride),
+            padding=_pair(self.padding),
+            dilation=_pair(self.dilation),
+            bias=True)
+        self.init_offset()
+
+    def init_offset(self):
+        self.conv_offset.weight.data.zero_()
+        self.conv_offset.bias.data.zero_()
+
+    def forward(self, x):
+        offset = self.conv_offset(x)
+        return deform_conv2d(x, offset, self.weight, self.stride, self.padding,
+                             self.dilation, self.groups, self.deform_groups,
+                             False, self.im2col_step)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+
+        if version is None or version < 2:
+            # the key is different in early versions
+            # In version < 2, DeformConvPack loads previous benchmark models.
+            if (prefix + 'conv_offset.weight' not in state_dict
+                    and prefix[:-1] + '_offset.weight' in state_dict):
+                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
+                    prefix[:-1] + '_offset.weight')
+            if (prefix + 'conv_offset.bias' not in state_dict
+                    and prefix[:-1] + '_offset.bias' in state_dict):
+                state_dict[prefix +
+                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
+                                                                '_offset.bias')
+
+        if version is not None and version > 1:
+            print_log(
+                f'DeformConv2dPack {prefix.rstrip(".")} is upgraded to '
+                'version 2.',
+                logger='root')
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
diff --git a/mmcv/ops/focal_loss.py b/mmcv/ops/focal_loss.py
new file mode 100644
index 0000000..763bc93
--- /dev/null
+++ b/mmcv/ops/focal_loss.py
@@ -0,0 +1,212 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'sigmoid_focal_loss_forward', 'sigmoid_focal_loss_backward',
+    'softmax_focal_loss_forward', 'softmax_focal_loss_backward'
+])
+
+
+class SigmoidFocalLossFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, target, gamma, alpha, weight, reduction):
+        return g.op(
+            'mmcv::MMCVSigmoidFocalLoss',
+            input,
+            target,
+            gamma_f=gamma,
+            alpha_f=alpha,
+            weight_f=weight,
+            reduction_s=reduction)
+
+    @staticmethod
+    def forward(ctx,
+                input,
+                target,
+                gamma=2.0,
+                alpha=0.25,
+                weight=None,
+                reduction='mean'):
+
+        assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor))
+        assert input.dim() == 2
+        assert target.dim() == 1
+        assert input.size(0) == target.size(0)
+        if weight is None:
+            weight = input.new_empty(0)
+        else:
+            assert weight.dim() == 1
+            assert input.size(1) == weight.size(0)
+        ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2}
+        assert reduction in ctx.reduction_dict.keys()
+
+        ctx.gamma = float(gamma)
+        ctx.alpha = float(alpha)
+        ctx.reduction = ctx.reduction_dict[reduction]
+
+        output = input.new_zeros(input.size())
+
+        ext_module.sigmoid_focal_loss_forward(
+            input, target, weight, output, gamma=ctx.gamma, alpha=ctx.alpha)
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            output = output.sum() / input.size(0)
+        elif ctx.reduction == ctx.reduction_dict['sum']:
+            output = output.sum()
+        ctx.save_for_backward(input, target, weight)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, target, weight = ctx.saved_tensors
+
+        grad_input = input.new_zeros(input.size())
+
+        ext_module.sigmoid_focal_loss_backward(
+            input,
+            target,
+            weight,
+            grad_input,
+            gamma=ctx.gamma,
+            alpha=ctx.alpha)
+
+        grad_input *= grad_output
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            grad_input /= input.size(0)
+        return grad_input, None, None, None, None, None
+
+
+sigmoid_focal_loss = SigmoidFocalLossFunction.apply
+
+
+class SigmoidFocalLoss(nn.Module):
+
+    def __init__(self, gamma, alpha, weight=None, reduction='mean'):
+        super(SigmoidFocalLoss, self).__init__()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.register_buffer('weight', weight)
+        self.reduction = reduction
+
+    def forward(self, input, target):
+        return sigmoid_focal_loss(input, target, self.gamma, self.alpha,
+                                  self.weight, self.reduction)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(gamma={self.gamma}, '
+        s += f'alpha={self.alpha}, '
+        s += f'reduction={self.reduction})'
+        return s
+
+
+class SoftmaxFocalLossFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, target, gamma, alpha, weight, reduction):
+        return g.op(
+            'mmcv::MMCVSoftmaxFocalLoss',
+            input,
+            target,
+            gamma_f=gamma,
+            alpha_f=alpha,
+            weight_f=weight,
+            reduction_s=reduction)
+
+    @staticmethod
+    def forward(ctx,
+                input,
+                target,
+                gamma=2.0,
+                alpha=0.25,
+                weight=None,
+                reduction='mean'):
+
+        assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor))
+        assert input.dim() == 2
+        assert target.dim() == 1
+        assert input.size(0) == target.size(0)
+        if weight is None:
+            weight = input.new_empty(0)
+        else:
+            assert weight.dim() == 1
+            assert input.size(1) == weight.size(0)
+        ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2}
+        assert reduction in ctx.reduction_dict.keys()
+
+        ctx.gamma = float(gamma)
+        ctx.alpha = float(alpha)
+        ctx.reduction = ctx.reduction_dict[reduction]
+
+        channel_stats, _ = torch.max(input, dim=1)
+        input_softmax = input - channel_stats.unsqueeze(1).expand_as(input)
+        input_softmax.exp_()
+
+        channel_stats = input_softmax.sum(dim=1)
+        input_softmax /= channel_stats.unsqueeze(1).expand_as(input)
+
+        output = input.new_zeros(input.size(0))
+        ext_module.softmax_focal_loss_forward(
+            input_softmax,
+            target,
+            weight,
+            output,
+            gamma=ctx.gamma,
+            alpha=ctx.alpha)
+
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            output = output.sum() / input.size(0)
+        elif ctx.reduction == ctx.reduction_dict['sum']:
+            output = output.sum()
+        ctx.save_for_backward(input_softmax, target, weight)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input_softmax, target, weight = ctx.saved_tensors
+        buff = input_softmax.new_zeros(input_softmax.size(0))
+        grad_input = input_softmax.new_zeros(input_softmax.size())
+
+        ext_module.softmax_focal_loss_backward(
+            input_softmax,
+            target,
+            weight,
+            buff,
+            grad_input,
+            gamma=ctx.gamma,
+            alpha=ctx.alpha)
+
+        grad_input *= grad_output
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            grad_input /= input_softmax.size(0)
+        return grad_input, None, None, None, None, None
+
+
+softmax_focal_loss = SoftmaxFocalLossFunction.apply
+
+
+class SoftmaxFocalLoss(nn.Module):
+
+    def __init__(self, gamma, alpha, weight=None, reduction='mean'):
+        super(SoftmaxFocalLoss, self).__init__()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.register_buffer('weight', weight)
+        self.reduction = reduction
+
+    def forward(self, input, target):
+        return softmax_focal_loss(input, target, self.gamma, self.alpha,
+                                  self.weight, self.reduction)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(gamma={self.gamma}, '
+        s += f'alpha={self.alpha}, '
+        s += f'reduction={self.reduction})'
+        return s
diff --git a/mmcv/ops/iou3d.py b/mmcv/ops/iou3d.py
new file mode 100644
index 0000000..8c4cc82
--- /dev/null
+++ b/mmcv/ops/iou3d.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'iou3d_boxes_iou_bev_forward', 'iou3d_nms_forward',
+    'iou3d_nms_normal_forward'
+])
+
+
+def boxes_iou_bev(boxes_a, boxes_b):
+    """Calculate boxes IoU in the Bird's Eye View.
+
+    Args:
+        boxes_a (torch.Tensor): Input boxes a with shape (M, 5).
+        boxes_b (torch.Tensor): Input boxes b with shape (N, 5).
+
+    Returns:
+        ans_iou (torch.Tensor): IoU result with shape (M, N).
+    """
+    ans_iou = boxes_a.new_zeros(
+        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
+
+    ext_module.iou3d_boxes_iou_bev_forward(boxes_a.contiguous(),
+                                           boxes_b.contiguous(), ans_iou)
+
+    return ans_iou
+
+
+def nms_bev(boxes, scores, thresh, pre_max_size=None, post_max_size=None):
+    """NMS function GPU implementation (for BEV boxes). The overlap of two
+    boxes for IoU calculation is defined as the exact overlapping area of the
+    two boxes. In this function, one can also set ``pre_max_size`` and
+    ``post_max_size``.
+
+    Args:
+        boxes (torch.Tensor): Input boxes with the shape of [N, 5]
+            ([x1, y1, x2, y2, ry]).
+        scores (torch.Tensor): Scores of boxes with the shape of [N].
+        thresh (float): Overlap threshold of NMS.
+        pre_max_size (int, optional): Max size of boxes before NMS.
+            Default: None.
+        post_max_size (int, optional): Max size of boxes after NMS.
+            Default: None.
+
+    Returns:
+        torch.Tensor: Indexes after NMS.
+    """
+    assert boxes.size(1) == 5, 'Input boxes shape should be [N, 5]'
+    order = scores.sort(0, descending=True)[1]
+
+    if pre_max_size is not None:
+        order = order[:pre_max_size]
+    boxes = boxes[order].contiguous()
+
+    keep = torch.zeros(boxes.size(0), dtype=torch.long)
+    num_out = torch.zeros(size=(), dtype=torch.long)
+    ext_module.iou3d_nms_forward(
+        boxes, keep, num_out, nms_overlap_thresh=thresh)
+    keep = order[keep[:num_out].cuda(boxes.device)].contiguous()
+    if post_max_size is not None:
+        keep = keep[:post_max_size]
+    return keep
+
+
+def nms_normal_bev(boxes, scores, thresh):
+    """Normal NMS function GPU implementation (for BEV boxes). The overlap of
+    two boxes for IoU calculation is defined as the exact overlapping area of
+    the two boxes WITH their yaw angle set to 0.
+
+    Args:
+        boxes (torch.Tensor): Input boxes with shape (N, 5).
+        scores (torch.Tensor): Scores of predicted boxes with shape (N).
+        thresh (float): Overlap threshold of NMS.
+
+    Returns:
+        torch.Tensor: Remaining indices with scores in descending order.
+    """
+    assert boxes.shape[1] == 5, 'Input boxes shape should be [N, 5]'
+    order = scores.sort(0, descending=True)[1]
+
+    boxes = boxes[order].contiguous()
+
+    keep = torch.zeros(boxes.size(0), dtype=torch.long)
+    num_out = torch.zeros(size=(), dtype=torch.long)
+    ext_module.iou3d_nms_normal_forward(
+        boxes, keep, num_out, nms_overlap_thresh=thresh)
+    return order[keep[:num_out].cuda(boxes.device)].contiguous()
diff --git a/mmcv/ops/iou3d_det/__init__.py b/mmcv/ops/iou3d_det/__init__.py
new file mode 100644
index 0000000..9c35fb7
--- /dev/null
+++ b/mmcv/ops/iou3d_det/__init__.py
@@ -0,0 +1,3 @@
+from .iou3d_utils import boxes_iou_bev, nms_gpu, nms_normal_gpu
+
+__all__ = ['boxes_iou_bev', 'nms_gpu', 'nms_normal_gpu']
diff --git a/mmcv/ops/iou3d_det/iou3d_utils.py b/mmcv/ops/iou3d_det/iou3d_utils.py
new file mode 100644
index 0000000..6f36019
--- /dev/null
+++ b/mmcv/ops/iou3d_det/iou3d_utils.py
@@ -0,0 +1,71 @@
+import torch
+
+from . import iou3d_cuda
+
+
+def boxes_iou_bev(boxes_a, boxes_b):
+    """Calculate boxes IoU in the bird view.
+
+    Args:
+        boxes_a (torch.Tensor): Input boxes a with shape (M, 5).
+        boxes_b (torch.Tensor): Input boxes b with shape (N, 5).
+
+    Returns:
+        ans_iou (torch.Tensor): IoU result with shape (M, N).
+    """
+    ans_iou = boxes_a.new_zeros(
+        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
+
+    iou3d_cuda.boxes_iou_bev_gpu(boxes_a.contiguous(), boxes_b.contiguous(),
+                                 ans_iou)
+
+    return ans_iou
+
+
+def nms_gpu(boxes, scores, thresh, pre_maxsize=None, post_max_size=None):
+    """Nms function with gpu implementation.
+
+    Args:
+        boxes (torch.Tensor): Input boxes with the shape of [N, 5]
+            ([x1, y1, x2, y2, ry]).
+        scores (torch.Tensor): Scores of boxes with the shape of [N].
+        thresh (int): Threshold.
+        pre_maxsize (int): Max size of boxes before nms. Default: None.
+        post_maxsize (int): Max size of boxes after nms. Default: None.
+
+    Returns:
+        torch.Tensor: Indexes after nms.
+    """
+    order = scores.sort(0, descending=True)[1]
+
+    if pre_maxsize is not None:
+        order = order[:pre_maxsize]
+    boxes = boxes[order].contiguous()
+
+    keep = torch.zeros(boxes.size(0), dtype=torch.long)
+    num_out = iou3d_cuda.nms_gpu(boxes, keep, thresh, boxes.device.index)
+    keep = order[keep[:num_out].cuda(boxes.device)].contiguous()
+    if post_max_size is not None:
+        keep = keep[:post_max_size]
+    return keep
+
+
+def nms_normal_gpu(boxes, scores, thresh):
+    """Normal non maximum suppression on GPU.
+
+    Args:
+        boxes (torch.Tensor): Input boxes with shape (N, 5).
+        scores (torch.Tensor): Scores of predicted boxes with shape (N).
+        thresh (torch.Tensor): Threshold of non maximum suppression.
+
+    Returns:
+        torch.Tensor: Remaining indices with scores in descending order.
+    """
+    order = scores.sort(0, descending=True)[1]
+
+    boxes = boxes[order].contiguous()
+
+    keep = torch.zeros(boxes.size(0), dtype=torch.long)
+    num_out = iou3d_cuda.nms_normal_gpu(boxes, keep, thresh,
+                                        boxes.device.index)
+    return order[keep[:num_out].cuda(boxes.device)].contiguous()
diff --git a/mmcv/ops/iou3d_det/src/iou3d.cpp b/mmcv/ops/iou3d_det/src/iou3d.cpp
new file mode 100644
index 0000000..25a5cd9
--- /dev/null
+++ b/mmcv/ops/iou3d_det/src/iou3d.cpp
@@ -0,0 +1,210 @@
+// Modified from
+// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms.cpp
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <torch/extension.h>
+#include <torch/serialize/tensor.h>
+
+#include <cstdint>
+#include <vector>
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+#define CHECK_ERROR(ans) \
+  { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line,
+                      bool abort = true) {
+  if (code != cudaSuccess) {
+    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
+            line);
+    if (abort) exit(code);
+  }
+}
+
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+
+void boxesoverlapLauncher(const int num_a, const float *boxes_a,
+                          const int num_b, const float *boxes_b,
+                          float *ans_overlap);
+void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b,
+                         const float *boxes_b, float *ans_iou);
+void nmsLauncher(const float *boxes, unsigned long long *mask, int boxes_num,
+                 float nms_overlap_thresh);
+void nmsNormalLauncher(const float *boxes, unsigned long long *mask,
+                       int boxes_num, float nms_overlap_thresh);
+
+int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b,
+                          at::Tensor ans_overlap) {
+  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
+  // params boxes_b: (M, 5)
+  // params ans_overlap: (N, M)
+
+  CHECK_INPUT(boxes_a);
+  CHECK_INPUT(boxes_b);
+  CHECK_INPUT(ans_overlap);
+
+  int num_a = boxes_a.size(0);
+  int num_b = boxes_b.size(0);
+
+  const float *boxes_a_data = boxes_a.data_ptr<float>();
+  const float *boxes_b_data = boxes_b.data_ptr<float>();
+  float *ans_overlap_data = ans_overlap.data_ptr<float>();
+
+  boxesoverlapLauncher(num_a, boxes_a_data, num_b, boxes_b_data,
+                       ans_overlap_data);
+
+  return 1;
+}
+
+int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b,
+                      at::Tensor ans_iou) {
+  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
+  // params boxes_b: (M, 5)
+  // params ans_overlap: (N, M)
+
+  CHECK_INPUT(boxes_a);
+  CHECK_INPUT(boxes_b);
+  CHECK_INPUT(ans_iou);
+
+  int num_a = boxes_a.size(0);
+  int num_b = boxes_b.size(0);
+
+  const float *boxes_a_data = boxes_a.data_ptr<float>();
+  const float *boxes_b_data = boxes_b.data_ptr<float>();
+  float *ans_iou_data = ans_iou.data_ptr<float>();
+
+  boxesioubevLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_iou_data);
+
+  return 1;
+}
+
+int nms_gpu(at::Tensor boxes, at::Tensor keep,
+	    float nms_overlap_thresh, int device_id) {
+  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
+  // params keep: (N)
+
+  CHECK_INPUT(boxes);
+  CHECK_CONTIGUOUS(keep);
+  cudaSetDevice(device_id);
+
+  int boxes_num = boxes.size(0);
+  const float *boxes_data = boxes.data_ptr<float>();
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+
+  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+
+  unsigned long long *mask_data = NULL;
+  CHECK_ERROR(cudaMalloc((void **)&mask_data,
+                         boxes_num * col_blocks * sizeof(unsigned long long)));
+  nmsLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
+
+  // unsigned long long mask_cpu[boxes_num * col_blocks];
+  // unsigned long long *mask_cpu = new unsigned long long [boxes_num *
+  // col_blocks];
+  std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks);
+
+  //    printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
+  CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data,
+                         boxes_num * col_blocks * sizeof(unsigned long long),
+                         cudaMemcpyDeviceToHost));
+
+  cudaFree(mask_data);
+
+  unsigned long long *remv_cpu = new unsigned long long[col_blocks]();
+
+  int num_to_keep = 0;
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_cpu[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
+      }
+    }
+  }
+  delete[] remv_cpu;
+  if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
+
+  return num_to_keep;
+}
+
+int nms_normal_gpu(at::Tensor boxes, at::Tensor keep,
+                   float nms_overlap_thresh, int device_id) {
+  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
+  // params keep: (N)
+
+  CHECK_INPUT(boxes);
+  CHECK_CONTIGUOUS(keep);
+  cudaSetDevice(device_id);
+
+  int boxes_num = boxes.size(0);
+  const float *boxes_data = boxes.data_ptr<float>();
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+
+  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+
+  unsigned long long *mask_data = NULL;
+  CHECK_ERROR(cudaMalloc((void **)&mask_data,
+                         boxes_num * col_blocks * sizeof(unsigned long long)));
+  nmsNormalLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
+
+  // unsigned long long mask_cpu[boxes_num * col_blocks];
+  // unsigned long long *mask_cpu = new unsigned long long [boxes_num *
+  // col_blocks];
+  std::vector<unsigned long long> mask_cpu(boxes_num * col_blocks);
+
+  //    printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
+  CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data,
+                         boxes_num * col_blocks * sizeof(unsigned long long),
+                         cudaMemcpyDeviceToHost));
+
+  cudaFree(mask_data);
+
+  unsigned long long *remv_cpu = new unsigned long long[col_blocks]();
+
+  int num_to_keep = 0;
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_cpu[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
+      }
+    }
+  }
+  delete[] remv_cpu;
+  if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
+
+  return num_to_keep;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("boxes_overlap_bev_gpu", &boxes_overlap_bev_gpu,
+        "oriented boxes overlap");
+  m.def("boxes_iou_bev_gpu", &boxes_iou_bev_gpu, "oriented boxes iou");
+  m.def("nms_gpu", &nms_gpu, "oriented nms gpu");
+  m.def("nms_normal_gpu", &nms_normal_gpu, "nms gpu");
+}
diff --git a/mmcv/ops/iou3d_det/src/iou3d_kernel.cu b/mmcv/ops/iou3d_det/src/iou3d_kernel.cu
new file mode 100644
index 0000000..861aea3
--- /dev/null
+++ b/mmcv/ops/iou3d_det/src/iou3d_kernel.cu
@@ -0,0 +1,439 @@
+// Modified from
+// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#include <stdio.h>
+#define THREADS_PER_BLOCK 16
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+//#define DEBUG
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+__device__ const float EPS = 1e-8;
+struct Point {
+  float x, y;
+  __device__ Point() {}
+  __device__ Point(double _x, double _y) { x = _x, y = _y; }
+
+  __device__ void set(float _x, float _y) {
+    x = _x;
+    y = _y;
+  }
+
+  __device__ Point operator+(const Point &b) const {
+    return Point(x + b.x, y + b.y);
+  }
+
+  __device__ Point operator-(const Point &b) const {
+    return Point(x - b.x, y - b.y);
+  }
+};
+
+__device__ inline float cross(const Point &a, const Point &b) {
+  return a.x * b.y - a.y * b.x;
+}
+
+__device__ inline float cross(const Point &p1, const Point &p2,
+                              const Point &p0) {
+  return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
+}
+
+__device__ int check_rect_cross(const Point &p1, const Point &p2,
+                                const Point &q1, const Point &q2) {
+  int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) &&
+            min(q1.x, q2.x) <= max(p1.x, p2.x) &&
+            min(p1.y, p2.y) <= max(q1.y, q2.y) &&
+            min(q1.y, q2.y) <= max(p1.y, p2.y);
+  return ret;
+}
+
+__device__ inline int check_in_box2d(const float *box, const Point &p) {
+  // params: box (5) [x1, y1, x2, y2, angle]
+  const float MARGIN = 1e-5;
+
+  float center_x = (box[0] + box[2]) / 2;
+  float center_y = (box[1] + box[3]) / 2;
+  float angle_cos = cos(-box[4]),
+        angle_sin =
+            sin(-box[4]);  // rotate the point in the opposite direction of box
+  float rot_x =
+      (p.x - center_x) * angle_cos + (p.y - center_y) * angle_sin + center_x;
+  float rot_y =
+      -(p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos + center_y;
+#ifdef DEBUG
+  printf("box: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", box[0], box[1], box[2],
+         box[3], box[4]);
+  printf(
+      "center: (%.3f, %.3f), cossin(%.3f, %.3f), src(%.3f, %.3f), rot(%.3f, "
+      "%.3f)\n",
+      center_x, center_y, angle_cos, angle_sin, p.x, p.y, rot_x, rot_y);
+#endif
+  return (rot_x > box[0] - MARGIN && rot_x < box[2] + MARGIN &&
+          rot_y > box[1] - MARGIN && rot_y < box[3] + MARGIN);
+}
+
+__device__ inline int intersection(const Point &p1, const Point &p0,
+                                   const Point &q1, const Point &q0,
+                                   Point &ans) {
+  // fast exclusion
+  if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
+
+  // check cross standing
+  float s1 = cross(q0, p1, p0);
+  float s2 = cross(p1, q1, p0);
+  float s3 = cross(p0, q1, q0);
+  float s4 = cross(q1, p1, q0);
+
+  if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
+
+  // calculate intersection of two lines
+  float s5 = cross(q1, p1, p0);
+  if (fabs(s5 - s1) > EPS) {
+    ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
+    ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
+
+  } else {
+    float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
+    float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
+    float D = a0 * b1 - a1 * b0;
+
+    ans.x = (b0 * c1 - b1 * c0) / D;
+    ans.y = (a1 * c0 - a0 * c1) / D;
+  }
+
+  return 1;
+}
+
+__device__ inline void rotate_around_center(const Point &center,
+                                            const float angle_cos,
+                                            const float angle_sin, Point &p) {
+  float new_x =
+      (p.x - center.x) * angle_cos + (p.y - center.y) * angle_sin + center.x;
+  float new_y =
+      -(p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
+  p.set(new_x, new_y);
+}
+
+__device__ inline int point_cmp(const Point &a, const Point &b,
+                                const Point &center) {
+  return atan2(a.y - center.y, a.x - center.x) >
+         atan2(b.y - center.y, b.x - center.x);
+}
+
+__device__ inline float box_overlap(const float *box_a, const float *box_b) {
+  // params: box_a (5) [x1, y1, x2, y2, angle]
+  // params: box_b (5) [x1, y1, x2, y2, angle]
+
+  float a_x1 = box_a[0], a_y1 = box_a[1], a_x2 = box_a[2], a_y2 = box_a[3],
+        a_angle = box_a[4];
+  float b_x1 = box_b[0], b_y1 = box_b[1], b_x2 = box_b[2], b_y2 = box_b[3],
+        b_angle = box_b[4];
+
+  Point center_a((a_x1 + a_x2) / 2, (a_y1 + a_y2) / 2);
+  Point center_b((b_x1 + b_x2) / 2, (b_y1 + b_y2) / 2);
+#ifdef DEBUG
+  printf(
+      "a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)\n",
+      a_x1, a_y1, a_x2, a_y2, a_angle, b_x1, b_y1, b_x2, b_y2, b_angle);
+  printf("center a: (%.3f, %.3f), b: (%.3f, %.3f)\n", center_a.x, center_a.y,
+         center_b.x, center_b.y);
+#endif
+
+  Point box_a_corners[5];
+  box_a_corners[0].set(a_x1, a_y1);
+  box_a_corners[1].set(a_x2, a_y1);
+  box_a_corners[2].set(a_x2, a_y2);
+  box_a_corners[3].set(a_x1, a_y2);
+
+  Point box_b_corners[5];
+  box_b_corners[0].set(b_x1, b_y1);
+  box_b_corners[1].set(b_x2, b_y1);
+  box_b_corners[2].set(b_x2, b_y2);
+  box_b_corners[3].set(b_x1, b_y2);
+
+  // get oriented corners
+  float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
+  float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
+
+  for (int k = 0; k < 4; k++) {
+#ifdef DEBUG
+    printf("before corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k,
+           box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x,
+           box_b_corners[k].y);
+#endif
+    rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
+    rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
+#ifdef DEBUG
+    printf("corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x,
+           box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
+#endif
+  }
+
+  box_a_corners[4] = box_a_corners[0];
+  box_b_corners[4] = box_b_corners[0];
+
+  // get intersection of lines
+  Point cross_points[16];
+  Point poly_center;
+  int cnt = 0, flag = 0;
+
+  poly_center.set(0, 0);
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      flag = intersection(box_a_corners[i + 1], box_a_corners[i],
+                          box_b_corners[j + 1], box_b_corners[j],
+                          cross_points[cnt]);
+      if (flag) {
+        poly_center = poly_center + cross_points[cnt];
+        cnt++;
+      }
+    }
+  }
+
+  // check corners
+  for (int k = 0; k < 4; k++) {
+    if (check_in_box2d(box_a, box_b_corners[k])) {
+      poly_center = poly_center + box_b_corners[k];
+      cross_points[cnt] = box_b_corners[k];
+      cnt++;
+    }
+    if (check_in_box2d(box_b, box_a_corners[k])) {
+      poly_center = poly_center + box_a_corners[k];
+      cross_points[cnt] = box_a_corners[k];
+      cnt++;
+    }
+  }
+
+  poly_center.x /= cnt;
+  poly_center.y /= cnt;
+
+  // sort the points of polygon
+  Point temp;
+  for (int j = 0; j < cnt - 1; j++) {
+    for (int i = 0; i < cnt - j - 1; i++) {
+      if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) {
+        temp = cross_points[i];
+        cross_points[i] = cross_points[i + 1];
+        cross_points[i + 1] = temp;
+      }
+    }
+  }
+
+#ifdef DEBUG
+  printf("cnt=%d\n", cnt);
+  for (int i = 0; i < cnt; i++) {
+    printf("All cross point %d: (%.3f, %.3f)\n", i, cross_points[i].x,
+           cross_points[i].y);
+  }
+#endif
+
+  // get the overlap areas
+  float area = 0;
+  for (int k = 0; k < cnt - 1; k++) {
+    area += cross(cross_points[k] - cross_points[0],
+                  cross_points[k + 1] - cross_points[0]);
+  }
+
+  return fabs(area) / 2.0;
+}
+
+__device__ inline float iou_bev(const float *box_a, const float *box_b) {
+  // params: box_a (5) [x1, y1, x2, y2, angle]
+  // params: box_b (5) [x1, y1, x2, y2, angle]
+  float sa = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]);
+  float sb = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]);
+  float s_overlap = box_overlap(box_a, box_b);
+  return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
+}
+
+__global__ void boxes_overlap_kernel(const int num_a, const float *boxes_a,
+                                     const int num_b, const float *boxes_b,
+                                     float *ans_overlap) {
+  const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+  const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+
+  if (a_idx >= num_a || b_idx >= num_b) {
+    return;
+  }
+  const float *cur_box_a = boxes_a + a_idx * 5;
+  const float *cur_box_b = boxes_b + b_idx * 5;
+  float s_overlap = box_overlap(cur_box_a, cur_box_b);
+  ans_overlap[a_idx * num_b + b_idx] = s_overlap;
+}
+
+__global__ void boxes_iou_bev_kernel(const int num_a, const float *boxes_a,
+                                     const int num_b, const float *boxes_b,
+                                     float *ans_iou) {
+  const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
+  const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+
+  if (a_idx >= num_a || b_idx >= num_b) {
+    return;
+  }
+
+  const float *cur_box_a = boxes_a + a_idx * 5;
+  const float *cur_box_b = boxes_b + b_idx * 5;
+  float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
+  ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
+}
+
+__global__ void nms_kernel(const int boxes_num, const float nms_overlap_thresh,
+                           const float *boxes, unsigned long long *mask) {
+  // params: boxes (N, 5) [x1, y1, x2, y2, ry]
+  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+                             THREADS_PER_BLOCK_NMS);
+  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+                             THREADS_PER_BLOCK_NMS);
+
+  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
+
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 5 + 0] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
+    block_boxes[threadIdx.x * 5 + 1] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
+    block_boxes[threadIdx.x * 5 + 2] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
+    block_boxes[threadIdx.x * 5 + 3] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
+    block_boxes[threadIdx.x * 5 + 4] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+    const float *cur_box = boxes + cur_box_idx * 5;
+
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (iou_bev(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+    mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+__device__ inline float iou_normal(float const *const a, float const *const b) {
+  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
+  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
+  float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
+  float interS = width * height;
+  float Sa = (a[2] - a[0]) * (a[3] - a[1]);
+  float Sb = (b[2] - b[0]) * (b[3] - b[1]);
+  return interS / fmaxf(Sa + Sb - interS, EPS);
+}
+
+__global__ void nms_normal_kernel(const int boxes_num,
+                                  const float nms_overlap_thresh,
+                                  const float *boxes,
+                                  unsigned long long *mask) {
+  // params: boxes (N, 5) [x1, y1, x2, y2, ry]
+  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+                             THREADS_PER_BLOCK_NMS);
+  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+                             THREADS_PER_BLOCK_NMS);
+
+  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
+
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 5 + 0] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
+    block_boxes[threadIdx.x * 5 + 1] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
+    block_boxes[threadIdx.x * 5 + 2] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
+    block_boxes[threadIdx.x * 5 + 3] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
+    block_boxes[threadIdx.x * 5 + 4] =
+        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+    const float *cur_box = boxes + cur_box_idx * 5;
+
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (iou_normal(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+    mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+void boxesoverlapLauncher(const int num_a, const float *boxes_a,
+                          const int num_b, const float *boxes_b,
+                          float *ans_overlap) {
+  dim3 blocks(
+      DIVUP(num_b, THREADS_PER_BLOCK),
+      DIVUP(num_a, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+  dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
+
+  boxes_overlap_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b,
+                                            ans_overlap);
+#ifdef DEBUG
+  cudaDeviceSynchronize();  // for using printf in kernel function
+#endif
+}
+
+void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b,
+                         const float *boxes_b, float *ans_iou) {
+  dim3 blocks(
+      DIVUP(num_b, THREADS_PER_BLOCK),
+      DIVUP(num_a, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
+  dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
+
+  boxes_iou_bev_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b,
+                                            ans_iou);
+}
+
+void nmsLauncher(const float *boxes, unsigned long long *mask, int boxes_num,
+                 float nms_overlap_thresh) {
+  dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
+              DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+  nms_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes, mask);
+}
+
+void nmsNormalLauncher(const float *boxes, unsigned long long *mask,
+                       int boxes_num, float nms_overlap_thresh) {
+  dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
+              DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+  nms_normal_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes,
+                                         mask);
+}
diff --git a/mmcv/ops/masked_conv.py b/mmcv/ops/masked_conv.py
new file mode 100644
index 0000000..cd514cc
--- /dev/null
+++ b/mmcv/ops/masked_conv.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['masked_im2col_forward', 'masked_col2im_forward'])
+
+
+class MaskedConv2dFunction(Function):
+
+    @staticmethod
+    def symbolic(g, features, mask, weight, bias, padding, stride):
+        return g.op(
+            'mmcv::MMCVMaskedConv2d',
+            features,
+            mask,
+            weight,
+            bias,
+            padding_i=padding,
+            stride_i=stride)
+
+    @staticmethod
+    def forward(ctx, features, mask, weight, bias, padding=0, stride=1):
+        assert mask.dim() == 3 and mask.size(0) == 1
+        assert features.dim() == 4 and features.size(0) == 1
+        assert features.size()[2:] == mask.size()[1:]
+        pad_h, pad_w = _pair(padding)
+        stride_h, stride_w = _pair(stride)
+        if stride_h != 1 or stride_w != 1:
+            raise ValueError(
+                'Stride could not only be 1 in masked_conv2d currently.')
+        out_channel, in_channel, kernel_h, kernel_w = weight.size()
+
+        batch_size = features.size(0)
+        out_h = int(
+            math.floor((features.size(2) + 2 * pad_h -
+                        (kernel_h - 1) - 1) / stride_h + 1))
+        out_w = int(
+            math.floor((features.size(3) + 2 * pad_w -
+                        (kernel_h - 1) - 1) / stride_w + 1))
+        mask_inds = torch.nonzero(mask[0] > 0, as_tuple=False)
+        output = features.new_zeros(batch_size, out_channel, out_h, out_w)
+        if mask_inds.numel() > 0:
+            mask_h_idx = mask_inds[:, 0].contiguous()
+            mask_w_idx = mask_inds[:, 1].contiguous()
+            data_col = features.new_zeros(in_channel * kernel_h * kernel_w,
+                                          mask_inds.size(0))
+            ext_module.masked_im2col_forward(
+                features,
+                mask_h_idx,
+                mask_w_idx,
+                data_col,
+                kernel_h=kernel_h,
+                kernel_w=kernel_w,
+                pad_h=pad_h,
+                pad_w=pad_w)
+
+            masked_output = torch.addmm(1, bias[:, None], 1,
+                                        weight.view(out_channel, -1), data_col)
+            ext_module.masked_col2im_forward(
+                masked_output,
+                mask_h_idx,
+                mask_w_idx,
+                output,
+                height=out_h,
+                width=out_w,
+                channels=out_channel)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        return (None, ) * 5
+
+
+masked_conv2d = MaskedConv2dFunction.apply
+
+
+class MaskedConv2d(nn.Conv2d):
+    """A MaskedConv2d which inherits the official Conv2d.
+
+    The masked forward doesn't implement the backward function and only
+    supports the stride parameter to be 1 currently.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True):
+        super(MaskedConv2d,
+              self).__init__(in_channels, out_channels, kernel_size, stride,
+                             padding, dilation, groups, bias)
+
+    def forward(self, input, mask=None):
+        if mask is None:  # fallback to the normal Conv2d
+            return super(MaskedConv2d, self).forward(input)
+        else:
+            return masked_conv2d(input, mask, self.weight, self.bias,
+                                 self.padding)
diff --git a/mmcv/ops/modulated_deform_conv.py b/mmcv/ops/modulated_deform_conv.py
new file mode 100644
index 0000000..2681bc8
--- /dev/null
+++ b/mmcv/ops/modulated_deform_conv.py
@@ -0,0 +1,282 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair, _single
+
+from mmcv.utils import deprecated_api_warning
+from ..models import CONV_LAYERS
+from ..utils import ext_loader, print_log
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['modulated_deform_conv_forward', 'modulated_deform_conv_backward'])
+
+
+class ModulatedDeformConv2dFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, offset, mask, weight, bias, stride, padding,
+                 dilation, groups, deform_groups):
+        input_tensors = [input, offset, mask, weight]
+        if bias is not None:
+            input_tensors.append(bias)
+        return g.op(
+            'mmcv::MMCVModulatedDeformConv2d',
+            *input_tensors,
+            stride_i=stride,
+            padding_i=padding,
+            dilation_i=dilation,
+            groups_i=groups,
+            deform_groups_i=deform_groups)
+
+    @staticmethod
+    def forward(ctx,
+                input,
+                offset,
+                mask,
+                weight,
+                bias=None,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                deform_groups=1):
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                f'Expected 4D tensor as input, got {input.dim()}D tensor \
+                  instead.')
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deform_groups = deform_groups
+        ctx.with_bias = bias is not None
+        if not ctx.with_bias:
+            bias = input.new_empty(0)  # fake tensor
+        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
+        # amp won't cast the type of model (float32), but "offset" is cast
+        # to float16 by nn.Conv2d automatically, leading to the type
+        # mismatch with input (when it is float32) or weight.
+        # The flag for whether to use fp16 or amp is the type of "offset",
+        # we cast weight and input to temporarily support fp16 and amp
+        # whatever the pytorch version is.
+        input = input.type_as(offset)
+        weight = weight.type_as(input)
+        ctx.save_for_backward(input, offset, mask, weight, bias)
+        output = input.new_empty(
+            ModulatedDeformConv2dFunction._output_size(ctx, input, weight))
+        ctx._bufs = [input.new_empty(0), input.new_empty(0)]
+        ext_module.modulated_deform_conv_forward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            output,
+            ctx._bufs[1],
+            kernel_h=weight.size(2),
+            kernel_w=weight.size(3),
+            stride_h=ctx.stride[0],
+            stride_w=ctx.stride[1],
+            pad_h=ctx.padding[0],
+            pad_w=ctx.padding[1],
+            dilation_h=ctx.dilation[0],
+            dilation_w=ctx.dilation[1],
+            group=ctx.groups,
+            deformable_group=ctx.deform_groups,
+            with_bias=ctx.with_bias)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, offset, mask, weight, bias = ctx.saved_tensors
+        grad_input = torch.zeros_like(input)
+        grad_offset = torch.zeros_like(offset)
+        grad_mask = torch.zeros_like(mask)
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
+        grad_output = grad_output.contiguous()
+        ext_module.modulated_deform_conv_backward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            ctx._bufs[1],
+            grad_input,
+            grad_weight,
+            grad_bias,
+            grad_offset,
+            grad_mask,
+            grad_output,
+            kernel_h=weight.size(2),
+            kernel_w=weight.size(3),
+            stride_h=ctx.stride[0],
+            stride_w=ctx.stride[1],
+            pad_h=ctx.padding[0],
+            pad_w=ctx.padding[1],
+            dilation_h=ctx.dilation[0],
+            dilation_w=ctx.dilation[1],
+            group=ctx.groups,
+            deformable_group=ctx.deform_groups,
+            with_bias=ctx.with_bias)
+        if not ctx.with_bias:
+            grad_bias = None
+
+        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
+                None, None, None, None, None)
+
+    @staticmethod
+    def _output_size(ctx, input, weight):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = ctx.padding[d]
+            kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = ctx.stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                'convolution input is too small (output would be ' +
+                'x'.join(map(str, output_size)) + ')')
+        return output_size
+
+
+modulated_deform_conv2d = ModulatedDeformConv2dFunction.apply
+
+
+class ModulatedDeformConv2d(nn.Module):
+
+    @deprecated_api_warning({'deformable_groups': 'deform_groups'},
+                            cls_name='ModulatedDeformConv2d')
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 deform_groups=1,
+                 bias=True):
+        super(ModulatedDeformConv2d, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deform_groups = deform_groups
+        # enable compatibility with nn.Conv2d
+        self.transposed = False
+        self.output_padding = _single(0)
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // groups,
+                         *self.kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.init_weights()
+
+    def init_weights(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.data.zero_()
+
+    def forward(self, x, offset, mask):
+        return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,
+                                       self.stride, self.padding,
+                                       self.dilation, self.groups,
+                                       self.deform_groups)
+
+
+@CONV_LAYERS.register_module('DCNv2')
+class ModulatedDeformConv2dPack(ModulatedDeformConv2d):
+    """A ModulatedDeformable Conv Encapsulation that acts as normal Conv
+    layers.
+
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int): Same as nn.Conv2d, while tuple is not supported.
+        padding (int): Same as nn.Conv2d, while tuple is not supported.
+        dilation (int): Same as nn.Conv2d, while tuple is not supported.
+        groups (int): Same as nn.Conv2d.
+        bias (bool or str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
+            False.
+    """
+
+    _version = 2
+
+    def __init__(self, *args, **kwargs):
+        super(ModulatedDeformConv2dPack, self).__init__(*args, **kwargs)
+        self.conv_offset = nn.Conv2d(
+            self.in_channels,
+            self.deform_groups * 3 * self.kernel_size[0] * self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            bias=True)
+        self.init_weights()
+
+    def init_weights(self):
+        super(ModulatedDeformConv2dPack, self).init_weights()
+        if hasattr(self, 'conv_offset'):
+            self.conv_offset.weight.data.zero_()
+            self.conv_offset.bias.data.zero_()
+
+    def forward(self, x):
+        out = self.conv_offset(x)
+        o1, o2, mask = torch.chunk(out, 3, dim=1)
+        offset = torch.cat((o1, o2), dim=1)
+        mask = torch.sigmoid(mask)
+        return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,
+                                       self.stride, self.padding,
+                                       self.dilation, self.groups,
+                                       self.deform_groups)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+
+        if version is None or version < 2:
+            # the key is different in early versions
+            # In version < 2, ModulatedDeformConvPack
+            # loads previous benchmark models.
+            if (prefix + 'conv_offset.weight' not in state_dict
+                    and prefix[:-1] + '_offset.weight' in state_dict):
+                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
+                    prefix[:-1] + '_offset.weight')
+            if (prefix + 'conv_offset.bias' not in state_dict
+                    and prefix[:-1] + '_offset.bias' in state_dict):
+                state_dict[prefix +
+                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
+                                                                '_offset.bias')
+
+        if version is not None and version > 1:
+            print_log(
+                f'ModulatedDeformConvPack {prefix.rstrip(".")} is upgraded to '
+                'version 2.',
+                logger='root')
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
diff --git a/mmcv/ops/multi_scale_deform_attn.py b/mmcv/ops/multi_scale_deform_attn.py
new file mode 100644
index 0000000..527ce70
--- /dev/null
+++ b/mmcv/ops/multi_scale_deform_attn.py
@@ -0,0 +1,358 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd.function import Function, once_differentiable
+
+from mmcv import deprecated_api_warning
+from mmcv.models.utils.weight_init import constant_init, xavier_init
+from mmcv.models.bricks.registry import ATTENTION
+from mmcv.models.backbones.base_module import BaseModule
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+class MultiScaleDeformableAttnFunction(Function):
+
+    @staticmethod
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights, im2col_step):
+        """GPU version of multi-scale deformable attention.
+
+        Args:
+            value (Tensor): The value has shape
+                (bs, num_keys, mum_heads, embed_dims//num_heads)
+            value_spatial_shapes (Tensor): Spatial shape of
+                each feature map, has shape (num_levels, 2),
+                last dimension 2 represent (h, w)
+            sampling_locations (Tensor): The location of sampling points,
+                has shape
+                (bs ,num_queries, num_heads, num_levels, num_points, 2),
+                the last dimension 2 represent (x, y).
+            attention_weights (Tensor): The weight of sampling points used
+                when calculate the attention, has shape
+                (bs ,num_queries, num_heads, num_levels, num_points),
+            im2col_step (Tensor): The step used in image to column.
+
+        Returns:
+            Tensor: has shape (bs, num_queries, embed_dims)
+        """
+
+        ctx.im2col_step = im2col_step
+        output = ext_module.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            im2col_step=ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes,
+                              value_level_start_index, sampling_locations,
+                              attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        """GPU version of backward function.
+
+        Args:
+            grad_output (Tensor): Gradient
+                of output tensor of forward.
+
+        Returns:
+             Tuple[Tensor]: Gradient
+                of input tensors in forward.
+        """
+        value, value_spatial_shapes, value_level_start_index,\
+            sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value = torch.zeros_like(value)
+        grad_sampling_loc = torch.zeros_like(sampling_locations)
+        grad_attn_weight = torch.zeros_like(attention_weights)
+
+        ext_module.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output.contiguous(),
+            grad_value,
+            grad_sampling_loc,
+            grad_attn_weight,
+            im2col_step=ctx.im2col_step)
+
+        return grad_value, None, None, \
+            grad_sampling_loc, grad_attn_weight, None
+
+
+def multi_scale_deformable_attn_pytorch(value, value_spatial_shapes,
+                                        sampling_locations, attention_weights):
+    """CPU version of multi-scale deformable attention.
+
+    Args:
+        value (Tensor): The value has shape
+            (bs, num_keys, mum_heads, embed_dims//num_heads)
+        value_spatial_shapes (Tensor): Spatial shape of
+            each feature map, has shape (num_levels, 2),
+            last dimension 2 represent (h, w)
+        sampling_locations (Tensor): The location of sampling points,
+            has shape
+            (bs ,num_queries, num_heads, num_levels, num_points, 2),
+            the last dimension 2 represent (x, y).
+        attention_weights (Tensor): The weight of sampling points used
+            when calculate the attention, has shape
+            (bs ,num_queries, num_heads, num_levels, num_points),
+
+    Returns:
+        Tensor: has shape (bs, num_queries, embed_dims)
+    """
+
+    bs, _, num_heads, embed_dims = value.shape
+    _, num_queries, num_heads, num_levels, num_points, _ =\
+        sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes],
+                             dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level, (H_, W_) in enumerate(value_spatial_shapes):
+        # bs, H_*W_, num_heads, embed_dims ->
+        # bs, H_*W_, num_heads*embed_dims ->
+        # bs, num_heads*embed_dims, H_*W_ ->
+        # bs*num_heads, embed_dims, H_, W_
+        value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(
+            bs * num_heads, embed_dims, H_, W_)
+        # bs, num_queries, num_heads, num_points, 2 ->
+        # bs, num_heads, num_queries, num_points, 2 ->
+        # bs*num_heads, num_queries, num_points, 2
+        sampling_grid_l_ = sampling_grids[:, :, :,
+                                          level].transpose(1, 2).flatten(0, 1)
+        # bs*num_heads, embed_dims, num_queries, num_points
+        sampling_value_l_ = F.grid_sample(
+            value_l_,
+            sampling_grid_l_,
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (bs, num_queries, num_heads, num_levels, num_points) ->
+    # (bs, num_heads, num_queries, num_levels, num_points) ->
+    # (bs, num_heads, 1, num_queries, num_levels*num_points)
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        bs * num_heads, 1, num_queries, num_levels * num_points)
+    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) *
+              attention_weights).sum(-1).view(bs, num_heads * embed_dims,
+                                              num_queries)
+    return output.transpose(1, 2).contiguous()
+
+
+@ATTENTION.register_module()
+class MultiScaleDeformableAttention(BaseModule):
+    """An attention module used in Deformable-Detr.
+
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 im2col_step=64,
+                 dropout=0.1,
+                 batch_first=False,
+                 norm_cfg=None,
+                 init_cfg=None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.init_weights()
+
+    def init_weights(self):
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+                         self.num_heads, 1, 1,
+                         2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiScaleDeformableAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_padding_mask=None,
+                reference_points=None,
+                spatial_shapes=None,
+                level_start_index=None,
+                **kwargs):
+        """Forward Function of MultiScaleDeformAttention.
+
+        Args:
+            query (Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`. Default
+                None.
+            reference_points (Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            spatial_shapes (Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+
+        Returns:
+             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if torch.cuda.is_available() and value.is_cuda:
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            # (num_query, bs ,embed_dims)
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
diff --git a/mmcv/ops/nms.py b/mmcv/ops/nms.py
new file mode 100644
index 0000000..ed9835e
--- /dev/null
+++ b/mmcv/ops/nms.py
@@ -0,0 +1,388 @@
+import os
+
+import numpy as np
+import torch
+
+from mmcv.utils import deprecated_api_warning
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['nms', 'softnms', 'nms_match', 'nms_rotated'])
+
+
+# This function is modified from: https://github.com/pytorch/vision/
+class NMSop(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, bboxes, scores, iou_threshold, offset, score_threshold,
+                max_num):
+        is_filtering_by_score = score_threshold > 0
+        if is_filtering_by_score:
+            valid_mask = scores > score_threshold
+            bboxes, scores = bboxes[valid_mask], scores[valid_mask]
+            valid_inds = torch.nonzero(
+                valid_mask, as_tuple=False).squeeze(dim=1)
+
+        inds = ext_module.nms(
+            bboxes, scores, iou_threshold=float(iou_threshold), offset=offset)
+
+        if max_num > 0:
+            inds = inds[:max_num]
+        if is_filtering_by_score:
+            inds = valid_inds[inds]
+        return inds
+
+    @staticmethod
+    def symbolic(g, bboxes, scores, iou_threshold, offset, score_threshold,
+                 max_num):
+        from ..onnx import is_custom_op_loaded
+        has_custom_op = is_custom_op_loaded()
+        # TensorRT nms plugin is aligned with original nms in ONNXRuntime
+        is_trt_backend = os.environ.get('ONNX_BACKEND') == 'MMCVTensorRT'
+        if has_custom_op and (not is_trt_backend):
+            return g.op(
+                'mmcv::NonMaxSuppression',
+                bboxes,
+                scores,
+                iou_threshold_f=float(iou_threshold),
+                offset_i=int(offset))
+        else:
+            from torch.onnx.symbolic_opset9 import select, squeeze, unsqueeze
+            from ..onnx.onnx_utils.symbolic_helper import _size_helper
+
+            boxes = unsqueeze(g, bboxes, 0)
+            scores = unsqueeze(g, unsqueeze(g, scores, 0), 0)
+
+            if max_num > 0:
+                max_num = g.op(
+                    'Constant',
+                    value_t=torch.tensor(max_num, dtype=torch.long))
+            else:
+                dim = g.op('Constant', value_t=torch.tensor(0))
+                max_num = _size_helper(g, bboxes, dim)
+            max_output_per_class = max_num
+            iou_threshold = g.op(
+                'Constant',
+                value_t=torch.tensor([iou_threshold], dtype=torch.float))
+            score_threshold = g.op(
+                'Constant',
+                value_t=torch.tensor([score_threshold], dtype=torch.float))
+            nms_out = g.op('NonMaxSuppression', boxes, scores,
+                           max_output_per_class, iou_threshold,
+                           score_threshold)
+            return squeeze(
+                g,
+                select(
+                    g, nms_out, 1,
+                    g.op(
+                        'Constant',
+                        value_t=torch.tensor([2], dtype=torch.long))), 1)
+
+
+class SoftNMSop(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, boxes, scores, iou_threshold, sigma, min_score, method,
+                offset):
+        dets = boxes.new_empty((boxes.size(0), 5), device='cpu')
+        inds = ext_module.softnms(
+            boxes.cpu(),
+            scores.cpu(),
+            dets.cpu(),
+            iou_threshold=float(iou_threshold),
+            sigma=float(sigma),
+            min_score=float(min_score),
+            method=int(method),
+            offset=int(offset))
+        return dets, inds
+
+    @staticmethod
+    def symbolic(g, boxes, scores, iou_threshold, sigma, min_score, method,
+                 offset):
+        from packaging import version
+        assert version.parse(torch.__version__) >= version.parse('1.7.0')
+        nms_out = g.op(
+            'mmcv::SoftNonMaxSuppression',
+            boxes,
+            scores,
+            iou_threshold_f=float(iou_threshold),
+            sigma_f=float(sigma),
+            min_score_f=float(min_score),
+            method_i=int(method),
+            offset_i=int(offset),
+            outputs=2)
+        return nms_out
+
+
+@deprecated_api_warning({'iou_thr': 'iou_threshold'})
+def nms(boxes, scores, iou_threshold, offset=0, score_threshold=0, max_num=-1):
+    """Dispatch to either CPU or GPU NMS implementations.
+
+    The input can be either torch tensor or numpy array. GPU NMS will be used
+    if the input is gpu tensor, otherwise CPU NMS
+    will be used. The returned type will always be the same as inputs.
+
+    Arguments:
+        boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).
+        scores (torch.Tensor or np.ndarray): scores in shape (N, ).
+        iou_threshold (float): IoU threshold for NMS.
+        offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).
+        score_threshold (float): score threshold for NMS.
+        max_num (int): maximum number of boxes after NMS.
+
+    Returns:
+        tuple: kept dets(boxes and scores) and indice, which is always the \
+            same data type as the input.
+
+    Example:
+        >>> boxes = np.array([[49.1, 32.4, 51.0, 35.9],
+        >>>                   [49.3, 32.9, 51.0, 35.3],
+        >>>                   [49.2, 31.8, 51.0, 35.4],
+        >>>                   [35.1, 11.5, 39.1, 15.7],
+        >>>                   [35.6, 11.8, 39.3, 14.2],
+        >>>                   [35.3, 11.5, 39.9, 14.5],
+        >>>                   [35.2, 11.7, 39.7, 15.7]], dtype=np.float32)
+        >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.5, 0.4, 0.3],\
+               dtype=np.float32)
+        >>> iou_threshold = 0.6
+        >>> dets, inds = nms(boxes, scores, iou_threshold)
+        >>> assert len(inds) == len(dets) == 3
+    """
+    assert isinstance(boxes, (torch.Tensor, np.ndarray))
+    assert isinstance(scores, (torch.Tensor, np.ndarray))
+    is_numpy = False
+    if isinstance(boxes, np.ndarray):
+        is_numpy = True
+        boxes = torch.from_numpy(boxes)
+    if isinstance(scores, np.ndarray):
+        scores = torch.from_numpy(scores)
+    assert boxes.size(1) == 4
+    assert boxes.size(0) == scores.size(0)
+    assert offset in (0, 1)
+
+
+    inds = NMSop.apply(boxes, scores, iou_threshold, offset,
+                        score_threshold, max_num)
+    dets = torch.cat((boxes[inds], scores[inds].reshape(-1, 1)), dim=1)
+    if is_numpy:
+        dets = dets.cpu().numpy()
+        inds = inds.cpu().numpy()
+    return dets, inds
+
+
+@deprecated_api_warning({'iou_thr': 'iou_threshold'})
+def soft_nms(boxes,
+             scores,
+             iou_threshold=0.3,
+             sigma=0.5,
+             min_score=1e-3,
+             method='linear',
+             offset=0):
+    """Dispatch to only CPU Soft NMS implementations.
+
+    The input can be either a torch tensor or numpy array.
+    The returned type will always be the same as inputs.
+
+    Arguments:
+        boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).
+        scores (torch.Tensor or np.ndarray): scores in shape (N, ).
+        iou_threshold (float): IoU threshold for NMS.
+        sigma (float): hyperparameter for gaussian method
+        min_score (float): score filter threshold
+        method (str): either 'linear' or 'gaussian'
+        offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).
+
+    Returns:
+        tuple: kept dets(boxes and scores) and indice, which is always the \
+            same data type as the input.
+
+    Example:
+        >>> boxes = np.array([[4., 3., 5., 3.],
+        >>>                   [4., 3., 5., 4.],
+        >>>                   [3., 1., 3., 1.],
+        >>>                   [3., 1., 3., 1.],
+        >>>                   [3., 1., 3., 1.],
+        >>>                   [3., 1., 3., 1.]], dtype=np.float32)
+        >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.4, 0.0], dtype=np.float32)
+        >>> iou_threshold = 0.6
+        >>> dets, inds = soft_nms(boxes, scores, iou_threshold, sigma=0.5)
+        >>> assert len(inds) == len(dets) == 5
+    """
+
+    assert isinstance(boxes, (torch.Tensor, np.ndarray))
+    assert isinstance(scores, (torch.Tensor, np.ndarray))
+    is_numpy = False
+    if isinstance(boxes, np.ndarray):
+        is_numpy = True
+        boxes = torch.from_numpy(boxes)
+    if isinstance(scores, np.ndarray):
+        scores = torch.from_numpy(scores)
+    assert boxes.size(1) == 4
+    assert boxes.size(0) == scores.size(0)
+    assert offset in (0, 1)
+    method_dict = {'naive': 0, 'linear': 1, 'gaussian': 2}
+    assert method in method_dict.keys()
+
+
+    dets, inds = SoftNMSop.apply(boxes.cpu(), scores.cpu(),
+                                    float(iou_threshold), float(sigma),
+                                    float(min_score), method_dict[method],
+                                    int(offset))
+
+    dets = dets[:inds.size(0)]
+
+    if is_numpy:
+        dets = dets.cpu().numpy()
+        inds = inds.cpu().numpy()
+        return dets, inds
+    else:
+        return dets.to(device=boxes.device), inds.to(device=boxes.device)
+
+
+def batched_nms(boxes, scores, idxs, nms_cfg, class_agnostic=False):
+    """Performs non-maximum suppression in a batched fashion.
+
+    Modified from https://github.com/pytorch/vision/blob
+    /505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39.
+    In order to perform NMS independently per class, we add an offset to all
+    the boxes. The offset is dependent only on the class idx, and is large
+    enough so that boxes from different classes do not overlap.
+
+    Arguments:
+        boxes (torch.Tensor): boxes in shape (N, 4).
+        scores (torch.Tensor): scores in shape (N, ).
+        idxs (torch.Tensor): each index value correspond to a bbox cluster,
+            and NMS will not be applied between elements of different idxs,
+            shape (N, ).
+        nms_cfg (dict): specify nms type and other parameters like iou_thr.
+            Possible keys includes the following.
+
+            - iou_thr (float): IoU threshold used for NMS.
+            - split_thr (float): threshold number of boxes. In some cases the
+                number of boxes is large (e.g., 200k). To avoid OOM during
+                training, the users could set `split_thr` to a small value.
+                If the number of boxes is greater than the threshold, it will
+                perform NMS on each group of boxes separately and sequentially.
+                Defaults to 10000.
+        class_agnostic (bool): if true, nms is class agnostic,
+            i.e. IoU thresholding happens over all boxes,
+            regardless of the predicted class.
+
+    Returns:
+        tuple: kept dets and indice.
+    """
+    nms_cfg_ = nms_cfg.copy()
+    class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic)
+    if class_agnostic:
+        boxes_for_nms = boxes
+    else:
+        max_coordinate = boxes.max()
+        offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))
+        boxes_for_nms = boxes + offsets[:, None]
+
+    nms_type = nms_cfg_.pop('type', 'nms')
+    nms_op = eval(nms_type)
+
+    split_thr = nms_cfg_.pop('split_thr', 10000)
+    # Won't split to multiple nms nodes when exporting to onnx
+    if boxes_for_nms.shape[0] < split_thr or torch.onnx.is_in_onnx_export():
+        dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_)
+        boxes = boxes[keep]
+        # -1 indexing works abnormal in TensorRT
+        # This assumes `dets` has 5 dimensions where
+        # the last dimension is score.
+        # TODO: more elegant way to handle the dimension issue.
+        # Some type of nms would reweight the score, such as SoftNMS
+        scores = dets[:, 4]
+    else:
+        max_num = nms_cfg_.pop('max_num', -1)
+        total_mask = scores.new_zeros(scores.size(), dtype=torch.bool)
+        # Some type of nms would reweight the score, such as SoftNMS
+        scores_after_nms = scores.new_zeros(scores.size())
+        for id in torch.unique(idxs):
+            mask = (idxs == id).nonzero(as_tuple=False).view(-1)
+            dets, keep = nms_op(boxes_for_nms[mask], scores[mask], **nms_cfg_)
+            total_mask[mask[keep]] = True
+            scores_after_nms[mask[keep]] = dets[:, -1]
+        keep = total_mask.nonzero(as_tuple=False).view(-1)
+
+        scores, inds = scores_after_nms[keep].sort(descending=True)
+        keep = keep[inds]
+        boxes = boxes[keep]
+
+        if max_num > 0:
+            keep = keep[:max_num]
+            boxes = boxes[:max_num]
+            scores = scores[:max_num]
+
+    return torch.cat([boxes, scores[:, None]], -1), keep
+
+
+def nms_match(dets, iou_threshold):
+    """Matched dets into different groups by NMS.
+
+    NMS match is Similar to NMS but when a bbox is suppressed, nms match will
+    record the indice of suppressed bbox and form a group with the indice of
+    kept bbox. In each group, indice is sorted as score order.
+
+    Arguments:
+        dets (torch.Tensor | np.ndarray): Det boxes with scores, shape (N, 5).
+        iou_thr (float): IoU thresh for NMS.
+
+    Returns:
+        List[torch.Tensor | np.ndarray]: The outer list corresponds different
+            matched group, the inner Tensor corresponds the indices for a group
+            in score order.
+    """
+    if dets.shape[0] == 0:
+        matched = []
+    else:
+        assert dets.shape[-1] == 5, 'inputs dets.shape should be (N, 5), ' \
+                                    f'but get {dets.shape}'
+        if isinstance(dets, torch.Tensor):
+            dets_t = dets.detach().cpu()
+        else:
+            dets_t = torch.from_numpy(dets)
+        indata_list = [dets_t]
+        indata_dict = {'iou_threshold': float(iou_threshold)}
+        matched = ext_module.nms_match(*indata_list, **indata_dict)
+
+    if isinstance(dets, torch.Tensor):
+        return [dets.new_tensor(m, dtype=torch.long) for m in matched]
+    else:
+        return [np.array(m, dtype=np.int) for m in matched]
+
+
+def nms_rotated(dets, scores, iou_threshold, labels=None):
+    """Performs non-maximum suppression (NMS) on the rotated boxes according to
+    their intersection-over-union (IoU).
+
+    Rotated NMS iteratively removes lower scoring rotated boxes which have an
+    IoU greater than iou_threshold with another (higher scoring) rotated box.
+
+    Args:
+        boxes (Tensor):  Rotated boxes in shape (N, 5). They are expected to \
+            be in (x_ctr, y_ctr, width, height, angle_radian) format.
+        scores (Tensor): scores in shape (N, ).
+        iou_threshold (float): IoU thresh for NMS.
+        labels (Tensor): boxes' label in shape (N,).
+
+    Returns:
+        tuple: kept dets(boxes and scores) and indice, which is always the \
+            same data type as the input.
+    """
+    if dets.shape[0] == 0:
+        return dets, None
+    multi_label = labels is not None
+    if multi_label:
+        dets_wl = torch.cat((dets, labels.unsqueeze(1)), 1)
+    else:
+        dets_wl = dets
+    _, order = scores.sort(0, descending=True)
+    dets_sorted = dets_wl.index_select(0, order)
+
+    keep_inds = ext_module.nms_rotated(dets_wl, scores, order, dets_sorted,
+                                        iou_threshold, multi_label)
+    dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),
+                     dim=1)
+    return dets, keep_inds
diff --git a/mmcv/ops/roi_align.py b/mmcv/ops/roi_align.py
new file mode 100644
index 0000000..0755aef
--- /dev/null
+++ b/mmcv/ops/roi_align.py
@@ -0,0 +1,223 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import deprecated_api_warning, ext_loader
+
+ext_module = ext_loader.load_ext('_ext',
+                                 ['roi_align_forward', 'roi_align_backward'])
+
+
+class RoIAlignFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio,
+                 pool_mode, aligned):
+        from ..onnx import is_custom_op_loaded
+        has_custom_op = is_custom_op_loaded()
+        if has_custom_op:
+            return g.op(
+                'mmcv::MMCVRoiAlign',
+                input,
+                rois,
+                output_height_i=output_size[0],
+                output_width_i=output_size[1],
+                spatial_scale_f=spatial_scale,
+                sampling_ratio_i=sampling_ratio,
+                mode_s=pool_mode,
+                aligned_i=aligned)
+        else:
+            from torch.onnx.symbolic_opset9 import sub, squeeze
+            from torch.onnx.symbolic_helper import _slice_helper
+            from torch.onnx import TensorProtoDataType
+            # batch_indices = rois[:, 0].long()
+            batch_indices = _slice_helper(
+                g, rois, axes=[1], starts=[0], ends=[1])
+            batch_indices = squeeze(g, batch_indices, 1)
+            batch_indices = g.op(
+                'Cast', batch_indices, to_i=TensorProtoDataType.INT64)
+            # rois = rois[:, 1:]
+            rois = _slice_helper(g, rois, axes=[1], starts=[1], ends=[5])
+            if aligned:
+                # rois -= 0.5/spatial_scale
+                aligned_offset = g.op(
+                    'Constant',
+                    value_t=torch.tensor([0.5 / spatial_scale],
+                                         dtype=torch.float32))
+                rois = sub(g, rois, aligned_offset)
+            # roi align
+            return g.op(
+                'RoiAlign',
+                input,
+                rois,
+                batch_indices,
+                output_height_i=output_size[0],
+                output_width_i=output_size[1],
+                spatial_scale_f=spatial_scale,
+                sampling_ratio_i=max(0, sampling_ratio),
+                mode_s=pool_mode)
+
+    @staticmethod
+    def forward(ctx,
+                input,
+                rois,
+                output_size,
+                spatial_scale=1.0,
+                sampling_ratio=0,
+                pool_mode='avg',
+                aligned=True):
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.sampling_ratio = sampling_ratio
+        assert pool_mode in ('max', 'avg')
+        ctx.pool_mode = 0 if pool_mode == 'max' else 1
+        ctx.aligned = aligned
+        ctx.input_shape = input.size()
+
+        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'
+
+        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
+                        ctx.output_size[1])
+        output = input.new_zeros(output_shape)
+        if ctx.pool_mode == 0:
+            argmax_y = input.new_zeros(output_shape)
+            argmax_x = input.new_zeros(output_shape)
+        else:
+            argmax_y = input.new_zeros(0)
+            argmax_x = input.new_zeros(0)
+
+        ext_module.roi_align_forward(
+            input,
+            rois,
+            output,
+            argmax_y,
+            argmax_x,
+            aligned_height=ctx.output_size[0],
+            aligned_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            pool_mode=ctx.pool_mode,
+            aligned=ctx.aligned)
+
+        ctx.save_for_backward(rois, argmax_y, argmax_x)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        rois, argmax_y, argmax_x = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(ctx.input_shape)
+        # complex head architecture may cause grad_output uncontiguous.
+        grad_output = grad_output.contiguous()
+        ext_module.roi_align_backward(
+            grad_output,
+            rois,
+            argmax_y,
+            argmax_x,
+            grad_input,
+            aligned_height=ctx.output_size[0],
+            aligned_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            pool_mode=ctx.pool_mode,
+            aligned=ctx.aligned)
+        return grad_input, None, None, None, None, None, None
+
+
+roi_align = RoIAlignFunction.apply
+
+
+class RoIAlign(nn.Module):
+    """RoI align pooling layer.
+
+    Args:
+        output_size (tuple): h, w
+        spatial_scale (float): scale the input boxes by this number
+        sampling_ratio (int): number of inputs samples to take for each
+            output sample. 0 to take samples densely for current models.
+        pool_mode (str, 'avg' or 'max'): pooling mode in each bin.
+        aligned (bool): if False, use the legacy implementation in
+            MMDetection. If True, align the results more perfectly.
+        use_torchvision (bool): whether to use roi_align from torchvision.
+
+    Note:
+        The implementation of RoIAlign when aligned=True is modified from
+        https://github.com/facebookresearch/detectron2/
+
+        The meaning of aligned=True:
+
+        Given a continuous coordinate c, its two neighboring pixel
+        indices (in our pixel model) are computed by floor(c - 0.5) and
+        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
+        indices [0] and [1] (which are sampled from the underlying signal
+        at continuous coordinates 0.5 and 1.5). But the original roi_align
+        (aligned=False) does not subtract the 0.5 when computing
+        neighboring pixel indices and therefore it uses pixels with a
+        slightly incorrect alignment (relative to our pixel model) when
+        performing bilinear interpolation.
+
+        With `aligned=True`,
+        we first appropriately scale the ROI and then shift it by -0.5
+        prior to calling roi_align. This produces the correct neighbors;
+
+        The difference does not make a difference to the model's
+        performance if ROIAlign is used together with conv layers.
+    """
+
+    @deprecated_api_warning(
+        {
+            'out_size': 'output_size',
+            'sample_num': 'sampling_ratio'
+        },
+        cls_name='RoIAlign')
+    def __init__(self,
+                 output_size,
+                 spatial_scale=1.0,
+                 sampling_ratio=0,
+                 pool_mode='avg',
+                 aligned=True,
+                 use_torchvision=False):
+        super(RoIAlign, self).__init__()
+
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+        self.sampling_ratio = int(sampling_ratio)
+        self.pool_mode = pool_mode
+        self.aligned = aligned
+        self.use_torchvision = use_torchvision
+
+    def forward(self, input, rois):
+        """
+        Args:
+            input: NCHW images
+            rois: Bx5 boxes. First column is the index into N.\
+                The other 4 columns are xyxy.
+        """
+        if self.use_torchvision:
+            from torchvision.ops import roi_align as tv_roi_align
+            if 'aligned' in tv_roi_align.__code__.co_varnames:
+                return tv_roi_align(input, rois, self.output_size,
+                                    self.spatial_scale, self.sampling_ratio,
+                                    self.aligned)
+            else:
+                if self.aligned:
+                    rois -= rois.new_tensor([0.] +
+                                            [0.5 / self.spatial_scale] * 4)
+                return tv_roi_align(input, rois, self.output_size,
+                                    self.spatial_scale, self.sampling_ratio)
+        else:
+            return roi_align(input, rois, self.output_size, self.spatial_scale,
+                             self.sampling_ratio, self.pool_mode, self.aligned)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale}, '
+        s += f'sampling_ratio={self.sampling_ratio}, '
+        s += f'pool_mode={self.pool_mode}, '
+        s += f'aligned={self.aligned}, '
+        s += f'use_torchvision={self.use_torchvision})'
+        return s
diff --git a/mmcv/ops/roiaware_pool3d/__init__.py b/mmcv/ops/roiaware_pool3d/__init__.py
new file mode 100644
index 0000000..aba9e18
--- /dev/null
+++ b/mmcv/ops/roiaware_pool3d/__init__.py
@@ -0,0 +1,8 @@
+from .points_in_boxes import (points_in_boxes_batch, points_in_boxes_cpu,
+                              points_in_boxes_gpu)
+from .roiaware_pool3d import RoIAwarePool3d
+
+__all__ = [
+    'RoIAwarePool3d', 'points_in_boxes_gpu', 'points_in_boxes_cpu',
+    'points_in_boxes_batch'
+]
diff --git a/mmcv/ops/roiaware_pool3d/points_in_boxes.py b/mmcv/ops/roiaware_pool3d/points_in_boxes.py
new file mode 100644
index 0000000..f576fed
--- /dev/null
+++ b/mmcv/ops/roiaware_pool3d/points_in_boxes.py
@@ -0,0 +1,123 @@
+import torch
+
+from . import roiaware_pool3d_ext
+
+
+def points_in_boxes_gpu(points, boxes):
+    """Find points that are in boxes (CUDA)
+
+    Args:
+        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR coordinate
+        boxes (torch.Tensor): [B, T, 7],
+            num_valid_boxes <= T, [x, y, z, w, l, h, ry] in LiDAR coordinate,
+            (x, y, z) is the bottom center
+
+    Returns:
+        box_idxs_of_pts (torch.Tensor): (B, M), default background = -1
+    """
+    assert boxes.shape[0] == points.shape[0], \
+        f'Points and boxes should have the same batch size, ' \
+        f'got {boxes.shape[0]} and {boxes.shape[0]}'
+    assert boxes.shape[2] == 7, \
+        f'boxes dimension should be 7, ' \
+        f'got unexpected shape {boxes.shape[2]}'
+    assert points.shape[2] == 3, \
+        f'points dimension should be 3, ' \
+        f'got unexpected shape {points.shape[2]}'
+    batch_size, num_points, _ = points.shape
+
+    box_idxs_of_pts = points.new_zeros((batch_size, num_points),
+                                       dtype=torch.int).fill_(-1)
+
+    # If manually put the tensor 'points' or 'boxes' on a device
+    # which is not the current device, some temporary variables
+    # will be created on the current device in the cuda op,
+    # and the output will be incorrect.
+    # Therefore, we force the current device to be the same
+    # as the device of the tensors if it was not.
+    # Please refer to https://github.com/open-mmlab/mmdetection3d/issues/305
+    # for the incorrect output before the fix.
+    points_device = points.get_device()
+    assert points_device == boxes.get_device(), \
+        'Points and boxes should be put on the same device'
+    if torch.cuda.current_device() != points_device:
+        torch.cuda.set_device(points_device)
+
+    roiaware_pool3d_ext.points_in_boxes_gpu(boxes.contiguous(),
+                                            points.contiguous(),
+                                            box_idxs_of_pts)
+
+    return box_idxs_of_pts
+
+
+def points_in_boxes_cpu(points, boxes):
+    """Find points that are in boxes (CPU)
+
+    Note:
+        Currently, the output of this function is different from that of
+        points_in_boxes_gpu.
+
+    Args:
+        points (torch.Tensor): [npoints, 3]
+        boxes (torch.Tensor): [N, 7], in LiDAR coordinate,
+            (x, y, z) is the bottom center
+
+    Returns:
+        point_indices (torch.Tensor): (N, npoints)
+    """
+    # TODO: Refactor this function as a CPU version of points_in_boxes_gpu
+    assert boxes.shape[1] == 7, \
+        f'boxes dimension should be 7, ' \
+        f'got unexpected shape {boxes.shape[2]}'
+    assert points.shape[1] == 3, \
+        f'points dimension should be 3, ' \
+        f'got unexpected shape {points.shape[2]}'
+
+    point_indices = points.new_zeros((boxes.shape[0], points.shape[0]),
+                                     dtype=torch.int)
+    roiaware_pool3d_ext.points_in_boxes_cpu(boxes.float().contiguous(),
+                                            points.float().contiguous(),
+                                            point_indices)
+
+    return point_indices
+
+
+def points_in_boxes_batch(points, boxes):
+    """Find points that are in boxes (CUDA)
+
+    Args:
+        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR coordinate
+        boxes (torch.Tensor): [B, T, 7],
+            num_valid_boxes <= T, [x, y, z, w, l, h, ry] in LiDAR coordinate,
+            (x, y, z) is the bottom center.
+
+    Returns:
+        box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0
+    """
+    assert boxes.shape[0] == points.shape[0], \
+        f'Points and boxes should have the same batch size, ' \
+        f'got {boxes.shape[0]} and {boxes.shape[0]}'
+    assert boxes.shape[2] == 7, \
+        f'boxes dimension should be 7, ' \
+        f'got unexpected shape {boxes.shape[2]}'
+    assert points.shape[2] == 3, \
+        f'points dimension should be 3, ' \
+        f'got unexpected shape {points.shape[2]}'
+    batch_size, num_points, _ = points.shape
+    num_boxes = boxes.shape[1]
+
+    box_idxs_of_pts = points.new_zeros((batch_size, num_points, num_boxes),
+                                       dtype=torch.int).fill_(0)
+
+    # Same reason as line 25-32
+    points_device = points.get_device()
+    assert points_device == boxes.get_device(), \
+        'Points and boxes should be put on the same device'
+    if torch.cuda.current_device() != points_device:
+        torch.cuda.set_device(points_device)
+
+    roiaware_pool3d_ext.points_in_boxes_batch(boxes.contiguous(),
+                                              points.contiguous(),
+                                              box_idxs_of_pts)
+
+    return box_idxs_of_pts
diff --git a/mmcv/ops/roiaware_pool3d/roiaware_pool3d.py b/mmcv/ops/roiaware_pool3d/roiaware_pool3d.py
new file mode 100644
index 0000000..536c9a1
--- /dev/null
+++ b/mmcv/ops/roiaware_pool3d/roiaware_pool3d.py
@@ -0,0 +1,110 @@
+from mmcv.utils import is_tuple_of
+import torch
+from torch import nn as nn
+from torch.autograd import Function
+
+from . import roiaware_pool3d_ext
+
+
+class RoIAwarePool3d(nn.Module):
+
+    def __init__(self, out_size, max_pts_per_voxel=128, mode='max'):
+        super().__init__()
+        """RoIAwarePool3d module
+
+        Args:
+            out_size (int or tuple): n or [n1, n2, n3]
+            max_pts_per_voxel (int): m
+            mode (str): 'max' or 'avg'
+        """
+        self.out_size = out_size
+        self.max_pts_per_voxel = max_pts_per_voxel
+        assert mode in ['max', 'avg']
+        pool_method_map = {'max': 0, 'avg': 1}
+        self.mode = pool_method_map[mode]
+
+    def forward(self, rois, pts, pts_feature):
+        """RoIAwarePool3d module forward.
+
+        Args:
+            rois (torch.Tensor): [N, 7],in LiDAR coordinate,
+                (x, y, z) is the bottom center of rois
+            pts (torch.Tensor): [npoints, 3]
+            pts_feature (torch.Tensor): [npoints, C]
+
+        Returns:
+            pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C]
+        """
+
+        return RoIAwarePool3dFunction.apply(rois, pts, pts_feature,
+                                            self.out_size,
+                                            self.max_pts_per_voxel, self.mode)
+
+
+class RoIAwarePool3dFunction(Function):
+
+    @staticmethod
+    def forward(ctx, rois, pts, pts_feature, out_size, max_pts_per_voxel,
+                mode):
+        """RoIAwarePool3d function forward.
+
+        Args:
+            rois (torch.Tensor): [N, 7], in LiDAR coordinate,
+                (x, y, z) is the bottom center of rois
+            pts (torch.Tensor): [npoints, 3]
+            pts_feature (torch.Tensor): [npoints, C]
+            out_size (int or tuple): n or [n1, n2, n3]
+            max_pts_per_voxel (int): m
+            mode (int): 0 (max pool) or 1 (average pool)
+
+        Returns:
+            pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C]
+        """
+
+        if isinstance(out_size, int):
+            out_x = out_y = out_z = out_size
+        else:
+            assert len(out_size) == 3
+            assert is_tuple_of(out_size, int)
+            out_x, out_y, out_z = out_size
+
+        num_rois = rois.shape[0]
+        num_channels = pts_feature.shape[-1]
+        num_pts = pts.shape[0]
+
+        pooled_features = pts_feature.new_zeros(
+            (num_rois, out_x, out_y, out_z, num_channels))
+        argmax = pts_feature.new_zeros(
+            (num_rois, out_x, out_y, out_z, num_channels), dtype=torch.int)
+        pts_idx_of_voxels = pts_feature.new_zeros(
+            (num_rois, out_x, out_y, out_z, max_pts_per_voxel),
+            dtype=torch.int)
+
+        roiaware_pool3d_ext.forward(rois, pts, pts_feature, argmax,
+                                    pts_idx_of_voxels, pooled_features, mode)
+
+        ctx.roiaware_pool3d_for_backward = (pts_idx_of_voxels, argmax, mode,
+                                            num_pts, num_channels)
+        return pooled_features
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        """RoIAwarePool3d function forward.
+
+        Args:
+            grad_out (torch.Tensor): [N, out_x, out_y, out_z, C]
+        Returns:
+            grad_in (torch.Tensor): [npoints, C]
+        """
+        ret = ctx.roiaware_pool3d_for_backward
+        pts_idx_of_voxels, argmax, mode, num_pts, num_channels = ret
+
+        grad_in = grad_out.new_zeros((num_pts, num_channels))
+        roiaware_pool3d_ext.backward(pts_idx_of_voxels, argmax,
+                                     grad_out.contiguous(), grad_in, mode)
+
+        return None, None, grad_in, None, None, None
+
+
+if __name__ == '__main__':
+    pass
diff --git a/mmcv/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp b/mmcv/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
new file mode 100644
index 0000000..a26ffb6
--- /dev/null
+++ b/mmcv/ops/roiaware_pool3d/src/points_in_boxes_cpu.cpp
@@ -0,0 +1,69 @@
+// Modified from
+// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+// Written by Shaoshuai Shi
+// All Rights Reserved 2019.
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <torch/extension.h>
+#include <torch/serialize/tensor.h>
+
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+// #define DEBUG
+
+inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz,
+                                      float &local_x, float &local_y) {
+  // should rotate pi/2 + alpha to translate LiDAR to local
+  float rot_angle = rz + M_PI / 2;
+  float cosa = cos(rot_angle), sina = sin(rot_angle);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d,
+                                 float &local_x, float &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the
+  // bottom center
+  float x = pt[0], y = pt[1], z = pt[2];
+  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6];
+  cz += h / 2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > h / 2.0) return 0;
+  lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) &
+                  (local_y > -w / 2.0) & (local_y < w / 2.0);
+  return in_flag;
+}
+
+int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
+                        at::Tensor pts_indices_tensor) {
+  // params boxes: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is the
+  // bottom center, each box DO NOT overlaps params pts: (npoints, 3) [x, y, z]
+  // in LiDAR coordinate params pts_indices: (N, npoints)
+
+  CHECK_CONTIGUOUS(boxes_tensor);
+  CHECK_CONTIGUOUS(pts_tensor);
+  CHECK_CONTIGUOUS(pts_indices_tensor);
+
+  int boxes_num = boxes_tensor.size(0);
+  int pts_num = pts_tensor.size(0);
+
+  const float *boxes = boxes_tensor.data_ptr<float>();
+  const float *pts = pts_tensor.data_ptr<float>();
+  int *pts_indices = pts_indices_tensor.data_ptr<int>();
+
+  float local_x = 0, local_y = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    for (int j = 0; j < pts_num; j++) {
+      int cur_in_flag =
+          check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y);
+      pts_indices[i * pts_num + j] = cur_in_flag;
+    }
+  }
+
+  return 1;
+}
diff --git a/mmcv/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu b/mmcv/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
new file mode 100644
index 0000000..896b316
--- /dev/null
+++ b/mmcv/ops/roiaware_pool3d/src/points_in_boxes_cuda.cu
@@ -0,0 +1,203 @@
+// Modified from
+// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+// Written by Shaoshuai Shi
+// All Rights Reserved 2019.
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <torch/serialize/tensor.h>
+#include <torch/types.h>
+
+#define THREADS_PER_BLOCK 256
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+// #define DEBUG
+
+__device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
+                                             float rz, float &local_x,
+                                             float &local_y) {
+  // should rotate pi/2 + alpha to translate LiDAR to local
+  float rot_angle = rz + M_PI / 2;
+  float cosa = cos(rot_angle), sina = sin(rot_angle);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d,
+                                        float &local_x, float &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the
+  // bottom center
+  float x = pt[0], y = pt[1], z = pt[2];
+  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6];
+  cz += h / 2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > h / 2.0) return 0;
+  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) &
+                  (local_y > -w / 2.0) & (local_y < w / 2.0);
+  return in_flag;
+}
+
+__global__ void points_in_boxes_kernel(int batch_size, int boxes_num,
+                                       int pts_num, const float *boxes,
+                                       const float *pts,
+                                       int *box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
+  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
+  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
+  // -1
+
+  int bs_idx = blockIdx.y;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (bs_idx >= batch_size || pt_idx >= pts_num) return;
+
+  boxes += bs_idx * boxes_num * 7;
+  pts += bs_idx * pts_num * 3 + pt_idx * 3;
+  box_idx_of_points += bs_idx * pts_num + pt_idx;
+
+  float local_x = 0, local_y = 0;
+  int cur_in_flag = 0;
+  for (int k = 0; k < boxes_num; k++) {
+    cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
+    if (cur_in_flag) {
+      box_idx_of_points[0] = k;
+      break;
+    }
+  }
+}
+
+__global__ void points_in_boxes_batch_kernel(int batch_size, int boxes_num,
+                                             int pts_num, const float *boxes,
+                                             const float *pts,
+                                             int *box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
+  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
+  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
+  // -1
+
+  int bs_idx = blockIdx.y;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (bs_idx >= batch_size || pt_idx >= pts_num) return;
+
+  boxes += bs_idx * boxes_num * 7;
+  pts += bs_idx * pts_num * 3 + pt_idx * 3;
+  box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;
+
+  float local_x = 0, local_y = 0;
+  int cur_in_flag = 0;
+  for (int k = 0; k < boxes_num; k++) {
+    cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
+    if (cur_in_flag) {
+      box_idx_of_points[k] = 1;
+    }
+    cur_in_flag = 0;
+  }
+}
+
+void points_in_boxes_launcher(int batch_size, int boxes_num, int pts_num,
+                              const float *boxes, const float *pts,
+                              int *box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
+  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
+  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
+  // -1
+  cudaError_t err;
+
+  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
+  dim3 threads(THREADS_PER_BLOCK);
+  points_in_boxes_kernel<<<blocks, threads>>>(batch_size, boxes_num, pts_num,
+                                              boxes, pts, box_idx_of_points);
+
+  err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
+
+#ifdef DEBUG
+  cudaDeviceSynchronize();  // for using printf in kernel function
+#endif
+}
+
+void points_in_boxes_batch_launcher(int batch_size, int boxes_num, int pts_num,
+                                    const float *boxes, const float *pts,
+                                    int *box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
+  // the bottom center, each box params pts: (B, npoints, 3) [x, y, z] in
+  // LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
+  cudaError_t err;
+
+  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
+  dim3 threads(THREADS_PER_BLOCK);
+  points_in_boxes_batch_kernel<<<blocks, threads>>>(
+      batch_size, boxes_num, pts_num, boxes, pts, box_idx_of_points);
+
+  err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
+
+#ifdef DEBUG
+  cudaDeviceSynchronize();  // for using printf in kernel function
+#endif
+}
+
+int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
+                        at::Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
+  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
+  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
+  // -1
+
+  CHECK_INPUT(boxes_tensor);
+  CHECK_INPUT(pts_tensor);
+  CHECK_INPUT(box_idx_of_points_tensor);
+
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+
+  const float *boxes = boxes_tensor.data_ptr<float>();
+  const float *pts = pts_tensor.data_ptr<float>();
+  int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
+
+  points_in_boxes_launcher(batch_size, boxes_num, pts_num, boxes, pts,
+                           box_idx_of_points);
+
+  return 1;
+}
+
+int points_in_boxes_batch(at::Tensor boxes_tensor, at::Tensor pts_tensor,
+                          at::Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate, z is
+  // the bottom center. params pts: (B, npoints, 3) [x, y, z] in LiDAR
+  // coordinate params boxes_idx_of_points: (B, npoints), default -1
+
+  CHECK_INPUT(boxes_tensor);
+  CHECK_INPUT(pts_tensor);
+  CHECK_INPUT(box_idx_of_points_tensor);
+
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+
+  const float *boxes = boxes_tensor.data_ptr<float>();
+  const float *pts = pts_tensor.data_ptr<float>();
+  int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
+
+  points_in_boxes_batch_launcher(batch_size, boxes_num, pts_num, boxes, pts,
+                                 box_idx_of_points);
+
+  return 1;
+}
diff --git a/mmcv/ops/roiaware_pool3d/src/roiaware_pool3d.cpp b/mmcv/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
new file mode 100644
index 0000000..cd743b1
--- /dev/null
+++ b/mmcv/ops/roiaware_pool3d/src/roiaware_pool3d.cpp
@@ -0,0 +1,136 @@
+// Modified from
+// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+// Written by Shaoshuai Shi
+// All Rights Reserved 2019.
+
+#include <assert.h>
+#include <torch/extension.h>
+#include <torch/serialize/tensor.h>
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels,
+                              int max_pts_each_voxel, int out_x, int out_y,
+                              int out_z, const float *rois, const float *pts,
+                              const float *pts_feature, int *argmax,
+                              int *pts_idx_of_voxels, float *pooled_features,
+                              int pool_method);
+
+void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y,
+                                       int out_z, int channels,
+                                       int max_pts_each_voxel,
+                                       const int *pts_idx_of_voxels,
+                                       const int *argmax, const float *grad_out,
+                                       float *grad_in, int pool_method);
+
+int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
+                        at::Tensor argmax, at::Tensor pts_idx_of_voxels,
+                        at::Tensor pooled_features, int pool_method);
+
+int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels,
+                                 at::Tensor argmax, at::Tensor grad_out,
+                                 at::Tensor grad_in, int pool_method);
+
+int points_in_boxes_cpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
+                        at::Tensor pts_indices_tensor);
+
+int points_in_boxes_gpu(at::Tensor boxes_tensor, at::Tensor pts_tensor,
+                        at::Tensor box_idx_of_points_tensor);
+
+int points_in_boxes_batch(at::Tensor boxes_tensor, at::Tensor pts_tensor,
+                          at::Tensor box_idx_of_points_tensor);
+
+int roiaware_pool3d_gpu(at::Tensor rois, at::Tensor pts, at::Tensor pts_feature,
+                        at::Tensor argmax, at::Tensor pts_idx_of_voxels,
+                        at::Tensor pooled_features, int pool_method) {
+  // params rois: (N, 7) [x, y, z, w, l, h, ry] in LiDAR coordinate
+  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
+  // params pts_feature: (npoints, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params pooled_features: (N, out_x, out_y, out_z, C)
+  // params pool_method: 0: max_pool 1: avg_pool
+
+  CHECK_INPUT(rois);
+  CHECK_INPUT(pts);
+  CHECK_INPUT(pts_feature);
+  CHECK_INPUT(argmax);
+  CHECK_INPUT(pts_idx_of_voxels);
+  CHECK_INPUT(pooled_features);
+
+  int boxes_num = rois.size(0);
+  int pts_num = pts.size(0);
+  int channels = pts_feature.size(1);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  assert((out_x < 256) && (out_y < 256) &&
+         (out_z < 256));  // we encode index with 8bit
+
+  const float *rois_data = rois.data_ptr<float>();
+  const float *pts_data = pts.data_ptr<float>();
+  const float *pts_feature_data = pts_feature.data_ptr<float>();
+  int *argmax_data = argmax.data_ptr<int>();
+  int *pts_idx_of_voxels_data = pts_idx_of_voxels.data_ptr<int>();
+  float *pooled_features_data = pooled_features.data_ptr<float>();
+
+  roiaware_pool3d_launcher(
+      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+      rois_data, pts_data, pts_feature_data, argmax_data,
+      pts_idx_of_voxels_data, pooled_features_data, pool_method);
+
+  return 1;
+}
+
+int roiaware_pool3d_gpu_backward(at::Tensor pts_idx_of_voxels,
+                                 at::Tensor argmax, at::Tensor grad_out,
+                                 at::Tensor grad_in, int pool_method) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+  // params pool_method: 0: max_pool 1: avg_pool
+
+  CHECK_INPUT(pts_idx_of_voxels);
+  CHECK_INPUT(argmax);
+  CHECK_INPUT(grad_out);
+  CHECK_INPUT(grad_in);
+
+  int boxes_num = pts_idx_of_voxels.size(0);
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int channels = grad_out.size(4);
+
+  const int *pts_idx_of_voxels_data = pts_idx_of_voxels.data_ptr<int>();
+  const int *argmax_data = argmax.data_ptr<int>();
+  const float *grad_out_data = grad_out.data_ptr<float>();
+  float *grad_in_data = grad_in.data_ptr<float>();
+
+  roiaware_pool3d_backward_launcher(boxes_num, out_x, out_y, out_z, channels,
+                                    max_pts_each_voxel, pts_idx_of_voxels_data,
+                                    argmax_data, grad_out_data, grad_in_data,
+                                    pool_method);
+
+  return 1;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &roiaware_pool3d_gpu, "roiaware pool3d forward (CUDA)");
+  m.def("backward", &roiaware_pool3d_gpu_backward,
+        "roiaware pool3d backward (CUDA)");
+  m.def("points_in_boxes_gpu", &points_in_boxes_gpu,
+        "points_in_boxes_gpu forward (CUDA)");
+  m.def("points_in_boxes_batch", &points_in_boxes_batch,
+        "points_in_boxes_batch forward (CUDA)");
+  m.def("points_in_boxes_cpu", &points_in_boxes_cpu,
+        "points_in_boxes_cpu forward (CPU)");
+}
diff --git a/mmcv/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu b/mmcv/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
new file mode 100644
index 0000000..312b35d
--- /dev/null
+++ b/mmcv/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
@@ -0,0 +1,366 @@
+// Modified from
+// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+// Written by Shaoshuai Shi
+// All Rights Reserved 2019.
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <torch/serialize/tensor.h>
+#include <torch/types.h>
+
+#define THREADS_PER_BLOCK 256
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+// #define DEBUG
+
+__device__ inline void lidar_to_local_coords(float shift_x, float shift_y,
+                                             float rz, float &local_x,
+                                             float &local_y) {
+  // should rotate pi/2 + alpha to translate LiDAR to local
+  float rot_angle = rz + M_PI / 2;
+  float cosa = cos(rot_angle), sina = sin(rot_angle);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+__device__ inline int check_pt_in_box3d(const float *pt, const float *box3d,
+                                        float &local_x, float &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, w, l, h, rz) in LiDAR coordinate, cz in the
+  // bottom center
+  float x = pt[0], y = pt[1], z = pt[2];
+  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  float w = box3d[3], l = box3d[4], h = box3d[5], rz = box3d[6];
+  cz += h / 2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > h / 2.0) return 0;
+  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -l / 2.0) & (local_x < l / 2.0) &
+                  (local_y > -w / 2.0) & (local_y < w / 2.0);
+  return in_flag;
+}
+
+__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
+                                            int out_x, int out_y, int out_z,
+                                            const float *rois, const float *pts,
+                                            int *pts_mask) {
+  // params rois: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate
+  // params pts: (npoints, 3) [x, y, z]
+  // params pts_mask: (N, npoints): -1 means point doesnot in this box,
+  // otherwise: encode (x_idxs, y_idxs, z_idxs) by binary bit
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int box_idx = blockIdx.y;
+  if (pt_idx >= pts_num || box_idx >= boxes_num) return;
+
+  pts += pt_idx * 3;
+  rois += box_idx * 7;
+  pts_mask += box_idx * pts_num + pt_idx;
+
+  float local_x = 0, local_y = 0;
+  int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
+
+  pts_mask[0] = -1;
+  if (cur_in_flag > 0) {
+    float local_z = pts[2] - rois[2];
+    float w = rois[3], l = rois[4], h = rois[5];
+
+    float x_res = l / out_x;
+    float y_res = w / out_y;
+    float z_res = h / out_z;
+
+    unsigned int x_idx = int((local_x + l / 2) / x_res);
+    unsigned int y_idx = int((local_y + w / 2) / y_res);
+    unsigned int z_idx = int(local_z / z_res);
+
+    x_idx = min(max(x_idx, 0), out_x - 1);
+    y_idx = min(max(y_idx, 0), out_y - 1);
+    z_idx = min(max(z_idx, 0), out_z - 1);
+
+    unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
+#ifdef DEBUG
+    printf(
+        "mask: pts_%d(%.3f, %.3f, %.3f), local(%.3f, %.3f, %.3f), idx(%d, %d, "
+        "%d), res(%.3f, %.3f, %.3f), idx_encoding=%x\n",
+        pt_idx, pts[0], pts[1], pts[2], local_x, local_y, local_z, x_idx, y_idx,
+        z_idx, x_res, y_res, z_res, idx_encoding);
+#endif
+
+    pts_mask[0] = idx_encoding;
+  }
+}
+
+__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
+                                             int max_pts_each_voxel, int out_x,
+                                             int out_y, int out_z,
+                                             const int *pts_mask,
+                                             int *pts_idx_of_voxels) {
+  // params pts_mask: (N, npoints)  0 or 1
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+
+  int box_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (box_idx >= boxes_num) return;
+
+  int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
+  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
+
+  for (int k = 0; k < pts_num; k++) {
+    if (pts_mask[box_idx * pts_num + k] != -1) {
+      unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
+      unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
+      unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
+      unsigned int z_idx = idx_encoding & 0xFF;
+      unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
+                                 y_idx * out_z * max_pts_each_voxel +
+                                 z_idx * max_pts_each_voxel;
+      unsigned int cnt = pts_idx_of_voxels[base_offset];
+      if (cnt < max_num_pts) {
+        pts_idx_of_voxels[base_offset + cnt + 1] = k;
+        pts_idx_of_voxels[base_offset]++;
+      }
+#ifdef DEBUG
+      printf("collect: pts_%d, idx(%d, %d, %d), idx_encoding=%x\n", k, x_idx,
+             y_idx, z_idx, idx_encoding);
+#endif
+    }
+  }
+}
+
+__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
+                                   int max_pts_each_voxel, int out_x, int out_y,
+                                   int out_z, const float *pts_feature,
+                                   const int *pts_idx_of_voxels,
+                                   float *pooled_features, int *argmax) {
+  // params pts_feature: (npoints, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
+  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x_idx = voxel_idx_flat / (out_y * out_z);
+  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+  int z_idx = voxel_idx_flat % out_z;
+  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
+      y_idx >= out_y || z_idx >= out_z)
+    return;
+
+#ifdef DEBUG
+  printf("src pts_idx_of_voxels: (%p, ), argmax: %p\n", pts_idx_of_voxels,
+         argmax);
+#endif
+
+  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                       offset_base * max_pts_each_voxel;
+  pooled_features += box_idx * out_x * out_y * out_z * channels +
+                     offset_base * channels + channel_idx;
+  argmax += box_idx * out_x * out_y * out_z * channels +
+            offset_base * channels + channel_idx;
+
+  int argmax_idx = -1;
+  float max_val = -1e50;
+
+  int total_pts = pts_idx_of_voxels[0];
+
+  for (int k = 1; k <= total_pts; k++) {
+    if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val) {
+      max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+      argmax_idx = pts_idx_of_voxels[k];
+    }
+  }
+
+  if (argmax_idx != -1) {
+    pooled_features[0] = max_val;
+  }
+  argmax[0] = argmax_idx;
+
+#ifdef DEBUG
+  printf(
+      "channel_%d idx(%d, %d, %d), argmax_idx=(%d, %.3f), total=%d, after "
+      "pts_idx: %p, argmax: (%p, %d)\n",
+      channel_idx, x_idx, y_idx, z_idx, argmax_idx, max_val, total_pts,
+      pts_idx_of_voxels, argmax, argmax_idx);
+#endif
+}
+
+__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
+                                   int max_pts_each_voxel, int out_x, int out_y,
+                                   int out_z, const float *pts_feature,
+                                   const int *pts_idx_of_voxels,
+                                   float *pooled_features) {
+  // params pts_feature: (npoints, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
+  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x_idx = voxel_idx_flat / (out_y * out_z);
+  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+  int z_idx = voxel_idx_flat % out_z;
+  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
+      y_idx >= out_y || z_idx >= out_z)
+    return;
+
+  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                       offset_base * max_pts_each_voxel;
+  pooled_features += box_idx * out_x * out_y * out_z * channels +
+                     offset_base * channels + channel_idx;
+
+  float sum_val = 0;
+  int total_pts = pts_idx_of_voxels[0];
+
+  for (int k = 1; k <= total_pts; k++) {
+    sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+  }
+
+  if (total_pts > 0) {
+    pooled_features[0] = sum_val / total_pts;
+  }
+}
+
+void roiaware_pool3d_launcher(int boxes_num, int pts_num, int channels,
+                              int max_pts_each_voxel, int out_x, int out_y,
+                              int out_z, const float *rois, const float *pts,
+                              const float *pts_feature, int *argmax,
+                              int *pts_idx_of_voxels, float *pooled_features,
+                              int pool_method) {
+  // params rois: (N, 7) [x, y, z, w, l, h, rz] in LiDAR coordinate
+  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
+  // params pts_feature: (npoints, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params pooled_features: (N, out_x, out_y, out_z, C)
+  // params pool_method: 0: max_pool 1: avg_pool
+
+  int *pts_mask = NULL;
+  cudaMalloc(&pts_mask, boxes_num * pts_num * sizeof(int));  // (N, M)
+  cudaMemset(pts_mask, -1, boxes_num * pts_num * sizeof(int));
+
+  dim3 blocks_mask(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num);
+  dim3 threads(THREADS_PER_BLOCK);
+  generate_pts_mask_for_box3d<<<blocks_mask, threads>>>(
+      boxes_num, pts_num, out_x, out_y, out_z, rois, pts, pts_mask);
+
+  // TODO: Merge the collect and pool functions, SS
+
+  dim3 blocks_collect(DIVUP(boxes_num, THREADS_PER_BLOCK));
+  collect_inside_pts_for_box3d<<<blocks_collect, threads>>>(
+      boxes_num, pts_num, max_pts_each_voxel, out_x, out_y, out_z, pts_mask,
+      pts_idx_of_voxels);
+
+  dim3 blocks_pool(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
+                   boxes_num);
+  if (pool_method == 0) {
+    roiaware_maxpool3d<<<blocks_pool, threads>>>(
+        boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+        pts_feature, pts_idx_of_voxels, pooled_features, argmax);
+  } else if (pool_method == 1) {
+    roiaware_avgpool3d<<<blocks_pool, threads>>>(
+        boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+        pts_feature, pts_idx_of_voxels, pooled_features);
+  }
+
+  cudaFree(pts_mask);
+
+#ifdef DEBUG
+  cudaDeviceSynchronize();  // for using printf in kernel function
+#endif
+}
+
+__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
+                                            int out_x, int out_y, int out_z,
+                                            const int *argmax,
+                                            const float *grad_out,
+                                            float *grad_in) {
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x_idx = voxel_idx_flat / (out_y * out_z);
+  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+  int z_idx = voxel_idx_flat % out_z;
+  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
+      y_idx >= out_y || z_idx >= out_z)
+    return;
+
+  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+  argmax += box_idx * out_x * out_y * out_z * channels +
+            offset_base * channels + channel_idx;
+  grad_out += box_idx * out_x * out_y * out_z * channels +
+              offset_base * channels + channel_idx;
+
+  if (argmax[0] == -1) return;
+
+  atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
+}
+
+__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
+                                            int out_x, int out_y, int out_z,
+                                            int max_pts_each_voxel,
+                                            const int *pts_idx_of_voxels,
+                                            const float *grad_out,
+                                            float *grad_in) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int x_idx = voxel_idx_flat / (out_y * out_z);
+  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+  int z_idx = voxel_idx_flat % out_z;
+  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
+      y_idx >= out_y || z_idx >= out_z)
+    return;
+
+  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                       offset_base * max_pts_each_voxel;
+  grad_out += box_idx * out_x * out_y * out_z * channels +
+              offset_base * channels + channel_idx;
+
+  int total_pts = pts_idx_of_voxels[0];
+  float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
+  for (int k = 1; k <= total_pts; k++) {
+    atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
+              grad_out[0] * cur_grad);
+  }
+}
+
+void roiaware_pool3d_backward_launcher(int boxes_num, int out_x, int out_y,
+                                       int out_z, int channels,
+                                       int max_pts_each_voxel,
+                                       const int *pts_idx_of_voxels,
+                                       const int *argmax, const float *grad_out,
+                                       float *grad_in, int pool_method) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+  // params pool_method: 0: max_pool, 1: avg_pool
+
+  dim3 blocks(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
+              boxes_num);
+  dim3 threads(THREADS_PER_BLOCK);
+  if (pool_method == 0) {
+    roiaware_maxpool3d_backward<<<blocks, threads>>>(
+        boxes_num, channels, out_x, out_y, out_z, argmax, grad_out, grad_in);
+  } else if (pool_method == 1) {
+    roiaware_avgpool3d_backward<<<blocks, threads>>>(
+        boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel,
+        pts_idx_of_voxels, grad_out, grad_in);
+  }
+}
diff --git a/mmcv/ops/voxelize.py b/mmcv/ops/voxelize.py
new file mode 100644
index 0000000..d6fc855
--- /dev/null
+++ b/mmcv/ops/voxelize.py
@@ -0,0 +1,145 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['dynamic_voxelize_forward', 'hard_voxelize_forward'])
+
+
+class _Voxelization(Function):
+
+    @staticmethod
+    def forward(ctx,
+                points,
+                voxel_size,
+                coors_range,
+                max_points=35,
+                max_voxels=20000):
+        """Convert kitti points(N, >=3) to voxels.
+
+        Args:
+            points (torch.Tensor): [N, ndim]. Points[:, :3] contain xyz points
+                and points[:, 3:] contain other information like reflectivity.
+            voxel_size (tuple or float): The size of voxel with the shape of
+                [3].
+            coors_range (tuple or float): The coordinate range of voxel with
+                the shape of [6].
+            max_points (int, optional): maximum points contained in a voxel. if
+                max_points=-1, it means using dynamic_voxelize. Default: 35.
+            max_voxels (int, optional): maximum voxels this function create.
+                for second, 20000 is a good choice. Users should shuffle points
+                before call this function because max_voxels may drop points.
+                Default: 20000.
+
+        Returns:
+            voxels_out (torch.Tensor): Output voxels with the shape of [M,
+                max_points, ndim]. Only contain points and returned when
+                max_points != -1.
+            coors_out (torch.Tensor): Output coordinates with the shape of
+                [M, 3].
+            num_points_per_voxel_out (torch.Tensor): Num points per voxel with
+                the shape of [M]. Only returned when max_points != -1.
+        """
+        if max_points == -1 or max_voxels == -1:
+            coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int)
+            ext_module.dynamic_voxelize_forward(
+                points,
+                torch.tensor(voxel_size, dtype=torch.float),
+                torch.tensor(coors_range, dtype=torch.float),
+                coors,
+                NDim=3)
+            return coors
+        else:
+            voxels = points.new_zeros(
+                size=(max_voxels, max_points, points.size(1)))
+            coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int)
+            num_points_per_voxel = points.new_zeros(
+                size=(max_voxels, ), dtype=torch.int)
+            voxel_num = torch.zeros(size=(), dtype=torch.long)
+            ext_module.hard_voxelize_forward(
+                points,
+                torch.tensor(voxel_size, dtype=torch.float),
+                torch.tensor(coors_range, dtype=torch.float),
+                voxels,
+                coors,
+                num_points_per_voxel,
+                voxel_num,
+                max_points=max_points,
+                max_voxels=max_voxels,
+                NDim=3)
+            # select the valid voxels
+            voxels_out = voxels[:voxel_num]
+            coors_out = coors[:voxel_num]
+            num_points_per_voxel_out = num_points_per_voxel[:voxel_num]
+            return voxels_out, coors_out, num_points_per_voxel_out
+
+
+voxelization = _Voxelization.apply
+
+
+class Voxelization(nn.Module):
+    """Convert kitti points(N, >=3) to voxels.
+
+    Please refer to `PVCNN <https://arxiv.org/abs/1907.03739>`_ for more
+    details.
+
+    Args:
+        voxel_size (tuple or float): The size of voxel with the shape of [3].
+        point_cloud_range (tuple or float): The coordinate range of voxel with
+            the shape of [6].
+        max_num_points (int): maximum points contained in a voxel. if
+            max_points=-1, it means using dynamic_voxelize.
+        max_voxels (int, optional): maximum voxels this function create.
+            for second, 20000 is a good choice. Users should shuffle points
+            before call this function because max_voxels may drop points.
+            Default: 20000.
+    """
+
+    def __init__(self,
+                 voxel_size,
+                 point_cloud_range,
+                 max_num_points,
+                 max_voxels=20000):
+        super().__init__()
+
+        self.voxel_size = voxel_size
+        self.point_cloud_range = point_cloud_range
+        self.max_num_points = max_num_points
+        if isinstance(max_voxels, tuple):
+            self.max_voxels = max_voxels
+        else:
+            self.max_voxels = _pair(max_voxels)
+
+        point_cloud_range = torch.tensor(
+            point_cloud_range, dtype=torch.float32)
+        voxel_size = torch.tensor(voxel_size, dtype=torch.float32)
+        grid_size = (point_cloud_range[3:] -
+                     point_cloud_range[:3]) / voxel_size
+        grid_size = torch.round(grid_size).long()
+        input_feat_shape = grid_size[:2]
+        self.grid_size = grid_size
+        # the origin shape is as [x-len, y-len, z-len]
+        # [w, h, d] -> [d, h, w]
+        self.pcd_shape = [*input_feat_shape, 1][::-1]
+
+    def forward(self, input):
+        if self.training:
+            max_voxels = self.max_voxels[0]
+        else:
+            max_voxels = self.max_voxels[1]
+
+        return voxelization(input, self.voxel_size, self.point_cloud_range,
+                            self.max_num_points, max_voxels)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += 'voxel_size=' + str(self.voxel_size)
+        s += ', point_cloud_range=' + str(self.point_cloud_range)
+        s += ', max_num_points=' + str(self.max_num_points)
+        s += ', max_voxels=' + str(self.max_voxels)
+        s += ')'
+        return s
diff --git a/mmcv/optims/__init__.py b/mmcv/optims/__init__.py
new file mode 100644
index 0000000..55f2449
--- /dev/null
+++ b/mmcv/optims/__init__.py
@@ -0,0 +1 @@
+from .optimizer import build_optimizer, OPTIMIZERS
\ No newline at end of file
diff --git a/mmcv/optims/adamw.py b/mmcv/optims/adamw.py
new file mode 100644
index 0000000..c890aea
--- /dev/null
+++ b/mmcv/optims/adamw.py
@@ -0,0 +1,131 @@
+try:
+    from torch.optim import _functional as F
+except:
+    print('WARNING!!!, I recommend using torch>=1.8')
+
+import torch
+from torch.optim.optimizer import Optimizer
+from mmcv.runner.optimizer.builder import OPTIMIZERS
+
+@OPTIMIZERS.register_module()
+class AdamW2(Optimizer):
+    r"""Implements AdamW algorithm. Solve the bug of torch 1.8
+
+    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
+    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay coefficient (default: 1e-2)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False)
+
+    .. _Adam\: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _Decoupled Weight Decay Regularization:
+        https://arxiv.org/abs/1711.05101
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=1e-2, amsgrad=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad)
+        super(AdamW2, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(AdamW2, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            state_sums = []
+            max_exp_avg_sqs = []
+            state_steps = []
+            amsgrad = group['amsgrad']
+
+            # put this line here for solving bug
+            beta1, beta2 = group['betas']
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError('AdamW does not support sparse gradients')
+                grads.append(p.grad)
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                exp_avgs.append(state['exp_avg'])
+                exp_avg_sqs.append(state['exp_avg_sq'])
+
+                if amsgrad:
+                    max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+
+
+                # update the steps for each param group update
+                state['step'] += 1
+                # record the step after step update
+                state_steps.append(state['step'])
+
+            F.adamw(params_with_grad,
+                    grads,
+                    exp_avgs,
+                    exp_avg_sqs,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad,
+                    beta1,
+                    beta2,
+                    group['lr'],
+                    group['weight_decay'],
+                    group['eps'])
+
+        return loss
\ No newline at end of file
diff --git a/mmcv/optims/optimizer.py b/mmcv/optims/optimizer.py
new file mode 100644
index 0000000..10d3772
--- /dev/null
+++ b/mmcv/optims/optimizer.py
@@ -0,0 +1,268 @@
+import torch
+from torch.nn import GroupNorm, LayerNorm
+from mmcv.utils import build_from_cfg, is_list_of, Registry
+from torch.nn.modules.instancenorm import _InstanceNorm
+from torch.nn.modules.batchnorm import _BatchNorm
+from mmcv.utils.ext_loader import check_ops_exist
+from mmcv.utils import Registry
+import inspect
+import copy
+
+OPTIMIZERS = Registry('optimizer')
+
+def build_optimizer(model, cfg):
+    optimizer_cfg = copy.deepcopy(cfg)
+    paramwise_cfg = optimizer_cfg.pop('paramwise_cfg', None)
+    return DefaultOptimizerConstructor(optimizer_cfg, paramwise_cfg)(model)
+
+def register_torch_optimizers():
+    torch_optimizers = []
+    for module_name in dir(torch.optim):
+        if module_name.startswith('__'):
+            continue
+        _optim = getattr(torch.optim, module_name)
+        if inspect.isclass(_optim) and issubclass(_optim,
+                                                  torch.optim.Optimizer):
+            OPTIMIZERS.register_module()(_optim)
+            torch_optimizers.append(module_name)
+    return torch_optimizers
+
+TORCH_OPTIMIZERS = register_torch_optimizers()
+
+class DefaultOptimizerConstructor:
+    """Default constructor for optimizers.
+
+    By default each parameter share the same optimizer settings, and we
+    provide an argument ``paramwise_cfg`` to specify parameter-wise settings.
+    It is a dict and may contain the following fields:
+
+    - ``custom_keys`` (dict): Specified parameters-wise settings by keys. If
+      one of the keys in ``custom_keys`` is a substring of the name of one
+      parameter, then the setting of the parameter will be specified by
+      ``custom_keys[key]`` and other setting like ``bias_lr_mult`` etc. will
+      be ignored. It should be noted that the aforementioned ``key`` is the
+      longest key that is a substring of the name of the parameter. If there
+      are multiple matched keys with the same length, then the key with lower
+      alphabet order will be chosen.
+      ``custom_keys[key]`` should be a dict and may contain fields ``lr_mult``
+      and ``decay_mult``. See Example 2 below.
+    - ``bias_lr_mult`` (float): It will be multiplied to the learning
+      rate for all bias parameters (except for those in normalization
+      layers and offset layers of DCN).
+    - ``bias_decay_mult`` (float): It will be multiplied to the weight
+      decay for all bias parameters (except for those in
+      normalization layers, depthwise conv layers, offset layers of DCN).
+    - ``norm_decay_mult`` (float): It will be multiplied to the weight
+      decay for all weight and bias parameters of normalization
+      layers.
+    - ``dwconv_decay_mult`` (float): It will be multiplied to the weight
+      decay for all weight and bias parameters of depthwise conv
+      layers.
+    - ``dcn_offset_lr_mult`` (float): It will be multiplied to the learning
+      rate for parameters of offset layer in the deformable convs
+      of a model.
+    - ``bypass_duplicate`` (bool): If true, the duplicate parameters
+      would not be added into optimizer. Default: False.
+
+    Note:
+        1. If the option ``dcn_offset_lr_mult`` is used, the constructor will
+            override the effect of ``bias_lr_mult`` in the bias of offset
+            layer. So be careful when using both ``bias_lr_mult`` and
+            ``dcn_offset_lr_mult``. If you wish to apply both of them to the
+            offset layer in deformable convs, set ``dcn_offset_lr_mult``
+            to the original ``dcn_offset_lr_mult`` * ``bias_lr_mult``.
+        2. If the option ``dcn_offset_lr_mult`` is used, the constructor will
+            apply it to all the DCN layers in the model. So be careful when
+            the model contains multiple DCN layers in places other than
+            backbone.
+
+    Args:
+        model (:obj:`nn.Module`): The model with parameters to be optimized.
+        optimizer_cfg (dict): The config dict of the optimizer.
+            Positional fields are
+
+                - `type`: class name of the optimizer.
+
+            Optional fields are
+
+                - any arguments of the corresponding optimizer type, e.g.,
+                  lr, weight_decay, momentum, etc.
+        paramwise_cfg (dict, optional): Parameter-wise options.
+
+    Example 1:
+        >>> model = torch.nn.modules.Conv1d(1, 1, 1)
+        >>> optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9,
+        >>>                      weight_decay=0.0001)
+        >>> paramwise_cfg = dict(norm_decay_mult=0.)
+        >>> optim_builder = DefaultOptimizerConstructor(
+        >>>     optimizer_cfg, paramwise_cfg)
+        >>> optimizer = optim_builder(model)
+
+    Example 2:
+        >>> # assume model have attribute model.backbone and model.cls_head
+        >>> optimizer_cfg = dict(type='SGD', lr=0.01, weight_decay=0.95)
+        >>> paramwise_cfg = dict(custom_keys={
+                '.backbone': dict(lr_mult=0.1, decay_mult=0.9)})
+        >>> optim_builder = DefaultOptimizerConstructor(
+        >>>     optimizer_cfg, paramwise_cfg)
+        >>> optimizer = optim_builder(model)
+        >>> # Then the `lr` and `weight_decay` for model.backbone is
+        >>> # (0.01 * 0.1, 0.95 * 0.9). `lr` and `weight_decay` for
+        >>> # model.cls_head is (0.01, 0.95).
+    """
+
+    def __init__(self, optimizer_cfg, paramwise_cfg=None):
+        if not isinstance(optimizer_cfg, dict):
+            raise TypeError('optimizer_cfg should be a dict',
+                            f'but got {type(optimizer_cfg)}')
+        self.optimizer_cfg = optimizer_cfg
+        self.paramwise_cfg = {} if paramwise_cfg is None else paramwise_cfg
+        self.base_lr = optimizer_cfg.get('lr', None)
+        self.base_wd = optimizer_cfg.get('weight_decay', None)
+        self._validate_cfg()
+
+    def _validate_cfg(self):
+        if not isinstance(self.paramwise_cfg, dict):
+            raise TypeError('paramwise_cfg should be None or a dict, '
+                            f'but got {type(self.paramwise_cfg)}')
+
+        if 'custom_keys' in self.paramwise_cfg:
+            if not isinstance(self.paramwise_cfg['custom_keys'], dict):
+                raise TypeError(
+                    'If specified, custom_keys must be a dict, '
+                    f'but got {type(self.paramwise_cfg["custom_keys"])}')
+            if self.base_wd is None:
+                for key in self.paramwise_cfg['custom_keys']:
+                    if 'decay_mult' in self.paramwise_cfg['custom_keys'][key]:
+                        raise ValueError('base_wd should not be None')
+
+        # get base lr and weight decay
+        # weight_decay must be explicitly specified if mult is specified
+        if ('bias_decay_mult' in self.paramwise_cfg
+                or 'norm_decay_mult' in self.paramwise_cfg
+                or 'dwconv_decay_mult' in self.paramwise_cfg):
+            if self.base_wd is None:
+                raise ValueError('base_wd should not be None')
+
+    def _is_in(self, param_group, param_group_list):
+        assert is_list_of(param_group_list, dict)
+        param = set(param_group['params'])
+        param_set = set()
+        for group in param_group_list:
+            param_set.update(set(group['params']))
+
+        return not param.isdisjoint(param_set)
+
+    def add_params(self, params, module, prefix='', is_dcn_module=None):
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+            prefix (str): The prefix of the module
+            is_dcn_module (int|float|None): If the current module is a
+                submodule of DCN, `is_dcn_module` will be passed to
+                control conv_offset layer's learning rate. Defaults to None.
+        """
+        # get param-wise options
+        custom_keys = self.paramwise_cfg.get('custom_keys', {})
+        # first sort with alphabet order and then sort with reversed len of str
+        sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True)
+
+        bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', 1.)
+        bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', 1.)
+        norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', 1.)
+        dwconv_decay_mult = self.paramwise_cfg.get('dwconv_decay_mult', 1.)
+        bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False)
+        dcn_offset_lr_mult = self.paramwise_cfg.get('dcn_offset_lr_mult', 1.)
+
+        # special rules for norm layers and depth-wise conv layers
+        is_norm = isinstance(module,
+                             (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm))
+        is_dwconv = (
+            isinstance(module, torch.nn.Conv2d)
+            and module.in_channels == module.groups)
+
+        for name, param in module.named_parameters(recurse=False):
+            param_group = {'params': [param]}
+            if not param.requires_grad:
+                params.append(param_group)
+                continue
+            if bypass_duplicate and self._is_in(param_group, params):
+                warnings.warn(f'{prefix} is duplicate. It is skipped since '
+                              f'bypass_duplicate={bypass_duplicate}')
+                continue
+            # if the parameter match one of the custom keys, ignore other rules
+            is_custom = False
+            for key in sorted_keys:
+                if key in f'{prefix}.{name}':
+                    is_custom = True
+                    lr_mult = custom_keys[key].get('lr_mult', 1.)
+                    param_group['lr'] = self.base_lr * lr_mult
+                    if self.base_wd is not None:
+                        decay_mult = custom_keys[key].get('decay_mult', 1.)
+                        param_group['weight_decay'] = self.base_wd * decay_mult
+                    break
+
+            if not is_custom:
+                # bias_lr_mult affects all bias parameters
+                # except for norm.bias dcn.conv_offset.bias
+                if name == 'bias' and not (is_norm or is_dcn_module):
+                    param_group['lr'] = self.base_lr * bias_lr_mult
+
+                if (prefix.find('conv_offset') != -1 and is_dcn_module
+                        and isinstance(module, torch.nn.Conv2d)):
+                    # deal with both dcn_offset's bias & weight
+                    param_group['lr'] = self.base_lr * dcn_offset_lr_mult
+
+                # apply weight decay policies
+                if self.base_wd is not None:
+                    # norm decay
+                    if is_norm:
+                        param_group[
+                            'weight_decay'] = self.base_wd * norm_decay_mult
+                    # depth-wise conv
+                    elif is_dwconv:
+                        param_group[
+                            'weight_decay'] = self.base_wd * dwconv_decay_mult
+                    # bias lr and decay
+                    elif name == 'bias' and not is_dcn_module:
+                        # TODO: current bias_decay_mult will have affect on DCN
+                        param_group[
+                            'weight_decay'] = self.base_wd * bias_decay_mult
+            params.append(param_group)
+
+        if check_ops_exist():
+            from mmcv.ops import DeformConv2d, ModulatedDeformConv2d
+            is_dcn_module = isinstance(module,
+                                       (DeformConv2d, ModulatedDeformConv2d))
+        else:
+            is_dcn_module = False
+        for child_name, child_mod in module.named_children():
+            child_prefix = f'{prefix}.{child_name}' if prefix else child_name
+            self.add_params(
+                params,
+                child_mod,
+                prefix=child_prefix,
+                is_dcn_module=is_dcn_module)
+
+    def __call__(self, model):
+        if hasattr(model, 'module'):
+            model = model.module
+
+        optimizer_cfg = self.optimizer_cfg.copy()
+        # if no paramwise option is specified, just use the global setting
+        if not self.paramwise_cfg:
+            optimizer_cfg['params'] = model.parameters()
+            return build_from_cfg(optimizer_cfg, OPTIMIZERS)
+
+        # set param-wise lr and weight decay recursively
+        params = []
+        self.add_params(params, model)
+        optimizer_cfg['params'] = params
+
+        return build_from_cfg(optimizer_cfg, OPTIMIZERS)
\ No newline at end of file
diff --git a/mmcv/parallel/__init__.py b/mmcv/parallel/__init__.py
new file mode 100644
index 0000000..0be77ee
--- /dev/null
+++ b/mmcv/parallel/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .collate import collate
+from .data_container import DataContainer
+from .utils import is_module_wrapper
\ No newline at end of file
diff --git a/mmcv/parallel/collate.py b/mmcv/parallel/collate.py
new file mode 100644
index 0000000..d291203
--- /dev/null
+++ b/mmcv/parallel/collate.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections.abc import Mapping, Sequence
+
+import torch
+import torch.nn.functional as F
+from torch.utils.data.dataloader import default_collate
+
+from .data_container import DataContainer
+
+def collate(batch, samples_per_gpu=1):
+    batch = collate_dc(batch, samples_per_gpu)
+    data_dict = {}
+    for key, value in batch.items():
+        if isinstance(value, DataContainer):
+            data_dict[key] = value.data[0]
+        elif isinstance(value[0], DataContainer):
+            data_dict[key] = value[0].data
+        else:
+            data_dict[key] = value
+    return data_dict
+
+def collate_dc(batch, samples_per_gpu=1):
+    """Puts each data field into a tensor/DataContainer with outer dimension
+    batch size.
+
+    Extend default_collate to add support for
+    :type:`~mmcv.parallel.DataContainer`. There are 3 cases.
+
+    1. cpu_only = True, e.g., meta data
+    2. cpu_only = False, stack = True, e.g., images tensors
+    3. cpu_only = False, stack = False, e.g., gt bboxes
+    """
+
+    if not isinstance(batch, Sequence):
+        raise TypeError(f'{batch.dtype} is not supported.')
+
+    if isinstance(batch[0], DataContainer):
+        stacked = []
+        if batch[0].cpu_only:
+            for i in range(0, len(batch), samples_per_gpu):
+                stacked.append(
+                    [sample.data for sample in batch[i:i + samples_per_gpu]])
+            return DataContainer(
+                stacked, batch[0].stack, batch[0].padding_value, cpu_only=True)
+        elif batch[0].stack:
+            for i in range(0, len(batch), samples_per_gpu):
+                assert isinstance(batch[i].data, torch.Tensor)
+
+                if batch[i].pad_dims is not None:
+                    ndim = batch[i].dim()
+                    assert ndim > batch[i].pad_dims
+                    max_shape = [0 for _ in range(batch[i].pad_dims)]
+                    for dim in range(1, batch[i].pad_dims + 1):
+                        max_shape[dim - 1] = batch[i].size(-dim)
+                    for sample in batch[i:i + samples_per_gpu]:
+                        for dim in range(0, ndim - batch[i].pad_dims):
+                            assert batch[i].size(dim) == sample.size(dim)
+                        for dim in range(1, batch[i].pad_dims + 1):
+                            max_shape[dim - 1] = max(max_shape[dim - 1],
+                                                     sample.size(-dim))
+                    padded_samples = []
+                    for sample in batch[i:i + samples_per_gpu]:
+                        pad = [0 for _ in range(batch[i].pad_dims * 2)]
+                        for dim in range(1, batch[i].pad_dims + 1):
+                            pad[2 * dim -
+                                1] = max_shape[dim - 1] - sample.size(-dim)
+                        padded_samples.append(
+                            F.pad(
+                                sample.data, pad, value=sample.padding_value))
+                    stacked.append(default_collate(padded_samples))
+                elif batch[i].pad_dims is None:
+                    stacked.append(
+                        default_collate([
+                            sample.data
+                            for sample in batch[i:i + samples_per_gpu]
+                        ]))
+                else:
+                    raise ValueError(
+                        'pad_dims should be either None or integers (1-3)')
+
+        else:
+            for i in range(0, len(batch), samples_per_gpu):
+                stacked.append(
+                    [sample.data for sample in batch[i:i + samples_per_gpu]])
+        return DataContainer(stacked, batch[0].stack, batch[0].padding_value)
+    elif isinstance(batch[0], Sequence):
+        transposed = zip(*batch)
+        return [collate_dc(samples, samples_per_gpu) for samples in transposed]
+    elif isinstance(batch[0], Mapping):
+        return {
+            key: collate_dc([d[key] for d in batch], samples_per_gpu)
+            for key in batch[0]
+        }
+    else:
+        return default_collate(batch)
diff --git a/mmcv/parallel/data_container.py b/mmcv/parallel/data_container.py
new file mode 100644
index 0000000..17ba05b
--- /dev/null
+++ b/mmcv/parallel/data_container.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+import torch
+
+
+def assert_tensor_type(func):
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if not isinstance(args[0].data, torch.Tensor):
+            raise AttributeError(
+                f'{args[0].__class__.__name__} has no attribute '
+                f'{func.__name__} for type {args[0].datatype}')
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+class DataContainer:
+    """A container for any type of objects.
+
+    Typically tensors will be stacked in the collate function and sliced along
+    some dimension in the scatter function. This behavior has some limitations.
+    1. All tensors have to be the same size.
+    2. Types are limited (numpy array or Tensor).
+
+    We design `DataContainer` and `MMDataParallel` to overcome these
+    limitations. The behavior can be either of the following.
+
+    - copy to GPU, pad all tensors to the same size and stack them
+    - copy to GPU without stacking
+    - leave the objects as is and pass it to the model
+    - pad_dims specifies the number of last few dimensions to do padding
+    """
+
+    def __init__(self,
+                 data,
+                 stack=False,
+                 padding_value=0,
+                 cpu_only=False,
+                 pad_dims=2):
+        self._data = data
+        self._cpu_only = cpu_only
+        self._stack = stack
+        self._padding_value = padding_value
+        assert pad_dims in [None, 1, 2, 3]
+        self._pad_dims = pad_dims
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}({repr(self.data)})'
+
+    def __len__(self):
+        return len(self._data)
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def datatype(self):
+        if isinstance(self.data, torch.Tensor):
+            return self.data.type()
+        else:
+            return type(self.data)
+
+    @property
+    def cpu_only(self):
+        return self._cpu_only
+
+    @property
+    def stack(self):
+        return self._stack
+
+    @property
+    def padding_value(self):
+        return self._padding_value
+
+    @property
+    def pad_dims(self):
+        return self._pad_dims
+
+    @assert_tensor_type
+    def size(self, *args, **kwargs):
+        return self.data.size(*args, **kwargs)
+
+    @assert_tensor_type
+    def dim(self):
+        return self.data.dim()
diff --git a/mmcv/parallel/registry.py b/mmcv/parallel/registry.py
new file mode 100644
index 0000000..144f9fb
--- /dev/null
+++ b/mmcv/parallel/registry.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+
+from mmcv.utils import Registry
+
+MODULE_WRAPPERS = Registry('module wrapper')
+MODULE_WRAPPERS.register_module(module=DataParallel)
+MODULE_WRAPPERS.register_module(module=DistributedDataParallel)
diff --git a/mmcv/parallel/utils.py b/mmcv/parallel/utils.py
new file mode 100644
index 0000000..0f5712c
--- /dev/null
+++ b/mmcv/parallel/utils.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .registry import MODULE_WRAPPERS
+
+
+def is_module_wrapper(module):
+    """Check if a module is a module wrapper.
+
+    The following 3 modules in MMCV (and their subclasses) are regarded as
+    module wrappers: DataParallel, DistributedDataParallel,
+    MMDistributedDataParallel (the deprecated version). You may add you own
+    module wrapper by registering it to mmcv.parallel.MODULE_WRAPPERS.
+
+    Args:
+        module (nn.Module): The module to be checked.
+
+    Returns:
+        bool: True if the input module is a module wrapper.
+    """
+    module_wrappers = tuple(MODULE_WRAPPERS.module_dict.values())
+    return isinstance(module, module_wrappers)
diff --git a/mmcv/runner/__init__.py b/mmcv/runner/__init__.py
new file mode 100644
index 0000000..2705045
--- /dev/null
+++ b/mmcv/runner/__init__.py
@@ -0,0 +1,3 @@
+from .hooks import DistEvalHook, EvalHook, OptimizerHook, HOOKS, DistSamplerSeedHook, Fp16OptimizerHook
+from .epoch_based_runner import EpochBasedRunner
+from .builder import build_runner
\ No newline at end of file
diff --git a/mmcv/runner/base_runner.py b/mmcv/runner/base_runner.py
new file mode 100644
index 0000000..5d39f23
--- /dev/null
+++ b/mmcv/runner/base_runner.py
@@ -0,0 +1,532 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+import os.path as osp
+import warnings
+from abc import ABCMeta, abstractmethod
+
+import torch
+from torch.optim import Optimizer
+
+from .hooks import HOOKS, Hook
+from ..parallel import is_module_wrapper
+from ..utils import load_checkpoint, LogBuffer, is_str, mkdir_or_exist, build_from_cfg, \
+                    Config, get_dist_info, get_time_str, Priority, get_priority
+
+
+class BaseRunner(metaclass=ABCMeta):
+    """The base class of Runner, a training helper for PyTorch.
+
+    All subclasses should implement the following APIs:
+
+    - ``run()``
+    - ``train()``
+    - ``val()``
+    - ``save_checkpoint()``
+
+    Args:
+        model (:obj:`torch.nn.Module`): The model to be run.
+        batch_processor (callable): A callable method that process a data
+            batch. The interface of this method should be
+            `batch_processor(model, data, train_mode) -> dict`
+        optimizer (dict or :obj:`torch.optim.Optimizer`): It can be either an
+            optimizer (in most cases) or a dict of optimizers (in models that
+            requires more than one optimizer, e.g., GAN).
+        work_dir (str, optional): The working directory to save checkpoints
+            and logs. Defaults to None.
+        logger (:obj:`logging.Logger`): Logger used during training.
+             Defaults to None. (The default value is just for backward
+             compatibility)
+        meta (dict | None): A dict records some import information such as
+            environment info and seed, which will be logged in logger hook.
+            Defaults to None.
+        max_epochs (int, optional): Total training epochs.
+        max_iters (int, optional): Total training iterations.
+    """
+
+    def __init__(self,
+                 model,
+                 batch_processor=None,
+                 optimizer=None,
+                 work_dir=None,
+                 logger=None,
+                 meta=None,
+                 max_iters=None,
+                 max_epochs=None):
+        if batch_processor is not None:
+            if not callable(batch_processor):
+                raise TypeError('batch_processor must be callable, '
+                                f'but got {type(batch_processor)}')
+            warnings.warn('batch_processor is deprecated, please implement '
+                          'train_step() and val_step() in the model instead.')
+            # raise an error is `batch_processor` is not None and
+            # `model.train_step()` exists.
+            if is_module_wrapper(model):
+                _model = model.module
+            else:
+                _model = model
+
+        # check the type of `optimizer`
+        if isinstance(optimizer, dict):
+            for name, optim in optimizer.items():
+                if not isinstance(optim, Optimizer):
+                    raise TypeError(
+                        f'optimizer must be a dict of torch.optim.Optimizers, '
+                        f'but optimizer["{name}"] is a {type(optim)}')
+        elif not isinstance(optimizer, Optimizer) and optimizer is not None:
+            raise TypeError(
+                f'optimizer must be a torch.optim.Optimizer object '
+                f'or dict or None, but got {type(optimizer)}')
+
+        # check the type of `logger`
+        if not isinstance(logger, logging.Logger):
+            raise TypeError(f'logger must be a logging.Logger object, '
+                            f'but got {type(logger)}')
+
+        # check the type of `meta`
+        if meta is not None and not isinstance(meta, dict):
+            raise TypeError(
+                f'meta must be a dict or None, but got {type(meta)}')
+
+        self.model = model
+        self.batch_processor = batch_processor
+        self.optimizer = optimizer
+        self.logger = logger
+        self.meta = meta
+        # create work_dir
+        if is_str(work_dir):
+            self.work_dir = osp.abspath(work_dir)
+            mkdir_or_exist(self.work_dir)
+        elif work_dir is None:
+            self.work_dir = None
+        else:
+            raise TypeError('"work_dir" must be a str or None')
+
+        # get model name from the model class
+        if hasattr(self.model, 'module'):
+            self._model_name = self.model.module.__class__.__name__
+        else:
+            self._model_name = self.model.__class__.__name__
+
+        self._rank, self._world_size = get_dist_info()
+        self.timestamp = get_time_str()
+        self.mode = None
+        self._hooks = []
+        self._epoch = 0
+        self._iter = 0
+        self._inner_iter = 0
+
+        if max_epochs is not None and max_iters is not None:
+            raise ValueError(
+                'Only one of `max_epochs` or `max_iters` can be set.')
+
+        self._max_epochs = max_epochs
+        self._max_iters = max_iters
+        # TODO: Redesign LogBuffer, it is not flexible and elegant enough
+        self.log_buffer = LogBuffer()
+
+    @property
+    def model_name(self):
+        """str: Name of the model, usually the module class name."""
+        return self._model_name
+
+    @property
+    def rank(self):
+        """int: Rank of current process. (distributed training)"""
+        return self._rank
+
+    @property
+    def world_size(self):
+        """int: Number of processes participating in the job.
+        (distributed training)"""
+        return self._world_size
+
+    @property
+    def hooks(self):
+        """list[:obj:`Hook`]: A list of registered hooks."""
+        return self._hooks
+
+    @property
+    def epoch(self):
+        """int: Current epoch."""
+        return self._epoch
+
+    @property
+    def iter(self):
+        """int: Current iteration."""
+        return self._iter
+
+    @property
+    def inner_iter(self):
+        """int: Iteration in an epoch."""
+        return self._inner_iter
+
+    @property
+    def max_epochs(self):
+        """int: Maximum training epochs."""
+        return self._max_epochs
+
+    @property
+    def max_iters(self):
+        """int: Maximum training iterations."""
+        return self._max_iters
+
+    @abstractmethod
+    def train(self):
+        pass
+
+    @abstractmethod
+    def val(self):
+        pass
+
+    @abstractmethod
+    def run(self, data_loaders, workflow, **kwargs):
+        pass
+
+    @abstractmethod
+    def save_checkpoint(self,
+                        out_dir,
+                        filename_tmpl,
+                        save_optimizer=True,
+                        meta=None,
+                        create_symlink=True):
+        pass
+
+    def current_lr(self):
+        """Get current learning rates.
+
+        Returns:
+            list[float] | dict[str, list[float]]: Current learning rates of all
+                param groups. If the runner has a dict of optimizers, this
+                method will return a dict.
+        """
+        if isinstance(self.optimizer, torch.optim.Optimizer):
+            lr = [group['lr'] for group in self.optimizer.param_groups]
+        elif isinstance(self.optimizer, dict):
+            lr = dict()
+            for name, optim in self.optimizer.items():
+                lr[name] = [group['lr'] for group in optim.param_groups]
+        else:
+            raise RuntimeError(
+                'lr is not applicable because optimizer does not exist.')
+        return lr
+
+    def current_momentum(self):
+        """Get current momentums.
+
+        Returns:
+            list[float] | dict[str, list[float]]: Current momentums of all
+                param groups. If the runner has a dict of optimizers, this
+                method will return a dict.
+        """
+
+        def _get_momentum(optimizer):
+            momentums = []
+            for group in optimizer.param_groups:
+                if 'momentum' in group.keys():
+                    momentums.append(group['momentum'])
+                elif 'betas' in group.keys():
+                    momentums.append(group['betas'][0])
+                else:
+                    momentums.append(0)
+            return momentums
+
+        if self.optimizer is None:
+            raise RuntimeError(
+                'momentum is not applicable because optimizer does not exist.')
+        elif isinstance(self.optimizer, torch.optim.Optimizer):
+            momentums = _get_momentum(self.optimizer)
+        elif isinstance(self.optimizer, dict):
+            momentums = dict()
+            for name, optim in self.optimizer.items():
+                momentums[name] = _get_momentum(optim)
+        return momentums
+
+    def register_hook(self, hook, priority='NORMAL'):
+        """Register a hook into the hook list.
+
+        The hook will be inserted into a priority queue, with the specified
+        priority (See :class:`Priority` for details of priorities).
+        For hooks with the same priority, they will be triggered in the same
+        order as they are registered.
+
+        Args:
+            hook (:obj:`Hook`): The hook to be registered.
+            priority (int or str or :obj:`Priority`): Hook priority.
+                Lower value means higher priority.
+        """
+        assert isinstance(hook, Hook)
+        if hasattr(hook, 'priority'):
+            raise ValueError('"priority" is a reserved attribute for hooks')
+        priority = get_priority(priority)
+        hook.priority = priority
+        # insert the hook to a sorted list
+        inserted = False
+        for i in range(len(self._hooks) - 1, -1, -1):
+            if priority >= self._hooks[i].priority:
+                self._hooks.insert(i + 1, hook)
+                inserted = True
+                break
+        if not inserted:
+            self._hooks.insert(0, hook)
+
+    def register_hook_from_cfg(self, hook_cfg):
+        """Register a hook from its cfg.
+
+        Args:
+            hook_cfg (dict): Hook config. It should have at least keys 'type'
+              and 'priority' indicating its type and priority.
+
+        Notes:
+            The specific hook class to register should not use 'type' and
+            'priority' arguments during initialization.
+        """
+        hook_cfg = hook_cfg.copy()
+        priority = hook_cfg.pop('priority', 'NORMAL')
+        hook = build_from_cfg(hook_cfg, HOOKS)
+        self.register_hook(hook, priority=priority)
+
+    def call_hook(self, fn_name):
+        """Call all hooks.
+
+        Args:
+            fn_name (str): The function name in each hook to be called, such as
+                "before_train_epoch".
+        """
+        for hook in self._hooks:
+            getattr(hook, fn_name)(self)
+
+    def get_hook_info(self):
+        # Get hooks info in each stage
+        stage_hook_map = {stage: [] for stage in Hook.stages}
+        for hook in self.hooks:
+            try:
+                priority = Priority(hook.priority).name
+            except ValueError:
+                priority = hook.priority
+            classname = hook.__class__.__name__
+            hook_info = f'({priority:<12}) {classname:<35}'
+            for trigger_stage in hook.get_triggered_stages():
+                stage_hook_map[trigger_stage].append(hook_info)
+
+        stage_hook_infos = []
+        for stage in Hook.stages:
+            hook_infos = stage_hook_map[stage]
+            if len(hook_infos) > 0:
+                info = f'{stage}:\n'
+                info += '\n'.join(hook_infos)
+                info += '\n -------------------- '
+                stage_hook_infos.append(info)
+        return '\n'.join(stage_hook_infos)
+
+    def load_checkpoint(self,
+                        filename,
+                        map_location='cpu',
+                        strict=False,
+                        revise_keys=[(r'^module.', '')]):
+        return load_checkpoint(
+            self.model,
+            filename,
+            map_location,
+            strict,
+            self.logger,
+            revise_keys=revise_keys)
+
+    def resume(self,
+               checkpoint,
+               resume_optimizer=True,
+               map_location='default'):
+        if map_location == 'default':
+            if torch.cuda.is_available():
+                device_id = torch.cuda.current_device()
+                checkpoint = self.load_checkpoint(
+                    checkpoint,
+                    map_location=lambda storage, loc: storage.cuda(device_id))
+            else:
+                checkpoint = self.load_checkpoint(checkpoint)
+        else:
+            checkpoint = self.load_checkpoint(
+                checkpoint, map_location=map_location)
+
+        self._epoch = checkpoint['meta']['epoch']
+        self._iter = checkpoint['meta']['iter']
+        if self.meta is None:
+            self.meta = {}
+        self.meta.setdefault('hook_msgs', {})
+        # load `last_ckpt`, `best_score`, `best_ckpt`, etc. for hook messages
+        self.meta['hook_msgs'].update(checkpoint['meta'].get('hook_msgs', {}))
+
+        # Re-calculate the number of iterations when resuming
+        # models with different number of GPUs
+        if 'config' in checkpoint['meta']:
+            config = Config.fromstring(
+                checkpoint['meta']['config'], file_format='.py')
+            previous_gpu_ids = config.get('gpu_ids', None)
+            if previous_gpu_ids and len(previous_gpu_ids) > 0 and len(
+                    previous_gpu_ids) != self.world_size:
+                self._iter = int(self._iter * len(previous_gpu_ids) /
+                                 self.world_size)
+                self.logger.info('the iteration number is changed due to '
+                                 'change of GPU number')
+
+        # resume meta information meta
+        self.meta = checkpoint['meta']
+
+        if 'optimizer' in checkpoint and resume_optimizer:
+            if isinstance(self.optimizer, Optimizer):
+                self.optimizer.load_state_dict(checkpoint['optimizer'])
+            elif isinstance(self.optimizer, dict):
+                for k in self.optimizer.keys():
+                    self.optimizer[k].load_state_dict(
+                        checkpoint['optimizer'][k])
+            else:
+                raise TypeError(
+                    'Optimizer should be dict or torch.optim.Optimizer '
+                    f'but got {type(self.optimizer)}')
+
+        self.logger.info('resumed epoch %d, iter %d', self.epoch, self.iter)
+
+    def register_lr_hook(self, lr_config):
+        if lr_config is None:
+            return
+        elif isinstance(lr_config, dict):
+            assert 'policy' in lr_config
+            policy_type = lr_config.pop('policy')
+            # If the type of policy is all in lower case, e.g., 'cyclic',
+            # then its first letter will be capitalized, e.g., to be 'Cyclic'.
+            # This is for the convenient usage of Lr updater.
+            # Since this is not applicable for `
+            # CosineAnnealingLrUpdater`,
+            # the string will not be changed if it contains capital letters.
+            if policy_type == policy_type.lower():
+                policy_type = policy_type.title()
+            hook_type = policy_type + 'LrUpdaterHook'
+            lr_config['type'] = hook_type
+            hook = build_from_cfg(lr_config, HOOKS)
+        else:
+            hook = lr_config
+        self.register_hook(hook, priority='VERY_HIGH')
+
+    def register_momentum_hook(self, momentum_config):
+        if momentum_config is None:
+            return
+        if isinstance(momentum_config, dict):
+            assert 'policy' in momentum_config
+            policy_type = momentum_config.pop('policy')
+            # If the type of policy is all in lower case, e.g., 'cyclic',
+            # then its first letter will be capitalized, e.g., to be 'Cyclic'.
+            # This is for the convenient usage of momentum updater.
+            # Since this is not applicable for
+            # `CosineAnnealingMomentumUpdater`,
+            # the string will not be changed if it contains capital letters.
+            if policy_type == policy_type.lower():
+                policy_type = policy_type.title()
+            hook_type = policy_type + 'MomentumUpdaterHook'
+            momentum_config['type'] = hook_type
+            hook = build_from_cfg(momentum_config, HOOKS)
+        else:
+            hook = momentum_config
+        self.register_hook(hook, priority='HIGH')
+
+    def register_optimizer_hook(self, optimizer_config):
+        if optimizer_config is None:
+            return
+        if isinstance(optimizer_config, dict):
+            optimizer_config.setdefault('type', 'OptimizerHook')
+            hook = build_from_cfg(optimizer_config, HOOKS)
+        else:
+            hook = optimizer_config
+        self.register_hook(hook, priority='ABOVE_NORMAL')
+
+    def register_checkpoint_hook(self, checkpoint_config):
+        if checkpoint_config is None:
+            return
+        if isinstance(checkpoint_config, dict):
+            checkpoint_config.setdefault('type', 'CheckpointHook')
+            hook = build_from_cfg(checkpoint_config, HOOKS)
+        else:
+            hook = checkpoint_config
+        self.register_hook(hook, priority='NORMAL')
+
+    def register_logger_hooks(self, log_config):
+        if log_config is None:
+            return
+        log_interval = log_config['interval']
+        for info in log_config['hooks']:
+            logger_hook = build_from_cfg(
+                info, HOOKS, default_args=dict(interval=log_interval))
+            self.register_hook(logger_hook, priority='VERY_LOW')
+
+    def register_timer_hook(self, timer_config):
+        if timer_config is None:
+            return
+        if isinstance(timer_config, dict):
+            timer_config_ = copy.deepcopy(timer_config)
+            hook = build_from_cfg(timer_config_, HOOKS)
+        else:
+            hook = timer_config
+        self.register_hook(hook, priority='LOW')
+
+    def register_custom_hooks(self, custom_config):
+        if custom_config is None:
+            return
+
+        if not isinstance(custom_config, list):
+            custom_config = [custom_config]
+
+        for item in custom_config:
+            if isinstance(item, dict):
+                self.register_hook_from_cfg(item)
+            else:
+                self.register_hook(item, priority='NORMAL')
+
+    def register_profiler_hook(self, profiler_config):
+        if profiler_config is None:
+            return
+        if isinstance(profiler_config, dict):
+            profiler_config.setdefault('type', 'ProfilerHook')
+            hook = build_from_cfg(profiler_config, HOOKS)
+        else:
+            hook = profiler_config
+        self.register_hook(hook)
+
+    def register_training_hooks(self,
+                                lr_config,
+                                optimizer_config=None,
+                                checkpoint_config=None,
+                                log_config=None,
+                                momentum_config=None,
+                                timer_config=dict(type='IterTimerHook'),
+                                custom_hooks_config=None):
+        """Register default and custom hooks for training.
+
+        Default and custom hooks include:
+
+        +----------------------+-------------------------+
+        | Hooks                | Priority                |
+        +======================+=========================+
+        | LrUpdaterHook        | VERY_HIGH (10)          |
+        +----------------------+-------------------------+
+        | MomentumUpdaterHook  | HIGH (30)               |
+        +----------------------+-------------------------+
+        | OptimizerStepperHook | ABOVE_NORMAL (40)       |
+        +----------------------+-------------------------+
+        | CheckpointSaverHook  | NORMAL (50)             |
+        +----------------------+-------------------------+
+        | IterTimerHook        | LOW (70)                |
+        +----------------------+-------------------------+
+        | LoggerHook(s)        | VERY_LOW (90)           |
+        +----------------------+-------------------------+
+        | CustomHook(s)        | defaults to NORMAL (50) |
+        +----------------------+-------------------------+
+
+        If custom hooks have same priority with default hooks, custom hooks
+        will be triggered after default hooks.
+        """
+        self.register_lr_hook(lr_config)
+        self.register_momentum_hook(momentum_config)
+        self.register_optimizer_hook(optimizer_config)
+        self.register_checkpoint_hook(checkpoint_config)
+        self.register_timer_hook(timer_config)
+        self.register_logger_hooks(log_config)
+        self.register_custom_hooks(custom_hooks_config)
diff --git a/mmcv/runner/builder.py b/mmcv/runner/builder.py
new file mode 100644
index 0000000..6443fe3
--- /dev/null
+++ b/mmcv/runner/builder.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from mmcv.utils import Registry
+
+RUNNERS = Registry('runner')
+
+def build_runner(cfg, default_args=None):
+    runner_cfg = copy.deepcopy(cfg)
+    runner = RUNNERS.build(runner_cfg, default_args=default_args)
+    return runner
\ No newline at end of file
diff --git a/mmcv/runner/epoch_based_runner.py b/mmcv/runner/epoch_based_runner.py
new file mode 100644
index 0000000..7139e80
--- /dev/null
+++ b/mmcv/runner/epoch_based_runner.py
@@ -0,0 +1,262 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import platform
+import shutil
+import time
+import warnings
+
+import torch
+
+from .base_runner import BaseRunner
+from .builder import RUNNERS
+from ..utils import save_checkpoint, is_list_of, symlink, get_host_info
+
+
+
+
+
+
+@RUNNERS.register_module()
+class EpochBasedRunner(BaseRunner):
+    """Epoch-based Runner.
+
+    This runner train models epoch by epoch.
+    """
+
+    def run_iter(self, data_batch, train_mode, **kwargs):
+        if self.batch_processor is not None:
+            outputs = self.batch_processor(
+                self.model, data_batch, train_mode=train_mode, **kwargs)
+        elif train_mode:
+            outputs = self.model(data_batch, return_loss=train_mode, **kwargs)
+        else:
+            outputs = self.model.val_step(data_batch, self.optimizer, **kwargs)
+        if not isinstance(outputs, dict):
+            raise TypeError('"batch_processor()" or "model.train_step()"'
+                            'and "model.val_step()" must return a dict')
+        if 'log_vars' in outputs:
+            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
+        self.outputs = outputs
+
+    def train(self, data_loader, **kwargs):
+        self.model.train()
+        self.mode = 'train'
+        self.data_loader = data_loader
+        self._max_iters = self._max_epochs * len(self.data_loader)
+        self.call_hook('before_train_epoch')
+        time.sleep(2)  # Prevent possible deadlock during epoch transition
+        for i, data_batch in enumerate(self.data_loader):
+            self._inner_iter = i
+            self.call_hook('before_train_iter')
+            self.run_iter(data_batch, train_mode=True, **kwargs)
+            self.call_hook('after_train_iter')
+            self._iter += 1
+
+        self.call_hook('after_train_epoch')
+        self._epoch += 1
+
+    @torch.no_grad()
+    def val(self, data_loader, **kwargs):
+        self.model.eval()
+        self.mode = 'val'
+        self.data_loader = data_loader
+        self.call_hook('before_val_epoch')
+        time.sleep(2)  # Prevent possible deadlock during epoch transition
+        for i, data_batch in enumerate(self.data_loader):
+            self._inner_iter = i
+            self.call_hook('before_val_iter')
+            self.run_iter(data_batch, train_mode=False)
+            self.call_hook('after_val_iter')
+            
+
+        self.call_hook('after_val_epoch')
+
+    def run(self, data_loaders, workflow, max_epochs=None, **kwargs):
+        """Start running.
+
+        Args:
+            data_loaders (list[:obj:`DataLoader`]): Dataloaders for training
+                and validation.
+            workflow (list[tuple]): A list of (phase, epochs) to specify the
+                running order and epochs. E.g, [('train', 2), ('val', 1)] means
+                running 2 epochs for training and 1 epoch for validation,
+                iteratively.
+        """
+        assert isinstance(data_loaders, list)
+        assert is_list_of(workflow, tuple)
+        assert len(data_loaders) == len(workflow)
+        if max_epochs is not None:
+            warnings.warn(
+                'setting max_epochs in run is deprecated, '
+                'please set max_epochs in runner_config', DeprecationWarning)
+            self._max_epochs = max_epochs
+
+        assert self._max_epochs is not None, (
+            'max_epochs must be specified during instantiation')
+
+        for i, flow in enumerate(workflow):
+            mode, epochs = flow
+            if mode == 'train':
+                self._max_iters = self._max_epochs * len(data_loaders[i])
+                break
+
+        work_dir = self.work_dir if self.work_dir is not None else 'NONE'
+        self.logger.info('Start running, host: %s, work_dir: %s',
+                         get_host_info(), work_dir)
+        self.logger.info('Hooks will be executed in the following order:\n%s',
+                         self.get_hook_info())
+        self.logger.info('workflow: %s, max: %d epochs', workflow,
+                         self._max_epochs)
+        self.call_hook('before_run')
+
+        while self.epoch < self._max_epochs:
+            for i, flow in enumerate(workflow):
+                mode, epochs = flow
+                if isinstance(mode, str):  # self.train()
+                    if not hasattr(self, mode):
+                        raise ValueError(
+                            f'runner has no method named "{mode}" to run an '
+                            'epoch')
+                    epoch_runner = getattr(self, mode)
+                else:
+                    raise TypeError(
+                        'mode in workflow must be a str, but got {}'.format(
+                            type(mode)))
+
+                for _ in range(epochs):
+                    if mode == 'train' and self.epoch >= self._max_epochs:
+                        break
+                    epoch_runner(data_loaders[i], **kwargs)
+
+        time.sleep(1)  # wait for some hooks like loggers to finish
+        self.call_hook('after_run')
+
+    def save_checkpoint(self,
+                        out_dir,
+                        filename_tmpl='epoch_{}.pth',
+                        save_optimizer=True,
+                        meta=None,
+                        create_symlink=True):
+        """Save the checkpoint.
+
+        Args:
+            out_dir (str): The directory that checkpoints are saved.
+            filename_tmpl (str, optional): The checkpoint filename template,
+                which contains a placeholder for the epoch number.
+                Defaults to 'epoch_{}.pth'.
+            save_optimizer (bool, optional): Whether to save the optimizer to
+                the checkpoint. Defaults to True.
+            meta (dict, optional): The meta information to be saved in the
+                checkpoint. Defaults to None.
+            create_symlink (bool, optional): Whether to create a symlink
+                "latest.pth" to point to the latest checkpoint.
+                Defaults to True.
+        """
+        if meta is None:
+            meta = {}
+        elif not isinstance(meta, dict):
+            raise TypeError(
+                f'meta should be a dict or None, but got {type(meta)}')
+        if self.meta is not None:
+            meta.update(self.meta)
+            # Note: meta.update(self.meta) should be done before
+            # meta.update(epoch=self.epoch + 1, iter=self.iter) otherwise
+            # there will be problems with resumed checkpoints.
+            # More details in https://github.com/open-mmlab/mmcv/pull/1108
+        meta.update(epoch=self.epoch + 1, iter=self.iter)
+
+        filename = filename_tmpl.format(self.epoch + 1)
+        filepath = osp.join(out_dir, filename)
+        optimizer = self.optimizer if save_optimizer else None
+        save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta)
+        # in some environments, `os.symlink` is not supported, you may need to
+        # set `create_symlink` to False
+        if create_symlink:
+            dst_file = osp.join(out_dir, 'latest.pth')
+            if platform.system() != 'Windows':
+                symlink(filename, dst_file)
+            else:
+                shutil.copy(filepath, dst_file)
+
+
+
+@RUNNERS.register_module()
+class EpochBasedRunner_video(EpochBasedRunner):
+    
+    ''' 
+    # basic logic
+    
+    input_sequence = [a, b, c] # given a sequence of samples
+    
+    prev_bev = None
+    for each in input_sequcene[:-1]
+        prev_bev = eval_model(each, prev_bev)) # inference only.
+    
+    model(input_sequcene[-1], prev_bev) # train the last sample.
+    '''
+    
+    def __init__(self,
+                 model,
+                 eval_model=None,
+                 batch_processor=None,
+                 optimizer=None,
+                 work_dir=None,
+                 logger=None,
+                 meta=None,
+                 keys=['gt_bboxes_3d', 'gt_labels_3d', 'img'],
+                 max_iters=None,
+                 max_epochs=None):
+        super().__init__(model,
+                 batch_processor,
+                 optimizer,
+                 work_dir,
+                 logger,
+                 meta,
+                 max_iters,
+                 max_epochs)
+        keys.append('img_metas')
+        self.keys = keys
+        self.eval_model = eval_model
+        self.eval_model.eval()
+    
+    def run_iter(self, data_batch, train_mode, **kwargs):
+        if self.batch_processor is not None:
+            assert False
+            # outputs = self.batch_processor(
+            #     self.model, data_batch, train_mode=train_mode, **kwargs)
+        elif train_mode:
+
+            num_samples = data_batch['img'].data[0].size(1)
+            data_list = []
+            prev_bev = None
+            for i in range(num_samples):
+                data = {}
+                for key in self.keys:
+                    if key not in ['img_metas', 'img', 'points']:
+                        data[key] = data_batch[key]
+                    else:
+                        if key == 'img':
+                            data['img'] = DataContainer(data=[data_batch['img'].data[0][:, i]], cpu_only=data_batch['img'].cpu_only, stack=True)
+                        elif key == 'img_metas':
+                            data['img_metas'] = DataContainer(data=[[each[i] for each in data_batch['img_metas'].data[0]]], cpu_only=data_batch['img_metas'].cpu_only)
+                        else:
+                            assert False
+                data_list.append(data)
+            with torch.no_grad():
+                for i in range(num_samples-1):
+                    if i>0: data_list[i]['prev_bev'] = DataContainer(data=[prev_bev], cpu_only=False)
+                    prev_bev = self.eval_model.val_step(data_list[i], self.optimizer, **kwargs)
+            
+            data_list[-1]['prev_bev'] = DataContainer(data=[prev_bev], cpu_only=False)
+            outputs = self.model.train_step(data_list[-1], self.optimizer, **kwargs)
+        else:
+            assert False
+            # outputs = self.model.val_step(data_batch, self.optimizer, **kwargs)
+
+        if not isinstance(outputs, dict):
+            raise TypeError('"batch_processor()" or "model.train_step()"'
+                            'and "model.val_step()" must return a dict')
+        if 'log_vars' in outputs:
+            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
+        self.outputs = outputs
+
diff --git a/mmcv/runner/hooks/__init__.py b/mmcv/runner/hooks/__init__.py
new file mode 100644
index 0000000..4c5c3a2
--- /dev/null
+++ b/mmcv/runner/hooks/__init__.py
@@ -0,0 +1,9 @@
+from .evaluation import DistEvalHook, EvalHook
+from .optimizer import OptimizerHook, Fp16OptimizerHook
+from .sampler_seed import DistSamplerSeedHook
+from .hook import HOOKS, Hook
+from .lr_updater import LrUpdaterHook
+from .checkpoint import CheckpointHook
+from .iter_timer import IterTimerHook
+from .logger import *
+from .vad_hooks import *
\ No newline at end of file
diff --git a/mmcv/runner/hooks/checkpoint.py b/mmcv/runner/hooks/checkpoint.py
new file mode 100644
index 0000000..0cf051d
--- /dev/null
+++ b/mmcv/runner/hooks/checkpoint.py
@@ -0,0 +1,167 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+
+from mmcv.fileio.file_client import FileClient
+from mmcv.utils import allreduce_params, master_only
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class CheckpointHook(Hook):
+    """Save checkpoints periodically.
+
+    Args:
+        interval (int): The saving period. If ``by_epoch=True``, interval
+            indicates epochs, otherwise it indicates iterations.
+            Default: -1, which means "never".
+        by_epoch (bool): Saving checkpoints by epoch or by iteration.
+            Default: True.
+        save_optimizer (bool): Whether to save optimizer state_dict in the
+            checkpoint. It is usually used for resuming experiments.
+            Default: True.
+        out_dir (str, optional): The root directory to save checkpoints. If not
+            specified, ``runner.work_dir`` will be used by default. If
+            specified, the ``out_dir`` will be the concatenation of ``out_dir``
+            and the last level directory of ``runner.work_dir``.
+            `Changed in version 1.3.16.`
+        max_keep_ckpts (int, optional): The maximum checkpoints to keep.
+            In some cases we want only the latest few checkpoints and would
+            like to delete old ones to save the disk space.
+            Default: -1, which means unlimited.
+        save_last (bool, optional): Whether to force the last checkpoint to be
+            saved regardless of interval. Default: True.
+        sync_buffer (bool, optional): Whether to synchronize buffers in
+            different gpus. Default: False.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+            `New in version 1.3.16.`
+
+    .. warning::
+        Before v1.3.16, the ``out_dir`` argument indicates the path where the
+        checkpoint is stored. However, since v1.3.16, ``out_dir`` indicates the
+        root directory and the final path to save checkpoint is the
+        concatenation of ``out_dir`` and the last level directory of
+        ``runner.work_dir``. Suppose the value of ``out_dir`` is "/path/of/A"
+        and the value of ``runner.work_dir`` is "/path/of/B", then the final
+        path will be "/path/of/A/B".
+    """
+
+    def __init__(self,
+                 interval=-1,
+                 by_epoch=True,
+                 save_optimizer=True,
+                 out_dir=None,
+                 max_keep_ckpts=-1,
+                 save_last=True,
+                 sync_buffer=False,
+                 file_client_args=None,
+                 **kwargs):
+        self.interval = interval
+        self.by_epoch = by_epoch
+        self.save_optimizer = save_optimizer
+        self.out_dir = out_dir
+        self.max_keep_ckpts = max_keep_ckpts
+        self.save_last = save_last
+        self.args = kwargs
+        self.sync_buffer = sync_buffer
+        self.file_client_args = file_client_args
+
+    def before_run(self, runner):
+        if not self.out_dir:
+            self.out_dir = runner.work_dir
+
+        self.file_client = FileClient.infer_client(self.file_client_args,
+                                                   self.out_dir)
+
+        # if `self.out_dir` is not equal to `runner.work_dir`, it means that
+        # `self.out_dir` is set so the final `self.out_dir` is the
+        # concatenation of `self.out_dir` and the last level directory of
+        # `runner.work_dir`
+        if self.out_dir != runner.work_dir:
+            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+            self.out_dir = self.file_client.join_path(self.out_dir, basename)
+
+        runner.logger.info((f'Checkpoints will be saved to {self.out_dir} by '
+                            f'{self.file_client.name}.'))
+
+        # disable the create_symlink option because some file backends do not
+        # allow to create a symlink
+        if 'create_symlink' in self.args:
+            if self.args[
+                    'create_symlink'] and not self.file_client.allow_symlink:
+                self.args['create_symlink'] = False
+                warnings.warn(
+                    ('create_symlink is set as True by the user but is changed'
+                     'to be False because creating symbolic link is not '
+                     f'allowed in {self.file_client.name}'))
+        else:
+            self.args['create_symlink'] = self.file_client.allow_symlink
+
+    def after_train_epoch(self, runner):
+        if not self.by_epoch:
+            return
+
+        # save checkpoint for following cases:
+        # 1. every ``self.interval`` epochs
+        # 2. reach the last epoch of training
+        if self.every_n_epochs(
+                runner, self.interval) or (self.save_last
+                                           and self.is_last_epoch(runner)):
+            runner.logger.info(
+                f'Saving checkpoint at {runner.epoch + 1} epochs')
+            if self.sync_buffer:
+                allreduce_params(runner.model.buffers())
+            self._save_checkpoint(runner)
+
+    @master_only
+    def _save_checkpoint(self, runner):
+        """Save the current checkpoint and delete unwanted checkpoint."""
+        runner.save_checkpoint(
+            self.out_dir, save_optimizer=self.save_optimizer, **self.args)
+        if runner.meta is not None:
+            if self.by_epoch:
+                cur_ckpt_filename = self.args.get(
+                    'filename_tmpl', 'epoch_{}.pth').format(runner.epoch + 1)
+            else:
+                cur_ckpt_filename = self.args.get(
+                    'filename_tmpl', 'iter_{}.pth').format(runner.iter + 1)
+            runner.meta.setdefault('hook_msgs', dict())
+            runner.meta['hook_msgs']['last_ckpt'] = self.file_client.join_path(
+                self.out_dir, cur_ckpt_filename)
+        # remove other checkpoints
+        if self.max_keep_ckpts > 0:
+            if self.by_epoch:
+                name = 'epoch_{}.pth'
+                current_ckpt = runner.epoch + 1
+            else:
+                name = 'iter_{}.pth'
+                current_ckpt = runner.iter + 1
+            redundant_ckpts = range(
+                current_ckpt - self.max_keep_ckpts * self.interval, 0,
+                -self.interval)
+            filename_tmpl = self.args.get('filename_tmpl', name)
+            for _step in redundant_ckpts:
+                ckpt_path = self.file_client.join_path(
+                    self.out_dir, filename_tmpl.format(_step))
+                if self.file_client.isfile(ckpt_path):
+                    self.file_client.remove(ckpt_path)
+                else:
+                    break
+
+    def after_train_iter(self, runner):
+        if self.by_epoch:
+            return
+
+        # save checkpoint for following cases:
+        # 1. every ``self.interval`` iterations
+        # 2. reach the last iteration of training
+        if self.every_n_iters(
+                runner, self.interval) or (self.save_last
+                                           and self.is_last_iter(runner)):
+            runner.logger.info(
+                f'Saving checkpoint at {runner.iter + 1} iterations')
+            if self.sync_buffer:
+                allreduce_params(runner.model.buffers())
+            self._save_checkpoint(runner)
diff --git a/mmcv/runner/hooks/evaluation.py b/mmcv/runner/hooks/evaluation.py
new file mode 100644
index 0000000..b09243a
--- /dev/null
+++ b/mmcv/runner/hooks/evaluation.py
@@ -0,0 +1,507 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+from math import inf
+
+import torch.distributed as dist
+from torch.nn.modules.batchnorm import _BatchNorm
+from torch.utils.data import DataLoader
+
+from mmcv.fileio.file_client import FileClient
+from mmcv.utils import is_seq_of
+from .hook import Hook
+from .logger import LoggerHook
+
+
+class EvalHook(Hook):
+    """Non-Distributed evaluation hook.
+
+    This hook will regularly perform evaluation in a given interval when
+    performing in non-distributed environment.
+
+    Args:
+        dataloader (DataLoader): A PyTorch dataloader, whose dataset has
+            implemented ``evaluate`` function.
+        start (int | None, optional): Evaluation starting epoch. It enables
+            evaluation before the training starts if ``start`` <= the resuming
+            epoch. If None, whether to evaluate is merely decided by
+            ``interval``. Default: None.
+        interval (int): Evaluation interval. Default: 1.
+        by_epoch (bool): Determine perform evaluation by epoch or by iteration.
+            If set to True, it will perform by epoch. Otherwise, by iteration.
+            Default: True.
+        save_best (str, optional): If a metric is specified, it would measure
+            the best checkpoint during evaluation. The information about best
+            checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep
+            best score value and best checkpoint path, which will be also
+            loaded when resume checkpoint. Options are the evaluation metrics
+            on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox
+            detection and instance segmentation. ``AR@100`` for proposal
+            recall. If ``save_best`` is ``auto``, the first key of the returned
+            ``OrderedDict`` result will be used. Default: None.
+        rule (str | None, optional): Comparison rule for best score. If set to
+            None, it will infer a reasonable rule. Keys such as 'acc', 'top'
+            .etc will be inferred by 'greater' rule. Keys contain 'loss' will
+            be inferred by 'less' rule. Options are 'greater', 'less', None.
+            Default: None.
+        test_fn (callable, optional): test a model with samples from a
+            dataloader, and return the test results. If ``None``, the default
+            test function ``mmcv.engine.single_gpu_test`` will be used.
+            (default: ``None``)
+        greater_keys (List[str] | None, optional): Metric keys that will be
+            inferred by 'greater' comparison rule. If ``None``,
+            _default_greater_keys will be used. (default: ``None``)
+        less_keys (List[str] | None, optional): Metric keys that will be
+            inferred by 'less' comparison rule. If ``None``, _default_less_keys
+            will be used. (default: ``None``)
+        out_dir (str, optional): The root directory to save checkpoints. If not
+            specified, `runner.work_dir` will be used by default. If specified,
+            the `out_dir` will be the concatenation of `out_dir` and the last
+            level directory of `runner.work_dir`.
+            `New in version 1.3.16.`
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details. Default: None.
+            `New in version 1.3.16.`
+        **eval_kwargs: Evaluation arguments fed into the evaluate function of
+            the dataset.
+
+    Notes:
+        If new arguments are added for EvalHook, tools/test.py,
+        tools/eval_metric.py may be affected.
+    """
+
+    # Since the key for determine greater or less is related to the downstream
+    # tasks, downstream repos may need to overwrite the following inner
+    # variable accordingly.
+
+    rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y}
+    init_value_map = {'greater': -inf, 'less': inf}
+    _default_greater_keys = [
+        'acc', 'top', 'AR@', 'auc', 'precision', 'mAP', 'mDice', 'mIoU',
+        'mAcc', 'aAcc'
+    ]
+    _default_less_keys = ['loss']
+
+    def __init__(self,
+                 dataloader,
+                 start=None,
+                 interval=1,
+                 by_epoch=True,
+                 save_best=None,
+                 rule=None,
+                 test_fn=None,
+                 greater_keys=None,
+                 less_keys=None,
+                 out_dir=None,
+                 file_client_args=None,
+                 **eval_kwargs):
+        if not isinstance(dataloader, DataLoader):
+            raise TypeError(f'dataloader must be a pytorch DataLoader, '
+                            f'but got {type(dataloader)}')
+
+        if interval <= 0:
+            raise ValueError(f'interval must be a positive number, '
+                             f'but got {interval}')
+
+        assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean'
+
+        if start is not None and start < 0:
+            raise ValueError(f'The evaluation start epoch {start} is smaller '
+                             f'than 0')
+
+        self.dataloader = dataloader
+        self.interval = interval
+        self.start = start
+        self.by_epoch = by_epoch
+
+        assert isinstance(save_best, str) or save_best is None, \
+            '""save_best"" should be a str or None ' \
+            f'rather than {type(save_best)}'
+        self.save_best = save_best
+        self.eval_kwargs = eval_kwargs
+        self.initial_flag = True
+
+        if test_fn is None:
+            raise 'not implement single_gpu_test test_gn'        
+        else:
+            self.test_fn = test_fn
+
+        if greater_keys is None:
+            self.greater_keys = self._default_greater_keys
+        else:
+            if not isinstance(greater_keys, (list, tuple)):
+                greater_keys = (greater_keys, )
+            assert is_seq_of(greater_keys, str)
+            self.greater_keys = greater_keys
+
+        if less_keys is None:
+            self.less_keys = self._default_less_keys
+        else:
+            if not isinstance(less_keys, (list, tuple)):
+                less_keys = (less_keys, )
+            assert is_seq_of(less_keys, str)
+            self.less_keys = less_keys
+
+        if self.save_best is not None:
+            self.best_ckpt_path = None
+            self._init_rule(rule, self.save_best)
+
+        self.out_dir = out_dir
+        self.file_client_args = file_client_args
+
+    def _init_rule(self, rule, key_indicator):
+        """Initialize rule, key_indicator, comparison_func, and best score.
+
+        Here is the rule to determine which rule is used for key indicator
+        when the rule is not specific (note that the key indicator matching
+        is case-insensitive):
+        1. If the key indicator is in ``self.greater_keys``, the rule will be
+           specified as 'greater'.
+        2. Or if the key indicator is in ``self.less_keys``, the rule will be
+           specified as 'less'.
+        3. Or if the key indicator is equal to the substring in any one item
+           in ``self.greater_keys``, the rule will be specified as 'greater'.
+        4. Or if the key indicator is equal to the substring in any one item
+           in ``self.less_keys``, the rule will be specified as 'less'.
+
+        Args:
+            rule (str | None): Comparison rule for best score.
+            key_indicator (str | None): Key indicator to determine the
+                comparison rule.
+        """
+        if rule not in self.rule_map and rule is not None:
+            raise KeyError(f'rule must be greater, less or None, '
+                           f'but got {rule}.')
+
+        if rule is None:
+            if key_indicator != 'auto':
+                # `_lc` here means we use the lower case of keys for
+                # case-insensitive matching
+                key_indicator_lc = key_indicator.lower()
+                greater_keys = [key.lower() for key in self.greater_keys]
+                less_keys = [key.lower() for key in self.less_keys]
+
+                if key_indicator_lc in greater_keys:
+                    rule = 'greater'
+                elif key_indicator_lc in less_keys:
+                    rule = 'less'
+                elif any(key in key_indicator_lc for key in greater_keys):
+                    rule = 'greater'
+                elif any(key in key_indicator_lc for key in less_keys):
+                    rule = 'less'
+                else:
+                    raise ValueError(f'Cannot infer the rule for key '
+                                     f'{key_indicator}, thus a specific rule '
+                                     f'must be specified.')
+        self.rule = rule
+        self.key_indicator = key_indicator
+        if self.rule is not None:
+            self.compare_func = self.rule_map[self.rule]
+
+    def before_run(self, runner):
+        if not self.out_dir:
+            self.out_dir = runner.work_dir
+
+        self.file_client = FileClient.infer_client(self.file_client_args,
+                                                   self.out_dir)
+
+        # if `self.out_dir` is not equal to `runner.work_dir`, it means that
+        # `self.out_dir` is set so the final `self.out_dir` is the
+        # concatenation of `self.out_dir` and the last level directory of
+        # `runner.work_dir`
+        if self.out_dir != runner.work_dir:
+            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+            self.out_dir = self.file_client.join_path(self.out_dir, basename)
+            runner.logger.info(
+                (f'The best checkpoint will be saved to {self.out_dir} by '
+                 f'{self.file_client.name}'))
+
+        if self.save_best is not None:
+            if runner.meta is None:
+                warnings.warn('runner.meta is None. Creating an empty one.')
+                runner.meta = dict()
+            runner.meta.setdefault('hook_msgs', dict())
+            self.best_ckpt_path = runner.meta['hook_msgs'].get(
+                'best_ckpt', None)
+
+    def before_train_iter(self, runner):
+        """Evaluate the model only at the start of training by iteration."""
+        if self.by_epoch or not self.initial_flag:
+            return
+        if self.start is not None and runner.iter >= self.start:
+            self.after_train_iter(runner)
+        self.initial_flag = False
+
+    def before_train_epoch(self, runner):
+        """Evaluate the model only at the start of training by epoch."""
+        if not (self.by_epoch and self.initial_flag):
+            return
+        if self.start is not None and runner.epoch >= self.start:
+            self.after_train_epoch(runner)
+        self.initial_flag = False
+
+    def after_train_iter(self, runner):
+        """Called after every training iter to evaluate the results."""
+        if not self.by_epoch and self._should_evaluate(runner):
+            # Because the priority of EvalHook is higher than LoggerHook, the
+            # training log and the evaluating log are mixed. Therefore,
+            # we need to dump the training log and clear it before evaluating
+            # log is generated. In addition, this problem will only appear in
+            # `IterBasedRunner` whose `self.by_epoch` is False, because
+            # `EpochBasedRunner` whose `self.by_epoch` is True calls
+            # `_do_evaluate` in `after_train_epoch` stage, and at this stage
+            # the training log has been printed, so it will not cause any
+            # problem. more details at
+            # https://github.com/open-mmlab/mmsegmentation/issues/694
+            for hook in runner._hooks:
+                if isinstance(hook, LoggerHook):
+                    hook.after_train_iter(runner)
+            runner.log_buffer.clear()
+
+            self._do_evaluate(runner)
+
+    def after_train_epoch(self, runner):
+        """Called after every training epoch to evaluate the results."""
+        if self.by_epoch and self._should_evaluate(runner):
+            self._do_evaluate(runner)
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        results = self.test_fn(runner.model, self.dataloader)
+        runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+        key_score = self.evaluate(runner, results)
+        # the key_score may be `None` so it needs to skip the action to save
+        # the best checkpoint
+        if self.save_best and key_score:
+            self._save_ckpt(runner, key_score)
+
+    def _should_evaluate(self, runner):
+        """Judge whether to perform evaluation.
+
+        Here is the rule to judge whether to perform evaluation:
+        1. It will not perform evaluation during the epoch/iteration interval,
+           which is determined by ``self.interval``.
+        2. It will not perform evaluation if the start time is larger than
+           current time.
+        3. It will not perform evaluation when current time is larger than
+           the start time but during epoch/iteration interval.
+
+        Returns:
+            bool: The flag indicating whether to perform evaluation.
+        """
+        if self.by_epoch:
+            current = runner.epoch
+            check_time = self.every_n_epochs
+        else:
+            current = runner.iter
+            check_time = self.every_n_iters
+
+        if self.start is None:
+            if not check_time(runner, self.interval):
+                # No evaluation during the interval.
+                return False
+        elif (current + 1) < self.start:
+            # No evaluation if start is larger than the current time.
+            return False
+        else:
+            # Evaluation only at epochs/iters 3, 5, 7...
+            # if start==3 and interval==2
+            if (current + 1 - self.start) % self.interval:
+                return False
+        return True
+
+    def _save_ckpt(self, runner, key_score):
+        """Save the best checkpoint.
+
+        It will compare the score according to the compare function, write
+        related information (best score, best checkpoint path) and save the
+        best checkpoint into ``work_dir``.
+        """
+        if self.by_epoch:
+            current = f'epoch_{runner.epoch + 1}'
+            cur_type, cur_time = 'epoch', runner.epoch + 1
+        else:
+            current = f'iter_{runner.iter + 1}'
+            cur_type, cur_time = 'iter', runner.iter + 1
+
+        best_score = runner.meta['hook_msgs'].get(
+            'best_score', self.init_value_map[self.rule])
+        if self.compare_func(key_score, best_score):
+            best_score = key_score
+            runner.meta['hook_msgs']['best_score'] = best_score
+
+            if self.best_ckpt_path and self.file_client.isfile(
+                    self.best_ckpt_path):
+                self.file_client.remove(self.best_ckpt_path)
+                runner.logger.info(
+                    (f'The previous best checkpoint {self.best_ckpt_path} was '
+                     'removed'))
+
+            best_ckpt_name = f'best_{self.key_indicator}_{current}.pth'
+            self.best_ckpt_path = self.file_client.join_path(
+                self.out_dir, best_ckpt_name)
+            runner.meta['hook_msgs']['best_ckpt'] = self.best_ckpt_path
+
+            runner.save_checkpoint(
+                self.out_dir, best_ckpt_name, create_symlink=False)
+            runner.logger.info(
+                f'Now best checkpoint is saved as {best_ckpt_name}.')
+            runner.logger.info(
+                f'Best {self.key_indicator} is {best_score:0.4f} '
+                f'at {cur_time} {cur_type}.')
+
+    def evaluate(self, runner, results):
+        """Evaluate the results.
+
+        Args:
+            runner (:obj:`mmcv.Runner`): The underlined training runner.
+            results (list): Output results.
+        """
+        eval_res = self.dataloader.dataset.evaluate(
+            results, logger=runner.logger, **self.eval_kwargs)
+
+        for name, val in eval_res.items():
+            runner.log_buffer.output[name] = val
+        runner.log_buffer.ready = True
+
+        if self.save_best is not None:
+            # If the performance of model is pool, the `eval_res` may be an
+            # empty dict and it will raise exception when `self.save_best` is
+            # not None. More details at
+            # https://github.com/open-mmlab/mmdetection/issues/6265.
+            if not eval_res:
+                warnings.warn(
+                    'Since `eval_res` is an empty dict, the behavior to save '
+                    'the best checkpoint will be skipped in this evaluation.')
+                return None
+
+            if self.key_indicator == 'auto':
+                # infer from eval_results
+                self._init_rule(self.rule, list(eval_res.keys())[0])
+            return eval_res[self.key_indicator]
+
+        return None
+
+
+class DistEvalHook(EvalHook):
+    """Distributed evaluation hook.
+
+    This hook will regularly perform evaluation in a given interval when
+    performing in distributed environment.
+
+    Args:
+        dataloader (DataLoader): A PyTorch dataloader, whose dataset has
+            implemented ``evaluate`` function.
+        start (int | None, optional): Evaluation starting epoch. It enables
+            evaluation before the training starts if ``start`` <= the resuming
+            epoch. If None, whether to evaluate is merely decided by
+            ``interval``. Default: None.
+        interval (int): Evaluation interval. Default: 1.
+        by_epoch (bool): Determine perform evaluation by epoch or by iteration.
+            If set to True, it will perform by epoch. Otherwise, by iteration.
+            default: True.
+        save_best (str, optional): If a metric is specified, it would measure
+            the best checkpoint during evaluation. The information about best
+            checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep
+            best score value and best checkpoint path, which will be also
+            loaded when resume checkpoint. Options are the evaluation metrics
+            on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox
+            detection and instance segmentation. ``AR@100`` for proposal
+            recall. If ``save_best`` is ``auto``, the first key of the returned
+            ``OrderedDict`` result will be used. Default: None.
+        rule (str | None, optional): Comparison rule for best score. If set to
+            None, it will infer a reasonable rule. Keys such as 'acc', 'top'
+            .etc will be inferred by 'greater' rule. Keys contain 'loss' will
+            be inferred by 'less' rule. Options are 'greater', 'less', None.
+            Default: None.
+        test_fn (callable, optional): test a model with samples from a
+            dataloader in a multi-gpu manner, and return the test results. If
+            ``None``, the default test function ``mmcv.engine.multi_gpu_test``
+            will be used. (default: ``None``)
+        tmpdir (str | None): Temporary directory to save the results of all
+            processes. Default: None.
+        gpu_collect (bool): Whether to use gpu or cpu to collect results.
+            Default: False.
+        broadcast_bn_buffer (bool): Whether to broadcast the
+            buffer(running_mean and running_var) of rank 0 to other rank
+            before evaluation. Default: True.
+        out_dir (str, optional): The root directory to save checkpoints. If not
+            specified, `runner.work_dir` will be used by default. If specified,
+            the `out_dir` will be the concatenation of `out_dir` and the last
+            level directory of `runner.work_dir`.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details. Default: None.
+        **eval_kwargs: Evaluation arguments fed into the evaluate function of
+            the dataset.
+    """
+
+    def __init__(self,
+                 dataloader,
+                 start=None,
+                 interval=1,
+                 by_epoch=True,
+                 save_best=None,
+                 rule=None,
+                 test_fn=None,
+                 greater_keys=None,
+                 less_keys=None,
+                 broadcast_bn_buffer=True,
+                 tmpdir=None,
+                 gpu_collect=False,
+                 out_dir=None,
+                 file_client_args=None,
+                 **eval_kwargs):
+
+        if test_fn is None:
+            raise 'not implement multi_gpu_test test_fn'
+
+        super().__init__(
+            dataloader,
+            start=start,
+            interval=interval,
+            by_epoch=by_epoch,
+            save_best=save_best,
+            rule=rule,
+            test_fn=test_fn,
+            greater_keys=greater_keys,
+            less_keys=less_keys,
+            out_dir=out_dir,
+            file_client_args=file_client_args,
+            **eval_kwargs)
+
+        self.broadcast_bn_buffer = broadcast_bn_buffer
+        self.tmpdir = tmpdir
+        self.gpu_collect = gpu_collect
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        # Synchronization of BatchNorm's buffer (running_mean
+        # and running_var) is not supported in the DDP of pytorch,
+        # which may cause the inconsistent performance of models in
+        # different ranks, so we broadcast BatchNorm's buffers
+        # of rank 0 to other ranks to avoid this.
+        if self.broadcast_bn_buffer:
+            model = runner.model
+            for name, module in model.named_modules():
+                if isinstance(module,
+                              _BatchNorm) and module.track_running_stats:
+                    dist.broadcast(module.running_var, 0)
+                    dist.broadcast(module.running_mean, 0)
+
+        tmpdir = self.tmpdir
+        if tmpdir is None:
+            tmpdir = osp.join(runner.work_dir, '.eval_hook')
+
+        results = self.test_fn(
+            runner.model,
+            self.dataloader,
+            tmpdir=tmpdir,
+            gpu_collect=self.gpu_collect)
+        if runner.rank == 0:
+            print('\n')
+            runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+            key_score = self.evaluate(runner, results)
+            # the key_score may be `None` so it needs to skip the action to
+            # save the best checkpoint
+            if self.save_best and key_score:
+                self._save_ckpt(runner, key_score)
diff --git a/mmcv/runner/hooks/hook.py b/mmcv/runner/hooks/hook.py
new file mode 100644
index 0000000..f2d1c98
--- /dev/null
+++ b/mmcv/runner/hooks/hook.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import Registry, is_method_overridden
+
+HOOKS = Registry('hook')
+
+
+class Hook:
+    stages = ('before_run', 'before_train_epoch', 'before_train_iter',
+              'after_train_iter', 'after_train_epoch', 'before_val_epoch',
+              'before_val_iter', 'after_val_iter', 'after_val_epoch',
+              'after_run')
+
+    def before_run(self, runner):
+        pass
+
+    def after_run(self, runner):
+        pass
+
+    def before_epoch(self, runner):
+        pass
+
+    def after_epoch(self, runner):
+        pass
+
+    def before_iter(self, runner):
+        pass
+
+    def after_iter(self, runner):
+        pass
+
+    def before_train_epoch(self, runner):
+        self.before_epoch(runner)
+
+    def before_val_epoch(self, runner):
+        self.before_epoch(runner)
+
+    def after_train_epoch(self, runner):
+        self.after_epoch(runner)
+
+    def after_val_epoch(self, runner):
+        self.after_epoch(runner)
+
+    def before_train_iter(self, runner):
+        self.before_iter(runner)
+
+    def before_val_iter(self, runner):
+        self.before_iter(runner)
+
+    def after_train_iter(self, runner):
+        self.after_iter(runner)
+
+    def after_val_iter(self, runner):
+        self.after_iter(runner)
+
+    def every_n_epochs(self, runner, n):
+        return (runner.epoch + 1) % n == 0 if n > 0 else False
+
+    def every_n_inner_iters(self, runner, n):
+        return (runner.inner_iter + 1) % n == 0 if n > 0 else False
+
+    def every_n_iters(self, runner, n):
+        return (runner.iter + 1) % n == 0 if n > 0 else False
+
+    def end_of_epoch(self, runner):
+        return runner.inner_iter + 1 == len(runner.data_loader)
+
+    def is_last_epoch(self, runner):
+        return runner.epoch + 1 == runner._max_epochs
+
+    def is_last_iter(self, runner):
+        return runner.iter + 1 == runner._max_iters
+
+    def get_triggered_stages(self):
+        trigger_stages = set()
+        for stage in Hook.stages:
+            if is_method_overridden(stage, Hook, self):
+                trigger_stages.add(stage)
+
+        # some methods will be triggered in multi stages
+        # use this dict to map method to stages.
+        method_stages_map = {
+            'before_epoch': ['before_train_epoch', 'before_val_epoch'],
+            'after_epoch': ['after_train_epoch', 'after_val_epoch'],
+            'before_iter': ['before_train_iter', 'before_val_iter'],
+            'after_iter': ['after_train_iter', 'after_val_iter'],
+        }
+
+        for method, map_stages in method_stages_map.items():
+            if is_method_overridden(method, Hook, self):
+                trigger_stages.update(map_stages)
+
+        return [stage for stage in Hook.stages if stage in trigger_stages]
diff --git a/mmcv/runner/hooks/iter_timer.py b/mmcv/runner/hooks/iter_timer.py
new file mode 100644
index 0000000..cfd5002
--- /dev/null
+++ b/mmcv/runner/hooks/iter_timer.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class IterTimerHook(Hook):
+
+    def before_epoch(self, runner):
+        self.t = time.time()
+
+    def before_iter(self, runner):
+        runner.log_buffer.update({'data_time': time.time() - self.t})
+
+    def after_iter(self, runner):
+        runner.log_buffer.update({'time': time.time() - self.t})
+        self.t = time.time()
diff --git a/mmcv/runner/hooks/logger/__init__.py b/mmcv/runner/hooks/logger/__init__.py
new file mode 100644
index 0000000..409be48
--- /dev/null
+++ b/mmcv/runner/hooks/logger/__init__.py
@@ -0,0 +1,3 @@
+from .base import LoggerHook
+from .text import TextLoggerHook
+from .tensorboard import TensorboardLoggerHook
\ No newline at end of file
diff --git a/mmcv/runner/hooks/logger/base.py b/mmcv/runner/hooks/logger/base.py
new file mode 100644
index 0000000..f845256
--- /dev/null
+++ b/mmcv/runner/hooks/logger/base.py
@@ -0,0 +1,166 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numbers
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+import torch
+
+from ..hook import Hook
+
+
+class LoggerHook(Hook):
+    """Base class for logger hooks.
+
+    Args:
+        interval (int): Logging interval (every k iterations).
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+        by_epoch (bool): Whether EpochBasedRunner is used.
+    """
+
+    __metaclass__ = ABCMeta
+
+    def __init__(self,
+                 interval=10,
+                 ignore_last=True,
+                 reset_flag=False,
+                 by_epoch=True):
+        self.interval = interval
+        self.ignore_last = ignore_last
+        self.reset_flag = reset_flag
+        self.by_epoch = by_epoch
+
+    @abstractmethod
+    def log(self, runner):
+        pass
+
+    @staticmethod
+    def is_scalar(val, include_np=True, include_torch=True):
+        """Tell the input variable is a scalar or not.
+
+        Args:
+            val: Input variable.
+            include_np (bool): Whether include 0-d np.ndarray as a scalar.
+            include_torch (bool): Whether include 0-d torch.Tensor as a scalar.
+
+        Returns:
+            bool: True or False.
+        """
+        if isinstance(val, numbers.Number):
+            return True
+        elif include_np and isinstance(val, np.ndarray) and val.ndim == 0:
+            return True
+        elif include_torch and isinstance(val, torch.Tensor) and len(val) == 1:
+            return True
+        else:
+            return False
+
+    def get_mode(self, runner):
+        if runner.mode == 'train':
+            if 'time' in runner.log_buffer.output:
+                mode = 'train'
+            else:
+                mode = 'val'
+        elif runner.mode == 'val':
+            mode = 'val'
+        else:
+            raise ValueError(f"runner mode should be 'train' or 'val', "
+                             f'but got {runner.mode}')
+        return mode
+
+    def get_epoch(self, runner):
+        if runner.mode == 'train':
+            epoch = runner.epoch + 1
+        elif runner.mode == 'val':
+            # normal val mode
+            # runner.epoch += 1 has been done before val workflow
+            epoch = runner.epoch
+        else:
+            raise ValueError(f"runner mode should be 'train' or 'val', "
+                             f'but got {runner.mode}')
+        return epoch
+
+    def get_iter(self, runner, inner_iter=False):
+        """Get the current training iteration step."""
+        if self.by_epoch and inner_iter:
+            current_iter = runner.inner_iter + 1
+        else:
+            current_iter = runner.iter + 1
+        return current_iter
+
+    def get_lr_tags(self, runner):
+        tags = {}
+        lrs = runner.current_lr()
+        if isinstance(lrs, dict):
+            for name, value in lrs.items():
+                tags[f'learning_rate/{name}'] = value[0]
+        else:
+            tags['learning_rate'] = lrs[0]
+        return tags
+
+    def get_momentum_tags(self, runner):
+        tags = {}
+        momentums = runner.current_momentum()
+        if isinstance(momentums, dict):
+            for name, value in momentums.items():
+                tags[f'momentum/{name}'] = value[0]
+        else:
+            tags['momentum'] = momentums[0]
+        return tags
+
+    def get_loggable_tags(self,
+                          runner,
+                          allow_scalar=True,
+                          allow_text=False,
+                          add_mode=True,
+                          tags_to_skip=('time', 'data_time')):
+        tags = {}
+        for var, val in runner.log_buffer.output.items():
+            if var in tags_to_skip:
+                continue
+            if self.is_scalar(val) and not allow_scalar:
+                continue
+            if isinstance(val, str) and not allow_text:
+                continue
+            if add_mode:
+                var = f'{self.get_mode(runner)}/{var}'
+            tags[var] = val
+        tags.update(self.get_lr_tags(runner))
+        tags.update(self.get_momentum_tags(runner))
+        return tags
+
+    def before_run(self, runner):
+        for hook in runner.hooks[::-1]:
+            if isinstance(hook, LoggerHook):
+                hook.reset_flag = True
+                break
+
+    def before_epoch(self, runner):
+        runner.log_buffer.clear()  # clear logs of last epoch
+
+    def after_train_iter(self, runner):
+        if self.by_epoch and self.every_n_inner_iters(runner, self.interval):
+            runner.log_buffer.average(self.interval)
+        elif not self.by_epoch and self.every_n_iters(runner, self.interval):
+            runner.log_buffer.average(self.interval)
+        elif self.end_of_epoch(runner) and not self.ignore_last:
+            # not precise but more stable
+            runner.log_buffer.average(self.interval)
+
+        if runner.log_buffer.ready:
+            self.log(runner)
+            if self.reset_flag:
+                runner.log_buffer.clear_output()
+
+    def after_train_epoch(self, runner):
+        if runner.log_buffer.ready:
+            self.log(runner)
+            if self.reset_flag:
+                runner.log_buffer.clear_output()
+
+    def after_val_epoch(self, runner):
+        runner.log_buffer.average()
+        self.log(runner)
+        if self.reset_flag:
+            runner.log_buffer.clear_output()
diff --git a/mmcv/runner/hooks/logger/tensorboard.py b/mmcv/runner/hooks/logger/tensorboard.py
new file mode 100644
index 0000000..72b6759
--- /dev/null
+++ b/mmcv/runner/hooks/logger/tensorboard.py
@@ -0,0 +1,55 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+from mmcv.utils import TORCH_VERSION, digit_version, master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class TensorboardLoggerHook(LoggerHook):
+
+    def __init__(self,
+                 log_dir=None,
+                 interval=10,
+                 ignore_last=True,
+                 reset_flag=False,
+                 by_epoch=True):
+        super(TensorboardLoggerHook, self).__init__(interval, ignore_last,
+                                                    reset_flag, by_epoch)
+        self.log_dir = log_dir
+
+    @master_only
+    def before_run(self, runner):
+        super(TensorboardLoggerHook, self).before_run(runner)
+        if (digit_version(TORCH_VERSION) < digit_version('1.1')):
+            try:
+                from tensorboardX import SummaryWriter
+            except ImportError:
+                raise ImportError('Please install tensorboardX to use '
+                                  'TensorboardLoggerHook.')
+        else:
+            try:
+                from torch.utils.tensorboard import SummaryWriter
+            except ImportError:
+                raise ImportError(
+                    'Please run "pip install future tensorboard" to install '
+                    'the dependencies to use torch.utils.tensorboard '
+                    '(applicable to PyTorch 1.1 or higher)')
+
+        if self.log_dir is None:
+            self.log_dir = osp.join(runner.work_dir, 'tf_logs')
+        self.writer = SummaryWriter(self.log_dir)
+
+    @master_only
+    def log(self, runner):
+        tags = self.get_loggable_tags(runner, allow_text=True)
+        for tag, val in tags.items():
+            if isinstance(val, str):
+                self.writer.add_text(tag, val, self.get_iter(runner))
+            else:
+                self.writer.add_scalar(tag, val, self.get_iter(runner))
+
+    @master_only
+    def after_run(self, runner):
+        self.writer.close()
diff --git a/mmcv/runner/hooks/logger/text.py b/mmcv/runner/hooks/logger/text.py
new file mode 100644
index 0000000..0413cd8
--- /dev/null
+++ b/mmcv/runner/hooks/logger/text.py
@@ -0,0 +1,256 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import os
+import os.path as osp
+from collections import OrderedDict
+
+import torch
+import torch.distributed as dist
+
+from mmcv.fileio.file_client import FileClient
+from mmcv.utils import is_tuple_of, scandir
+from mmcv.fileio.io import dump
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class TextLoggerHook(LoggerHook):
+    """Logger hook in text.
+
+    In this logger hook, the information will be printed on terminal and
+    saved in json file.
+
+    Args:
+        by_epoch (bool, optional): Whether EpochBasedRunner is used.
+            Default: True.
+        interval (int, optional): Logging interval (every k iterations).
+            Default: 10.
+        ignore_last (bool, optional): Ignore the log of last iterations in each
+            epoch if less than :attr:`interval`. Default: True.
+        reset_flag (bool, optional): Whether to clear the output buffer after
+            logging. Default: False.
+        interval_exp_name (int, optional): Logging interval for experiment
+            name. This feature is to help users conveniently get the experiment
+            information from screen or log file. Default: 1000.
+        out_dir (str, optional): Logs are saved in ``runner.work_dir`` default.
+            If ``out_dir`` is specified, logs will be copied to a new directory
+            which is the concatenation of ``out_dir`` and the last level
+            directory of ``runner.work_dir``. Default: None.
+            `New in version 1.3.16.`
+        out_suffix (str or tuple[str], optional): Those filenames ending with
+            ``out_suffix`` will be copied to ``out_dir``.
+            Default: ('.log.json', '.log', '.py').
+            `New in version 1.3.16.`
+        keep_local (bool, optional): Whether to keep local log when
+            :attr:`out_dir` is specified. If False, the local log will be
+            removed. Default: True.
+            `New in version 1.3.16.`
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+            `New in version 1.3.16.`
+    """
+
+    def __init__(self,
+                 by_epoch=True,
+                 interval=10,
+                 ignore_last=True,
+                 reset_flag=False,
+                 interval_exp_name=1000,
+                 out_dir=None,
+                 out_suffix=('.log.json', '.log', '.py'),
+                 keep_local=True,
+                 file_client_args=None):
+        super(TextLoggerHook, self).__init__(interval, ignore_last, reset_flag,
+                                             by_epoch)
+        self.by_epoch = by_epoch
+        self.time_sec_tot = 0
+        self.interval_exp_name = interval_exp_name
+
+        if out_dir is None and file_client_args is not None:
+            raise ValueError(
+                'file_client_args should be "None" when `out_dir` is not'
+                'specified.')
+        self.out_dir = out_dir
+
+        if not (out_dir is None or isinstance(out_dir, str)
+                or is_tuple_of(out_dir, str)):
+            raise TypeError('out_dir should be  "None" or string or tuple of '
+                            'string, but got {out_dir}')
+        self.out_suffix = out_suffix
+
+        self.keep_local = keep_local
+        self.file_client_args = file_client_args
+        if self.out_dir is not None:
+            self.file_client = FileClient.infer_client(file_client_args,
+                                                       self.out_dir)
+
+    def before_run(self, runner):
+        super(TextLoggerHook, self).before_run(runner)
+
+        if self.out_dir is not None:
+            self.file_client = FileClient.infer_client(self.file_client_args,
+                                                       self.out_dir)
+            # The final `self.out_dir` is the concatenation of `self.out_dir`
+            # and the last level directory of `runner.work_dir`
+            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+            self.out_dir = self.file_client.join_path(self.out_dir, basename)
+            runner.logger.info(
+                (f'Text logs will be saved to {self.out_dir} by '
+                 f'{self.file_client.name} after the training process.'))
+
+        self.start_iter = runner.iter
+        self.json_log_path = osp.join(runner.work_dir,
+                                      f'{runner.timestamp}.log.json')
+        if runner.meta is not None:
+            self._dump_log(runner.meta, runner)
+
+    def _get_max_memory(self, runner):
+        device = getattr(runner.model, 'output_device', None)
+        mem = torch.cuda.max_memory_allocated(device=device)
+        mem_mb = torch.tensor([mem / (1024 * 1024)],
+                              dtype=torch.int,
+                              device=device)
+        if runner.world_size > 1:
+            dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX)
+        return mem_mb.item()
+
+    def _log_info(self, log_dict, runner):
+        # print exp name for users to distinguish experiments
+        # at every ``interval_exp_name`` iterations and the end of each epoch
+        if runner.meta is not None and 'exp_name' in runner.meta:
+            if (self.every_n_iters(runner, self.interval_exp_name)) or (
+                    self.by_epoch and self.end_of_epoch(runner)):
+                exp_info = f'Exp name: {runner.meta["exp_name"]}'
+                runner.logger.info(exp_info)
+
+        if log_dict['mode'] == 'train':
+            if isinstance(log_dict['lr'], dict):
+                lr_str = []
+                for k, val in log_dict['lr'].items():
+                    lr_str.append(f'lr_{k}: {val:.3e}')
+                lr_str = ' '.join(lr_str)
+            else:
+                lr_str = f'lr: {log_dict["lr"]:.3e}'
+
+            # by epoch: Epoch [4][100/1000]
+            # by iter:  Iter [100/100000]
+            if self.by_epoch:
+                log_str = f'Epoch [{log_dict["epoch"]}]' \
+                          f'[{log_dict["iter"]}/{len(runner.data_loader)}]\t'
+            else:
+                log_str = f'Iter [{log_dict["iter"]}/{runner.max_iters}]\t'
+            log_str += f'{lr_str}, '
+
+            if 'time' in log_dict.keys():
+                self.time_sec_tot += (log_dict['time'] * self.interval)
+                time_sec_avg = self.time_sec_tot / (
+                    runner.iter - self.start_iter + 1)
+                eta_sec = time_sec_avg * (runner.max_iters - runner.iter - 1)
+                eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
+                log_str += f'eta: {eta_str}, '
+                log_str += f'time: {log_dict["time"]:.3f}, ' \
+                           f'data_time: {log_dict["data_time"]:.3f}, '
+                # statistic memory
+                if torch.cuda.is_available():
+                    log_str += f'memory: {log_dict["memory"]}, '
+        else:
+            # val/test time
+            # here 1000 is the length of the val dataloader
+            # by epoch: Epoch[val] [4][1000]
+            # by iter: Iter[val] [1000]
+            if self.by_epoch:
+                log_str = f'Epoch({log_dict["mode"]}) ' \
+                    f'[{log_dict["epoch"]}][{log_dict["iter"]}]\t'
+            else:
+                log_str = f'Iter({log_dict["mode"]}) [{log_dict["iter"]}]\t'
+
+        log_items = []
+        for name, val in log_dict.items():
+            # TODO: resolve this hack
+            # these items have been in log_str
+            if name in [
+                    'mode', 'Epoch', 'iter', 'lr', 'time', 'data_time',
+                    'memory', 'epoch'
+            ]:
+                continue
+            if isinstance(val, float):
+                val = f'{val:.4f}'
+            log_items.append(f'{name}: {val}')
+        log_str += ', '.join(log_items)
+
+        runner.logger.info(log_str)
+
+    def _dump_log(self, log_dict, runner):
+        # dump log in json format
+        json_log = OrderedDict()
+        for k, v in log_dict.items():
+            json_log[k] = self._round_float(v)
+        # only append log at last line
+        if runner.rank == 0:
+            with open(self.json_log_path, 'a+') as f:
+                dump(json_log, f, file_format='json')
+                f.write('\n')
+
+    def _round_float(self, items):
+        if isinstance(items, list):
+            return [self._round_float(item) for item in items]
+        elif isinstance(items, float):
+            return round(items, 5)
+        else:
+            return items
+
+    def log(self, runner):
+        if 'eval_iter_num' in runner.log_buffer.output:
+            # this doesn't modify runner.iter and is regardless of by_epoch
+            cur_iter = runner.log_buffer.output.pop('eval_iter_num')
+        else:
+            cur_iter = self.get_iter(runner, inner_iter=True)
+
+        log_dict = OrderedDict(
+            mode=self.get_mode(runner),
+            epoch=self.get_epoch(runner),
+            iter=cur_iter)
+
+        # only record lr of the first param group
+        cur_lr = runner.current_lr()
+        if isinstance(cur_lr, list):
+            log_dict['lr'] = cur_lr[0]
+        else:
+            assert isinstance(cur_lr, dict)
+            log_dict['lr'] = {}
+            for k, lr_ in cur_lr.items():
+                assert isinstance(lr_, list)
+                log_dict['lr'].update({k: lr_[0]})
+
+        if 'time' in runner.log_buffer.output:
+            # statistic memory
+            if torch.cuda.is_available():
+                log_dict['memory'] = self._get_max_memory(runner)
+
+        log_dict = dict(log_dict, **runner.log_buffer.output)
+
+        self._log_info(log_dict, runner)
+        self._dump_log(log_dict, runner)
+        return log_dict
+
+    def after_run(self, runner):
+        # copy or upload logs to self.out_dir
+        if self.out_dir is not None:
+            for filename in scandir(runner.work_dir, self.out_suffix, True):
+                local_filepath = osp.join(runner.work_dir, filename)
+                out_filepath = self.file_client.join_path(
+                    self.out_dir, filename)
+                with open(local_filepath, 'r') as f:
+                    self.file_client.put_text(f.read(), out_filepath)
+
+                runner.logger.info(
+                    (f'The file {local_filepath} has been uploaded to '
+                     f'{out_filepath}.'))
+
+                if not self.keep_local:
+                    os.remove(local_filepath)
+                    runner.logger.info(
+                        (f'{local_filepath} was removed due to the '
+                         '`self.keep_local=False`'))
diff --git a/mmcv/runner/hooks/lr_updater.py b/mmcv/runner/hooks/lr_updater.py
new file mode 100644
index 0000000..a750548
--- /dev/null
+++ b/mmcv/runner/hooks/lr_updater.py
@@ -0,0 +1,670 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numbers
+from math import cos, pi
+
+from mmcv.utils import is_list_of
+from .hook import HOOKS, Hook
+
+
+class LrUpdaterHook(Hook):
+    """LR Scheduler in MMCV.
+
+    Args:
+        by_epoch (bool): LR changes epoch by epoch
+        warmup (string): Type of warmup used. It can be None(use no warmup),
+            'constant', 'linear' or 'exp'
+        warmup_iters (int): The number of iterations or epochs that warmup
+            lasts
+        warmup_ratio (float): LR used at the beginning of warmup equals to
+            warmup_ratio * initial_lr
+        warmup_by_epoch (bool): When warmup_by_epoch == True, warmup_iters
+            means the number of epochs that warmup lasts, otherwise means the
+            number of iteration that warmup lasts
+    """
+
+    def __init__(self,
+                 by_epoch=True,
+                 warmup=None,
+                 warmup_iters=0,
+                 warmup_ratio=0.1,
+                 warmup_by_epoch=False):
+        # validate the "warmup" argument
+        if warmup is not None:
+            if warmup not in ['constant', 'linear', 'exp']:
+                raise ValueError(
+                    f'"{warmup}" is not a supported type for warming up, valid'
+                    ' types are "constant" and "linear"')
+        if warmup is not None:
+            assert warmup_iters > 0, \
+                '"warmup_iters" must be a positive integer'
+            assert 0 < warmup_ratio <= 1.0, \
+                '"warmup_ratio" must be in range (0,1]'
+
+        self.by_epoch = by_epoch
+        self.warmup = warmup
+        self.warmup_iters = warmup_iters
+        self.warmup_ratio = warmup_ratio
+        self.warmup_by_epoch = warmup_by_epoch
+
+        if self.warmup_by_epoch:
+            self.warmup_epochs = self.warmup_iters
+            self.warmup_iters = None
+        else:
+            self.warmup_epochs = None
+
+        self.base_lr = []  # initial lr for all param groups
+        self.regular_lr = []  # expected lr if no warming up is performed
+
+    def _set_lr(self, runner, lr_groups):
+        if isinstance(runner.optimizer, dict):
+            for k, optim in runner.optimizer.items():
+                for param_group, lr in zip(optim.param_groups, lr_groups[k]):
+                    param_group['lr'] = lr
+        else:
+            for param_group, lr in zip(runner.optimizer.param_groups,
+                                       lr_groups):
+                param_group['lr'] = lr
+
+    def get_lr(self, runner, base_lr):
+        raise NotImplementedError
+
+    def get_regular_lr(self, runner):
+        if isinstance(runner.optimizer, dict):
+            lr_groups = {}
+            for k in runner.optimizer.keys():
+                _lr_group = [
+                    self.get_lr(runner, _base_lr)
+                    for _base_lr in self.base_lr[k]
+                ]
+                lr_groups.update({k: _lr_group})
+
+            return lr_groups
+        else:
+            return [self.get_lr(runner, _base_lr) for _base_lr in self.base_lr]
+
+    def get_warmup_lr(self, cur_iters):
+
+        def _get_warmup_lr(cur_iters, regular_lr):
+            if self.warmup == 'constant':
+                warmup_lr = [_lr * self.warmup_ratio for _lr in regular_lr]
+            elif self.warmup == 'linear':
+                k = (1 - cur_iters / self.warmup_iters) * (1 -
+                                                           self.warmup_ratio)
+                warmup_lr = [_lr * (1 - k) for _lr in regular_lr]
+            elif self.warmup == 'exp':
+                k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters)
+                warmup_lr = [_lr * k for _lr in regular_lr]
+            return warmup_lr
+
+        if isinstance(self.regular_lr, dict):
+            lr_groups = {}
+            for key, regular_lr in self.regular_lr.items():
+                lr_groups[key] = _get_warmup_lr(cur_iters, regular_lr)
+            return lr_groups
+        else:
+            return _get_warmup_lr(cur_iters, self.regular_lr)
+
+    def before_run(self, runner):
+        # NOTE: when resuming from a checkpoint, if 'initial_lr' is not saved,
+        # it will be set according to the optimizer params
+        if isinstance(runner.optimizer, dict):
+            self.base_lr = {}
+            for k, optim in runner.optimizer.items():
+                for group in optim.param_groups:
+                    group.setdefault('initial_lr', group['lr'])
+                _base_lr = [
+                    group['initial_lr'] for group in optim.param_groups
+                ]
+                self.base_lr.update({k: _base_lr})
+        else:
+            for group in runner.optimizer.param_groups:
+                group.setdefault('initial_lr', group['lr'])
+            self.base_lr = [
+                group['initial_lr'] for group in runner.optimizer.param_groups
+            ]
+
+    def before_train_epoch(self, runner):
+        if self.warmup_iters is None:
+            epoch_len = len(runner.data_loader)
+            self.warmup_iters = self.warmup_epochs * epoch_len
+
+        if not self.by_epoch:
+            return
+
+        self.regular_lr = self.get_regular_lr(runner)
+        self._set_lr(runner, self.regular_lr)
+
+    def before_train_iter(self, runner):
+        cur_iter = runner.iter
+        if not self.by_epoch:
+            self.regular_lr = self.get_regular_lr(runner)
+            if self.warmup is None or cur_iter >= self.warmup_iters:
+                self._set_lr(runner, self.regular_lr)
+            else:
+                warmup_lr = self.get_warmup_lr(cur_iter)
+                self._set_lr(runner, warmup_lr)
+        elif self.by_epoch:
+            if self.warmup is None or cur_iter > self.warmup_iters:
+                return
+            elif cur_iter == self.warmup_iters:
+                self._set_lr(runner, self.regular_lr)
+            else:
+                warmup_lr = self.get_warmup_lr(cur_iter)
+                self._set_lr(runner, warmup_lr)
+
+
+@HOOKS.register_module()
+class FixedLrUpdaterHook(LrUpdaterHook):
+
+    def __init__(self, **kwargs):
+        super(FixedLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        return base_lr
+
+
+@HOOKS.register_module()
+class StepLrUpdaterHook(LrUpdaterHook):
+    """Step LR scheduler with min_lr clipping.
+
+    Args:
+        step (int | list[int]): Step to decay the LR. If an int value is given,
+            regard it as the decay interval. If a list is given, decay LR at
+            these steps.
+        gamma (float, optional): Decay LR ratio. Default: 0.1.
+        min_lr (float, optional): Minimum LR value to keep. If LR after decay
+            is lower than `min_lr`, it will be clipped to this value. If None
+            is given, we don't perform lr clipping. Default: None.
+    """
+
+    def __init__(self, step, gamma=0.1, min_lr=None, **kwargs):
+        if isinstance(step, list):
+            assert is_list_of(step, int)
+            assert all([s > 0 for s in step])
+        elif isinstance(step, int):
+            assert step > 0
+        else:
+            raise TypeError('"step" must be a list or integer')
+        self.step = step
+        self.gamma = gamma
+        self.min_lr = min_lr
+        super(StepLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        progress = runner.epoch if self.by_epoch else runner.iter
+
+        # calculate exponential term
+        if isinstance(self.step, int):
+            exp = progress // self.step
+        else:
+            exp = len(self.step)
+            for i, s in enumerate(self.step):
+                if progress < s:
+                    exp = i
+                    break
+
+        lr = base_lr * (self.gamma**exp)
+        if self.min_lr is not None:
+            # clip to a minimum value
+            lr = max(lr, self.min_lr)
+        return lr
+
+
+@HOOKS.register_module()
+class ExpLrUpdaterHook(LrUpdaterHook):
+
+    def __init__(self, gamma, **kwargs):
+        self.gamma = gamma
+        super(ExpLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        progress = runner.epoch if self.by_epoch else runner.iter
+        return base_lr * self.gamma**progress
+
+
+@HOOKS.register_module()
+class PolyLrUpdaterHook(LrUpdaterHook):
+
+    def __init__(self, power=1., min_lr=0., **kwargs):
+        self.power = power
+        self.min_lr = min_lr
+        super(PolyLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        if self.by_epoch:
+            progress = runner.epoch
+            max_progress = runner.max_epochs
+        else:
+            progress = runner.iter
+            max_progress = runner.max_iters
+        coeff = (1 - progress / max_progress)**self.power
+        return (base_lr - self.min_lr) * coeff + self.min_lr
+
+
+@HOOKS.register_module()
+class InvLrUpdaterHook(LrUpdaterHook):
+
+    def __init__(self, gamma, power=1., **kwargs):
+        self.gamma = gamma
+        self.power = power
+        super(InvLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        progress = runner.epoch if self.by_epoch else runner.iter
+        return base_lr * (1 + self.gamma * progress)**(-self.power)
+
+
+@HOOKS.register_module()
+class CosineAnnealingLrUpdaterHook(LrUpdaterHook):
+
+    def __init__(self, min_lr=None, min_lr_ratio=None, **kwargs):
+        assert (min_lr is None) ^ (min_lr_ratio is None)
+        self.min_lr = min_lr
+        self.min_lr_ratio = min_lr_ratio
+        super(CosineAnnealingLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        if self.by_epoch:
+            progress = runner.epoch
+            max_progress = runner.max_epochs
+        else:
+            progress = runner.iter
+            max_progress = runner.max_iters
+
+        if self.min_lr_ratio is not None:
+            target_lr = base_lr * self.min_lr_ratio
+        else:
+            target_lr = self.min_lr
+        return annealing_cos(base_lr, target_lr, progress / max_progress)
+
+
+@HOOKS.register_module()
+class FlatCosineAnnealingLrUpdaterHook(LrUpdaterHook):
+    """Flat + Cosine lr schedule.
+
+    Modified from https://github.com/fastai/fastai/blob/master/fastai/callback/schedule.py#L128 # noqa: E501
+
+    Args:
+        start_percent (float): When to start annealing the learning rate
+            after the percentage of the total training steps.
+            The value should be in range [0, 1).
+            Default: 0.75
+        min_lr (float, optional): The minimum lr. Default: None.
+        min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
+            Either `min_lr` or `min_lr_ratio` should be specified.
+            Default: None.
+    """
+
+    def __init__(self,
+                 start_percent=0.75,
+                 min_lr=None,
+                 min_lr_ratio=None,
+                 **kwargs):
+        assert (min_lr is None) ^ (min_lr_ratio is None)
+        if start_percent < 0 or start_percent > 1 or not isinstance(
+                start_percent, float):
+            raise ValueError(
+                'expected float between 0 and 1 start_percent, but '
+                f'got {start_percent}')
+        self.start_percent = start_percent
+        self.min_lr = min_lr
+        self.min_lr_ratio = min_lr_ratio
+        super(FlatCosineAnnealingLrUpdaterHook, self).__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        if self.by_epoch:
+            start = round(runner.max_epochs * self.start_percent)
+            progress = runner.epoch - start
+            max_progress = runner.max_epochs - start
+        else:
+            start = round(runner.max_iters * self.start_percent)
+            progress = runner.iter - start
+            max_progress = runner.max_iters - start
+
+        if self.min_lr_ratio is not None:
+            target_lr = base_lr * self.min_lr_ratio
+        else:
+            target_lr = self.min_lr
+
+        if progress < 0:
+            return base_lr
+        else:
+            return annealing_cos(base_lr, target_lr, progress / max_progress)
+
+
+@HOOKS.register_module()
+class CosineRestartLrUpdaterHook(LrUpdaterHook):
+    """Cosine annealing with restarts learning rate scheme.
+
+    Args:
+        periods (list[int]): Periods for each cosine anneling cycle.
+        restart_weights (list[float], optional): Restart weights at each
+            restart iteration. Default: [1].
+        min_lr (float, optional): The minimum lr. Default: None.
+        min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
+            Either `min_lr` or `min_lr_ratio` should be specified.
+            Default: None.
+    """
+
+    def __init__(self,
+                 periods,
+                 restart_weights=[1],
+                 min_lr=None,
+                 min_lr_ratio=None,
+                 **kwargs):
+        assert (min_lr is None) ^ (min_lr_ratio is None)
+        self.periods = periods
+        self.min_lr = min_lr
+        self.min_lr_ratio = min_lr_ratio
+        self.restart_weights = restart_weights
+        assert (len(self.periods) == len(self.restart_weights)
+                ), 'periods and restart_weights should have the same length.'
+        super(CosineRestartLrUpdaterHook, self).__init__(**kwargs)
+
+        self.cumulative_periods = [
+            sum(self.periods[0:i + 1]) for i in range(0, len(self.periods))
+        ]
+
+    def get_lr(self, runner, base_lr):
+        if self.by_epoch:
+            progress = runner.epoch
+        else:
+            progress = runner.iter
+
+        if self.min_lr_ratio is not None:
+            target_lr = base_lr * self.min_lr_ratio
+        else:
+            target_lr = self.min_lr
+
+        idx = get_position_from_periods(progress, self.cumulative_periods)
+        current_weight = self.restart_weights[idx]
+        nearest_restart = 0 if idx == 0 else self.cumulative_periods[idx - 1]
+        current_periods = self.periods[idx]
+
+        alpha = min((progress - nearest_restart) / current_periods, 1)
+        return annealing_cos(base_lr, target_lr, alpha, current_weight)
+
+
+def get_position_from_periods(iteration, cumulative_periods):
+    """Get the position from a period list.
+
+    It will return the index of the right-closest number in the period list.
+    For example, the cumulative_periods = [100, 200, 300, 400],
+    if iteration == 50, return 0;
+    if iteration == 210, return 2;
+    if iteration == 300, return 3.
+
+    Args:
+        iteration (int): Current iteration.
+        cumulative_periods (list[int]): Cumulative period list.
+
+    Returns:
+        int: The position of the right-closest number in the period list.
+    """
+    for i, period in enumerate(cumulative_periods):
+        if iteration < period:
+            return i
+    raise ValueError(f'Current iteration {iteration} exceeds '
+                     f'cumulative_periods {cumulative_periods}')
+
+
+@HOOKS.register_module()
+class CyclicLrUpdaterHook(LrUpdaterHook):
+    """Cyclic LR Scheduler.
+
+    Implement the cyclical learning rate policy (CLR) described in
+    https://arxiv.org/pdf/1506.01186.pdf
+
+    Different from the original paper, we use cosine annealing rather than
+    triangular policy inside a cycle. This improves the performance in the
+    3D detection area.
+
+    Args:
+        by_epoch (bool): Whether to update LR by epoch.
+        target_ratio (tuple[float]): Relative ratio of the highest LR and the
+            lowest LR to the initial LR.
+        cyclic_times (int): Number of cycles during training
+        step_ratio_up (float): The ratio of the increasing process of LR in
+            the total cycle.
+        anneal_strategy (str): {'cos', 'linear'}
+            Specifies the annealing strategy: 'cos' for cosine annealing,
+            'linear' for linear annealing. Default: 'cos'.
+    """
+
+    def __init__(self,
+                 by_epoch=False,
+                 target_ratio=(10, 1e-4),
+                 cyclic_times=1,
+                 step_ratio_up=0.4,
+                 anneal_strategy='cos',
+                 **kwargs):
+        if isinstance(target_ratio, float):
+            target_ratio = (target_ratio, target_ratio / 1e5)
+        elif isinstance(target_ratio, tuple):
+            target_ratio = (target_ratio[0], target_ratio[0] / 1e5) \
+                if len(target_ratio) == 1 else target_ratio
+        else:
+            raise ValueError('target_ratio should be either float '
+                             f'or tuple, got {type(target_ratio)}')
+
+        assert len(target_ratio) == 2, \
+            '"target_ratio" must be list or tuple of two floats'
+        assert 0 <= step_ratio_up < 1.0, \
+            '"step_ratio_up" must be in range [0,1)'
+
+        self.target_ratio = target_ratio
+        self.cyclic_times = cyclic_times
+        self.step_ratio_up = step_ratio_up
+        self.lr_phases = []  # init lr_phases
+        # validate anneal_strategy
+        if anneal_strategy not in ['cos', 'linear']:
+            raise ValueError('anneal_strategy must be one of "cos" or '
+                             f'"linear", instead got {anneal_strategy}')
+        elif anneal_strategy == 'cos':
+            self.anneal_func = annealing_cos
+        elif anneal_strategy == 'linear':
+            self.anneal_func = annealing_linear
+
+        assert not by_epoch, \
+            'currently only support "by_epoch" = False'
+        super(CyclicLrUpdaterHook, self).__init__(by_epoch, **kwargs)
+
+    def before_run(self, runner):
+        super(CyclicLrUpdaterHook, self).before_run(runner)
+        # initiate lr_phases
+        # total lr_phases are separated as up and down
+        max_iter_per_phase = runner.max_iters // self.cyclic_times
+        iter_up_phase = int(self.step_ratio_up * max_iter_per_phase)
+        self.lr_phases.append(
+            [0, iter_up_phase, max_iter_per_phase, 1, self.target_ratio[0]])
+        self.lr_phases.append([
+            iter_up_phase, max_iter_per_phase, max_iter_per_phase,
+            self.target_ratio[0], self.target_ratio[1]
+        ])
+
+    def get_lr(self, runner, base_lr):
+        curr_iter = runner.iter
+        for (start_iter, end_iter, max_iter_per_phase, start_ratio,
+             end_ratio) in self.lr_phases:
+            curr_iter %= max_iter_per_phase
+            if start_iter <= curr_iter < end_iter:
+                progress = curr_iter - start_iter
+                return self.anneal_func(base_lr * start_ratio,
+                                        base_lr * end_ratio,
+                                        progress / (end_iter - start_iter))
+
+
+@HOOKS.register_module()
+class OneCycleLrUpdaterHook(LrUpdaterHook):
+    """One Cycle LR Scheduler.
+
+    The 1cycle learning rate policy changes the learning rate after every
+    batch. The one cycle learning rate policy is described in
+    https://arxiv.org/pdf/1708.07120.pdf
+
+    Args:
+        max_lr (float or list): Upper learning rate boundaries in the cycle
+            for each parameter group.
+        total_steps (int, optional): The total number of steps in the cycle.
+            Note that if a value is not provided here, it will be the max_iter
+            of runner. Default: None.
+        pct_start (float): The percentage of the cycle (in number of steps)
+            spent increasing the learning rate.
+            Default: 0.3
+        anneal_strategy (str): {'cos', 'linear'}
+            Specifies the annealing strategy: 'cos' for cosine annealing,
+            'linear' for linear annealing.
+            Default: 'cos'
+        div_factor (float): Determines the initial learning rate via
+            initial_lr = max_lr/div_factor
+            Default: 25
+        final_div_factor (float): Determines the minimum learning rate via
+            min_lr = initial_lr/final_div_factor
+            Default: 1e4
+        three_phase (bool): If three_phase is True, use a third phase of the
+            schedule to annihilate the learning rate according to
+            final_div_factor instead of modifying the second phase (the first
+            two phases will be symmetrical about the step indicated by
+            pct_start).
+            Default: False
+    """
+
+    def __init__(self,
+                 max_lr,
+                 total_steps=None,
+                 pct_start=0.3,
+                 anneal_strategy='cos',
+                 div_factor=25,
+                 final_div_factor=1e4,
+                 three_phase=False,
+                 **kwargs):
+        # validate by_epoch, currently only support by_epoch = False
+        if 'by_epoch' not in kwargs:
+            kwargs['by_epoch'] = False
+        else:
+            assert not kwargs['by_epoch'], \
+                'currently only support "by_epoch" = False'
+        if not isinstance(max_lr, (numbers.Number, list, dict)):
+            raise ValueError('the type of max_lr must be the one of list or '
+                             f'dict, but got {type(max_lr)}')
+        self._max_lr = max_lr
+        if total_steps is not None:
+            if not isinstance(total_steps, int):
+                raise ValueError('the type of total_steps must be int, but'
+                                 f'got {type(total_steps)}')
+            self.total_steps = total_steps
+        # validate pct_start
+        if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float):
+            raise ValueError('expected float between 0 and 1 pct_start, but '
+                             f'got {pct_start}')
+        self.pct_start = pct_start
+        # validate anneal_strategy
+        if anneal_strategy not in ['cos', 'linear']:
+            raise ValueError('anneal_strategy must be one of "cos" or '
+                             f'"linear", instead got {anneal_strategy}')
+        elif anneal_strategy == 'cos':
+            self.anneal_func = annealing_cos
+        elif anneal_strategy == 'linear':
+            self.anneal_func = annealing_linear
+        self.div_factor = div_factor
+        self.final_div_factor = final_div_factor
+        self.three_phase = three_phase
+        self.lr_phases = []  # init lr_phases
+        super(OneCycleLrUpdaterHook, self).__init__(**kwargs)
+
+    def before_run(self, runner):
+        if hasattr(self, 'total_steps'):
+            total_steps = self.total_steps
+        else:
+            total_steps = runner.max_iters
+        if total_steps < runner.max_iters:
+            raise ValueError(
+                'The total steps must be greater than or equal to max '
+                f'iterations {runner.max_iters} of runner, but total steps '
+                f'is {total_steps}.')
+
+        if isinstance(runner.optimizer, dict):
+            self.base_lr = {}
+            for k, optim in runner.optimizer.items():
+                _max_lr = format_param(k, optim, self._max_lr)
+                self.base_lr[k] = [lr / self.div_factor for lr in _max_lr]
+                for group, lr in zip(optim.param_groups, self.base_lr[k]):
+                    group.setdefault('initial_lr', lr)
+        else:
+            k = type(runner.optimizer).__name__
+            _max_lr = format_param(k, runner.optimizer, self._max_lr)
+            self.base_lr = [lr / self.div_factor for lr in _max_lr]
+            for group, lr in zip(runner.optimizer.param_groups, self.base_lr):
+                group.setdefault('initial_lr', lr)
+
+        if self.three_phase:
+            self.lr_phases.append(
+                [float(self.pct_start * total_steps) - 1, 1, self.div_factor])
+            self.lr_phases.append([
+                float(2 * self.pct_start * total_steps) - 2, self.div_factor, 1
+            ])
+            self.lr_phases.append(
+                [total_steps - 1, 1, 1 / self.final_div_factor])
+        else:
+            self.lr_phases.append(
+                [float(self.pct_start * total_steps) - 1, 1, self.div_factor])
+            self.lr_phases.append(
+                [total_steps - 1, self.div_factor, 1 / self.final_div_factor])
+
+    def get_lr(self, runner, base_lr):
+        curr_iter = runner.iter
+        start_iter = 0
+        for i, (end_iter, start_lr, end_lr) in enumerate(self.lr_phases):
+            if curr_iter <= end_iter:
+                pct = (curr_iter - start_iter) / (end_iter - start_iter)
+                lr = self.anneal_func(base_lr * start_lr, base_lr * end_lr,
+                                      pct)
+                break
+            start_iter = end_iter
+        return lr
+
+
+def annealing_cos(start, end, factor, weight=1):
+    """Calculate annealing cos learning rate.
+
+    Cosine anneal from `weight * start + (1 - weight) * end` to `end` as
+    percentage goes from 0.0 to 1.0.
+
+    Args:
+        start (float): The starting learning rate of the cosine annealing.
+        end (float): The ending learing rate of the cosine annealing.
+        factor (float): The coefficient of `pi` when calculating the current
+            percentage. Range from 0.0 to 1.0.
+        weight (float, optional): The combination factor of `start` and `end`
+            when calculating the actual starting learning rate. Default to 1.
+    """
+    cos_out = cos(pi * factor) + 1
+    return end + 0.5 * weight * (start - end) * cos_out
+
+
+def annealing_linear(start, end, factor):
+    """Calculate annealing linear learning rate.
+
+    Linear anneal from `start` to `end` as percentage goes from 0.0 to 1.0.
+
+    Args:
+        start (float): The starting learning rate of the linear annealing.
+        end (float): The ending learing rate of the linear annealing.
+        factor (float): The coefficient of `pi` when calculating the current
+            percentage. Range from 0.0 to 1.0.
+    """
+    return start + (end - start) * factor
+
+
+def format_param(name, optim, param):
+    if isinstance(param, numbers.Number):
+        return [param] * len(optim.param_groups)
+    elif isinstance(param, (list, tuple)):  # multi param groups
+        if len(param) != len(optim.param_groups):
+            raise ValueError(f'expected {len(optim.param_groups)} '
+                             f'values for {name}, got {len(param)}')
+        return param
+    else:  # multi optimizers
+        if name not in param:
+            raise KeyError(f'{name} is not found in {param.keys()}')
+        return param[name]
diff --git a/mmcv/runner/hooks/optimizer.py b/mmcv/runner/hooks/optimizer.py
new file mode 100644
index 0000000..af1aa00
--- /dev/null
+++ b/mmcv/runner/hooks/optimizer.py
@@ -0,0 +1,506 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from collections import defaultdict
+from itertools import chain
+
+from torch.nn.utils import clip_grad
+
+from torch.nn.modules.batchnorm import _BatchNorm
+from mmcv.utils import LossScaler, wrap_fp16_model, TORCH_VERSION, digit_version, allreduce_grads
+from .hook import HOOKS, Hook
+
+try:
+    # If PyTorch version >= 1.6.0, torch.cuda.amp.GradScaler would be imported
+    # and used; otherwise, auto fp16 will adopt mmcv's implementation.
+    from torch.cuda.amp import GradScaler
+except ImportError:
+    pass
+
+
+@HOOKS.register_module()
+class OptimizerHook(Hook):
+
+    def __init__(self, grad_clip=None):
+        self.grad_clip = grad_clip
+
+    def clip_grads(self, params):
+        params = list(
+            filter(lambda p: p.requires_grad and p.grad is not None, params))
+        if len(params) > 0:
+            return clip_grad.clip_grad_norm_(params, **self.grad_clip)
+
+    def after_train_iter(self, runner):
+        runner.optimizer.zero_grad()
+        runner.outputs['loss'].backward()
+        if self.grad_clip is not None:
+            grad_norm = self.clip_grads(runner.model.parameters())
+            if grad_norm is not None:
+                # Add grad norm to the logger
+                runner.log_buffer.update({'grad_norm': float(grad_norm)},
+                                         runner.outputs['num_samples'])
+        runner.optimizer.step()
+
+
+@HOOKS.register_module()
+class GradientCumulativeOptimizerHook(OptimizerHook):
+    """Optimizer Hook implements multi-iters gradient cumulating.
+
+    Args:
+        cumulative_iters (int, optional): Num of gradient cumulative iters.
+            The optimizer will step every `cumulative_iters` iters.
+            Defaults to 1.
+
+    Examples:
+        >>> # Use cumulative_iters to simulate a large batch size
+        >>> # It is helpful when the hardware cannot handle a large batch size.
+        >>> loader = DataLoader(data, batch_size=64)
+        >>> optim_hook = GradientCumulativeOptimizerHook(cumulative_iters=4)
+        >>> # almost equals to
+        >>> loader = DataLoader(data, batch_size=256)
+        >>> optim_hook = OptimizerHook()
+    """
+
+    def __init__(self, cumulative_iters=1, **kwargs):
+        super(GradientCumulativeOptimizerHook, self).__init__(**kwargs)
+
+        assert isinstance(cumulative_iters, int) and cumulative_iters > 0, \
+            f'cumulative_iters only accepts positive int, but got ' \
+            f'{type(cumulative_iters)} instead.'
+
+        self.cumulative_iters = cumulative_iters
+        self.divisible_iters = 0
+        self.remainder_iters = 0
+        self.initialized = False
+
+    def has_batch_norm(self, module):
+        if isinstance(module, _BatchNorm):
+            return True
+        for m in module.children():
+            if self.has_batch_norm(m):
+                return True
+        return False
+
+    def _init(self, runner):
+        if runner.iter % self.cumulative_iters != 0:
+            runner.logger.warning(
+                'Resume iter number is not divisible by cumulative_iters in '
+                'GradientCumulativeOptimizerHook, which means the gradient of '
+                'some iters is lost and the result may be influenced slightly.'
+            )
+
+        if self.has_batch_norm(runner.model) and self.cumulative_iters > 1:
+            runner.logger.warning(
+                'GradientCumulativeOptimizerHook may slightly decrease '
+                'performance if the model has BatchNorm layers.')
+
+        residual_iters = runner.max_iters - runner.iter
+
+        self.divisible_iters = (
+            residual_iters // self.cumulative_iters * self.cumulative_iters)
+        self.remainder_iters = residual_iters - self.divisible_iters
+
+        self.initialized = True
+
+    def after_train_iter(self, runner):
+        if not self.initialized:
+            self._init(runner)
+
+        if runner.iter < self.divisible_iters:
+            loss_factor = self.cumulative_iters
+        else:
+            loss_factor = self.remainder_iters
+        loss = runner.outputs['loss']
+        loss = loss / loss_factor
+        loss.backward()
+
+        if (self.every_n_iters(runner, self.cumulative_iters)
+                or self.is_last_iter(runner)):
+
+            if self.grad_clip is not None:
+                grad_norm = self.clip_grads(runner.model.parameters())
+                if grad_norm is not None:
+                    # Add grad norm to the logger
+                    runner.log_buffer.update({'grad_norm': float(grad_norm)},
+                                             runner.outputs['num_samples'])
+            runner.optimizer.step()
+            runner.optimizer.zero_grad()
+
+
+if (digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
+
+    @HOOKS.register_module()
+    class Fp16OptimizerHook(OptimizerHook):
+        """FP16 optimizer hook (using PyTorch's implementation).
+
+        If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
+        to take care of the optimization procedure.
+
+        Args:
+            loss_scale (float | str | dict): Scale factor configuration.
+                If loss_scale is a float, static loss scaling will be used with
+                the specified scale. If loss_scale is a string, it must be
+                'dynamic', then dynamic loss scaling will be used.
+                It can also be a dict containing arguments of GradScalar.
+                Defaults to 512. For Pytorch >= 1.6, mmcv uses official
+                implementation of GradScaler. If you use a dict version of
+                loss_scale to create GradScaler, please refer to:
+                https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler
+                for the parameters.
+
+        Examples:
+            >>> loss_scale = dict(
+            ...     init_scale=65536.0,
+            ...     growth_factor=2.0,
+            ...     backoff_factor=0.5,
+            ...     growth_interval=2000
+            ... )
+            >>> optimizer_hook = Fp16OptimizerHook(loss_scale=loss_scale)
+        """
+
+        def __init__(self,
+                     grad_clip=None,
+                     coalesce=True,
+                     bucket_size_mb=-1,
+                     loss_scale=512.,
+                     distributed=True):
+            self.grad_clip = grad_clip
+            self.coalesce = coalesce
+            self.bucket_size_mb = bucket_size_mb
+            self.distributed = distributed
+            self._scale_update_param = None
+            if loss_scale == 'dynamic':
+                self.loss_scaler = GradScaler()
+            elif isinstance(loss_scale, float):
+                self._scale_update_param = loss_scale
+                self.loss_scaler = GradScaler(init_scale=loss_scale)
+            elif isinstance(loss_scale, dict):
+                self.loss_scaler = GradScaler(**loss_scale)
+            else:
+                raise ValueError('loss_scale must be of type float, dict, or '
+                                 f'"dynamic", got {loss_scale}')
+
+        def before_run(self, runner):
+            """Preparing steps before Mixed Precision Training."""
+            # wrap model mode to fp16
+            wrap_fp16_model(runner.model)
+            # resume from state dict
+            if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']:
+                scaler_state_dict = runner.meta['fp16']['loss_scaler']
+                self.loss_scaler.load_state_dict(scaler_state_dict)
+
+        def copy_grads_to_fp32(self, fp16_net, fp32_weights):
+            """Copy gradients from fp16 model to fp32 weight copy."""
+            for fp32_param, fp16_param in zip(fp32_weights,
+                                              fp16_net.parameters()):
+                if fp16_param.grad is not None:
+                    if fp32_param.grad is None:
+                        fp32_param.grad = fp32_param.data.new(
+                            fp32_param.size())
+                    fp32_param.grad.copy_(fp16_param.grad)
+
+        def copy_params_to_fp16(self, fp16_net, fp32_weights):
+            """Copy updated params from fp32 weight copy to fp16 model."""
+            for fp16_param, fp32_param in zip(fp16_net.parameters(),
+                                              fp32_weights):
+                fp16_param.data.copy_(fp32_param.data)
+
+        def after_train_iter(self, runner):
+            """Backward optimization steps for Mixed Precision Training. For
+            dynamic loss scaling, please refer to
+            https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.
+
+            1. Scale the loss by a scale factor.
+            2. Backward the loss to obtain the gradients.
+            3. Unscale the optimizer’s gradient tensors.
+            4. Call optimizer.step() and update scale factor.
+            5. Save loss_scaler state_dict for resume purpose.
+            """
+            # clear grads of last iteration
+            runner.model.zero_grad()
+            runner.optimizer.zero_grad()
+
+            self.loss_scaler.scale(runner.outputs['loss']).backward()
+            self.loss_scaler.unscale_(runner.optimizer)
+            # grad clip
+            if self.grad_clip is not None:
+                grad_norm = self.clip_grads(runner.model.parameters())
+                if grad_norm is not None:
+                    # Add grad norm to the logger
+                    runner.log_buffer.update({'grad_norm': float(grad_norm)},
+                                             runner.outputs['num_samples'])
+            # backward and update scaler
+            self.loss_scaler.step(runner.optimizer)
+            self.loss_scaler.update(self._scale_update_param)
+
+            # save state_dict of loss_scaler
+            runner.meta.setdefault(
+                'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
+
+    @HOOKS.register_module()
+    class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook,
+                                              Fp16OptimizerHook):
+        """Fp16 optimizer Hook (using PyTorch's implementation) implements
+        multi-iters gradient cumulating.
+
+        If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
+        to take care of the optimization procedure.
+        """
+
+        def __init__(self, *args, **kwargs):
+            super(GradientCumulativeFp16OptimizerHook,
+                  self).__init__(*args, **kwargs)
+
+        def after_train_iter(self, runner):
+            if not self.initialized:
+                self._init(runner)
+
+            if runner.iter < self.divisible_iters:
+                loss_factor = self.cumulative_iters
+            else:
+                loss_factor = self.remainder_iters
+            loss = runner.outputs['loss']
+            loss = loss / loss_factor
+
+            self.loss_scaler.scale(loss).backward()
+
+            if (self.every_n_iters(runner, self.cumulative_iters)
+                    or self.is_last_iter(runner)):
+
+                # copy fp16 grads in the model to fp32 params in the optimizer
+                self.loss_scaler.unscale_(runner.optimizer)
+
+                if self.grad_clip is not None:
+                    grad_norm = self.clip_grads(runner.model.parameters())
+                    if grad_norm is not None:
+                        # Add grad norm to the logger
+                        runner.log_buffer.update(
+                            {'grad_norm': float(grad_norm)},
+                            runner.outputs['num_samples'])
+
+                # backward and update scaler
+                self.loss_scaler.step(runner.optimizer)
+                self.loss_scaler.update(self._scale_update_param)
+
+                # save state_dict of loss_scaler
+                runner.meta.setdefault(
+                    'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
+
+                # clear grads
+                runner.model.zero_grad()
+                runner.optimizer.zero_grad()
+
+else:
+
+    @HOOKS.register_module()
+    class Fp16OptimizerHook(OptimizerHook):
+        """FP16 optimizer hook (mmcv's implementation).
+
+        The steps of fp16 optimizer is as follows.
+        1. Scale the loss value.
+        2. BP in the fp16 model.
+        2. Copy gradients from fp16 model to fp32 weights.
+        3. Update fp32 weights.
+        4. Copy updated parameters from fp32 weights to fp16 model.
+
+        Refer to https://arxiv.org/abs/1710.03740 for more details.
+
+        Args:
+            loss_scale (float | str | dict): Scale factor configuration.
+                If loss_scale is a float, static loss scaling will be used with
+                the specified scale. If loss_scale is a string, it must be
+                'dynamic', then dynamic loss scaling will be used.
+                It can also be a dict containing arguments of LossScaler.
+                Defaults to 512.
+        """
+
+        def __init__(self,
+                     grad_clip=None,
+                     coalesce=True,
+                     bucket_size_mb=-1,
+                     loss_scale=512.,
+                     distributed=True):
+            self.grad_clip = grad_clip
+            self.coalesce = coalesce
+            self.bucket_size_mb = bucket_size_mb
+            self.distributed = distributed
+            if loss_scale == 'dynamic':
+                self.loss_scaler = LossScaler(mode='dynamic')
+            elif isinstance(loss_scale, float):
+                self.loss_scaler = LossScaler(
+                    init_scale=loss_scale, mode='static')
+            elif isinstance(loss_scale, dict):
+                self.loss_scaler = LossScaler(**loss_scale)
+            else:
+                raise ValueError('loss_scale must be of type float, dict, or '
+                                 f'"dynamic", got {loss_scale}')
+
+        def before_run(self, runner):
+            """Preparing steps before Mixed Precision Training.
+
+            1. Make a master copy of fp32 weights for optimization.
+            2. Convert the main model from fp32 to fp16.
+            """
+            # keep a copy of fp32 weights
+            old_groups = runner.optimizer.param_groups
+            runner.optimizer.param_groups = copy.deepcopy(
+                runner.optimizer.param_groups)
+            state = defaultdict(dict)
+            p_map = {
+                old_p: p
+                for old_p, p in zip(
+                    chain(*(g['params'] for g in old_groups)),
+                    chain(*(g['params']
+                            for g in runner.optimizer.param_groups)))
+            }
+            for k, v in runner.optimizer.state.items():
+                state[p_map[k]] = v
+            runner.optimizer.state = state
+            # convert model to fp16
+            wrap_fp16_model(runner.model)
+            # resume from state dict
+            if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']:
+                scaler_state_dict = runner.meta['fp16']['loss_scaler']
+                self.loss_scaler.load_state_dict(scaler_state_dict)
+
+        def copy_grads_to_fp32(self, fp16_net, fp32_weights):
+            """Copy gradients from fp16 model to fp32 weight copy."""
+            for fp32_param, fp16_param in zip(fp32_weights,
+                                              fp16_net.parameters()):
+                if fp16_param.grad is not None:
+                    if fp32_param.grad is None:
+                        fp32_param.grad = fp32_param.data.new(
+                            fp32_param.size())
+                    fp32_param.grad.copy_(fp16_param.grad)
+
+        def copy_params_to_fp16(self, fp16_net, fp32_weights):
+            """Copy updated params from fp32 weight copy to fp16 model."""
+            for fp16_param, fp32_param in zip(fp16_net.parameters(),
+                                              fp32_weights):
+                fp16_param.data.copy_(fp32_param.data)
+
+        def after_train_iter(self, runner):
+            """Backward optimization steps for Mixed Precision Training. For
+            dynamic loss scaling, please refer `loss_scalar.py`
+
+            1. Scale the loss by a scale factor.
+            2. Backward the loss to obtain the gradients (fp16).
+            3. Copy gradients from the model to the fp32 weight copy.
+            4. Scale the gradients back and update the fp32 weight copy.
+            5. Copy back the params from fp32 weight copy to the fp16 model.
+            6. Save loss_scaler state_dict for resume purpose.
+            """
+            # clear grads of last iteration
+            runner.model.zero_grad()
+            runner.optimizer.zero_grad()
+            # scale the loss value
+            scaled_loss = runner.outputs['loss'] * self.loss_scaler.loss_scale
+            scaled_loss.backward()
+            # copy fp16 grads in the model to fp32 params in the optimizer
+
+            fp32_weights = []
+            for param_group in runner.optimizer.param_groups:
+                fp32_weights += param_group['params']
+            self.copy_grads_to_fp32(runner.model, fp32_weights)
+            # allreduce grads
+            if self.distributed:
+                allreduce_grads(fp32_weights, self.coalesce,
+                                self.bucket_size_mb)
+
+            has_overflow = self.loss_scaler.has_overflow(fp32_weights)
+            # if has overflow, skip this iteration
+            if not has_overflow:
+                # scale the gradients back
+                for param in fp32_weights:
+                    if param.grad is not None:
+                        param.grad.div_(self.loss_scaler.loss_scale)
+                if self.grad_clip is not None:
+                    grad_norm = self.clip_grads(fp32_weights)
+                    if grad_norm is not None:
+                        # Add grad norm to the logger
+                        runner.log_buffer.update(
+                            {'grad_norm': float(grad_norm)},
+                            runner.outputs['num_samples'])
+                # update fp32 params
+                runner.optimizer.step()
+                # copy fp32 params to the fp16 model
+                self.copy_params_to_fp16(runner.model, fp32_weights)
+            self.loss_scaler.update_scale(has_overflow)
+            if has_overflow:
+                runner.logger.warning('Check overflow, downscale loss scale '
+                                      f'to {self.loss_scaler.cur_scale}')
+
+            # save state_dict of loss_scaler
+            runner.meta.setdefault(
+                'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
+
+    @HOOKS.register_module()
+    class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook,
+                                              Fp16OptimizerHook):
+        """Fp16 optimizer Hook (using mmcv implementation) implements multi-
+        iters gradient cumulating."""
+
+        def __init__(self, *args, **kwargs):
+            super(GradientCumulativeFp16OptimizerHook,
+                  self).__init__(*args, **kwargs)
+
+        def after_train_iter(self, runner):
+            if not self.initialized:
+                self._init(runner)
+
+            if runner.iter < self.divisible_iters:
+                loss_factor = self.cumulative_iters
+            else:
+                loss_factor = self.remainder_iters
+
+            loss = runner.outputs['loss']
+            loss = loss / loss_factor
+
+            # scale the loss value
+            scaled_loss = loss * self.loss_scaler.loss_scale
+            scaled_loss.backward()
+
+            if (self.every_n_iters(runner, self.cumulative_iters)
+                    or self.is_last_iter(runner)):
+
+                # copy fp16 grads in the model to fp32 params in the optimizer
+                fp32_weights = []
+                for param_group in runner.optimizer.param_groups:
+                    fp32_weights += param_group['params']
+                self.copy_grads_to_fp32(runner.model, fp32_weights)
+                # allreduce grads
+                if self.distributed:
+                    allreduce_grads(fp32_weights, self.coalesce,
+                                    self.bucket_size_mb)
+
+                has_overflow = self.loss_scaler.has_overflow(fp32_weights)
+                # if has overflow, skip this iteration
+                if not has_overflow:
+                    # scale the gradients back
+                    for param in fp32_weights:
+                        if param.grad is not None:
+                            param.grad.div_(self.loss_scaler.loss_scale)
+                    if self.grad_clip is not None:
+                        grad_norm = self.clip_grads(fp32_weights)
+                        if grad_norm is not None:
+                            # Add grad norm to the logger
+                            runner.log_buffer.update(
+                                {'grad_norm': float(grad_norm)},
+                                runner.outputs['num_samples'])
+                    # update fp32 params
+                    runner.optimizer.step()
+                    # copy fp32 params to the fp16 model
+                    self.copy_params_to_fp16(runner.model, fp32_weights)
+                else:
+                    runner.logger.warning(
+                        'Check overflow, downscale loss scale '
+                        f'to {self.loss_scaler.cur_scale}')
+
+                self.loss_scaler.update_scale(has_overflow)
+
+                # save state_dict of loss_scaler
+                runner.meta.setdefault(
+                    'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
+
+                # clear grads
+                runner.model.zero_grad()
+                runner.optimizer.zero_grad()
diff --git a/mmcv/runner/hooks/sampler_seed.py b/mmcv/runner/hooks/sampler_seed.py
new file mode 100644
index 0000000..ee0dc6b
--- /dev/null
+++ b/mmcv/runner/hooks/sampler_seed.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class DistSamplerSeedHook(Hook):
+    """Data-loading sampler for distributed training.
+
+    When distributed training, it is only useful in conjunction with
+    :obj:`EpochBasedRunner`, while :obj:`IterBasedRunner` achieves the same
+    purpose with :obj:`IterLoader`.
+    """
+
+    def before_epoch(self, runner):
+        if hasattr(runner.data_loader.sampler, 'set_epoch'):
+            # in case the data loader uses `SequentialSampler` in Pytorch
+            runner.data_loader.sampler.set_epoch(runner.epoch)
+        elif hasattr(runner.data_loader.batch_sampler.sampler, 'set_epoch'):
+            # batch sampler in pytorch warps the sampler as its attributes.
+            runner.data_loader.batch_sampler.sampler.set_epoch(runner.epoch)
diff --git a/mmcv/runner/hooks/vad_hooks.py b/mmcv/runner/hooks/vad_hooks.py
new file mode 100644
index 0000000..56a4b05
--- /dev/null
+++ b/mmcv/runner/hooks/vad_hooks.py
@@ -0,0 +1,17 @@
+from mmcv.runner.hooks.hook import HOOKS, Hook
+from mmcv.parallel import is_module_wrapper
+
+
+
+
+@HOOKS.register_module()
+class CustomSetEpochInfoHook(Hook):
+    """Set runner's epoch information to the model."""
+
+    def before_train_epoch(self, runner):
+        epoch = runner.epoch
+        model = runner.model
+        if is_module_wrapper(model):
+            model = model.module
+        model.set_epoch(epoch)
+
diff --git a/mmcv/structures/__init__.py b/mmcv/structures/__init__.py
new file mode 100644
index 0000000..a0e31b2
--- /dev/null
+++ b/mmcv/structures/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from .boxes import BoxMode, Boxes
+from .instances import Instances
+# from .keypoints import Keypoints, heatmaps_to_keypoints
+from .masks import ROIMasks
+# from .masks import BitMasks, PolygonMasks, polygons_to_bitmask, ROIMasks
+# from .rotated_boxes import RotatedBoxes
+# from .rotated_boxes import pairwise_iou as pairwise_iou_rotated
diff --git a/mmcv/structures/boxes.py b/mmcv/structures/boxes.py
new file mode 100644
index 0000000..fd396f6
--- /dev/null
+++ b/mmcv/structures/boxes.py
@@ -0,0 +1,425 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import math
+import numpy as np
+from enum import IntEnum, unique
+from typing import List, Tuple, Union
+import torch
+from torch import device
+
+_RawBoxType = Union[List[float], Tuple[float, ...], torch.Tensor, np.ndarray]
+
+
+@unique
+class BoxMode(IntEnum):
+    """
+    Enum of different ways to represent a box.
+    """
+
+    XYXY_ABS = 0
+    """
+    (x0, y0, x1, y1) in absolute floating points coordinates.
+    The coordinates in range [0, width or height].
+    """
+    XYWH_ABS = 1
+    """
+    (x0, y0, w, h) in absolute floating points coordinates.
+    """
+    XYXY_REL = 2
+    """
+    Not yet supported!
+    (x0, y0, x1, y1) in range [0, 1]. They are relative to the size of the image.
+    """
+    XYWH_REL = 3
+    """
+    Not yet supported!
+    (x0, y0, w, h) in range [0, 1]. They are relative to the size of the image.
+    """
+    XYWHA_ABS = 4
+    """
+    (xc, yc, w, h, a) in absolute floating points coordinates.
+    (xc, yc) is the center of the rotated box, and the angle a is in degrees ccw.
+    """
+
+    @staticmethod
+    def convert(box: _RawBoxType, from_mode: "BoxMode", to_mode: "BoxMode") -> _RawBoxType:
+        """
+        Args:
+            box: can be a k-tuple, k-list or an Nxk array/tensor, where k = 4 or 5
+            from_mode, to_mode (BoxMode)
+
+        Returns:
+            The converted box of the same type.
+        """
+        if from_mode == to_mode:
+            return box
+
+        original_type = type(box)
+        is_numpy = isinstance(box, np.ndarray)
+        single_box = isinstance(box, (list, tuple))
+        if single_box:
+            assert len(box) == 4 or len(box) == 5, (
+                "BoxMode.convert takes either a k-tuple/list or an Nxk array/tensor,"
+                " where k == 4 or 5"
+            )
+            arr = torch.tensor(box)[None, :]
+        else:
+            # avoid modifying the input box
+            if is_numpy:
+                arr = torch.from_numpy(np.asarray(box)).clone()
+            else:
+                arr = box.clone()
+
+        assert to_mode not in [BoxMode.XYXY_REL, BoxMode.XYWH_REL] and from_mode not in [
+            BoxMode.XYXY_REL,
+            BoxMode.XYWH_REL,
+        ], "Relative mode not yet supported!"
+
+        if from_mode == BoxMode.XYWHA_ABS and to_mode == BoxMode.XYXY_ABS:
+            assert (
+                arr.shape[-1] == 5
+            ), "The last dimension of input shape must be 5 for XYWHA format"
+            original_dtype = arr.dtype
+            arr = arr.double()
+
+            w = arr[:, 2]
+            h = arr[:, 3]
+            a = arr[:, 4]
+            c = torch.abs(torch.cos(a * math.pi / 180.0))
+            s = torch.abs(torch.sin(a * math.pi / 180.0))
+            # This basically computes the horizontal bounding rectangle of the rotated box
+            new_w = c * w + s * h
+            new_h = c * h + s * w
+
+            # convert center to top-left corner
+            arr[:, 0] -= new_w / 2.0
+            arr[:, 1] -= new_h / 2.0
+            # bottom-right corner
+            arr[:, 2] = arr[:, 0] + new_w
+            arr[:, 3] = arr[:, 1] + new_h
+
+            arr = arr[:, :4].to(dtype=original_dtype)
+        elif from_mode == BoxMode.XYWH_ABS and to_mode == BoxMode.XYWHA_ABS:
+            original_dtype = arr.dtype
+            arr = arr.double()
+            arr[:, 0] += arr[:, 2] / 2.0
+            arr[:, 1] += arr[:, 3] / 2.0
+            angles = torch.zeros((arr.shape[0], 1), dtype=arr.dtype)
+            arr = torch.cat((arr, angles), axis=1).to(dtype=original_dtype)
+        else:
+            if to_mode == BoxMode.XYXY_ABS and from_mode == BoxMode.XYWH_ABS:
+                arr[:, 2] += arr[:, 0]
+                arr[:, 3] += arr[:, 1]
+            elif from_mode == BoxMode.XYXY_ABS and to_mode == BoxMode.XYWH_ABS:
+                arr[:, 2] -= arr[:, 0]
+                arr[:, 3] -= arr[:, 1]
+            else:
+                raise NotImplementedError(
+                    "Conversion from BoxMode {} to {} is not supported yet".format(
+                        from_mode, to_mode
+                    )
+                )
+
+        if single_box:
+            return original_type(arr.flatten().tolist())
+        if is_numpy:
+            return arr.numpy()
+        else:
+            return arr
+
+
+class Boxes:
+    """
+    This structure stores a list of boxes as a Nx4 torch.Tensor.
+    It supports some common methods about boxes
+    (`area`, `clip`, `nonempty`, etc),
+    and also behaves like a Tensor
+    (support indexing, `to(device)`, `.device`, and iteration over all boxes)
+
+    Attributes:
+        tensor (torch.Tensor): float matrix of Nx4. Each row is (x1, y1, x2, y2).
+    """
+
+    def __init__(self, tensor: torch.Tensor):
+        """
+        Args:
+            tensor (Tensor[float]): a Nx4 matrix.  Each row is (x1, y1, x2, y2).
+        """
+        if not isinstance(tensor, torch.Tensor):
+            tensor = torch.as_tensor(tensor, dtype=torch.float32, device=torch.device("cpu"))
+        else:
+            tensor = tensor.to(torch.float32)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that does not depend on
+            # the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((-1, 4)).to(dtype=torch.float32)
+        assert tensor.dim() == 2 and tensor.size(-1) == 4, tensor.size()
+
+        self.tensor = tensor
+
+    def clone(self) -> "Boxes":
+        """
+        Clone the Boxes.
+
+        Returns:
+            Boxes
+        """
+        return Boxes(self.tensor.clone())
+
+    def to(self, device: torch.device):
+        # Boxes are assumed float32 and does not support to(dtype)
+        return Boxes(self.tensor.to(device=device))
+
+    def area(self) -> torch.Tensor:
+        """
+        Computes the area of all the boxes.
+
+        Returns:
+            torch.Tensor: a vector with areas of each box.
+        """
+        box = self.tensor
+        area = (box[:, 2] - box[:, 0]) * (box[:, 3] - box[:, 1])
+        return area
+
+    def clip(self, box_size: Tuple[int, int]) -> None:
+        """
+        Clip (in place) the boxes by limiting x coordinates to the range [0, width]
+        and y coordinates to the range [0, height].
+
+        Args:
+            box_size (height, width): The clipping box's size.
+        """
+        assert torch.isfinite(self.tensor).all(), "Box tensor contains infinite or NaN!"
+        h, w = box_size
+        x1 = self.tensor[:, 0].clamp(min=0, max=w)
+        y1 = self.tensor[:, 1].clamp(min=0, max=h)
+        x2 = self.tensor[:, 2].clamp(min=0, max=w)
+        y2 = self.tensor[:, 3].clamp(min=0, max=h)
+        self.tensor = torch.stack((x1, y1, x2, y2), dim=-1)
+
+    def nonempty(self, threshold: float = 0.0) -> torch.Tensor:
+        """
+        Find boxes that are non-empty.
+        A box is considered empty, if either of its side is no larger than threshold.
+
+        Returns:
+            Tensor:
+                a binary vector which represents whether each box is empty
+                (False) or non-empty (True).
+        """
+        box = self.tensor
+        widths = box[:, 2] - box[:, 0]
+        heights = box[:, 3] - box[:, 1]
+        keep = (widths > threshold) & (heights > threshold)
+        return keep
+
+    def __getitem__(self, item) -> "Boxes":
+        """
+        Args:
+            item: int, slice, or a BoolTensor
+
+        Returns:
+            Boxes: Create a new :class:`Boxes` by indexing.
+
+        The following usage are allowed:
+
+        1. `new_boxes = boxes[3]`: return a `Boxes` which contains only one box.
+        2. `new_boxes = boxes[2:10]`: return a slice of boxes.
+        3. `new_boxes = boxes[vector]`, where vector is a torch.BoolTensor
+           with `length = len(boxes)`. Nonzero elements in the vector will be selected.
+
+        Note that the returned Boxes might share storage with this Boxes,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            return Boxes(self.tensor[item].view(1, -1))
+        b = self.tensor[item]
+        assert b.dim() == 2, "Indexing on Boxes with {} failed to return a matrix!".format(item)
+        return Boxes(b)
+
+    def __len__(self) -> int:
+        return self.tensor.shape[0]
+
+    def __repr__(self) -> str:
+        return "Boxes(" + str(self.tensor) + ")"
+
+    def inside_box(self, box_size: Tuple[int, int], boundary_threshold: int = 0) -> torch.Tensor:
+        """
+        Args:
+            box_size (height, width): Size of the reference box.
+            boundary_threshold (int): Boxes that extend beyond the reference box
+                boundary by more than boundary_threshold are considered "outside".
+
+        Returns:
+            a binary vector, indicating whether each box is inside the reference box.
+        """
+        height, width = box_size
+        inds_inside = (
+            (self.tensor[..., 0] >= -boundary_threshold)
+            & (self.tensor[..., 1] >= -boundary_threshold)
+            & (self.tensor[..., 2] < width + boundary_threshold)
+            & (self.tensor[..., 3] < height + boundary_threshold)
+        )
+        return inds_inside
+
+    def get_centers(self) -> torch.Tensor:
+        """
+        Returns:
+            The box centers in a Nx2 array of (x, y).
+        """
+        return (self.tensor[:, :2] + self.tensor[:, 2:]) / 2
+
+    def scale(self, scale_x: float, scale_y: float) -> None:
+        """
+        Scale the box with horizontal and vertical scaling factors
+        """
+        self.tensor[:, 0::2] *= scale_x
+        self.tensor[:, 1::2] *= scale_y
+
+    @classmethod
+    def cat(cls, boxes_list: List["Boxes"]) -> "Boxes":
+        """
+        Concatenates a list of Boxes into a single Boxes
+
+        Arguments:
+            boxes_list (list[Boxes])
+
+        Returns:
+            Boxes: the concatenated Boxes
+        """
+        assert isinstance(boxes_list, (list, tuple))
+        if len(boxes_list) == 0:
+            return cls(torch.empty(0))
+        assert all([isinstance(box, Boxes) for box in boxes_list])
+
+        # use torch.cat (v.s. layers.cat) so the returned boxes never share storage with input
+        cat_boxes = cls(torch.cat([b.tensor for b in boxes_list], dim=0))
+        return cat_boxes
+
+    @property
+    def device(self) -> device:
+        return self.tensor.device
+
+    # type "Iterator[torch.Tensor]", yield, and iter() not supported by torchscript
+    # https://github.com/pytorch/pytorch/issues/18627
+    @torch.jit.unused
+    def __iter__(self):
+        """
+        Yield a box as a Tensor of shape (4,) at a time.
+        """
+        yield from self.tensor
+
+
+def pairwise_intersection(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor:
+    """
+    Given two lists of boxes of size N and M,
+    compute the intersection area between __all__ N x M pairs of boxes.
+    The box order must be (xmin, ymin, xmax, ymax)
+
+    Args:
+        boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively.
+
+    Returns:
+        Tensor: intersection, sized [N,M].
+    """
+    boxes1, boxes2 = boxes1.tensor, boxes2.tensor
+    width_height = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) - torch.max(
+        boxes1[:, None, :2], boxes2[:, :2]
+    )  # [N,M,2]
+
+    width_height.clamp_(min=0)  # [N,M,2]
+    intersection = width_height.prod(dim=2)  # [N,M]
+    return intersection
+
+
+# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
+# with slight modifications
+def pairwise_iou(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor:
+    """
+    Given two lists of boxes of size N and M, compute the IoU
+    (intersection over union) between **all** N x M pairs of boxes.
+    The box order must be (xmin, ymin, xmax, ymax).
+
+    Args:
+        boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively.
+
+    Returns:
+        Tensor: IoU, sized [N,M].
+    """
+    area1 = boxes1.area()  # [N]
+    area2 = boxes2.area()  # [M]
+    inter = pairwise_intersection(boxes1, boxes2)
+
+    # handle empty boxes
+    iou = torch.where(
+        inter > 0,
+        inter / (area1[:, None] + area2 - inter),
+        torch.zeros(1, dtype=inter.dtype, device=inter.device),
+    )
+    return iou
+
+
+def pairwise_ioa(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor:
+    """
+    Similar to :func:`pariwise_iou` but compute the IoA (intersection over boxes2 area).
+
+    Args:
+        boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively.
+
+    Returns:
+        Tensor: IoA, sized [N,M].
+    """
+    area2 = boxes2.area()  # [M]
+    inter = pairwise_intersection(boxes1, boxes2)
+
+    # handle empty boxes
+    ioa = torch.where(
+        inter > 0, inter / area2, torch.zeros(1, dtype=inter.dtype, device=inter.device)
+    )
+    return ioa
+
+
+def pairwise_point_box_distance(points: torch.Tensor, boxes: Boxes):
+    """
+    Pairwise distance between N points and M boxes. The distance between a
+    point and a box is represented by the distance from the point to 4 edges
+    of the box. Distances are all positive when the point is inside the box.
+
+    Args:
+        points: Nx2 coordinates. Each row is (x, y)
+        boxes: M boxes
+
+    Returns:
+        Tensor: distances of size (N, M, 4). The 4 values are distances from
+            the point to the left, top, right, bottom of the box.
+    """
+    x, y = points.unsqueeze(dim=2).unbind(dim=1)  # (N, 1)
+    x0, y0, x1, y1 = boxes.tensor.unsqueeze(dim=0).unbind(dim=2)  # (1, M)
+    return torch.stack([x - x0, y - y0, x1 - x, y1 - y], dim=2)
+
+
+def matched_pairwise_iou(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor:
+    """
+    Compute pairwise intersection over union (IOU) of two sets of matched
+    boxes that have the same number of boxes.
+    Similar to :func:`pairwise_iou`, but computes only diagonal elements of the matrix.
+
+    Args:
+        boxes1 (Boxes): bounding boxes, sized [N,4].
+        boxes2 (Boxes): same length as boxes1
+    Returns:
+        Tensor: iou, sized [N].
+    """
+    assert len(boxes1) == len(
+        boxes2
+    ), "boxlists should have the same" "number of entries, got {}, {}".format(
+        len(boxes1), len(boxes2)
+    )
+    area1 = boxes1.area()  # [N]
+    area2 = boxes2.area()  # [N]
+    box1, box2 = boxes1.tensor, boxes2.tensor
+    lt = torch.max(box1[:, :2], box2[:, :2])  # [N,2]
+    rb = torch.min(box1[:, 2:], box2[:, 2:])  # [N,2]
+    wh = (rb - lt).clamp(min=0)  # [N,2]
+    inter = wh[:, 0] * wh[:, 1]  # [N]
+    iou = inter / (area1 + area2 - inter)  # [N]
+    return iou
diff --git a/mmcv/structures/image_list.py b/mmcv/structures/image_list.py
new file mode 100644
index 0000000..e4243bb
--- /dev/null
+++ b/mmcv/structures/image_list.py
@@ -0,0 +1,129 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from __future__ import division
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+from torch import device
+from torch.nn import functional as F
+
+from detectron2.layers.wrappers import move_device_like, shapes_to_tensor
+
+
+class ImageList:
+    """
+    Structure that holds a list of images (of possibly
+    varying sizes) as a single tensor.
+    This works by padding the images to the same size.
+    The original sizes of each image is stored in `image_sizes`.
+
+    Attributes:
+        image_sizes (list[tuple[int, int]]): each tuple is (h, w).
+            During tracing, it becomes list[Tensor] instead.
+    """
+
+    def __init__(self, tensor: torch.Tensor, image_sizes: List[Tuple[int, int]]):
+        """
+        Arguments:
+            tensor (Tensor): of shape (N, H, W) or (N, C_1, ..., C_K, H, W) where K >= 1
+            image_sizes (list[tuple[int, int]]): Each tuple is (h, w). It can
+                be smaller than (H, W) due to padding.
+        """
+        self.tensor = tensor
+        self.image_sizes = image_sizes
+
+    def __len__(self) -> int:
+        return len(self.image_sizes)
+
+    def __getitem__(self, idx) -> torch.Tensor:
+        """
+        Access the individual image in its original size.
+
+        Args:
+            idx: int or slice
+
+        Returns:
+            Tensor: an image of shape (H, W) or (C_1, ..., C_K, H, W) where K >= 1
+        """
+        size = self.image_sizes[idx]
+        return self.tensor[idx, ..., : size[0], : size[1]]
+
+    @torch.jit.unused
+    def to(self, *args: Any, **kwargs: Any) -> "ImageList":
+        cast_tensor = self.tensor.to(*args, **kwargs)
+        return ImageList(cast_tensor, self.image_sizes)
+
+    @property
+    def device(self) -> device:
+        return self.tensor.device
+
+    @staticmethod
+    def from_tensors(
+        tensors: List[torch.Tensor],
+        size_divisibility: int = 0,
+        pad_value: float = 0.0,
+        padding_constraints: Optional[Dict[str, int]] = None,
+    ) -> "ImageList":
+        """
+        Args:
+            tensors: a tuple or list of `torch.Tensor`, each of shape (Hi, Wi) or
+                (C_1, ..., C_K, Hi, Wi) where K >= 1. The Tensors will be padded
+                to the same shape with `pad_value`.
+            size_divisibility (int): If `size_divisibility > 0`, add padding to ensure
+                the common height and width is divisible by `size_divisibility`.
+                This depends on the model and many models need a divisibility of 32.
+            pad_value (float): value to pad.
+            padding_constraints (optional[Dict]): If given, it would follow the format as
+                {"size_divisibility": int, "square_size": int}, where `size_divisibility` will
+                overwrite the above one if presented and `square_size` indicates the
+                square padding size if `square_size` > 0.
+        Returns:
+            an `ImageList`.
+        """
+        assert len(tensors) > 0
+        assert isinstance(tensors, (tuple, list))
+        for t in tensors:
+            assert isinstance(t, torch.Tensor), type(t)
+            assert t.shape[:-2] == tensors[0].shape[:-2], t.shape
+
+        image_sizes = [(im.shape[-2], im.shape[-1]) for im in tensors]
+        image_sizes_tensor = [shapes_to_tensor(x) for x in image_sizes]
+        max_size = torch.stack(image_sizes_tensor).max(0).values
+
+        if padding_constraints is not None:
+            square_size = padding_constraints.get("square_size", 0)
+            if square_size > 0:
+                # pad to square.
+                max_size[0] = max_size[1] = square_size
+            if "size_divisibility" in padding_constraints:
+                size_divisibility = padding_constraints["size_divisibility"]
+        if size_divisibility > 1:
+            stride = size_divisibility
+            # the last two dims are H,W, both subject to divisibility requirement
+            max_size = (max_size + (stride - 1)).div(stride, rounding_mode="floor") * stride
+
+        # handle weirdness of scripting and tracing ...
+        if torch.jit.is_scripting():
+            max_size: List[int] = max_size.to(dtype=torch.long).tolist()
+        else:
+            if torch.jit.is_tracing():
+                image_sizes = image_sizes_tensor
+
+        if len(tensors) == 1:
+            # This seems slightly (2%) faster.
+            # TODO: check whether it's faster for multiple images as well
+            image_size = image_sizes[0]
+            padding_size = [0, max_size[-1] - image_size[1], 0, max_size[-2] - image_size[0]]
+            batched_imgs = F.pad(tensors[0], padding_size, value=pad_value).unsqueeze_(0)
+        else:
+            # max_size can be a tensor in tracing mode, therefore convert to list
+            batch_shape = [len(tensors)] + list(tensors[0].shape[:-2]) + list(max_size)
+            device = (
+                None if torch.jit.is_scripting() else ("cpu" if torch.jit.is_tracing() else None)
+            )
+            batched_imgs = tensors[0].new_full(batch_shape, pad_value, device=device)
+            batched_imgs = move_device_like(batched_imgs, tensors[0])
+            for i, img in enumerate(tensors):
+                # Use `batched_imgs` directly instead of `img, pad_img = zip(tensors, batched_imgs)`
+                # Tracing mode cannot capture `copy_()` of temporary locals
+                batched_imgs[i, ..., : img.shape[-2], : img.shape[-1]].copy_(img)
+
+        return ImageList(batched_imgs.contiguous(), image_sizes)
diff --git a/mmcv/structures/instances.py b/mmcv/structures/instances.py
new file mode 100644
index 0000000..c9579bc
--- /dev/null
+++ b/mmcv/structures/instances.py
@@ -0,0 +1,194 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import itertools
+import warnings
+from typing import Any, Dict, List, Tuple, Union
+import torch
+
+
+class Instances:
+    """
+    This class represents a list of instances in an image.
+    It stores the attributes of instances (e.g., boxes, masks, labels, scores) as "fields".
+    All fields must have the same ``__len__`` which is the number of instances.
+
+    All other (non-field) attributes of this class are considered private:
+    they must start with '_' and are not modifiable by a user.
+
+    Some basic usage:
+
+    1. Set/get/check a field:
+
+       .. code-block:: python
+
+          instances.gt_boxes = Boxes(...)
+          print(instances.pred_masks)  # a tensor of shape (N, H, W)
+          print('gt_masks' in instances)
+
+    2. ``len(instances)`` returns the number of instances
+    3. Indexing: ``instances[indices]`` will apply the indexing on all the fields
+       and returns a new :class:`Instances`.
+       Typically, ``indices`` is a integer vector of indices,
+       or a binary mask of length ``num_instances``
+
+       .. code-block:: python
+
+          category_3_detections = instances[instances.pred_classes == 3]
+          confident_detections = instances[instances.scores > 0.9]
+    """
+
+    def __init__(self, image_size: Tuple[int, int], **kwargs: Any):
+        """
+        Args:
+            image_size (height, width): the spatial size of the image.
+            kwargs: fields to add to this `Instances`.
+        """
+        self._image_size = image_size
+        self._fields: Dict[str, Any] = {}
+        for k, v in kwargs.items():
+            self.set(k, v)
+
+    @property
+    def image_size(self) -> Tuple[int, int]:
+        """
+        Returns:
+            tuple: height, width
+        """
+        return self._image_size
+
+    def __setattr__(self, name: str, val: Any) -> None:
+        if name.startswith("_"):
+            super().__setattr__(name, val)
+        else:
+            self.set(name, val)
+
+    def __getattr__(self, name: str) -> Any:
+        if name == "_fields" or name not in self._fields:
+            raise AttributeError("Cannot find field '{}' in the given Instances!".format(name))
+        return self._fields[name]
+
+    def set(self, name: str, value: Any) -> None:
+        """
+        Set the field named `name` to `value`.
+        The length of `value` must be the number of instances,
+        and must agree with other existing fields in this object.
+        """
+        with warnings.catch_warnings(record=True):
+            data_len = len(value)
+        if len(self._fields):
+            assert (
+                len(self) == data_len
+            ), "Adding a field of length {} to a Instances of length {}".format(data_len, len(self))
+        self._fields[name] = value
+
+    def has(self, name: str) -> bool:
+        """
+        Returns:
+            bool: whether the field called `name` exists.
+        """
+        return name in self._fields
+
+    def remove(self, name: str) -> None:
+        """
+        Remove the field called `name`.
+        """
+        del self._fields[name]
+
+    def get(self, name: str) -> Any:
+        """
+        Returns the field called `name`.
+        """
+        return self._fields[name]
+
+    def get_fields(self) -> Dict[str, Any]:
+        """
+        Returns:
+            dict: a dict which maps names (str) to data of the fields
+
+        Modifying the returned dict will modify this instance.
+        """
+        return self._fields
+
+    # Tensor-like methods
+    def to(self, *args: Any, **kwargs: Any) -> "Instances":
+        """
+        Returns:
+            Instances: all fields are called with a `to(device)`, if the field has this method.
+        """
+        ret = Instances(self._image_size)
+        for k, v in self._fields.items():
+            if hasattr(v, "to"):
+                v = v.to(*args, **kwargs)
+            ret.set(k, v)
+        return ret
+
+    def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Instances":
+        """
+        Args:
+            item: an index-like object and will be used to index all the fields.
+
+        Returns:
+            If `item` is a string, return the data in the corresponding field.
+            Otherwise, returns an `Instances` where all fields are indexed by `item`.
+        """
+        if type(item) == int:
+            if item >= len(self) or item < -len(self):
+                raise IndexError("Instances index out of range!")
+            else:
+                item = slice(item, None, len(self))
+
+        ret = Instances(self._image_size)
+        for k, v in self._fields.items():
+            ret.set(k, v[item])
+        return ret
+
+    def __len__(self) -> int:
+        for v in self._fields.values():
+            # use __len__ because len() has to be int and is not friendly to tracing
+            return v.__len__()
+        raise NotImplementedError("Empty Instances does not support __len__!")
+
+    def __iter__(self):
+        raise NotImplementedError("`Instances` object is not iterable!")
+
+    @staticmethod
+    def cat(instance_lists: List["Instances"]) -> "Instances":
+        """
+        Args:
+            instance_lists (list[Instances])
+
+        Returns:
+            Instances
+        """
+        assert all(isinstance(i, Instances) for i in instance_lists)
+        assert len(instance_lists) > 0
+        if len(instance_lists) == 1:
+            return instance_lists[0]
+
+        image_size = instance_lists[0].image_size
+        if not isinstance(image_size, torch.Tensor):  # could be a tensor in tracing
+            for i in instance_lists[1:]:
+                assert i.image_size == image_size
+        ret = Instances(image_size)
+        for k in instance_lists[0]._fields.keys():
+            values = [i.get(k) for i in instance_lists]
+            v0 = values[0]
+            if isinstance(v0, torch.Tensor):
+                values = torch.cat(values, dim=0)
+            elif isinstance(v0, list):
+                values = list(itertools.chain(*values))
+            elif hasattr(type(v0), "cat"):
+                values = type(v0).cat(values)
+            else:
+                raise ValueError("Unsupported type {} for concatenation".format(type(v0)))
+            ret.set(k, values)
+        return ret
+
+    def __str__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={}, ".format(len(self))
+        s += "image_height={}, ".format(self._image_size[0])
+        s += "image_width={}, ".format(self._image_size[1])
+        s += "fields=[{}])".format(", ".join((f"{k}: {v}" for k, v in self._fields.items())))
+        return s
+
+    __repr__ = __str__
diff --git a/mmcv/structures/keypoints.py b/mmcv/structures/keypoints.py
new file mode 100644
index 0000000..b93ebed
--- /dev/null
+++ b/mmcv/structures/keypoints.py
@@ -0,0 +1,235 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import numpy as np
+from typing import Any, List, Tuple, Union
+import torch
+from torch.nn import functional as F
+
+
+class Keypoints:
+    """
+    Stores keypoint **annotation** data. GT Instances have a `gt_keypoints` property
+    containing the x,y location and visibility flag of each keypoint. This tensor has shape
+    (N, K, 3) where N is the number of instances and K is the number of keypoints per instance.
+
+    The visibility flag follows the COCO format and must be one of three integers:
+
+    * v=0: not labeled (in which case x=y=0)
+    * v=1: labeled but not visible
+    * v=2: labeled and visible
+    """
+
+    def __init__(self, keypoints: Union[torch.Tensor, np.ndarray, List[List[float]]]):
+        """
+        Arguments:
+            keypoints: A Tensor, numpy array, or list of the x, y, and visibility of each keypoint.
+                The shape should be (N, K, 3) where N is the number of
+                instances, and K is the number of keypoints per instance.
+        """
+        device = keypoints.device if isinstance(keypoints, torch.Tensor) else torch.device("cpu")
+        keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=device)
+        assert keypoints.dim() == 3 and keypoints.shape[2] == 3, keypoints.shape
+        self.tensor = keypoints
+
+    def __len__(self) -> int:
+        return self.tensor.size(0)
+
+    def to(self, *args: Any, **kwargs: Any) -> "Keypoints":
+        return type(self)(self.tensor.to(*args, **kwargs))
+
+    @property
+    def device(self) -> torch.device:
+        return self.tensor.device
+
+    def to_heatmap(self, boxes: torch.Tensor, heatmap_size: int) -> torch.Tensor:
+        """
+        Convert keypoint annotations to a heatmap of one-hot labels for training,
+        as described in :paper:`Mask R-CNN`.
+
+        Arguments:
+            boxes: Nx4 tensor, the boxes to draw the keypoints to
+
+        Returns:
+            heatmaps:
+                A tensor of shape (N, K), each element is integer spatial label
+                in the range [0, heatmap_size**2 - 1] for each keypoint in the input.
+            valid:
+                A tensor of shape (N, K) containing whether each keypoint is in the roi or not.
+        """
+        return _keypoints_to_heatmap(self.tensor, boxes, heatmap_size)
+
+    def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Keypoints":
+        """
+        Create a new `Keypoints` by indexing on this `Keypoints`.
+
+        The following usage are allowed:
+
+        1. `new_kpts = kpts[3]`: return a `Keypoints` which contains only one instance.
+        2. `new_kpts = kpts[2:10]`: return a slice of key points.
+        3. `new_kpts = kpts[vector]`, where vector is a torch.ByteTensor
+           with `length = len(kpts)`. Nonzero elements in the vector will be selected.
+
+        Note that the returned Keypoints might share storage with this Keypoints,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            return Keypoints([self.tensor[item]])
+        return Keypoints(self.tensor[item])
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={})".format(len(self.tensor))
+        return s
+
+    @staticmethod
+    def cat(keypoints_list: List["Keypoints"]) -> "Keypoints":
+        """
+        Concatenates a list of Keypoints into a single Keypoints
+
+        Arguments:
+            keypoints_list (list[Keypoints])
+
+        Returns:
+            Keypoints: the concatenated Keypoints
+        """
+        assert isinstance(keypoints_list, (list, tuple))
+        assert len(keypoints_list) > 0
+        assert all(isinstance(keypoints, Keypoints) for keypoints in keypoints_list)
+
+        cat_kpts = type(keypoints_list[0])(
+            torch.cat([kpts.tensor for kpts in keypoints_list], dim=0)
+        )
+        return cat_kpts
+
+
+# TODO make this nicer, this is a direct translation from C2 (but removing the inner loop)
+def _keypoints_to_heatmap(
+    keypoints: torch.Tensor, rois: torch.Tensor, heatmap_size: int
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Encode keypoint locations into a target heatmap for use in SoftmaxWithLoss across space.
+
+    Maps keypoints from the half-open interval [x1, x2) on continuous image coordinates to the
+    closed interval [0, heatmap_size - 1] on discrete image coordinates. We use the
+    continuous-discrete conversion from Heckbert 1990 ("What is the coordinate of a pixel?"):
+    d = floor(c) and c = d + 0.5, where d is a discrete coordinate and c is a continuous coordinate.
+
+    Arguments:
+        keypoints: tensor of keypoint locations in of shape (N, K, 3).
+        rois: Nx4 tensor of rois in xyxy format
+        heatmap_size: integer side length of square heatmap.
+
+    Returns:
+        heatmaps: A tensor of shape (N, K) containing an integer spatial label
+            in the range [0, heatmap_size**2 - 1] for each keypoint in the input.
+        valid: A tensor of shape (N, K) containing whether each keypoint is in
+            the roi or not.
+    """
+
+    if rois.numel() == 0:
+        return rois.new().long(), rois.new().long()
+    offset_x = rois[:, 0]
+    offset_y = rois[:, 1]
+    scale_x = heatmap_size / (rois[:, 2] - rois[:, 0])
+    scale_y = heatmap_size / (rois[:, 3] - rois[:, 1])
+
+    offset_x = offset_x[:, None]
+    offset_y = offset_y[:, None]
+    scale_x = scale_x[:, None]
+    scale_y = scale_y[:, None]
+
+    x = keypoints[..., 0]
+    y = keypoints[..., 1]
+
+    x_boundary_inds = x == rois[:, 2][:, None]
+    y_boundary_inds = y == rois[:, 3][:, None]
+
+    x = (x - offset_x) * scale_x
+    x = x.floor().long()
+    y = (y - offset_y) * scale_y
+    y = y.floor().long()
+
+    x[x_boundary_inds] = heatmap_size - 1
+    y[y_boundary_inds] = heatmap_size - 1
+
+    valid_loc = (x >= 0) & (y >= 0) & (x < heatmap_size) & (y < heatmap_size)
+    vis = keypoints[..., 2] > 0
+    valid = (valid_loc & vis).long()
+
+    lin_ind = y * heatmap_size + x
+    heatmaps = lin_ind * valid
+
+    return heatmaps, valid
+
+
+@torch.jit.script_if_tracing
+def heatmaps_to_keypoints(maps: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
+    """
+    Extract predicted keypoint locations from heatmaps.
+
+    Args:
+        maps (Tensor): (#ROIs, #keypoints, POOL_H, POOL_W). The predicted heatmap of logits for
+            each ROI and each keypoint.
+        rois (Tensor): (#ROIs, 4). The box of each ROI.
+
+    Returns:
+        Tensor of shape (#ROIs, #keypoints, 4) with the last dimension corresponding to
+        (x, y, logit, score) for each keypoint.
+
+    When converting discrete pixel indices in an NxN image to a continuous keypoint coordinate,
+    we maintain consistency with :meth:`Keypoints.to_heatmap` by using the conversion from
+    Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a continuous coordinate.
+    """
+
+    offset_x = rois[:, 0]
+    offset_y = rois[:, 1]
+
+    widths = (rois[:, 2] - rois[:, 0]).clamp(min=1)
+    heights = (rois[:, 3] - rois[:, 1]).clamp(min=1)
+    widths_ceil = widths.ceil()
+    heights_ceil = heights.ceil()
+
+    num_rois, num_keypoints = maps.shape[:2]
+    xy_preds = maps.new_zeros(rois.shape[0], num_keypoints, 4)
+
+    width_corrections = widths / widths_ceil
+    height_corrections = heights / heights_ceil
+
+    keypoints_idx = torch.arange(num_keypoints, device=maps.device)
+
+    for i in range(num_rois):
+        outsize = (int(heights_ceil[i]), int(widths_ceil[i]))
+        roi_map = F.interpolate(maps[[i]], size=outsize, mode="bicubic", align_corners=False)
+
+        # Although semantically equivalent, `reshape` is used instead of `squeeze` due
+        # to limitation during ONNX export of `squeeze` in scripting mode
+        roi_map = roi_map.reshape(roi_map.shape[1:])  # keypoints x H x W
+
+        # softmax over the spatial region
+        max_score, _ = roi_map.view(num_keypoints, -1).max(1)
+        max_score = max_score.view(num_keypoints, 1, 1)
+        tmp_full_resolution = (roi_map - max_score).exp_()
+        tmp_pool_resolution = (maps[i] - max_score).exp_()
+        # Produce scores over the region H x W, but normalize with POOL_H x POOL_W,
+        # so that the scores of objects of different absolute sizes will be more comparable
+        roi_map_scores = tmp_full_resolution / tmp_pool_resolution.sum((1, 2), keepdim=True)
+
+        w = roi_map.shape[2]
+        pos = roi_map.view(num_keypoints, -1).argmax(1)
+
+        x_int = pos % w
+        y_int = (pos - x_int) // w
+
+        assert (
+            roi_map_scores[keypoints_idx, y_int, x_int]
+            == roi_map_scores.view(num_keypoints, -1).max(1)[0]
+        ).all()
+
+        x = (x_int.float() + 0.5) * width_corrections[i]
+        y = (y_int.float() + 0.5) * height_corrections[i]
+
+        xy_preds[i, :, 0] = x + offset_x[i]
+        xy_preds[i, :, 1] = y + offset_y[i]
+        xy_preds[i, :, 2] = roi_map[keypoints_idx, y_int, x_int]
+        xy_preds[i, :, 3] = roi_map_scores[keypoints_idx, y_int, x_int]
+
+    return xy_preds
diff --git a/mmcv/structures/masks.py b/mmcv/structures/masks.py
new file mode 100644
index 0000000..0db389f
--- /dev/null
+++ b/mmcv/structures/masks.py
@@ -0,0 +1,534 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import copy
+import itertools
+import numpy as np
+from typing import Any, Iterator, List, Union
+import pycocotools.mask as mask_util
+import torch
+from torch import device
+
+from mmcv.layers.roi_align import ROIAlign
+from mmcv.utils import retry_if_cuda_oom
+
+from .boxes import Boxes
+
+
+def polygon_area(x, y):
+    # Using the shoelace formula
+    # https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
+    return 0.5 * np.abs(np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1)))
+
+
+def polygons_to_bitmask(polygons: List[np.ndarray], height: int, width: int) -> np.ndarray:
+    """
+    Args:
+        polygons (list[ndarray]): each array has shape (Nx2,)
+        height, width (int)
+
+    Returns:
+        ndarray: a bool mask of shape (height, width)
+    """
+    if len(polygons) == 0:
+        # COCOAPI does not support empty polygons
+        return np.zeros((height, width)).astype(bool)
+    rles = mask_util.frPyObjects(polygons, height, width)
+    rle = mask_util.merge(rles)
+    return mask_util.decode(rle).astype(bool)
+
+
+def rasterize_polygons_within_box(
+    polygons: List[np.ndarray], box: np.ndarray, mask_size: int
+) -> torch.Tensor:
+    """
+    Rasterize the polygons into a mask image and
+    crop the mask content in the given box.
+    The cropped mask is resized to (mask_size, mask_size).
+
+    This function is used when generating training targets for mask head in Mask R-CNN.
+    Given original ground-truth masks for an image, new ground-truth mask
+    training targets in the size of `mask_size x mask_size`
+    must be provided for each predicted box. This function will be called to
+    produce such targets.
+
+    Args:
+        polygons (list[ndarray[float]]): a list of polygons, which represents an instance.
+        box: 4-element numpy array
+        mask_size (int):
+
+    Returns:
+        Tensor: BoolTensor of shape (mask_size, mask_size)
+    """
+    # 1. Shift the polygons w.r.t the boxes
+    w, h = box[2] - box[0], box[3] - box[1]
+
+    polygons = copy.deepcopy(polygons)
+    for p in polygons:
+        p[0::2] = p[0::2] - box[0]
+        p[1::2] = p[1::2] - box[1]
+
+    # 2. Rescale the polygons to the new box size
+    # max() to avoid division by small number
+    ratio_h = mask_size / max(h, 0.1)
+    ratio_w = mask_size / max(w, 0.1)
+
+    if ratio_h == ratio_w:
+        for p in polygons:
+            p *= ratio_h
+    else:
+        for p in polygons:
+            p[0::2] *= ratio_w
+            p[1::2] *= ratio_h
+
+    # 3. Rasterize the polygons with coco api
+    mask = polygons_to_bitmask(polygons, mask_size, mask_size)
+    mask = torch.from_numpy(mask)
+    return mask
+
+
+class BitMasks:
+    """
+    This class stores the segmentation masks for all objects in one image, in
+    the form of bitmaps.
+
+    Attributes:
+        tensor: bool Tensor of N,H,W, representing N instances in the image.
+    """
+
+    def __init__(self, tensor: Union[torch.Tensor, np.ndarray]):
+        """
+        Args:
+            tensor: bool Tensor of N,H,W, representing N instances in the image.
+        """
+        if isinstance(tensor, torch.Tensor):
+            tensor = tensor.to(torch.bool)
+        else:
+            tensor = torch.as_tensor(tensor, dtype=torch.bool, device=torch.device("cpu"))
+        assert tensor.dim() == 3, tensor.size()
+        self.image_size = tensor.shape[1:]
+        self.tensor = tensor
+
+    @torch.jit.unused
+    def to(self, *args: Any, **kwargs: Any) -> "BitMasks":
+        return BitMasks(self.tensor.to(*args, **kwargs))
+
+    @property
+    def device(self) -> torch.device:
+        return self.tensor.device
+
+    @torch.jit.unused
+    def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "BitMasks":
+        """
+        Returns:
+            BitMasks: Create a new :class:`BitMasks` by indexing.
+
+        The following usage are allowed:
+
+        1. `new_masks = masks[3]`: return a `BitMasks` which contains only one mask.
+        2. `new_masks = masks[2:10]`: return a slice of masks.
+        3. `new_masks = masks[vector]`, where vector is a torch.BoolTensor
+           with `length = len(masks)`. Nonzero elements in the vector will be selected.
+
+        Note that the returned object might share storage with this object,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            return BitMasks(self.tensor[item].unsqueeze(0))
+        m = self.tensor[item]
+        assert m.dim() == 3, "Indexing on BitMasks with {} returns a tensor with shape {}!".format(
+            item, m.shape
+        )
+        return BitMasks(m)
+
+    @torch.jit.unused
+    def __iter__(self) -> torch.Tensor:
+        yield from self.tensor
+
+    @torch.jit.unused
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={})".format(len(self.tensor))
+        return s
+
+    def __len__(self) -> int:
+        return self.tensor.shape[0]
+
+    def nonempty(self) -> torch.Tensor:
+        """
+        Find masks that are non-empty.
+
+        Returns:
+            Tensor: a BoolTensor which represents
+                whether each mask is empty (False) or non-empty (True).
+        """
+        return self.tensor.flatten(1).any(dim=1)
+
+    @staticmethod
+    def from_polygon_masks(
+        polygon_masks: Union["PolygonMasks", List[List[np.ndarray]]], height: int, width: int
+    ) -> "BitMasks":
+        """
+        Args:
+            polygon_masks (list[list[ndarray]] or PolygonMasks)
+            height, width (int)
+        """
+        if isinstance(polygon_masks, PolygonMasks):
+            polygon_masks = polygon_masks.polygons
+        masks = [polygons_to_bitmask(p, height, width) for p in polygon_masks]
+        if len(masks):
+            return BitMasks(torch.stack([torch.from_numpy(x) for x in masks]))
+        else:
+            return BitMasks(torch.empty(0, height, width, dtype=torch.bool))
+
+    @staticmethod
+    def from_roi_masks(roi_masks: "ROIMasks", height: int, width: int) -> "BitMasks":
+        """
+        Args:
+            roi_masks:
+            height, width (int):
+        """
+        return roi_masks.to_bitmasks(height, width)
+
+    def crop_and_resize(self, boxes: torch.Tensor, mask_size: int) -> torch.Tensor:
+        """
+        Crop each bitmask by the given box, and resize results to (mask_size, mask_size).
+        This can be used to prepare training targets for Mask R-CNN.
+        It has less reconstruction error compared to rasterization with polygons.
+        However we observe no difference in accuracy,
+        but BitMasks requires more memory to store all the masks.
+
+        Args:
+            boxes (Tensor): Nx4 tensor storing the boxes for each mask
+            mask_size (int): the size of the rasterized mask.
+
+        Returns:
+            Tensor:
+                A bool tensor of shape (N, mask_size, mask_size), where
+                N is the number of predicted boxes for this image.
+        """
+        assert len(boxes) == len(self), "{} != {}".format(len(boxes), len(self))
+        device = self.tensor.device
+
+        batch_inds = torch.arange(len(boxes), device=device).to(dtype=boxes.dtype)[:, None]
+        rois = torch.cat([batch_inds, boxes], dim=1)  # Nx5
+
+        bit_masks = self.tensor.to(dtype=torch.float32)
+        rois = rois.to(device=device)
+        output = (
+            ROIAlign((mask_size, mask_size), 1.0, 0, aligned=True)
+            .forward(bit_masks[:, None, :, :], rois)
+            .squeeze(1)
+        )
+        output = output >= 0.5
+        return output
+
+    def get_bounding_boxes(self) -> Boxes:
+        """
+        Returns:
+            Boxes: tight bounding boxes around bitmasks.
+            If a mask is empty, it's bounding box will be all zero.
+        """
+        boxes = torch.zeros(self.tensor.shape[0], 4, dtype=torch.float32)
+        x_any = torch.any(self.tensor, dim=1)
+        y_any = torch.any(self.tensor, dim=2)
+        for idx in range(self.tensor.shape[0]):
+            x = torch.where(x_any[idx, :])[0]
+            y = torch.where(y_any[idx, :])[0]
+            if len(x) > 0 and len(y) > 0:
+                boxes[idx, :] = torch.as_tensor(
+                    [x[0], y[0], x[-1] + 1, y[-1] + 1], dtype=torch.float32
+                )
+        return Boxes(boxes)
+
+    @staticmethod
+    def cat(bitmasks_list: List["BitMasks"]) -> "BitMasks":
+        """
+        Concatenates a list of BitMasks into a single BitMasks
+
+        Arguments:
+            bitmasks_list (list[BitMasks])
+
+        Returns:
+            BitMasks: the concatenated BitMasks
+        """
+        assert isinstance(bitmasks_list, (list, tuple))
+        assert len(bitmasks_list) > 0
+        assert all(isinstance(bitmask, BitMasks) for bitmask in bitmasks_list)
+
+        cat_bitmasks = type(bitmasks_list[0])(torch.cat([bm.tensor for bm in bitmasks_list], dim=0))
+        return cat_bitmasks
+
+
+class PolygonMasks:
+    """
+    This class stores the segmentation masks for all objects in one image, in the form of polygons.
+
+    Attributes:
+        polygons: list[list[ndarray]]. Each ndarray is a float64 vector representing a polygon.
+    """
+
+    def __init__(self, polygons: List[List[Union[torch.Tensor, np.ndarray]]]):
+        """
+        Arguments:
+            polygons (list[list[np.ndarray]]): The first
+                level of the list correspond to individual instances,
+                the second level to all the polygons that compose the
+                instance, and the third level to the polygon coordinates.
+                The third level array should have the format of
+                [x0, y0, x1, y1, ..., xn, yn] (n >= 3).
+        """
+        if not isinstance(polygons, list):
+            raise ValueError(
+                "Cannot create PolygonMasks: Expect a list of list of polygons per image. "
+                "Got '{}' instead.".format(type(polygons))
+            )
+
+        def _make_array(t: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
+            # Use float64 for higher precision, because why not?
+            # Always put polygons on CPU (self.to is a no-op) since they
+            # are supposed to be small tensors.
+            # May need to change this assumption if GPU placement becomes useful
+            if isinstance(t, torch.Tensor):
+                t = t.cpu().numpy()
+            return np.asarray(t).astype("float64")
+
+        def process_polygons(
+            polygons_per_instance: List[Union[torch.Tensor, np.ndarray]]
+        ) -> List[np.ndarray]:
+            if not isinstance(polygons_per_instance, list):
+                raise ValueError(
+                    "Cannot create polygons: Expect a list of polygons per instance. "
+                    "Got '{}' instead.".format(type(polygons_per_instance))
+                )
+            # transform each polygon to a numpy array
+            polygons_per_instance = [_make_array(p) for p in polygons_per_instance]
+            for polygon in polygons_per_instance:
+                if len(polygon) % 2 != 0 or len(polygon) < 6:
+                    raise ValueError(f"Cannot create a polygon from {len(polygon)} coordinates.")
+            return polygons_per_instance
+
+        self.polygons: List[List[np.ndarray]] = [
+            process_polygons(polygons_per_instance) for polygons_per_instance in polygons
+        ]
+
+    def to(self, *args: Any, **kwargs: Any) -> "PolygonMasks":
+        return self
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device("cpu")
+
+    def get_bounding_boxes(self) -> Boxes:
+        """
+        Returns:
+            Boxes: tight bounding boxes around polygon masks.
+        """
+        boxes = torch.zeros(len(self.polygons), 4, dtype=torch.float32)
+        for idx, polygons_per_instance in enumerate(self.polygons):
+            minxy = torch.as_tensor([float("inf"), float("inf")], dtype=torch.float32)
+            maxxy = torch.zeros(2, dtype=torch.float32)
+            for polygon in polygons_per_instance:
+                coords = torch.from_numpy(polygon).view(-1, 2).to(dtype=torch.float32)
+                minxy = torch.min(minxy, torch.min(coords, dim=0).values)
+                maxxy = torch.max(maxxy, torch.max(coords, dim=0).values)
+            boxes[idx, :2] = minxy
+            boxes[idx, 2:] = maxxy
+        return Boxes(boxes)
+
+    def nonempty(self) -> torch.Tensor:
+        """
+        Find masks that are non-empty.
+
+        Returns:
+            Tensor:
+                a BoolTensor which represents whether each mask is empty (False) or not (True).
+        """
+        keep = [1 if len(polygon) > 0 else 0 for polygon in self.polygons]
+        return torch.from_numpy(np.asarray(keep, dtype=bool))
+
+    def __getitem__(self, item: Union[int, slice, List[int], torch.BoolTensor]) -> "PolygonMasks":
+        """
+        Support indexing over the instances and return a `PolygonMasks` object.
+        `item` can be:
+
+        1. An integer. It will return an object with only one instance.
+        2. A slice. It will return an object with the selected instances.
+        3. A list[int]. It will return an object with the selected instances,
+           correpsonding to the indices in the list.
+        4. A vector mask of type BoolTensor, whose length is num_instances.
+           It will return an object with the instances whose mask is nonzero.
+        """
+        if isinstance(item, int):
+            selected_polygons = [self.polygons[item]]
+        elif isinstance(item, slice):
+            selected_polygons = self.polygons[item]
+        elif isinstance(item, list):
+            selected_polygons = [self.polygons[i] for i in item]
+        elif isinstance(item, torch.Tensor):
+            # Polygons is a list, so we have to move the indices back to CPU.
+            if item.dtype == torch.bool:
+                assert item.dim() == 1, item.shape
+                item = item.nonzero().squeeze(1).cpu().numpy().tolist()
+            elif item.dtype in [torch.int32, torch.int64]:
+                item = item.cpu().numpy().tolist()
+            else:
+                raise ValueError("Unsupported tensor dtype={} for indexing!".format(item.dtype))
+            selected_polygons = [self.polygons[i] for i in item]
+        return PolygonMasks(selected_polygons)
+
+    def __iter__(self) -> Iterator[List[np.ndarray]]:
+        """
+        Yields:
+            list[ndarray]: the polygons for one instance.
+            Each Tensor is a float64 vector representing a polygon.
+        """
+        return iter(self.polygons)
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={})".format(len(self.polygons))
+        return s
+
+    def __len__(self) -> int:
+        return len(self.polygons)
+
+    def crop_and_resize(self, boxes: torch.Tensor, mask_size: int) -> torch.Tensor:
+        """
+        Crop each mask by the given box, and resize results to (mask_size, mask_size).
+        This can be used to prepare training targets for Mask R-CNN.
+
+        Args:
+            boxes (Tensor): Nx4 tensor storing the boxes for each mask
+            mask_size (int): the size of the rasterized mask.
+
+        Returns:
+            Tensor: A bool tensor of shape (N, mask_size, mask_size), where
+            N is the number of predicted boxes for this image.
+        """
+        assert len(boxes) == len(self), "{} != {}".format(len(boxes), len(self))
+
+        device = boxes.device
+        # Put boxes on the CPU, as the polygon representation is not efficient GPU-wise
+        # (several small tensors for representing a single instance mask)
+        boxes = boxes.to(torch.device("cpu"))
+
+        results = [
+            rasterize_polygons_within_box(poly, box.numpy(), mask_size)
+            for poly, box in zip(self.polygons, boxes)
+        ]
+        """
+        poly: list[list[float]], the polygons for one instance
+        box: a tensor of shape (4,)
+        """
+        if len(results) == 0:
+            return torch.empty(0, mask_size, mask_size, dtype=torch.bool, device=device)
+        return torch.stack(results, dim=0).to(device=device)
+
+    def area(self):
+        """
+        Computes area of the mask.
+        Only works with Polygons, using the shoelace formula:
+        https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
+
+        Returns:
+            Tensor: a vector, area for each instance
+        """
+
+        area = []
+        for polygons_per_instance in self.polygons:
+            area_per_instance = 0
+            for p in polygons_per_instance:
+                area_per_instance += polygon_area(p[0::2], p[1::2])
+            area.append(area_per_instance)
+
+        return torch.tensor(area)
+
+    @staticmethod
+    def cat(polymasks_list: List["PolygonMasks"]) -> "PolygonMasks":
+        """
+        Concatenates a list of PolygonMasks into a single PolygonMasks
+
+        Arguments:
+            polymasks_list (list[PolygonMasks])
+
+        Returns:
+            PolygonMasks: the concatenated PolygonMasks
+        """
+        assert isinstance(polymasks_list, (list, tuple))
+        assert len(polymasks_list) > 0
+        assert all(isinstance(polymask, PolygonMasks) for polymask in polymasks_list)
+
+        cat_polymasks = type(polymasks_list[0])(
+            list(itertools.chain.from_iterable(pm.polygons for pm in polymasks_list))
+        )
+        return cat_polymasks
+
+
+class ROIMasks:
+    """
+    Represent masks by N smaller masks defined in some ROIs. Once ROI boxes are given,
+    full-image bitmask can be obtained by "pasting" the mask on the region defined
+    by the corresponding ROI box.
+    """
+
+    def __init__(self, tensor: torch.Tensor):
+        """
+        Args:
+            tensor: (N, M, M) mask tensor that defines the mask within each ROI.
+        """
+        if tensor.dim() != 3:
+            raise ValueError("ROIMasks must take a masks of 3 dimension.")
+        self.tensor = tensor
+
+    def to(self, device: torch.device) -> "ROIMasks":
+        return ROIMasks(self.tensor.to(device))
+
+    @property
+    def device(self) -> device:
+        return self.tensor.device
+
+    def __len__(self):
+        return self.tensor.shape[0]
+
+    def __getitem__(self, item) -> "ROIMasks":
+        """
+        Returns:
+            ROIMasks: Create a new :class:`ROIMasks` by indexing.
+
+        The following usage are allowed:
+
+        1. `new_masks = masks[2:10]`: return a slice of masks.
+        2. `new_masks = masks[vector]`, where vector is a torch.BoolTensor
+           with `length = len(masks)`. Nonzero elements in the vector will be selected.
+
+        Note that the returned object might share storage with this object,
+        subject to Pytorch's indexing semantics.
+        """
+        t = self.tensor[item]
+        if t.dim() != 3:
+            raise ValueError(
+                f"Indexing on ROIMasks with {item} returns a tensor with shape {t.shape}!"
+            )
+        return ROIMasks(t)
+
+    @torch.jit.unused
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "num_instances={})".format(len(self.tensor))
+        return s
+
+    @torch.jit.unused
+    def to_bitmasks(self, boxes: torch.Tensor, height, width, threshold=0.5):
+        """
+        Args: see documentation of :func:`paste_masks_in_image`.
+        """
+        from detectron2.layers.mask_ops import paste_masks_in_image, _paste_masks_tensor_shape
+
+        if torch.jit.is_tracing():
+            if isinstance(height, torch.Tensor):
+                paste_func = _paste_masks_tensor_shape
+            else:
+                paste_func = paste_masks_in_image
+        else:
+            paste_func = retry_if_cuda_oom(paste_masks_in_image)
+        bitmasks = paste_func(self.tensor, boxes.tensor, (height, width), threshold=threshold)
+        return BitMasks(bitmasks)
diff --git a/mmcv/structures/rotated_boxes.py b/mmcv/structures/rotated_boxes.py
new file mode 100644
index 0000000..c842b99
--- /dev/null
+++ b/mmcv/structures/rotated_boxes.py
@@ -0,0 +1,505 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import math
+from typing import List, Tuple
+import torch
+
+from detectron2.layers.rotated_boxes import pairwise_iou_rotated
+
+from .boxes import Boxes
+
+
+class RotatedBoxes(Boxes):
+    """
+    This structure stores a list of rotated boxes as a Nx5 torch.Tensor.
+    It supports some common methods about boxes
+    (`area`, `clip`, `nonempty`, etc),
+    and also behaves like a Tensor
+    (support indexing, `to(device)`, `.device`, and iteration over all boxes)
+    """
+
+    def __init__(self, tensor: torch.Tensor):
+        """
+        Args:
+            tensor (Tensor[float]): a Nx5 matrix.  Each row is
+                (x_center, y_center, width, height, angle),
+                in which angle is represented in degrees.
+                While there's no strict range restriction for it,
+                the recommended principal range is between [-180, 180) degrees.
+
+        Assume we have a horizontal box B = (x_center, y_center, width, height),
+        where width is along the x-axis and height is along the y-axis.
+        The rotated box B_rot (x_center, y_center, width, height, angle)
+        can be seen as:
+
+        1. When angle == 0:
+           B_rot == B
+        2. When angle > 0:
+           B_rot is obtained by rotating B w.r.t its center by :math:`|angle|` degrees CCW;
+        3. When angle < 0:
+           B_rot is obtained by rotating B w.r.t its center by :math:`|angle|` degrees CW.
+
+        Mathematically, since the right-handed coordinate system for image space
+        is (y, x), where y is top->down and x is left->right, the 4 vertices of the
+        rotated rectangle :math:`(yr_i, xr_i)` (i = 1, 2, 3, 4) can be obtained from
+        the vertices of the horizontal rectangle :math:`(y_i, x_i)` (i = 1, 2, 3, 4)
+        in the following way (:math:`\\theta = angle*\\pi/180` is the angle in radians,
+        :math:`(y_c, x_c)` is the center of the rectangle):
+
+        .. math::
+
+            yr_i = \\cos(\\theta) (y_i - y_c) - \\sin(\\theta) (x_i - x_c) + y_c,
+
+            xr_i = \\sin(\\theta) (y_i - y_c) + \\cos(\\theta) (x_i - x_c) + x_c,
+
+        which is the standard rigid-body rotation transformation.
+
+        Intuitively, the angle is
+        (1) the rotation angle from y-axis in image space
+        to the height vector (top->down in the box's local coordinate system)
+        of the box in CCW, and
+        (2) the rotation angle from x-axis in image space
+        to the width vector (left->right in the box's local coordinate system)
+        of the box in CCW.
+
+        More intuitively, consider the following horizontal box ABCD represented
+        in (x1, y1, x2, y2): (3, 2, 7, 4),
+        covering the [3, 7] x [2, 4] region of the continuous coordinate system
+        which looks like this:
+
+        .. code:: none
+
+            O--------> x
+            |
+            |  A---B
+            |  |   |
+            |  D---C
+            |
+            v y
+
+        Note that each capital letter represents one 0-dimensional geometric point
+        instead of a 'square pixel' here.
+
+        In the example above, using (x, y) to represent a point we have:
+
+        .. math::
+
+            O = (0, 0), A = (3, 2), B = (7, 2), C = (7, 4), D = (3, 4)
+
+        We name vector AB = vector DC as the width vector in box's local coordinate system, and
+        vector AD = vector BC as the height vector in box's local coordinate system. Initially,
+        when angle = 0 degree, they're aligned with the positive directions of x-axis and y-axis
+        in the image space, respectively.
+
+        For better illustration, we denote the center of the box as E,
+
+        .. code:: none
+
+            O--------> x
+            |
+            |  A---B
+            |  | E |
+            |  D---C
+            |
+            v y
+
+        where the center E = ((3+7)/2, (2+4)/2) = (5, 3).
+
+        Also,
+
+        .. math::
+
+            width = |AB| = |CD| = 7 - 3 = 4,
+            height = |AD| = |BC| = 4 - 2 = 2.
+
+        Therefore, the corresponding representation for the same shape in rotated box in
+        (x_center, y_center, width, height, angle) format is:
+
+        (5, 3, 4, 2, 0),
+
+        Now, let's consider (5, 3, 4, 2, 90), which is rotated by 90 degrees
+        CCW (counter-clockwise) by definition. It looks like this:
+
+        .. code:: none
+
+            O--------> x
+            |   B-C
+            |   | |
+            |   |E|
+            |   | |
+            |   A-D
+            v y
+
+        The center E is still located at the same point (5, 3), while the vertices
+        ABCD are rotated by 90 degrees CCW with regard to E:
+        A = (4, 5), B = (4, 1), C = (6, 1), D = (6, 5)
+
+        Here, 90 degrees can be seen as the CCW angle to rotate from y-axis to
+        vector AD or vector BC (the top->down height vector in box's local coordinate system),
+        or the CCW angle to rotate from x-axis to vector AB or vector DC (the left->right
+        width vector in box's local coordinate system).
+
+        .. math::
+
+            width = |AB| = |CD| = 5 - 1 = 4,
+            height = |AD| = |BC| = 6 - 4 = 2.
+
+        Next, how about (5, 3, 4, 2, -90), which is rotated by 90 degrees CW (clockwise)
+        by definition? It looks like this:
+
+        .. code:: none
+
+            O--------> x
+            |   D-A
+            |   | |
+            |   |E|
+            |   | |
+            |   C-B
+            v y
+
+        The center E is still located at the same point (5, 3), while the vertices
+        ABCD are rotated by 90 degrees CW with regard to E:
+        A = (6, 1), B = (6, 5), C = (4, 5), D = (4, 1)
+
+        .. math::
+
+            width = |AB| = |CD| = 5 - 1 = 4,
+            height = |AD| = |BC| = 6 - 4 = 2.
+
+        This covers exactly the same region as (5, 3, 4, 2, 90) does, and their IoU
+        will be 1. However, these two will generate different RoI Pooling results and
+        should not be treated as an identical box.
+
+        On the other hand, it's easy to see that (X, Y, W, H, A) is identical to
+        (X, Y, W, H, A+360N), for any integer N. For example (5, 3, 4, 2, 270) would be
+        identical to (5, 3, 4, 2, -90), because rotating the shape 270 degrees CCW is
+        equivalent to rotating the same shape 90 degrees CW.
+
+        We could rotate further to get (5, 3, 4, 2, 180), or (5, 3, 4, 2, -180):
+
+        .. code:: none
+
+            O--------> x
+            |
+            |  C---D
+            |  | E |
+            |  B---A
+            |
+            v y
+
+        .. math::
+
+            A = (7, 4), B = (3, 4), C = (3, 2), D = (7, 2),
+
+            width = |AB| = |CD| = 7 - 3 = 4,
+            height = |AD| = |BC| = 4 - 2 = 2.
+
+        Finally, this is a very inaccurate (heavily quantized) illustration of
+        how (5, 3, 4, 2, 60) looks like in case anyone wonders:
+
+        .. code:: none
+
+            O--------> x
+            |     B\
+            |    /  C
+            |   /E /
+            |  A  /
+            |   `D
+            v y
+
+        It's still a rectangle with center of (5, 3), width of 4 and height of 2,
+        but its angle (and thus orientation) is somewhere between
+        (5, 3, 4, 2, 0) and (5, 3, 4, 2, 90).
+        """
+        device = tensor.device if isinstance(tensor, torch.Tensor) else torch.device("cpu")
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that does not depend on
+            # the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((0, 5)).to(dtype=torch.float32, device=device)
+        assert tensor.dim() == 2 and tensor.size(-1) == 5, tensor.size()
+
+        self.tensor = tensor
+
+    def clone(self) -> "RotatedBoxes":
+        """
+        Clone the RotatedBoxes.
+
+        Returns:
+            RotatedBoxes
+        """
+        return RotatedBoxes(self.tensor.clone())
+
+    def to(self, device: torch.device):
+        # Boxes are assumed float32 and does not support to(dtype)
+        return RotatedBoxes(self.tensor.to(device=device))
+
+    def area(self) -> torch.Tensor:
+        """
+        Computes the area of all the boxes.
+
+        Returns:
+            torch.Tensor: a vector with areas of each box.
+        """
+        box = self.tensor
+        area = box[:, 2] * box[:, 3]
+        return area
+
+    # Avoid in-place operations so that we can torchscript; NOTE: this creates a new tensor
+    def normalize_angles(self) -> None:
+        """
+        Restrict angles to the range of [-180, 180) degrees
+        """
+        angle_tensor = (self.tensor[:, 4] + 180.0) % 360.0 - 180.0
+        self.tensor = torch.cat((self.tensor[:, :4], angle_tensor[:, None]), dim=1)
+
+    def clip(self, box_size: Tuple[int, int], clip_angle_threshold: float = 1.0) -> None:
+        """
+        Clip (in place) the boxes by limiting x coordinates to the range [0, width]
+        and y coordinates to the range [0, height].
+
+        For RRPN:
+        Only clip boxes that are almost horizontal with a tolerance of
+        clip_angle_threshold to maintain backward compatibility.
+
+        Rotated boxes beyond this threshold are not clipped for two reasons:
+
+        1. There are potentially multiple ways to clip a rotated box to make it
+           fit within the image.
+        2. It's tricky to make the entire rectangular box fit within the image
+           and still be able to not leave out pixels of interest.
+
+        Therefore we rely on ops like RoIAlignRotated to safely handle this.
+
+        Args:
+            box_size (height, width): The clipping box's size.
+            clip_angle_threshold:
+                Iff. abs(normalized(angle)) <= clip_angle_threshold (in degrees),
+                we do the clipping as horizontal boxes.
+        """
+        h, w = box_size
+
+        # normalize angles to be within (-180, 180] degrees
+        self.normalize_angles()
+
+        idx = torch.where(torch.abs(self.tensor[:, 4]) <= clip_angle_threshold)[0]
+
+        # convert to (x1, y1, x2, y2)
+        x1 = self.tensor[idx, 0] - self.tensor[idx, 2] / 2.0
+        y1 = self.tensor[idx, 1] - self.tensor[idx, 3] / 2.0
+        x2 = self.tensor[idx, 0] + self.tensor[idx, 2] / 2.0
+        y2 = self.tensor[idx, 1] + self.tensor[idx, 3] / 2.0
+
+        # clip
+        x1.clamp_(min=0, max=w)
+        y1.clamp_(min=0, max=h)
+        x2.clamp_(min=0, max=w)
+        y2.clamp_(min=0, max=h)
+
+        # convert back to (xc, yc, w, h)
+        self.tensor[idx, 0] = (x1 + x2) / 2.0
+        self.tensor[idx, 1] = (y1 + y2) / 2.0
+        # make sure widths and heights do not increase due to numerical errors
+        self.tensor[idx, 2] = torch.min(self.tensor[idx, 2], x2 - x1)
+        self.tensor[idx, 3] = torch.min(self.tensor[idx, 3], y2 - y1)
+
+    def nonempty(self, threshold: float = 0.0) -> torch.Tensor:
+        """
+        Find boxes that are non-empty.
+        A box is considered empty, if either of its side is no larger than threshold.
+
+        Returns:
+            Tensor: a binary vector which represents
+            whether each box is empty (False) or non-empty (True).
+        """
+        box = self.tensor
+        widths = box[:, 2]
+        heights = box[:, 3]
+        keep = (widths > threshold) & (heights > threshold)
+        return keep
+
+    def __getitem__(self, item) -> "RotatedBoxes":
+        """
+        Returns:
+            RotatedBoxes: Create a new :class:`RotatedBoxes` by indexing.
+
+        The following usage are allowed:
+
+        1. `new_boxes = boxes[3]`: return a `RotatedBoxes` which contains only one box.
+        2. `new_boxes = boxes[2:10]`: return a slice of boxes.
+        3. `new_boxes = boxes[vector]`, where vector is a torch.ByteTensor
+           with `length = len(boxes)`. Nonzero elements in the vector will be selected.
+
+        Note that the returned RotatedBoxes might share storage with this RotatedBoxes,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            return RotatedBoxes(self.tensor[item].view(1, -1))
+        b = self.tensor[item]
+        assert b.dim() == 2, "Indexing on RotatedBoxes with {} failed to return a matrix!".format(
+            item
+        )
+        return RotatedBoxes(b)
+
+    def __len__(self) -> int:
+        return self.tensor.shape[0]
+
+    def __repr__(self) -> str:
+        return "RotatedBoxes(" + str(self.tensor) + ")"
+
+    def inside_box(self, box_size: Tuple[int, int], boundary_threshold: int = 0) -> torch.Tensor:
+        """
+        Args:
+            box_size (height, width): Size of the reference box covering
+                [0, width] x [0, height]
+            boundary_threshold (int): Boxes that extend beyond the reference box
+                boundary by more than boundary_threshold are considered "outside".
+
+        For RRPN, it might not be necessary to call this function since it's common
+        for rotated box to extend to outside of the image boundaries
+        (the clip function only clips the near-horizontal boxes)
+
+        Returns:
+            a binary vector, indicating whether each box is inside the reference box.
+        """
+        height, width = box_size
+
+        cnt_x = self.tensor[..., 0]
+        cnt_y = self.tensor[..., 1]
+        half_w = self.tensor[..., 2] / 2.0
+        half_h = self.tensor[..., 3] / 2.0
+        a = self.tensor[..., 4]
+        c = torch.abs(torch.cos(a * math.pi / 180.0))
+        s = torch.abs(torch.sin(a * math.pi / 180.0))
+        # This basically computes the horizontal bounding rectangle of the rotated box
+        max_rect_dx = c * half_w + s * half_h
+        max_rect_dy = c * half_h + s * half_w
+
+        inds_inside = (
+            (cnt_x - max_rect_dx >= -boundary_threshold)
+            & (cnt_y - max_rect_dy >= -boundary_threshold)
+            & (cnt_x + max_rect_dx < width + boundary_threshold)
+            & (cnt_y + max_rect_dy < height + boundary_threshold)
+        )
+
+        return inds_inside
+
+    def get_centers(self) -> torch.Tensor:
+        """
+        Returns:
+            The box centers in a Nx2 array of (x, y).
+        """
+        return self.tensor[:, :2]
+
+    def scale(self, scale_x: float, scale_y: float) -> None:
+        """
+        Scale the rotated box with horizontal and vertical scaling factors
+        Note: when scale_factor_x != scale_factor_y,
+        the rotated box does not preserve the rectangular shape when the angle
+        is not a multiple of 90 degrees under resize transformation.
+        Instead, the shape is a parallelogram (that has skew)
+        Here we make an approximation by fitting a rotated rectangle to the parallelogram.
+        """
+        self.tensor[:, 0] *= scale_x
+        self.tensor[:, 1] *= scale_y
+        theta = self.tensor[:, 4] * math.pi / 180.0
+        c = torch.cos(theta)
+        s = torch.sin(theta)
+
+        # In image space, y is top->down and x is left->right
+        # Consider the local coordintate system for the rotated box,
+        # where the box center is located at (0, 0), and the four vertices ABCD are
+        # A(-w / 2, -h / 2), B(w / 2, -h / 2), C(w / 2, h / 2), D(-w / 2, h / 2)
+        # the midpoint of the left edge AD of the rotated box E is:
+        # E = (A+D)/2 = (-w / 2, 0)
+        # the midpoint of the top edge AB of the rotated box F is:
+        # F(0, -h / 2)
+        # To get the old coordinates in the global system, apply the rotation transformation
+        # (Note: the right-handed coordinate system for image space is yOx):
+        # (old_x, old_y) = (s * y + c * x, c * y - s * x)
+        # E(old) = (s * 0 + c * (-w/2), c * 0 - s * (-w/2)) = (-c * w / 2, s * w / 2)
+        # F(old) = (s * (-h / 2) + c * 0, c * (-h / 2) - s * 0) = (-s * h / 2, -c * h / 2)
+        # After applying the scaling factor (sfx, sfy):
+        # E(new) = (-sfx * c * w / 2, sfy * s * w / 2)
+        # F(new) = (-sfx * s * h / 2, -sfy * c * h / 2)
+        # The new width after scaling tranformation becomes:
+
+        # w(new) = |E(new) - O| * 2
+        #        = sqrt[(sfx * c * w / 2)^2 + (sfy * s * w / 2)^2] * 2
+        #        = sqrt[(sfx * c)^2 + (sfy * s)^2] * w
+        # i.e., scale_factor_w = sqrt[(sfx * c)^2 + (sfy * s)^2]
+        #
+        # For example,
+        # when angle = 0 or 180, |c| = 1, s = 0, scale_factor_w == scale_factor_x;
+        # when |angle| = 90, c = 0, |s| = 1, scale_factor_w == scale_factor_y
+        self.tensor[:, 2] *= torch.sqrt((scale_x * c) ** 2 + (scale_y * s) ** 2)
+
+        # h(new) = |F(new) - O| * 2
+        #        = sqrt[(sfx * s * h / 2)^2 + (sfy * c * h / 2)^2] * 2
+        #        = sqrt[(sfx * s)^2 + (sfy * c)^2] * h
+        # i.e., scale_factor_h = sqrt[(sfx * s)^2 + (sfy * c)^2]
+        #
+        # For example,
+        # when angle = 0 or 180, |c| = 1, s = 0, scale_factor_h == scale_factor_y;
+        # when |angle| = 90, c = 0, |s| = 1, scale_factor_h == scale_factor_x
+        self.tensor[:, 3] *= torch.sqrt((scale_x * s) ** 2 + (scale_y * c) ** 2)
+
+        # The angle is the rotation angle from y-axis in image space to the height
+        # vector (top->down in the box's local coordinate system) of the box in CCW.
+        #
+        # angle(new) = angle_yOx(O - F(new))
+        #            = angle_yOx( (sfx * s * h / 2, sfy * c * h / 2) )
+        #            = atan2(sfx * s * h / 2, sfy * c * h / 2)
+        #            = atan2(sfx * s, sfy * c)
+        #
+        # For example,
+        # when sfx == sfy, angle(new) == atan2(s, c) == angle(old)
+        self.tensor[:, 4] = torch.atan2(scale_x * s, scale_y * c) * 180 / math.pi
+
+    @classmethod
+    def cat(cls, boxes_list: List["RotatedBoxes"]) -> "RotatedBoxes":
+        """
+        Concatenates a list of RotatedBoxes into a single RotatedBoxes
+
+        Arguments:
+            boxes_list (list[RotatedBoxes])
+
+        Returns:
+            RotatedBoxes: the concatenated RotatedBoxes
+        """
+        assert isinstance(boxes_list, (list, tuple))
+        if len(boxes_list) == 0:
+            return cls(torch.empty(0))
+        assert all([isinstance(box, RotatedBoxes) for box in boxes_list])
+
+        # use torch.cat (v.s. layers.cat) so the returned boxes never share storage with input
+        cat_boxes = cls(torch.cat([b.tensor for b in boxes_list], dim=0))
+        return cat_boxes
+
+    @property
+    def device(self) -> torch.device:
+        return self.tensor.device
+
+    @torch.jit.unused
+    def __iter__(self):
+        """
+        Yield a box as a Tensor of shape (5,) at a time.
+        """
+        yield from self.tensor
+
+
+def pairwise_iou(boxes1: RotatedBoxes, boxes2: RotatedBoxes) -> None:
+    """
+    Given two lists of rotated boxes of size N and M,
+    compute the IoU (intersection over union)
+    between **all** N x M pairs of boxes.
+    The box order must be (x_center, y_center, width, height, angle).
+
+    Args:
+        boxes1, boxes2 (RotatedBoxes):
+            two `RotatedBoxes`. Contains N & M rotated boxes, respectively.
+
+    Returns:
+        Tensor: IoU, sized [N,M].
+    """
+
+    return pairwise_iou_rotated(boxes1.tensor, boxes2.tensor)
diff --git a/mmcv/utils/__init__.py b/mmcv/utils/__init__.py
new file mode 100644
index 0000000..5992858
--- /dev/null
+++ b/mmcv/utils/__init__.py
@@ -0,0 +1,29 @@
+# flake8: noqa
+# Copyright (c) OpenMMLab. All rights reserved.
+from .config import Config, ConfigDict, DictAction
+from .misc import (check_prerequisites, concat_list, deprecated_api_warning,
+                   has_method, import_modules_from_strings, is_list_of,
+                   is_method_overridden, is_seq_of, is_str, is_tuple_of,
+                   iter_cast, list_cast, requires_executable, requires_package,
+                   slice_list, to_1tuple, to_2tuple, to_3tuple, to_4tuple,
+                   to_ntuple, tuple_cast)
+from .path import (check_file_exist, fopen, is_filepath, mkdir_or_exist,
+                   scandir, symlink)
+from .progressbar import (ProgressBar, track_iter_progress,
+                          track_parallel_progress, track_progress)
+from .timer import Timer, TimerError, check_time
+from .version_utils import digit_version, get_git_hash
+import torch
+from .logging import get_logger, print_log
+from .registry import Registry, build_from_cfg
+from .hub import load_url
+from .logging import get_logger, print_log
+from .logger import get_root_logger
+from .collect_env import collect_env
+from .runner_utils import *
+from .fp16_utils import LossScaler, auto_fp16, force_fp32, wrap_fp16_model, TORCH_VERSION
+from .checkpoint import load_checkpoint, save_checkpoint
+from .log_buffer import LogBuffer
+from .priority import Priority, get_priority
+from .memory import retry_if_cuda_oom
+from .visual import convert_color, save_tensor
\ No newline at end of file
diff --git a/mmcv/utils/bricks.py b/mmcv/utils/bricks.py
new file mode 100644
index 0000000..fd45881
--- /dev/null
+++ b/mmcv/utils/bricks.py
@@ -0,0 +1,20 @@
+import functools
+import time
+from collections import defaultdict
+import torch
+time_maps = defaultdict(lambda :0.)
+count_maps = defaultdict(lambda :0.)
+def run_time(name):
+    def middle(fn):
+        def wrapper(*args, **kwargs):
+            torch.cuda.synchronize()
+            start = time.time()
+            res = fn(*args, **kwargs)
+            torch.cuda.synchronize()
+            time_maps['%s : %s'%(name, fn.__name__) ] += time.time()-start
+            count_maps['%s : %s'%(name, fn.__name__) ] +=1
+            print("%s : %s takes up %f "% (name, fn.__name__,time_maps['%s : %s'%(name, fn.__name__) ] /count_maps['%s : %s'%(name, fn.__name__) ] ))
+            return res
+        return wrapper
+    return middle
+    
\ No newline at end of file
diff --git a/mmcv/utils/checkpoint.py b/mmcv/utils/checkpoint.py
new file mode 100644
index 0000000..5b76d0b
--- /dev/null
+++ b/mmcv/utils/checkpoint.py
@@ -0,0 +1,153 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import io
+import os
+import os.path as osp
+import pkgutil
+import re
+import time
+import warnings
+from collections import OrderedDict
+import torch
+from torch.optim import Optimizer
+from .logging import print_log
+from .runner_utils import get_dist_info
+from ..parallel import is_module_wrapper
+from mmcv.fileio.file_client import FileClient
+
+def load_checkpoint(model,
+                    filename,
+                    map_location=None,
+                    strict=False,
+                    logger=None,
+                    revise_keys=[(r'^module\.', '')]):
+    """Load checkpoint from a file or URI.
+
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+        revise_keys (list): A list of customized keywords to modify the
+            state_dict in checkpoint. Each item is a (pattern, replacement)
+            pair of the regular expression operations. Default: strip
+            the prefix 'module.' by [(r'^module\\.', '')].
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    print_log(
+        f'load checkpoint from path: {filename}', logger)
+    if not osp.isfile(filename):
+        raise IOError(f'{filename} is not a checkpoint file')
+    checkpoint = torch.load(filename, map_location=map_location)
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+
+    # strip prefix of state_dict
+    metadata = getattr(state_dict, '_metadata', OrderedDict())
+    for p, r in revise_keys:
+        state_dict = OrderedDict(
+            {re.sub(p, r, k): v
+             for k, v in state_dict.items()})
+    # Keep metadata in state_dict
+    state_dict._metadata = metadata
+
+    # load state_dict
+    if is_module_wrapper(model):
+        model = model.module
+    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict)
+    # ignore "num_batches_tracked" of BN layers
+    err_msg = []
+    if unexpected_keys:
+        err_msg.append('unexpected key in source '
+                       f'state_dict: {", ".join(unexpected_keys)}\n')
+    if missing_keys:
+        err_msg.append(
+            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
+
+    rank, _ = get_dist_info()
+    if len(err_msg) > 0 and rank == 0:
+        err_msg.insert(
+            0, 'The model and loaded state dict do not match exactly\n')
+        err_msg = '\n'.join(err_msg)
+        if strict:
+            raise RuntimeError(err_msg)
+        elif logger is not None:
+            logger.warning(err_msg)
+        else:
+            print(err_msg)
+    return checkpoint
+
+def weights_to_cpu(state_dict):
+    """Copy a model state_dict to cpu.
+
+    Args:
+        state_dict (OrderedDict): Model weights on GPU.
+
+    Returns:
+        OrderedDict: Model weights on GPU.
+    """
+    state_dict_cpu = OrderedDict()
+    for key, val in state_dict.items():
+        state_dict_cpu[key] = val.cpu()
+    # Keep metadata in state_dict
+    state_dict_cpu._metadata = getattr(state_dict, '_metadata', OrderedDict())
+    return state_dict_cpu
+
+def save_checkpoint(model,
+                    filename,
+                    optimizer=None,
+                    meta=None,
+                    file_client_args=None):
+    """Save checkpoint to file.
+
+    The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
+    ``optimizer``. By default ``meta`` will contain version and time info.
+
+    Args:
+        model (Module): Module whose params are to be saved.
+        filename (str): Checkpoint filename.
+        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
+        meta (dict, optional): Metadata to be saved in checkpoint.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+            `New in version 1.3.16.`
+    """
+    if meta is None:
+        meta = {}
+    elif not isinstance(meta, dict):
+        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
+    meta.update(time=time.asctime())
+
+    if is_module_wrapper(model):
+        model = model.module
+
+    if hasattr(model, 'CLASSES') and model.CLASSES is not None:
+        # save class name to the meta
+        meta.update(CLASSES=model.CLASSES)
+    checkpoint = {
+        'meta': meta,
+        'state_dict': weights_to_cpu(model.state_dict()),
+    }
+    # save optimizer state dict in the checkpoint
+    if isinstance(optimizer, Optimizer):
+        checkpoint['optimizer'] = optimizer.state_dict()
+    elif isinstance(optimizer, dict):
+        checkpoint['optimizer'] = {}
+        for name, optim in optimizer.items():
+            checkpoint['optimizer'][name] = optim.state_dict()
+
+    file_client = FileClient.infer_client(file_client_args, filename)
+    with io.BytesIO() as f:
+        torch.save(checkpoint, f)
+        file_client.put(f.getvalue(), filename)
diff --git a/mmcv/utils/collect_env.py b/mmcv/utils/collect_env.py
new file mode 100644
index 0000000..25bea53
--- /dev/null
+++ b/mmcv/utils/collect_env.py
@@ -0,0 +1,13 @@
+from mmcv.utils import get_git_hash
+from mmcv import __version__
+
+def collect_env():
+    """Collect the information of the running environments."""
+    env_info = {}
+    env_info['MMCV'] = __version__
+    return env_info
+
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print(f'{name}: {val}')
diff --git a/mmcv/utils/config.py b/mmcv/utils/config.py
new file mode 100644
index 0000000..3c7fae7
--- /dev/null
+++ b/mmcv/utils/config.py
@@ -0,0 +1,687 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import ast
+import copy
+import os
+import os.path as osp
+import platform
+import shutil
+import sys
+import tempfile
+import uuid
+import warnings
+from argparse import Action, ArgumentParser
+from collections import abc
+from importlib import import_module
+from mmcv.fileio.io import load, dump
+
+from addict import Dict
+from yapf.yapflib.yapf_api import FormatCode
+
+from .misc import import_modules_from_strings
+from .path import check_file_exist
+
+if platform.system() == 'Windows':
+    import regex as re
+else:
+    import re
+
+BASE_KEY = '_base_'
+DELETE_KEY = '_delete_'
+DEPRECATION_KEY = '_deprecation_'
+RESERVED_KEYS = ['filename', 'text', 'pretty_text']
+
+
+class ConfigDict(Dict):
+
+    def __missing__(self, name):
+        raise KeyError(name)
+
+    def __getattr__(self, name):
+        try:
+            value = super(ConfigDict, self).__getattr__(name)
+        except KeyError:
+            ex = AttributeError(f"'{self.__class__.__name__}' object has no "
+                                f"attribute '{name}'")
+        except Exception as e:
+            ex = e
+        else:
+            return value
+        raise ex
+
+
+def add_args(parser, cfg, prefix=''):
+    for k, v in cfg.items():
+        if isinstance(v, str):
+            parser.add_argument('--' + prefix + k)
+        elif isinstance(v, int):
+            parser.add_argument('--' + prefix + k, type=int)
+        elif isinstance(v, float):
+            parser.add_argument('--' + prefix + k, type=float)
+        elif isinstance(v, bool):
+            parser.add_argument('--' + prefix + k, action='store_true')
+        elif isinstance(v, dict):
+            add_args(parser, v, prefix + k + '.')
+        elif isinstance(v, abc.Iterable):
+            parser.add_argument('--' + prefix + k, type=type(v[0]), nargs='+')
+        else:
+            print(f'cannot parse key {prefix + k} of type {type(v)}')
+    return parser
+
+
+class Config:
+    """A facility for config and config files.
+
+    It supports common file formats as configs: python/json/yaml. The interface
+    is the same as a dict object and also allows access config values as
+    attributes.
+
+    Example:
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> cfg.a
+        1
+        >>> cfg.b
+        {'b1': [0, 1]}
+        >>> cfg.b.b1
+        [0, 1]
+        >>> cfg = Config.fromfile('tests/data/config/a.py')
+        >>> cfg.filename
+        "/home/kchen/projects/mmcv/tests/data/config/a.py"
+        >>> cfg.item4
+        'test'
+        >>> cfg
+        "Config [path: /home/kchen/projects/mmcv/tests/data/config/a.py]: "
+        "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}"
+    """
+
+    @staticmethod
+    def _validate_py_syntax(filename):
+        with open(filename, 'r', encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            content = f.read()
+        try:
+            ast.parse(content)
+        except SyntaxError as e:
+            raise SyntaxError('There are syntax errors in config '
+                              f'file {filename}: {e}')
+
+    @staticmethod
+    def _substitute_predefined_vars(filename, temp_config_name):
+        file_dirname = osp.dirname(filename)
+        file_basename = osp.basename(filename)
+        file_basename_no_extension = osp.splitext(file_basename)[0]
+        file_extname = osp.splitext(filename)[1]
+        support_templates = dict(
+            fileDirname=file_dirname,
+            fileBasename=file_basename,
+            fileBasenameNoExtension=file_basename_no_extension,
+            fileExtname=file_extname)
+        with open(filename, 'r', encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            config_file = f.read()
+        for key, value in support_templates.items():
+            regexp = r'\{\{\s*' + str(key) + r'\s*\}\}'
+            value = value.replace('\\', '/')
+            config_file = re.sub(regexp, value, config_file)
+        with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file:
+            tmp_config_file.write(config_file)
+
+    @staticmethod
+    def _pre_substitute_base_vars(filename, temp_config_name):
+        """Substitute base variable placehoders to string, so that parsing
+        would work."""
+        with open(filename, 'r', encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            config_file = f.read()
+        base_var_dict = {}
+        regexp = r'\{\{\s*' + BASE_KEY + r'\.([\w\.]+)\s*\}\}'
+        base_vars = set(re.findall(regexp, config_file))
+        for base_var in base_vars:
+            randstr = f'_{base_var}_{uuid.uuid4().hex.lower()[:6]}'
+            base_var_dict[randstr] = base_var
+            regexp = r'\{\{\s*' + BASE_KEY + r'\.' + base_var + r'\s*\}\}'
+            config_file = re.sub(regexp, f'"{randstr}"', config_file)
+        with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file:
+            tmp_config_file.write(config_file)
+        return base_var_dict
+
+    @staticmethod
+    def _substitute_base_vars(cfg, base_var_dict, base_cfg):
+        """Substitute variable strings to their actual values."""
+        cfg = copy.deepcopy(cfg)
+
+        if isinstance(cfg, dict):
+            for k, v in cfg.items():
+                if isinstance(v, str) and v in base_var_dict:
+                    new_v = base_cfg
+                    for new_k in base_var_dict[v].split('.'):
+                        new_v = new_v[new_k]
+                    cfg[k] = new_v
+                elif isinstance(v, (list, tuple, dict)):
+                    cfg[k] = Config._substitute_base_vars(
+                        v, base_var_dict, base_cfg)
+        elif isinstance(cfg, tuple):
+            cfg = tuple(
+                Config._substitute_base_vars(c, base_var_dict, base_cfg)
+                for c in cfg)
+        elif isinstance(cfg, list):
+            cfg = [
+                Config._substitute_base_vars(c, base_var_dict, base_cfg)
+                for c in cfg
+            ]
+        elif isinstance(cfg, str) and cfg in base_var_dict:
+            new_v = base_cfg
+            for new_k in base_var_dict[cfg].split('.'):
+                new_v = new_v[new_k]
+            cfg = new_v
+
+        return cfg
+
+    @staticmethod
+    def _file2dict(filename, use_predefined_variables=True):
+        filename = osp.abspath(osp.expanduser(filename))
+        check_file_exist(filename)
+        fileExtname = osp.splitext(filename)[1]
+        if fileExtname not in ['.py', '.json', '.yaml', '.yml']:
+            raise IOError('Only py/yml/yaml/json type are supported now!')
+
+        with tempfile.TemporaryDirectory() as temp_config_dir:
+            temp_config_file = tempfile.NamedTemporaryFile(
+                dir=temp_config_dir, suffix=fileExtname)
+            if platform.system() == 'Windows':
+                temp_config_file.close()
+            temp_config_name = osp.basename(temp_config_file.name)
+            # Substitute predefined variables
+            if use_predefined_variables:
+                Config._substitute_predefined_vars(filename,
+                                                   temp_config_file.name)
+            else:
+                shutil.copyfile(filename, temp_config_file.name)
+            # Substitute base variables from placeholders to strings
+            base_var_dict = Config._pre_substitute_base_vars(
+                temp_config_file.name, temp_config_file.name)
+
+            if filename.endswith('.py'):
+                temp_module_name = osp.splitext(temp_config_name)[0]
+                sys.path.insert(0, temp_config_dir)
+                Config._validate_py_syntax(filename)
+                mod = import_module(temp_module_name)
+                sys.path.pop(0)
+                cfg_dict = {
+                    name: value
+                    for name, value in mod.__dict__.items()
+                    if not name.startswith('__')
+                }
+                # delete imported module
+                del sys.modules[temp_module_name]
+            elif filename.endswith(('.yml', '.yaml', '.json')):
+                cfg_dict = load(temp_config_file.name)
+            # close temp file
+            temp_config_file.close()
+
+        # check deprecation information
+        if DEPRECATION_KEY in cfg_dict:
+            deprecation_info = cfg_dict.pop(DEPRECATION_KEY)
+            warning_msg = f'The config file {filename} will be deprecated ' \
+                'in the future.'
+            if 'expected' in deprecation_info:
+                warning_msg += f' Please use {deprecation_info["expected"]} ' \
+                    'instead.'
+            if 'reference' in deprecation_info:
+                warning_msg += ' More information can be found at ' \
+                    f'{deprecation_info["reference"]}'
+            warnings.warn(warning_msg)
+
+        cfg_text = filename + '\n'
+        with open(filename, 'r', encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            cfg_text += f.read()
+
+        if BASE_KEY in cfg_dict:
+            cfg_dir = osp.dirname(filename)
+            base_filename = cfg_dict.pop(BASE_KEY)
+            base_filename = base_filename if isinstance(
+                base_filename, list) else [base_filename]
+
+            cfg_dict_list = list()
+            cfg_text_list = list()
+            for f in base_filename:
+                _cfg_dict, _cfg_text = Config._file2dict(osp.join(cfg_dir, f))
+                cfg_dict_list.append(_cfg_dict)
+                cfg_text_list.append(_cfg_text)
+
+            base_cfg_dict = dict()
+            for c in cfg_dict_list:
+                duplicate_keys = base_cfg_dict.keys() & c.keys()
+                if len(duplicate_keys) > 0:
+                    raise KeyError('Duplicate key is not allowed among bases. '
+                                   f'Duplicate keys: {duplicate_keys}')
+                base_cfg_dict.update(c)
+
+            # Substitute base variables from strings to their actual values
+            cfg_dict = Config._substitute_base_vars(cfg_dict, base_var_dict,
+                                                    base_cfg_dict)
+
+            base_cfg_dict = Config._merge_a_into_b(cfg_dict, base_cfg_dict)
+            cfg_dict = base_cfg_dict
+
+            # merge cfg_text
+            cfg_text_list.append(cfg_text)
+            cfg_text = '\n'.join(cfg_text_list)
+
+        return cfg_dict, cfg_text
+
+    @staticmethod
+    def _merge_a_into_b(a, b, allow_list_keys=False):
+        """merge dict ``a`` into dict ``b`` (non-inplace).
+
+        Values in ``a`` will overwrite ``b``. ``b`` is copied first to avoid
+        in-place modifications.
+
+        Args:
+            a (dict): The source dict to be merged into ``b``.
+            b (dict): The origin dict to be fetch keys from ``a``.
+            allow_list_keys (bool): If True, int string keys (e.g. '0', '1')
+              are allowed in source ``a`` and will replace the element of the
+              corresponding index in b if b is a list. Default: False.
+
+        Returns:
+            dict: The modified dict of ``b`` using ``a``.
+
+        Examples:
+            # Normally merge a into b.
+            >>> Config._merge_a_into_b(
+            ...     dict(obj=dict(a=2)), dict(obj=dict(a=1)))
+            {'obj': {'a': 2}}
+
+            # Delete b first and merge a into b.
+            >>> Config._merge_a_into_b(
+            ...     dict(obj=dict(_delete_=True, a=2)), dict(obj=dict(a=1)))
+            {'obj': {'a': 2}}
+
+            # b is a list
+            >>> Config._merge_a_into_b(
+            ...     {'0': dict(a=2)}, [dict(a=1), dict(b=2)], True)
+            [{'a': 2}, {'b': 2}]
+        """
+        b = b.copy()
+        for k, v in a.items():
+            if allow_list_keys and k.isdigit() and isinstance(b, list):
+                k = int(k)
+                if len(b) <= k:
+                    raise KeyError(f'Index {k} exceeds the length of list {b}')
+                b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
+            elif isinstance(v,
+                            dict) and k in b and not v.pop(DELETE_KEY, False):
+                allowed_types = (dict, list) if allow_list_keys else dict
+                if not isinstance(b[k], allowed_types):
+                    raise TypeError(
+                        f'{k}={v} in child config cannot inherit from base '
+                        f'because {k} is a dict in the child config but is of '
+                        f'type {type(b[k])} in base config. You may set '
+                        f'`{DELETE_KEY}=True` to ignore the base config')
+                b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
+            else:
+                b[k] = v
+        return b
+
+    @staticmethod
+    def fromfile(filename,
+                 use_predefined_variables=True,
+                 import_custom_modules=True):
+        cfg_dict, cfg_text = Config._file2dict(filename,
+                                               use_predefined_variables)
+        if import_custom_modules and cfg_dict.get('custom_imports', None):
+            import_modules_from_strings(**cfg_dict['custom_imports'])
+        return Config(cfg_dict, cfg_text=cfg_text, filename=filename)
+
+    @staticmethod
+    def fromstring(cfg_str, file_format):
+        """Generate config from config str.
+
+        Args:
+            cfg_str (str): Config str.
+            file_format (str): Config file format corresponding to the
+               config str. Only py/yml/yaml/json type are supported now!
+
+        Returns:
+            obj:`Config`: Config obj.
+        """
+        if file_format not in ['.py', '.json', '.yaml', '.yml']:
+            raise IOError('Only py/yml/yaml/json type are supported now!')
+        if file_format != '.py' and 'dict(' in cfg_str:
+            # check if users specify a wrong suffix for python
+            warnings.warn(
+                'Please check "file_format", the file format may be .py')
+        with tempfile.NamedTemporaryFile(
+                'w', encoding='utf-8', suffix=file_format,
+                delete=False) as temp_file:
+            temp_file.write(cfg_str)
+            # on windows, previous implementation cause error
+            # see PR 1077 for details
+        cfg = Config.fromfile(temp_file.name)
+        os.remove(temp_file.name)
+        return cfg
+
+    @staticmethod
+    def auto_argparser(description=None):
+        """Generate argparser from config file automatically (experimental)"""
+        partial_parser = ArgumentParser(description=description)
+        partial_parser.add_argument('config', help='config file path')
+        cfg_file = partial_parser.parse_known_args()[0].config
+        cfg = Config.fromfile(cfg_file)
+        parser = ArgumentParser(description=description)
+        parser.add_argument('config', help='config file path')
+        add_args(parser, cfg)
+        return parser, cfg
+
+    def __init__(self, cfg_dict=None, cfg_text=None, filename=None):
+        if cfg_dict is None:
+            cfg_dict = dict()
+        elif not isinstance(cfg_dict, dict):
+            raise TypeError('cfg_dict must be a dict, but '
+                            f'got {type(cfg_dict)}')
+        for key in cfg_dict:
+            if key in RESERVED_KEYS:
+                raise KeyError(f'{key} is reserved for config file')
+
+        super(Config, self).__setattr__('_cfg_dict', ConfigDict(cfg_dict))
+        super(Config, self).__setattr__('_filename', filename)
+        if cfg_text:
+            text = cfg_text
+        elif filename:
+            with open(filename, 'r') as f:
+                text = f.read()
+        else:
+            text = ''
+        super(Config, self).__setattr__('_text', text)
+
+    @property
+    def filename(self):
+        return self._filename
+
+    @property
+    def text(self):
+        return self._text
+
+    @property
+    def pretty_text(self):
+
+        indent = 4
+
+        def _indent(s_, num_spaces):
+            s = s_.split('\n')
+            if len(s) == 1:
+                return s_
+            first = s.pop(0)
+            s = [(num_spaces * ' ') + line for line in s]
+            s = '\n'.join(s)
+            s = first + '\n' + s
+            return s
+
+        def _format_basic_types(k, v, use_mapping=False):
+            if isinstance(v, str):
+                v_str = f"'{v}'"
+            else:
+                v_str = str(v)
+
+            if use_mapping:
+                k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                attr_str = f'{k_str}: {v_str}'
+            else:
+                attr_str = f'{str(k)}={v_str}'
+            attr_str = _indent(attr_str, indent)
+
+            return attr_str
+
+        def _format_list(k, v, use_mapping=False):
+            # check if all items in the list are dict
+            if all(isinstance(_, dict) for _ in v):
+                v_str = '[\n'
+                v_str += '\n'.join(
+                    f'dict({_indent(_format_dict(v_), indent)}),'
+                    for v_ in v).rstrip(',')
+                if use_mapping:
+                    k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                    attr_str = f'{k_str}: {v_str}'
+                else:
+                    attr_str = f'{str(k)}={v_str}'
+                attr_str = _indent(attr_str, indent) + ']'
+            else:
+                attr_str = _format_basic_types(k, v, use_mapping)
+            return attr_str
+
+        def _contain_invalid_identifier(dict_str):
+            contain_invalid_identifier = False
+            for key_name in dict_str:
+                contain_invalid_identifier |= \
+                    (not str(key_name).isidentifier())
+            return contain_invalid_identifier
+
+        def _format_dict(input_dict, outest_level=False):
+            r = ''
+            s = []
+
+            use_mapping = _contain_invalid_identifier(input_dict)
+            if use_mapping:
+                r += '{'
+            for idx, (k, v) in enumerate(input_dict.items()):
+                is_last = idx >= len(input_dict) - 1
+                end = '' if outest_level or is_last else ','
+                if isinstance(v, dict):
+                    v_str = '\n' + _format_dict(v)
+                    if use_mapping:
+                        k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                        attr_str = f'{k_str}: dict({v_str}'
+                    else:
+                        attr_str = f'{str(k)}=dict({v_str}'
+                    attr_str = _indent(attr_str, indent) + ')' + end
+                elif isinstance(v, list):
+                    attr_str = _format_list(k, v, use_mapping) + end
+                else:
+                    attr_str = _format_basic_types(k, v, use_mapping) + end
+
+                s.append(attr_str)
+            r += '\n'.join(s)
+            if use_mapping:
+                r += '}'
+            return r
+
+        cfg_dict = self._cfg_dict.to_dict()
+        text = _format_dict(cfg_dict, outest_level=True)
+        # copied from setup.cfg
+        yapf_style = dict(
+            based_on_style='pep8',
+            blank_line_before_nested_class_or_def=True,
+            split_before_expression_after_opening_paren=True)
+        text, _ = FormatCode(text, style_config=yapf_style, verify=True)
+
+        return text
+
+    def __repr__(self):
+        return f'Config (path: {self.filename}): {self._cfg_dict.__repr__()}'
+
+    def __len__(self):
+        return len(self._cfg_dict)
+
+    def __getattr__(self, name):
+        return getattr(self._cfg_dict, name)
+
+    def __getitem__(self, name):
+        return self._cfg_dict.__getitem__(name)
+
+    def __setattr__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setattr__(name, value)
+
+    def __setitem__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setitem__(name, value)
+
+    def __iter__(self):
+        return iter(self._cfg_dict)
+
+    def __getstate__(self):
+        return (self._cfg_dict, self._filename, self._text)
+
+    def __setstate__(self, state):
+        _cfg_dict, _filename, _text = state
+        super(Config, self).__setattr__('_cfg_dict', _cfg_dict)
+        super(Config, self).__setattr__('_filename', _filename)
+        super(Config, self).__setattr__('_text', _text)
+
+    def dump(self, file=None):
+        cfg_dict = super(Config, self).__getattribute__('_cfg_dict').to_dict()
+        if self.filename.endswith('.py'):
+            if file is None:
+                return self.pretty_text
+            else:
+                with open(file, 'w', encoding='utf-8') as f:
+                    f.write(self.pretty_text)
+        else:
+            if file is None:
+                file_format = self.filename.split('.')[-1]
+                return dump(cfg_dict, file_format=file_format)
+            else:
+                dump(cfg_dict, file)
+
+    def merge_from_dict(self, options, allow_list_keys=True):
+        """Merge list into cfg_dict.
+
+        Merge the dict parsed by MultipleKVAction into this cfg.
+
+        Examples:
+            >>> options = {'model.backbone.depth': 50,
+            ...            'model.backbone.with_cp':True}
+            >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet'))))
+            >>> cfg.merge_from_dict(options)
+            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+            >>> assert cfg_dict == dict(
+            ...     model=dict(backbone=dict(depth=50, with_cp=True)))
+
+            # Merge list element
+            >>> cfg = Config(dict(pipeline=[
+            ...     dict(type='LoadImage'), dict(type='LoadAnnotations')]))
+            >>> options = dict(pipeline={'0': dict(type='SelfLoadImage')})
+            >>> cfg.merge_from_dict(options, allow_list_keys=True)
+            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+            >>> assert cfg_dict == dict(pipeline=[
+            ...     dict(type='SelfLoadImage'), dict(type='LoadAnnotations')])
+
+        Args:
+            options (dict): dict of configs to merge from.
+            allow_list_keys (bool): If True, int string keys (e.g. '0', '1')
+              are allowed in ``options`` and will replace the element of the
+              corresponding index in the config if the config is a list.
+              Default: True.
+        """
+        option_cfg_dict = {}
+        for full_key, v in options.items():
+            d = option_cfg_dict
+            key_list = full_key.split('.')
+            for subkey in key_list[:-1]:
+                d.setdefault(subkey, ConfigDict())
+                d = d[subkey]
+            subkey = key_list[-1]
+            d[subkey] = v
+
+        cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+        super(Config, self).__setattr__(
+            '_cfg_dict',
+            Config._merge_a_into_b(
+                option_cfg_dict, cfg_dict, allow_list_keys=allow_list_keys))
+
+
+class DictAction(Action):
+    """
+    argparse action to split an argument into KEY=VALUE form
+    on the first = and append to a dictionary. List options can
+    be passed as comma separated values, i.e 'KEY=V1,V2,V3', or with explicit
+    brackets, i.e. 'KEY=[V1,V2,V3]'. It also support nested brackets to build
+    list/tuple values. e.g. 'KEY=[(V1,V2),(V3,V4)]'
+    """
+
+    @staticmethod
+    def _parse_int_float_bool(val):
+        try:
+            return int(val)
+        except ValueError:
+            pass
+        try:
+            return float(val)
+        except ValueError:
+            pass
+        if val.lower() in ['true', 'false']:
+            return True if val.lower() == 'true' else False
+        return val
+
+    @staticmethod
+    def _parse_iterable(val):
+        """Parse iterable values in the string.
+
+        All elements inside '()' or '[]' are treated as iterable values.
+
+        Args:
+            val (str): Value string.
+
+        Returns:
+            list | tuple: The expanded list or tuple from the string.
+
+        Examples:
+            >>> DictAction._parse_iterable('1,2,3')
+            [1, 2, 3]
+            >>> DictAction._parse_iterable('[a, b, c]')
+            ['a', 'b', 'c']
+            >>> DictAction._parse_iterable('[(1, 2, 3), [a, b], c]')
+            [(1, 2, 3), ['a', 'b'], 'c']
+        """
+
+        def find_next_comma(string):
+            """Find the position of next comma in the string.
+
+            If no ',' is found in the string, return the string length. All
+            chars inside '()' and '[]' are treated as one element and thus ','
+            inside these brackets are ignored.
+            """
+            assert (string.count('(') == string.count(')')) and (
+                    string.count('[') == string.count(']')), \
+                f'Imbalanced brackets exist in {string}'
+            end = len(string)
+            for idx, char in enumerate(string):
+                pre = string[:idx]
+                # The string before this ',' is balanced
+                if ((char == ',') and (pre.count('(') == pre.count(')'))
+                        and (pre.count('[') == pre.count(']'))):
+                    end = idx
+                    break
+            return end
+
+        # Strip ' and " characters and replace whitespace.
+        val = val.strip('\'\"').replace(' ', '')
+        is_tuple = False
+        if val.startswith('(') and val.endswith(')'):
+            is_tuple = True
+            val = val[1:-1]
+        elif val.startswith('[') and val.endswith(']'):
+            val = val[1:-1]
+        elif ',' not in val:
+            # val is a single value
+            return DictAction._parse_int_float_bool(val)
+
+        values = []
+        while len(val) > 0:
+            comma_idx = find_next_comma(val)
+            element = DictAction._parse_iterable(val[:comma_idx])
+            values.append(element)
+            val = val[comma_idx + 1:]
+        if is_tuple:
+            values = tuple(values)
+        return values
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        options = {}
+        for kv in values:
+            key, val = kv.split('=', maxsplit=1)
+            options[key] = self._parse_iterable(val)
+        setattr(namespace, self.dest, options)
diff --git a/mmcv/utils/contextmanagers.py b/mmcv/utils/contextmanagers.py
new file mode 100644
index 0000000..38a6392
--- /dev/null
+++ b/mmcv/utils/contextmanagers.py
@@ -0,0 +1,121 @@
+import asyncio
+import contextlib
+import logging
+import os
+import time
+from typing import List
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+DEBUG_COMPLETED_TIME = bool(os.environ.get('DEBUG_COMPLETED_TIME', False))
+
+
+@contextlib.asynccontextmanager
+async def completed(trace_name='',
+                    name='',
+                    sleep_interval=0.05,
+                    streams: List[torch.cuda.Stream] = None):
+    """Async context manager that waits for work to complete on given CUDA
+    streams."""
+    if not torch.cuda.is_available():
+        yield
+        return
+
+    stream_before_context_switch = torch.cuda.current_stream()
+    if not streams:
+        streams = [stream_before_context_switch]
+    else:
+        streams = [s if s else stream_before_context_switch for s in streams]
+
+    end_events = [
+        torch.cuda.Event(enable_timing=DEBUG_COMPLETED_TIME) for _ in streams
+    ]
+
+    if DEBUG_COMPLETED_TIME:
+        start = torch.cuda.Event(enable_timing=True)
+        stream_before_context_switch.record_event(start)
+
+        cpu_start = time.monotonic()
+    logger.debug('%s %s starting, streams: %s', trace_name, name, streams)
+    grad_enabled_before = torch.is_grad_enabled()
+    try:
+        yield
+    finally:
+        current_stream = torch.cuda.current_stream()
+        assert current_stream == stream_before_context_switch
+
+        if DEBUG_COMPLETED_TIME:
+            cpu_end = time.monotonic()
+        for i, stream in enumerate(streams):
+            event = end_events[i]
+            stream.record_event(event)
+
+        grad_enabled_after = torch.is_grad_enabled()
+
+        # observed change of torch.is_grad_enabled() during concurrent run of
+        # async_test_bboxes code
+        assert (grad_enabled_before == grad_enabled_after
+                ), 'Unexpected is_grad_enabled() value change'
+
+        are_done = [e.query() for e in end_events]
+        logger.debug('%s %s completed: %s streams: %s', trace_name, name,
+                     are_done, streams)
+        with torch.cuda.stream(stream_before_context_switch):
+            while not all(are_done):
+                await asyncio.sleep(sleep_interval)
+                are_done = [e.query() for e in end_events]
+                logger.debug(
+                    '%s %s completed: %s streams: %s',
+                    trace_name,
+                    name,
+                    are_done,
+                    streams,
+                )
+
+        current_stream = torch.cuda.current_stream()
+        assert current_stream == stream_before_context_switch
+
+        if DEBUG_COMPLETED_TIME:
+            cpu_time = (cpu_end - cpu_start) * 1000
+            stream_times_ms = ''
+            for i, stream in enumerate(streams):
+                elapsed_time = start.elapsed_time(end_events[i])
+                stream_times_ms += f' {stream} {elapsed_time:.2f} ms'
+            logger.info('%s %s %.2f ms %s', trace_name, name, cpu_time,
+                        stream_times_ms)
+
+
+@contextlib.asynccontextmanager
+async def concurrent(streamqueue: asyncio.Queue,
+                     trace_name='concurrent',
+                     name='stream'):
+    """Run code concurrently in different streams.
+
+    :param streamqueue: asyncio.Queue instance.
+
+    Queue tasks define the pool of streams used for concurrent execution.
+    """
+    if not torch.cuda.is_available():
+        yield
+        return
+
+    initial_stream = torch.cuda.current_stream()
+
+    with torch.cuda.stream(initial_stream):
+        stream = await streamqueue.get()
+        assert isinstance(stream, torch.cuda.Stream)
+
+        try:
+            with torch.cuda.stream(stream):
+                logger.debug('%s %s is starting, stream: %s', trace_name, name,
+                             stream)
+                yield
+                current = torch.cuda.current_stream()
+                assert current == stream
+                logger.debug('%s %s has finished, stream: %s', trace_name,
+                             name, stream)
+        finally:
+            streamqueue.task_done()
+            streamqueue.put_nowait(stream)
diff --git a/mmcv/utils/ext_loader.py b/mmcv/utils/ext_loader.py
new file mode 100644
index 0000000..971d3d9
--- /dev/null
+++ b/mmcv/utils/ext_loader.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import importlib
+import os
+import pkgutil
+import warnings
+from collections import namedtuple
+
+import torch
+
+def load_ext(name, funcs):
+    ext = importlib.import_module('mmcv.' + name)
+    for fun in funcs:
+        assert hasattr(ext, fun), f'{fun} miss in module {name}'
+    return ext
+
+def check_ops_exist():
+    ext_loader = pkgutil.find_loader('mmcv._ext')
+    return ext_loader is not None
diff --git a/mmcv/utils/fp16_utils.py b/mmcv/utils/fp16_utils.py
new file mode 100644
index 0000000..5c620ff
--- /dev/null
+++ b/mmcv/utils/fp16_utils.py
@@ -0,0 +1,407 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+import warnings
+from collections import abc
+from inspect import getfullargspec
+
+import numpy as np
+import torch
+import torch.nn as nn
+TORCH_VERSION = torch.__version__
+from .version_utils import digit_version
+from .runner_utils import allreduce_grads as _allreduce_grads
+
+try:
+    # If PyTorch version >= 1.6.0, torch.cuda.amp.autocast would be imported
+    # and used; otherwise, auto fp16 will adopt mmcv's implementation.
+    # Note that when PyTorch >= 1.6.0, we still cast tensor types to fp16
+    # manually, so the behavior may not be consistent with real amp.
+    from torch.cuda.amp import autocast
+except ImportError:
+    pass
+
+
+def cast_tensor_type(inputs, src_type, dst_type):
+    """Recursively convert Tensor in inputs from src_type to dst_type.
+
+    Args:
+        inputs: Inputs that to be casted.
+        src_type (torch.dtype): Source type..
+        dst_type (torch.dtype): Destination type.
+
+    Returns:
+        The same type with inputs, but all contained Tensors have been cast.
+    """
+    if isinstance(inputs, nn.Module):
+        return inputs
+    elif isinstance(inputs, torch.Tensor):
+        return inputs.to(dst_type)
+    elif isinstance(inputs, str):
+        return inputs
+    elif isinstance(inputs, np.ndarray):
+        return inputs
+    elif isinstance(inputs, abc.Mapping):
+        return type(inputs)({
+            k: cast_tensor_type(v, src_type, dst_type)
+            for k, v in inputs.items()
+        })
+    elif isinstance(inputs, abc.Iterable):
+        return type(inputs)(
+            cast_tensor_type(item, src_type, dst_type) for item in inputs)
+    else:
+        return inputs
+
+
+def auto_fp16(apply_to=None, out_fp32=False):
+    """Decorator to enable fp16 training automatically.
+
+    This decorator is useful when you write custom modules and want to support
+    mixed precision training. If inputs arguments are fp32 tensors, they will
+    be converted to fp16 automatically. Arguments other than fp32 tensors are
+    ignored. If you are using PyTorch >= 1.6, torch.cuda.amp is used as the
+    backend, otherwise, original mmcv implementation will be adopted.
+
+    Args:
+        apply_to (Iterable, optional): The argument names to be converted.
+            `None` indicates all arguments.
+        out_fp32 (bool): Whether to convert the output back to fp32.
+
+    Example:
+
+        >>> import torch.nn as nn
+        >>> class MyModule1(nn.Module):
+        >>>
+        >>>     # Convert x and y to fp16
+        >>>     @auto_fp16()
+        >>>     def forward(self, x, y):
+        >>>         pass
+
+        >>> import torch.nn as nn
+        >>> class MyModule2(nn.Module):
+        >>>
+        >>>     # convert pred to fp16
+        >>>     @auto_fp16(apply_to=('pred', ))
+        >>>     def do_something(self, pred, others):
+        >>>         pass
+    """
+
+    def auto_fp16_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # check if the module has set the attribute `fp16_enabled`, if not,
+            # just fallback to the original method.
+            if not isinstance(args[0], torch.nn.Module):
+                raise TypeError('@auto_fp16 can only be used to decorate the '
+                                'method of nn.Module')
+            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
+                return old_func(*args, **kwargs)
+
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get the argument names to be casted
+            args_to_cast = args_info.args if apply_to is None else apply_to
+            # convert the args that need to be processed
+            new_args = []
+            # NOTE: default args are not taken into consideration
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for i, arg_name in enumerate(arg_names):
+                    if arg_name in args_to_cast:
+                        new_args.append(
+                            cast_tensor_type(args[i], torch.float, torch.half))
+                    else:
+                        new_args.append(args[i])
+            # convert the kwargs that need to be processed
+            new_kwargs = {}
+            if kwargs:
+                for arg_name, arg_value in kwargs.items():
+                    if arg_name in args_to_cast:
+                        new_kwargs[arg_name] = cast_tensor_type(
+                            arg_value, torch.float, torch.half)
+                    else:
+                        new_kwargs[arg_name] = arg_value
+            # apply converted arguments to the decorated method
+            if (digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
+                with autocast(enabled=True):
+                    output = old_func(*new_args, **new_kwargs)
+            else:
+                output = old_func(*new_args, **new_kwargs)
+            # cast the results back to fp32 if necessary
+            if out_fp32:
+                output = cast_tensor_type(output, torch.half, torch.float)
+            return output
+
+        return new_func
+
+    return auto_fp16_wrapper
+
+
+def force_fp32(apply_to=None, out_fp16=False):
+    """Decorator to convert input arguments to fp32 in force.
+
+    This decorator is useful when you write custom modules and want to support
+    mixed precision training. If there are some inputs that must be processed
+    in fp32 mode, then this decorator can handle it. If inputs arguments are
+    fp16 tensors, they will be converted to fp32 automatically. Arguments other
+    than fp16 tensors are ignored. If you are using PyTorch >= 1.6,
+    torch.cuda.amp is used as the backend, otherwise, original mmcv
+    implementation will be adopted.
+
+    Args:
+        apply_to (Iterable, optional): The argument names to be converted.
+            `None` indicates all arguments.
+        out_fp16 (bool): Whether to convert the output back to fp16.
+
+    Example:
+
+        >>> import torch.nn as nn
+        >>> class MyModule1(nn.Module):
+        >>>
+        >>>     # Convert x and y to fp32
+        >>>     @force_fp32()
+        >>>     def loss(self, x, y):
+        >>>         pass
+
+        >>> import torch.nn as nn
+        >>> class MyModule2(nn.Module):
+        >>>
+        >>>     # convert pred to fp32
+        >>>     @force_fp32(apply_to=('pred', ))
+        >>>     def post_process(self, pred, others):
+        >>>         pass
+    """
+
+    def force_fp32_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # check if the module has set the attribute `fp16_enabled`, if not,
+            # just fallback to the original method.
+            if not isinstance(args[0], torch.nn.Module):
+                raise TypeError('@force_fp32 can only be used to decorate the '
+                                'method of nn.Module')
+            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
+                return old_func(*args, **kwargs)
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get the argument names to be casted
+            args_to_cast = args_info.args if apply_to is None else apply_to
+            # convert the args that need to be processed
+            new_args = []
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for i, arg_name in enumerate(arg_names):
+                    if arg_name in args_to_cast:
+                        new_args.append(
+                            cast_tensor_type(args[i], torch.half, torch.float))
+                    else:
+                        new_args.append(args[i])
+            # convert the kwargs that need to be processed
+            new_kwargs = dict()
+            if kwargs:
+                for arg_name, arg_value in kwargs.items():
+                    if arg_name in args_to_cast:
+                        new_kwargs[arg_name] = cast_tensor_type(
+                            arg_value, torch.half, torch.float)
+                    else:
+                        new_kwargs[arg_name] = arg_value
+            # apply converted arguments to the decorated method
+            if (digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
+                with autocast(enabled=False):
+                    output = old_func(*new_args, **new_kwargs)
+            else:
+                output = old_func(*new_args, **new_kwargs)
+            # cast the results back to fp32 if necessary
+            if out_fp16:
+                output = cast_tensor_type(output, torch.float, torch.half)
+            return output
+
+        return new_func
+
+    return force_fp32_wrapper
+
+
+def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+    warnings.warning(
+        '"mmcv.runner.fp16_utils.allreduce_grads" is deprecated, and will be '
+        'removed in v2.8. Please switch to "mmcv.runner.allreduce_grads')
+    _allreduce_grads(params, coalesce=coalesce, bucket_size_mb=bucket_size_mb)
+
+
+def wrap_fp16_model(model):
+    """Wrap the FP32 model to FP16.
+
+    If you are using PyTorch >= 1.6, torch.cuda.amp is used as the
+    backend, otherwise, original mmcv implementation will be adopted.
+
+    For PyTorch >= 1.6, this function will
+    1. Set fp16 flag inside the model to True.
+
+    Otherwise:
+    1. Convert FP32 model to FP16.
+    2. Remain some necessary layers to be FP32, e.g., normalization layers.
+    3. Set `fp16_enabled` flag inside the model to True.
+
+    Args:
+        model (nn.Module): Model in FP32.
+    """
+    if (digit_version(TORCH_VERSION) < digit_version('1.6.0')):
+        # convert model to fp16
+        model.half()
+        # patch the normalization layers to make it work in fp32 mode
+        patch_norm_fp32(model)
+    # set `fp16_enabled` flag
+    for m in model.modules():
+        if hasattr(m, 'fp16_enabled'):
+            m.fp16_enabled = True
+
+
+def patch_norm_fp32(module):
+    """Recursively convert normalization layers from FP16 to FP32.
+
+    Args:
+        module (nn.Module): The modules to be converted in FP16.
+
+    Returns:
+        nn.Module: The converted module, the normalization layers have been
+            converted to FP32.
+    """
+    if isinstance(module, (nn.modules.batchnorm._BatchNorm, nn.GroupNorm)):
+        module.float()
+        if isinstance(module, nn.GroupNorm) or torch.__version__ < '1.3':
+            module.forward = patch_forward_method(module.forward, torch.half,
+                                                  torch.float)
+    for child in module.children():
+        patch_norm_fp32(child)
+    return module
+
+
+def patch_forward_method(func, src_type, dst_type, convert_output=True):
+    """Patch the forward method of a module.
+
+    Args:
+        func (callable): The original forward method.
+        src_type (torch.dtype): Type of input arguments to be converted from.
+        dst_type (torch.dtype): Type of input arguments to be converted to.
+        convert_output (bool): Whether to convert the output back to src_type.
+
+    Returns:
+        callable: The patched forward method.
+    """
+
+    def new_forward(*args, **kwargs):
+        output = func(*cast_tensor_type(args, src_type, dst_type),
+                      **cast_tensor_type(kwargs, src_type, dst_type))
+        if convert_output:
+            output = cast_tensor_type(output, dst_type, src_type)
+        return output
+
+    return new_forward
+
+
+class LossScaler:
+    """Class that manages loss scaling in mixed precision training which
+    supports both dynamic or static mode.
+
+    The implementation refers to
+    https://github.com/NVIDIA/apex/blob/master/apex/fp16_utils/loss_scaler.py.
+    Indirectly, by supplying ``mode='dynamic'`` for dynamic loss scaling.
+    It's important to understand how :class:`LossScaler` operates.
+    Loss scaling is designed to combat the problem of underflowing
+    gradients encountered at long times when training fp16 networks.
+    Dynamic loss scaling begins by attempting a very high loss
+    scale.  Ironically, this may result in OVERflowing gradients.
+    If overflowing gradients are encountered, :class:`FP16_Optimizer` then
+    skips the update step for this particular iteration/minibatch,
+    and :class:`LossScaler` adjusts the loss scale to a lower value.
+    If a certain number of iterations occur without overflowing gradients
+    detected,:class:`LossScaler` increases the loss scale once more.
+    In this way :class:`LossScaler` attempts to "ride the edge" of always
+    using the highest loss scale possible without incurring overflow.
+
+    Args:
+        init_scale (float): Initial loss scale value, default: 2**32.
+        scale_factor (float): Factor used when adjusting the loss scale.
+            Default: 2.
+        mode (str): Loss scaling mode. 'dynamic' or 'static'
+        scale_window (int): Number of consecutive iterations without an
+            overflow to wait before increasing the loss scale. Default: 1000.
+    """
+
+    def __init__(self,
+                 init_scale=2**32,
+                 mode='dynamic',
+                 scale_factor=2.,
+                 scale_window=1000):
+        self.cur_scale = init_scale
+        self.cur_iter = 0
+        assert mode in ('dynamic',
+                        'static'), 'mode can only be dynamic or static'
+        self.mode = mode
+        self.last_overflow_iter = -1
+        self.scale_factor = scale_factor
+        self.scale_window = scale_window
+
+    def has_overflow(self, params):
+        """Check if params contain overflow."""
+        if self.mode != 'dynamic':
+            return False
+        for p in params:
+            if p.grad is not None and LossScaler._has_inf_or_nan(p.grad.data):
+                return True
+        return False
+
+    def _has_inf_or_nan(x):
+        """Check if params contain NaN."""
+        try:
+            cpu_sum = float(x.float().sum())
+        except RuntimeError as instance:
+            if 'value cannot be converted' not in instance.args[0]:
+                raise
+            return True
+        else:
+            if cpu_sum == float('inf') or cpu_sum == -float('inf') \
+                    or cpu_sum != cpu_sum:
+                return True
+            return False
+
+    def update_scale(self, overflow):
+        """update the current loss scale value when overflow happens."""
+        if self.mode != 'dynamic':
+            return
+        if overflow:
+            self.cur_scale = max(self.cur_scale / self.scale_factor, 1)
+            self.last_overflow_iter = self.cur_iter
+        else:
+            if (self.cur_iter - self.last_overflow_iter) % \
+                    self.scale_window == 0:
+                self.cur_scale *= self.scale_factor
+        self.cur_iter += 1
+
+    def state_dict(self):
+        """Returns the state of the scaler as a :class:`dict`."""
+        return dict(
+            cur_scale=self.cur_scale,
+            cur_iter=self.cur_iter,
+            mode=self.mode,
+            last_overflow_iter=self.last_overflow_iter,
+            scale_factor=self.scale_factor,
+            scale_window=self.scale_window)
+
+    def load_state_dict(self, state_dict):
+        """Loads the loss_scaler state dict.
+
+        Args:
+           state_dict (dict): scaler state.
+        """
+        self.cur_scale = state_dict['cur_scale']
+        self.cur_iter = state_dict['cur_iter']
+        self.mode = state_dict['mode']
+        self.last_overflow_iter = state_dict['last_overflow_iter']
+        self.scale_factor = state_dict['scale_factor']
+        self.scale_window = state_dict['scale_window']
+
+    @property
+    def loss_scale(self):
+        return self.cur_scale
diff --git a/mmcv/utils/grid_mask.py b/mmcv/utils/grid_mask.py
new file mode 100755
index 0000000..3d04b2c
--- /dev/null
+++ b/mmcv/utils/grid_mask.py
@@ -0,0 +1,124 @@
+import torch
+import torch.nn as nn
+import numpy as np
+from PIL import Image
+from mmcv.runner import force_fp32, auto_fp16
+
+class Grid(object):
+    def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.):
+        self.use_h = use_h
+        self.use_w = use_w
+        self.rotate = rotate
+        self.offset = offset
+        self.ratio = ratio
+        self.mode=mode
+        self.st_prob = prob
+        self.prob = prob
+
+    def set_prob(self, epoch, max_epoch):
+        self.prob = self.st_prob * epoch / max_epoch
+
+    def __call__(self, img, label):
+        if np.random.rand() > self.prob:
+            return img, label
+        h = img.size(1)
+        w = img.size(2)
+        self.d1 = 2
+        self.d2 = min(h, w)
+        hh = int(1.5*h)
+        ww = int(1.5*w)
+        d = np.random.randint(self.d1, self.d2)
+        if self.ratio == 1:
+            self.l = np.random.randint(1, d)
+        else:
+            self.l = min(max(int(d*self.ratio+0.5),1),d-1)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        if self.use_h:
+            for i in range(hh//d):
+                s = d*i + st_h
+                t = min(s+self.l, hh)
+                mask[s:t,:] *= 0
+        if self.use_w:
+            for i in range(ww//d):
+                s = d*i + st_w
+                t = min(s+self.l, ww)
+                mask[:,s:t] *= 0
+       
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w]
+
+        mask = torch.from_numpy(mask).float()
+        if self.mode == 1:
+            mask = 1-mask
+
+        mask = mask.expand_as(img)
+        if self.offset:
+            offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).float()
+            offset = (1 - mask) * offset
+            img = img * mask + offset
+        else:
+            img = img * mask 
+
+        return img, label
+
+
+class GridMask(nn.Module):
+    def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.):
+        super(GridMask, self).__init__()
+        self.use_h = use_h
+        self.use_w = use_w
+        self.rotate = rotate
+        self.offset = offset
+        self.ratio = ratio
+        self.mode = mode
+        self.st_prob = prob
+        self.prob = prob
+        self.fp16_enable = False
+    def set_prob(self, epoch, max_epoch):
+        self.prob = self.st_prob * epoch / max_epoch #+ 1.#0.5
+    @auto_fp16()
+    def forward(self, x):
+        if np.random.rand() > self.prob or not self.training:
+            return x
+        n,c,h,w = x.size()
+        x = x.view(-1,h,w)
+        hh = int(1.5*h)
+        ww = int(1.5*w)
+        d = np.random.randint(2, h)
+        self.l = min(max(int(d*self.ratio+0.5),1),d-1)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        if self.use_h:
+            for i in range(hh//d):
+                s = d*i + st_h
+                t = min(s+self.l, hh)
+                mask[s:t,:] *= 0
+        if self.use_w:
+            for i in range(ww//d):
+                s = d*i + st_w
+                t = min(s+self.l, ww)
+                mask[:,s:t] *= 0
+       
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w]
+
+        mask = torch.from_numpy(mask).to(x.dtype).cuda()
+        if self.mode == 1:
+            mask = 1-mask
+        mask = mask.expand_as(x)
+        if self.offset:
+            offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).to(x.dtype).cuda()
+            x = x * mask + offset * (1 - mask)
+        else:
+            x = x * mask 
+        
+        return x.view(n,c,h,w)
\ No newline at end of file
diff --git a/mmcv/utils/hub.py b/mmcv/utils/hub.py
new file mode 100644
index 0000000..a2505c0
--- /dev/null
+++ b/mmcv/utils/hub.py
@@ -0,0 +1,128 @@
+# The 1.6 release of PyTorch switched torch.save to use a new zipfile-based
+# file format. It will cause RuntimeError when a checkpoint was saved in
+# torch >= 1.6.0 but loaded in torch < 1.7.0.
+# More details at https://github.com/open-mmlab/mmpose/issues/904
+from .path import mkdir_or_exist
+from .version_utils import digit_version
+import torch
+
+TORCH_VERSION = torch.__version__
+
+if digit_version(TORCH_VERSION) < digit_version('1.7.0'):
+    # Modified from https://github.com/pytorch/pytorch/blob/master/torch/hub.py
+    import os
+    import torch
+    import warnings
+    from urllib.parse import urlparse
+    import sys
+    import zipfile
+    from torch.hub import download_url_to_file, _get_torch_home, HASH_REGEX
+
+    # Hub used to support automatically extracts from zipfile manually
+    # compressed by users. The legacy zip format expects only one file from
+    # torch.save() < 1.6 in the zip. We should remove this support since
+    # zipfile is now default zipfile format for torch.save().
+    def _is_legacy_zip_format(filename):
+        if zipfile.is_zipfile(filename):
+            infolist = zipfile.ZipFile(filename).infolist()
+            return len(infolist) == 1 and not infolist[0].is_dir()
+        return False
+
+    def _legacy_zip_load(filename, model_dir, map_location):
+        warnings.warn('Falling back to the old format < 1.6. This support will'
+                      ' be deprecated in favor of default zipfile format '
+                      'introduced in 1.6. Please redo torch.save() to save it '
+                      'in the new zipfile format.')
+        # Note: extractall() defaults to overwrite file if exists. No need to
+        #       clean up beforehand. We deliberately don't handle tarfile here
+        #       since our legacy serialization format was in tar.
+        #       E.g. resnet18-5c106cde.pth which is widely used.
+        with zipfile.ZipFile(filename) as f:
+            members = f.infolist()
+            if len(members) != 1:
+                raise RuntimeError(
+                    'Only one file(not dir) is allowed in the zipfile')
+            f.extractall(model_dir)
+            extraced_name = members[0].filename
+            extracted_file = os.path.join(model_dir, extraced_name)
+        return torch.load(extracted_file, map_location=map_location)
+
+    def load_url(url,
+                 model_dir=None,
+                 map_location=None,
+                 progress=True,
+                 check_hash=False,
+                 file_name=None):
+        r"""Loads the Torch serialized object at the given URL.
+
+        If downloaded file is a zip file, it will be automatically decompressed
+
+        If the object is already present in `model_dir`, it's deserialized and
+        returned.
+        The default value of ``model_dir`` is ``<hub_dir>/checkpoints`` where
+        ``hub_dir`` is the directory returned by :func:`~torch.hub.get_dir`.
+
+        Args:
+            url (str): URL of the object to download
+            model_dir (str, optional): directory in which to save the object
+            map_location (optional): a function or a dict specifying how to
+                remap storage locations (see torch.load)
+            progress (bool, optional): whether or not to display a progress bar
+                to stderr. Default: True
+            check_hash(bool, optional): If True, the filename part of the URL
+                should follow the naming convention ``filename-<sha256>.ext``
+                where ``<sha256>`` is the first eight or more digits of the
+                SHA256 hash of the contents of the file. The hash is used to
+                ensure unique names and to verify the contents of the file.
+                Default: False
+            file_name (str, optional): name for the downloaded file. Filename
+                from ``url`` will be used if not set. Default: None.
+
+        Example:
+            >>> url = ('https://s3.amazonaws.com/pytorch/models/resnet18-5c106'
+            ...        'cde.pth')
+            >>> state_dict = torch.hub.load_state_dict_from_url(url)
+        """
+        # Issue warning to move data if old env is set
+        if os.getenv('TORCH_MODEL_ZOO'):
+            warnings.warn('TORCH_MODEL_ZOO is deprecated, please use env '
+                          'TORCH_HOME instead')
+
+        if model_dir is None:
+            torch_home = _get_torch_home()
+            model_dir = os.path.join(torch_home, 'checkpoints')
+
+        mkdir_or_exist(model_dir)
+
+        parts = urlparse(url)
+        filename = os.path.basename(parts.path)
+        if file_name is not None:
+            filename = file_name
+        cached_file = os.path.join(model_dir, filename)
+        if not os.path.exists(cached_file):
+            sys.stderr.write('Downloading: "{}" to {}\n'.format(
+                url, cached_file))
+            hash_prefix = None
+            if check_hash:
+                r = HASH_REGEX.search(filename)  # r is Optional[Match[str]]
+                hash_prefix = r.group(1) if r else None
+            download_url_to_file(
+                url, cached_file, hash_prefix, progress=progress)
+
+        if _is_legacy_zip_format(cached_file):
+            return _legacy_zip_load(cached_file, model_dir, map_location)
+
+        try:
+            return torch.load(cached_file, map_location=map_location)
+        except RuntimeError as error:
+            if digit_version(TORCH_VERSION) < digit_version('1.5.0'):
+                warnings.warn(
+                    f'If the error is the same as "{cached_file} is a zip '
+                    'archive (did you mean to use torch.jit.load()?)", you can'
+                    ' upgrade your torch to 1.5.0 or higher (current torch '
+                    f'version is {TORCH_VERSION}). The error was raised '
+                    ' because the checkpoint was saved in torch>=1.6.0 but '
+                    'loaded in torch<1.5.')
+            raise error
+else:
+    from torch.utils.model_zoo import load_url  # noqa: F401
diff --git a/mmcv/utils/log_buffer.py b/mmcv/utils/log_buffer.py
new file mode 100644
index 0000000..d949e29
--- /dev/null
+++ b/mmcv/utils/log_buffer.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+
+import numpy as np
+
+
+class LogBuffer:
+
+    def __init__(self):
+        self.val_history = OrderedDict()
+        self.n_history = OrderedDict()
+        self.output = OrderedDict()
+        self.ready = False
+
+    def clear(self):
+        self.val_history.clear()
+        self.n_history.clear()
+        self.clear_output()
+
+    def clear_output(self):
+        self.output.clear()
+        self.ready = False
+
+    def update(self, vars, count=1):
+        assert isinstance(vars, dict)
+        for key, var in vars.items():
+            if key not in self.val_history:
+                self.val_history[key] = []
+                self.n_history[key] = []
+            self.val_history[key].append(var)
+            self.n_history[key].append(count)
+
+    def average(self, n=0):
+        """Average latest n values or all values."""
+        assert n >= 0
+        for key in self.val_history:
+            values = np.array(self.val_history[key][-n:])
+            nums = np.array(self.n_history[key][-n:])
+            avg = np.sum(values * nums) / np.sum(nums)
+            self.output[key] = avg
+        self.ready = True
diff --git a/mmcv/utils/logger.py b/mmcv/utils/logger.py
new file mode 100644
index 0000000..ed1de21
--- /dev/null
+++ b/mmcv/utils/logger.py
@@ -0,0 +1,21 @@
+import logging
+from .logging import get_logger
+from mmcv import __version__
+
+def get_root_logger(log_file=None, log_level=logging.INFO, name = __version__):
+    """Get root logger.
+
+    Args:
+        log_file (str, optional): File path of log. Defaults to None.
+        log_level (int, optional): The level of logger.
+            Defaults to logging.INFO.
+
+    Returns:
+        :obj:`logging.Logger`: The obtained logger
+    """
+    logger = get_logger(name=name, log_file=log_file, log_level=log_level)
+    
+    logging_filter = logging.Filter(name)
+    logging_filter.filter = lambda record: record.find(name) != -1
+
+    return logger
diff --git a/mmcv/utils/logging.py b/mmcv/utils/logging.py
new file mode 100644
index 0000000..4aa0e04
--- /dev/null
+++ b/mmcv/utils/logging.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+
+import torch.distributed as dist
+
+logger_initialized = {}
+
+
+def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
+    """Initialize and get a logger by name.
+
+    If the logger has not been initialized, this method will initialize the
+    logger by adding one or two handlers, otherwise the initialized logger will
+    be directly returned. During initialization, a StreamHandler will always be
+    added. If `log_file` is specified and the process rank is 0, a FileHandler
+    will also be added.
+
+    Args:
+        name (str): Logger name.
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the logger.
+        log_level (int): The logger level. Note that only the process of
+            rank 0 is affected, and other processes will set the level to
+            "Error" thus be silent most of the time.
+        file_mode (str): The file mode used in opening log file.
+            Defaults to 'w'.
+
+    Returns:
+        logging.Logger: The expected logger.
+    """
+    logger = logging.getLogger(name)
+    if name in logger_initialized:
+        return logger
+    # handle hierarchical names
+    # e.g., logger "a" is initialized, then logger "a.b" will skip the
+    # initialization since it is a child of "a".
+    for logger_name in logger_initialized:
+        if name.startswith(logger_name):
+            return logger
+
+    # handle duplicate logs to the console
+    # Starting in 1.8.0, PyTorch DDP attaches a StreamHandler <stderr> (NOTSET)
+    # to the root logger. As logger.propagate is True by default, this root
+    # level handler causes logging messages from rank>0 processes to
+    # unexpectedly show up on the console, creating much unwanted clutter.
+    # To fix this issue, we set the root logger's StreamHandler, if any, to log
+    # at the ERROR level.
+    for handler in logger.root.handlers:
+        if type(handler) is logging.StreamHandler:
+            handler.setLevel(logging.ERROR)
+
+    stream_handler = logging.StreamHandler()
+    handlers = [stream_handler]
+
+    if dist.is_available() and dist.is_initialized():
+        rank = dist.get_rank()
+    else:
+        rank = 0
+
+    # only rank 0 will add a FileHandler
+    if rank == 0 and log_file is not None:
+        # Here, the default behaviour of the official logger is 'a'. Thus, we
+        # provide an interface to change the file mode to the default
+        # behaviour.
+        file_handler = logging.FileHandler(log_file, file_mode)
+        handlers.append(file_handler)
+
+    formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    for handler in handlers:
+        handler.setFormatter(formatter)
+        handler.setLevel(log_level)
+        logger.addHandler(handler)
+
+    if rank == 0:
+        logger.setLevel(log_level)
+    else:
+        logger.setLevel(logging.ERROR)
+
+    logger_initialized[name] = True
+
+    return logger
+
+
+def print_log(msg, logger=None, level=logging.INFO):
+    """Print a log message.
+
+    Args:
+        msg (str): The message to be logged.
+        logger (logging.Logger | str | None): The logger to be used.
+            Some special loggers are:
+            - "silent": no message will be printed.
+            - other str: the logger obtained with `get_root_logger(logger)`.
+            - None: The `print()` method will be used to print log messages.
+        level (int): Logging level. Only available when `logger` is a Logger
+            object or "root".
+    """
+    if logger is None:
+        print(msg)
+    elif isinstance(logger, logging.Logger):
+        logger.log(level, msg)
+    elif logger == 'silent':
+        pass
+    elif isinstance(logger, str):
+        _logger = get_logger(logger)
+        _logger.log(level, msg)
+    else:
+        raise TypeError(
+            'logger should be either a logging.Logger object, str, '
+            f'"silent" or None, but got {type(logger)}')
diff --git a/mmcv/utils/memory.py b/mmcv/utils/memory.py
new file mode 100644
index 0000000..3ecd7d7
--- /dev/null
+++ b/mmcv/utils/memory.py
@@ -0,0 +1,84 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import logging
+from contextlib import contextmanager
+from functools import wraps
+import torch
+
+__all__ = ["retry_if_cuda_oom"]
+
+
+@contextmanager
+def _ignore_torch_cuda_oom():
+    """
+    A context which ignores CUDA OOM exception from pytorch.
+    """
+    try:
+        yield
+    except RuntimeError as e:
+        # NOTE: the string may change?
+        if "CUDA out of memory. " in str(e):
+            pass
+        else:
+            raise
+
+
+def retry_if_cuda_oom(func):
+    """
+    Makes a function retry itself after encountering
+    pytorch's CUDA OOM error.
+    It will first retry after calling `torch.cuda.empty_cache()`.
+
+    If that still fails, it will then retry by trying to convert inputs to CPUs.
+    In this case, it expects the function to dispatch to CPU implementation.
+    The return values may become CPU tensors as well and it's user's
+    responsibility to convert it back to CUDA tensor if needed.
+
+    Args:
+        func: a stateless callable that takes tensor-like objects as arguments
+
+    Returns:
+        a callable which retries `func` if OOM is encountered.
+
+    Examples:
+    ::
+        output = retry_if_cuda_oom(some_torch_function)(input1, input2)
+        # output may be on CPU even if inputs are on GPU
+
+    Note:
+        1. When converting inputs to CPU, it will only look at each argument and check
+           if it has `.device` and `.to` for conversion. Nested structures of tensors
+           are not supported.
+
+        2. Since the function might be called more than once, it has to be
+           stateless.
+    """
+
+    def maybe_to_cpu(x):
+        try:
+            like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to")
+        except AttributeError:
+            like_gpu_tensor = False
+        if like_gpu_tensor:
+            return x.to(device="cpu")
+        else:
+            return x
+
+    @wraps(func)
+    def wrapped(*args, **kwargs):
+        with _ignore_torch_cuda_oom():
+            return func(*args, **kwargs)
+
+        # Clear cache and retry
+        torch.cuda.empty_cache()
+        with _ignore_torch_cuda_oom():
+            return func(*args, **kwargs)
+
+        # Try on CPU. This slows down the code significantly, therefore print a notice.
+        logger = logging.getLogger(__name__)
+        logger.info("Attempting to copy inputs of {} to CPU due to CUDA OOM".format(str(func)))
+        new_args = (maybe_to_cpu(x) for x in args)
+        new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()}
+        return func(*new_args, **new_kwargs)
+
+    return wrapped
\ No newline at end of file
diff --git a/mmcv/utils/misc.py b/mmcv/utils/misc.py
new file mode 100644
index 0000000..2c58d0d
--- /dev/null
+++ b/mmcv/utils/misc.py
@@ -0,0 +1,377 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import collections.abc
+import functools
+import itertools
+import subprocess
+import warnings
+from collections import abc
+from importlib import import_module
+from inspect import getfullargspec
+from itertools import repeat
+
+
+# From PyTorch internals
+def _ntuple(n):
+
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+
+
+def is_str(x):
+    """Whether the input is an string instance.
+
+    Note: This method is deprecated since python 2 is no longer supported.
+    """
+    return isinstance(x, str)
+
+
+def import_modules_from_strings(imports, allow_failed_imports=False):
+    """Import modules from the given list of strings.
+
+    Args:
+        imports (list | str | None): The given module names to be imported.
+        allow_failed_imports (bool): If True, the failed imports will return
+            None. Otherwise, an ImportError is raise. Default: False.
+
+    Returns:
+        list[module] | module | None: The imported modules.
+
+    Examples:
+        >>> osp, sys = import_modules_from_strings(
+        ...     ['os.path', 'sys'])
+        >>> import os.path as osp_
+        >>> import sys as sys_
+        >>> assert osp == osp_
+        >>> assert sys == sys_
+    """
+    if not imports:
+        return
+    single_import = False
+    if isinstance(imports, str):
+        single_import = True
+        imports = [imports]
+    if not isinstance(imports, list):
+        raise TypeError(
+            f'custom_imports must be a list but got type {type(imports)}')
+    imported = []
+    for imp in imports:
+        if not isinstance(imp, str):
+            raise TypeError(
+                f'{imp} is of type {type(imp)} and cannot be imported.')
+        try:
+            imported_tmp = import_module(imp)
+        except ImportError:
+            if allow_failed_imports:
+                warnings.warn(f'{imp} failed to import and is ignored.',
+                              UserWarning)
+                imported_tmp = None
+            else:
+                raise ImportError
+        imported.append(imported_tmp)
+    if single_import:
+        imported = imported[0]
+    return imported
+
+
+def iter_cast(inputs, dst_type, return_type=None):
+    """Cast elements of an iterable object into some type.
+
+    Args:
+        inputs (Iterable): The input object.
+        dst_type (type): Destination type.
+        return_type (type, optional): If specified, the output object will be
+            converted to this type, otherwise an iterator.
+
+    Returns:
+        iterator or specified type: The converted object.
+    """
+    if not isinstance(inputs, abc.Iterable):
+        raise TypeError('inputs must be an iterable object')
+    if not isinstance(dst_type, type):
+        raise TypeError('"dst_type" must be a valid type')
+
+    out_iterable = map(dst_type, inputs)
+
+    if return_type is None:
+        return out_iterable
+    else:
+        return return_type(out_iterable)
+
+
+def list_cast(inputs, dst_type):
+    """Cast elements of an iterable object into a list of some type.
+
+    A partial method of :func:`iter_cast`.
+    """
+    return iter_cast(inputs, dst_type, return_type=list)
+
+
+def tuple_cast(inputs, dst_type):
+    """Cast elements of an iterable object into a tuple of some type.
+
+    A partial method of :func:`iter_cast`.
+    """
+    return iter_cast(inputs, dst_type, return_type=tuple)
+
+
+def is_seq_of(seq, expected_type, seq_type=None):
+    """Check whether it is a sequence of some type.
+
+    Args:
+        seq (Sequence): The sequence to be checked.
+        expected_type (type): Expected type of sequence items.
+        seq_type (type, optional): Expected sequence type.
+
+    Returns:
+        bool: Whether the sequence is valid.
+    """
+    if seq_type is None:
+        exp_seq_type = abc.Sequence
+    else:
+        assert isinstance(seq_type, type)
+        exp_seq_type = seq_type
+    if not isinstance(seq, exp_seq_type):
+        return False
+    for item in seq:
+        if not isinstance(item, expected_type):
+            return False
+    return True
+
+
+def is_list_of(seq, expected_type):
+    """Check whether it is a list of some type.
+
+    A partial method of :func:`is_seq_of`.
+    """
+    return is_seq_of(seq, expected_type, seq_type=list)
+
+
+def is_tuple_of(seq, expected_type):
+    """Check whether it is a tuple of some type.
+
+    A partial method of :func:`is_seq_of`.
+    """
+    return is_seq_of(seq, expected_type, seq_type=tuple)
+
+
+def slice_list(in_list, lens):
+    """Slice a list into several sub lists by a list of given length.
+
+    Args:
+        in_list (list): The list to be sliced.
+        lens(int or list): The expected length of each out list.
+
+    Returns:
+        list: A list of sliced list.
+    """
+    if isinstance(lens, int):
+        assert len(in_list) % lens == 0
+        lens = [lens] * int(len(in_list) / lens)
+    if not isinstance(lens, list):
+        raise TypeError('"indices" must be an integer or a list of integers')
+    elif sum(lens) != len(in_list):
+        raise ValueError('sum of lens and list length does not '
+                         f'match: {sum(lens)} != {len(in_list)}')
+    out_list = []
+    idx = 0
+    for i in range(len(lens)):
+        out_list.append(in_list[idx:idx + lens[i]])
+        idx += lens[i]
+    return out_list
+
+
+def concat_list(in_list):
+    """Concatenate a list of list into a single list.
+
+    Args:
+        in_list (list): The list of list to be merged.
+
+    Returns:
+        list: The concatenated flat list.
+    """
+    return list(itertools.chain(*in_list))
+
+
+def check_prerequisites(
+        prerequisites,
+        checker,
+        msg_tmpl='Prerequisites "{}" are required in method "{}" but not '
+        'found, please install them first.'):  # yapf: disable
+    """A decorator factory to check if prerequisites are satisfied.
+
+    Args:
+        prerequisites (str of list[str]): Prerequisites to be checked.
+        checker (callable): The checker method that returns True if a
+            prerequisite is meet, False otherwise.
+        msg_tmpl (str): The message template with two variables.
+
+    Returns:
+        decorator: A specific decorator.
+    """
+
+    def wrap(func):
+
+        @functools.wraps(func)
+        def wrapped_func(*args, **kwargs):
+            requirements = [prerequisites] if isinstance(
+                prerequisites, str) else prerequisites
+            missing = []
+            for item in requirements:
+                if not checker(item):
+                    missing.append(item)
+            if missing:
+                print(msg_tmpl.format(', '.join(missing), func.__name__))
+                raise RuntimeError('Prerequisites not meet.')
+            else:
+                return func(*args, **kwargs)
+
+        return wrapped_func
+
+    return wrap
+
+
+def _check_py_package(package):
+    try:
+        import_module(package)
+    except ImportError:
+        return False
+    else:
+        return True
+
+
+def _check_executable(cmd):
+    if subprocess.call(f'which {cmd}', shell=True) != 0:
+        return False
+    else:
+        return True
+
+
+def requires_package(prerequisites):
+    """A decorator to check if some python packages are installed.
+
+    Example:
+        >>> @requires_package('numpy')
+        >>> func(arg1, args):
+        >>>     return numpy.zeros(1)
+        array([0.])
+        >>> @requires_package(['numpy', 'non_package'])
+        >>> func(arg1, args):
+        >>>     return numpy.zeros(1)
+        ImportError
+    """
+    return check_prerequisites(prerequisites, checker=_check_py_package)
+
+
+def requires_executable(prerequisites):
+    """A decorator to check if some executable files are installed.
+
+    Example:
+        >>> @requires_executable('ffmpeg')
+        >>> func(arg1, args):
+        >>>     print(1)
+        1
+    """
+    return check_prerequisites(prerequisites, checker=_check_executable)
+
+
+def deprecated_api_warning(name_dict, cls_name=None):
+    """A decorator to check if some arguments are deprecate and try to replace
+    deprecate src_arg_name to dst_arg_name.
+
+    Args:
+        name_dict(dict):
+            key (str): Deprecate argument names.
+            val (str): Expected argument names.
+
+    Returns:
+        func: New function.
+    """
+
+    def api_warning_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get name of the function
+            func_name = old_func.__name__
+            if cls_name is not None:
+                func_name = f'{cls_name}.{func_name}'
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for src_arg_name, dst_arg_name in name_dict.items():
+                    if src_arg_name in arg_names:
+                        warnings.warn(
+                            f'"{src_arg_name}" is deprecated in '
+                            f'`{func_name}`, please use "{dst_arg_name}" '
+                            'instead')
+                        arg_names[arg_names.index(src_arg_name)] = dst_arg_name
+            if kwargs:
+                for src_arg_name, dst_arg_name in name_dict.items():
+                    if src_arg_name in kwargs:
+
+                        assert dst_arg_name not in kwargs, (
+                            f'The expected behavior is to replace '
+                            f'the deprecated key `{src_arg_name}` to '
+                            f'new key `{dst_arg_name}`, but got them '
+                            f'in the arguments at the same time, which '
+                            f'is confusing. `{src_arg_name} will be '
+                            f'deprecated in the future, please '
+                            f'use `{dst_arg_name}` instead.')
+
+                        warnings.warn(
+                            f'"{src_arg_name}" is deprecated in '
+                            f'`{func_name}`, please use "{dst_arg_name}" '
+                            'instead')
+                        kwargs[dst_arg_name] = kwargs.pop(src_arg_name)
+
+            # apply converted arguments to the decorated method
+            output = old_func(*args, **kwargs)
+            return output
+
+        return new_func
+
+    return api_warning_wrapper
+
+
+def is_method_overridden(method, base_class, derived_class):
+    """Check if a method of base class is overridden in derived class.
+
+    Args:
+        method (str): the method name to check.
+        base_class (type): the class of the base class.
+        derived_class (type | Any): the class or instance of the derived class.
+    """
+    assert isinstance(base_class, type), \
+        "base_class doesn't accept instance, Please pass class instead."
+
+    if not isinstance(derived_class, type):
+        derived_class = derived_class.__class__
+
+    base_method = getattr(base_class, method)
+    derived_method = getattr(derived_class, method)
+    return derived_method != base_method
+
+
+def has_method(obj: object, method: str) -> bool:
+    """Check whether the object has a method.
+
+    Args:
+        method (str): The method name to check.
+        obj (object): The object to check.
+
+    Returns:
+        bool: True if the object has the method else False.
+    """
+    return hasattr(obj, method) and callable(getattr(obj, method))
diff --git a/mmcv/utils/path.py b/mmcv/utils/path.py
new file mode 100644
index 0000000..7dab4b3
--- /dev/null
+++ b/mmcv/utils/path.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+from pathlib import Path
+
+from .misc import is_str
+
+
+def is_filepath(x):
+    return is_str(x) or isinstance(x, Path)
+
+
+def fopen(filepath, *args, **kwargs):
+    if is_str(filepath):
+        return open(filepath, *args, **kwargs)
+    elif isinstance(filepath, Path):
+        return filepath.open(*args, **kwargs)
+    raise ValueError('`filepath` should be a string or a Path')
+
+
+def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
+    if not osp.isfile(filename):
+        raise FileNotFoundError(msg_tmpl.format(filename))
+
+
+def mkdir_or_exist(dir_name, mode=0o777):
+    if dir_name == '':
+        return
+    dir_name = osp.expanduser(dir_name)
+    os.makedirs(dir_name, mode=mode, exist_ok=True)
+
+
+def symlink(src, dst, overwrite=True, **kwargs):
+    if os.path.lexists(dst) and overwrite:
+        os.remove(dst)
+    os.symlink(src, dst, **kwargs)
+
+
+def scandir(dir_path, suffix=None, recursive=False, case_sensitive=True):
+    """Scan a directory to find the interested files.
+
+    Args:
+        dir_path (str | obj:`Path`): Path of the directory.
+        suffix (str | tuple(str), optional): File suffix that we are
+            interested in. Default: None.
+        recursive (bool, optional): If set to True, recursively scan the
+            directory. Default: False.
+        case_sensitive (bool, optional) : If set to False, ignore the case of
+            suffix. Default: True.
+
+    Returns:
+        A generator for all the interested files with relative paths.
+    """
+    if isinstance(dir_path, (str, Path)):
+        dir_path = str(dir_path)
+    else:
+        raise TypeError('"dir_path" must be a string or Path object')
+
+    if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+        raise TypeError('"suffix" must be a string or tuple of strings')
+
+    if suffix is not None and not case_sensitive:
+        suffix = suffix.lower() if isinstance(suffix, str) else tuple(
+            item.lower() for item in suffix)
+
+    root = dir_path
+
+    def _scandir(dir_path, suffix, recursive, case_sensitive):
+        for entry in os.scandir(dir_path):
+            if not entry.name.startswith('.') and entry.is_file():
+                rel_path = osp.relpath(entry.path, root)
+                _rel_path = rel_path if case_sensitive else rel_path.lower()
+                if suffix is None or _rel_path.endswith(suffix):
+                    yield rel_path
+            elif recursive and os.path.isdir(entry.path):
+                # scan recursively if entry.path is a directory
+                yield from _scandir(entry.path, suffix, recursive,
+                                    case_sensitive)
+
+    return _scandir(dir_path, suffix, recursive, case_sensitive)
+
+
+def find_vcs_root(path, markers=('.git', )):
+    """Finds the root directory (including itself) of specified markers.
+
+    Args:
+        path (str): Path of directory or file.
+        markers (list[str], optional): List of file or directory names.
+
+    Returns:
+        The directory contained one of the markers or None if not found.
+    """
+    if osp.isfile(path):
+        path = osp.dirname(path)
+
+    prev, cur = None, osp.abspath(osp.expanduser(path))
+    while cur != prev:
+        if any(osp.exists(osp.join(cur, marker)) for marker in markers):
+            return cur
+        prev, cur = cur, osp.split(cur)[0]
+    return None
diff --git a/mmcv/utils/position_embedding.py b/mmcv/utils/position_embedding.py
new file mode 100644
index 0000000..290110f
--- /dev/null
+++ b/mmcv/utils/position_embedding.py
@@ -0,0 +1,34 @@
+import torch
+import torch.nn as nn
+import math
+
+class RelPositionEmbedding(nn.Module):
+    def __init__(self, num_pos_feats=64, pos_norm=True):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.fc = nn.Linear(4, self.num_pos_feats,bias=False)
+        #nn.init.orthogonal_(self.fc.weight)
+        #self.fc.weight.requires_grad = False
+        self.pos_norm = pos_norm
+        if self.pos_norm:
+            self.norm = nn.LayerNorm(self.num_pos_feats)
+    def forward(self, tensor):
+        #mask = nesttensor.mask
+        B,C,H,W = tensor.shape
+        #print('tensor.shape',  tensor.shape)
+        y_range = (torch.arange(H) / float(H - 1)).to(tensor.device)
+        #y_axis = torch.stack((y_range, 1-y_range),dim=1)
+        y_axis = torch.stack((torch.cos(y_range * math.pi), torch.sin(y_range * math.pi)), dim=1)
+        y_axis = y_axis.reshape(H, 1, 2).repeat(1, W, 1).reshape(H * W, 2)
+
+        x_range = (torch.arange(W) / float(W - 1)).to(tensor.device)
+        #x_axis =torch.stack((x_range,1-x_range),dim=1)
+        x_axis = torch.stack((torch.cos(x_range * math.pi), torch.sin(x_range * math.pi)), dim=1)
+        x_axis = x_axis.reshape(1, W, 2).repeat(H, 1, 1).reshape(H * W, 2)
+        x_pos = torch.cat((y_axis, x_axis), dim=1)
+        x_pos = self.fc(x_pos)
+
+        if self.pos_norm:
+            x_pos = self.norm(x_pos)
+        #print('xpos,', x_pos.max(),x_pos.min())
+        return x_pos
\ No newline at end of file
diff --git a/mmcv/utils/priority.py b/mmcv/utils/priority.py
new file mode 100644
index 0000000..64cc4e3
--- /dev/null
+++ b/mmcv/utils/priority.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from enum import Enum
+
+
+class Priority(Enum):
+    """Hook priority levels.
+
+    +--------------+------------+
+    | Level        | Value      |
+    +==============+============+
+    | HIGHEST      | 0          |
+    +--------------+------------+
+    | VERY_HIGH    | 10         |
+    +--------------+------------+
+    | HIGH         | 30         |
+    +--------------+------------+
+    | ABOVE_NORMAL | 40         |
+    +--------------+------------+
+    | NORMAL       | 50         |
+    +--------------+------------+
+    | BELOW_NORMAL | 60         |
+    +--------------+------------+
+    | LOW          | 70         |
+    +--------------+------------+
+    | VERY_LOW     | 90         |
+    +--------------+------------+
+    | LOWEST       | 100        |
+    +--------------+------------+
+    """
+
+    HIGHEST = 0
+    VERY_HIGH = 10
+    HIGH = 30
+    ABOVE_NORMAL = 40
+    NORMAL = 50
+    BELOW_NORMAL = 60
+    LOW = 70
+    VERY_LOW = 90
+    LOWEST = 100
+
+
+def get_priority(priority):
+    """Get priority value.
+
+    Args:
+        priority (int or str or :obj:`Priority`): Priority.
+
+    Returns:
+        int: The priority value.
+    """
+    if isinstance(priority, int):
+        if priority < 0 or priority > 100:
+            raise ValueError('priority must be between 0 and 100')
+        return priority
+    elif isinstance(priority, Priority):
+        return priority.value
+    elif isinstance(priority, str):
+        return Priority[priority.upper()].value
+    else:
+        raise TypeError('priority must be an integer or Priority enum value')
diff --git a/mmcv/utils/progressbar.py b/mmcv/utils/progressbar.py
new file mode 100644
index 0000000..0062f67
--- /dev/null
+++ b/mmcv/utils/progressbar.py
@@ -0,0 +1,208 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+from collections.abc import Iterable
+from multiprocessing import Pool
+from shutil import get_terminal_size
+
+from .timer import Timer
+
+
+class ProgressBar:
+    """A progress bar which can print the progress."""
+
+    def __init__(self, task_num=0, bar_width=50, start=True, file=sys.stdout):
+        self.task_num = task_num
+        self.bar_width = bar_width
+        self.completed = 0
+        self.file = file
+        if start:
+            self.start()
+
+    @property
+    def terminal_width(self):
+        width, _ = get_terminal_size()
+        return width
+
+    def start(self):
+        if self.task_num > 0:
+            self.file.write(f'[{" " * self.bar_width}] 0/{self.task_num}, '
+                            'elapsed: 0s, ETA:')
+        else:
+            self.file.write('completed: 0, elapsed: 0s')
+        self.file.flush()
+        self.timer = Timer()
+
+    def update(self, num_tasks=1):
+        assert num_tasks > 0
+        self.completed += num_tasks
+        elapsed = self.timer.since_start()
+        if elapsed > 0:
+            fps = self.completed / elapsed
+        else:
+            fps = float('inf')
+        if self.task_num > 0:
+            percentage = self.completed / float(self.task_num)
+            eta = int(elapsed * (1 - percentage) / percentage + 0.5)
+            msg = f'\r[{{}}] {self.completed}/{self.task_num}, ' \
+                  f'{fps:.1f} task/s, elapsed: {int(elapsed + 0.5)}s, ' \
+                  f'ETA: {eta:5}s'
+
+            bar_width = min(self.bar_width,
+                            int(self.terminal_width - len(msg)) + 2,
+                            int(self.terminal_width * 0.6))
+            bar_width = max(2, bar_width)
+            mark_width = int(bar_width * percentage)
+            bar_chars = '>' * mark_width + ' ' * (bar_width - mark_width)
+            self.file.write(msg.format(bar_chars))
+        else:
+            self.file.write(
+                f'completed: {self.completed}, elapsed: {int(elapsed + 0.5)}s,'
+                f' {fps:.1f} tasks/s')
+        self.file.flush()
+
+
+def track_progress(func, tasks, bar_width=50, file=sys.stdout, **kwargs):
+    """Track the progress of tasks execution with a progress bar.
+
+    Tasks are done with a simple for-loop.
+
+    Args:
+        func (callable): The function to be applied to each task.
+        tasks (list or tuple[Iterable, int]): A list of tasks or
+            (tasks, total num).
+        bar_width (int): Width of progress bar.
+
+    Returns:
+        list: The task results.
+    """
+    if isinstance(tasks, tuple):
+        assert len(tasks) == 2
+        assert isinstance(tasks[0], Iterable)
+        assert isinstance(tasks[1], int)
+        task_num = tasks[1]
+        tasks = tasks[0]
+    elif isinstance(tasks, Iterable):
+        task_num = len(tasks)
+    else:
+        raise TypeError(
+            '"tasks" must be an iterable object or a (iterator, int) tuple')
+    prog_bar = ProgressBar(task_num, bar_width, file=file)
+    results = []
+    for task in tasks:
+        results.append(func(task, **kwargs))
+        prog_bar.update()
+    prog_bar.file.write('\n')
+    return results
+
+
+def init_pool(process_num, initializer=None, initargs=None):
+    if initializer is None:
+        return Pool(process_num)
+    elif initargs is None:
+        return Pool(process_num, initializer)
+    else:
+        if not isinstance(initargs, tuple):
+            raise TypeError('"initargs" must be a tuple')
+        return Pool(process_num, initializer, initargs)
+
+
+def track_parallel_progress(func,
+                            tasks,
+                            nproc,
+                            initializer=None,
+                            initargs=None,
+                            bar_width=50,
+                            chunksize=1,
+                            skip_first=False,
+                            keep_order=True,
+                            file=sys.stdout):
+    """Track the progress of parallel task execution with a progress bar.
+
+    The built-in :mod:`multiprocessing` module is used for process pools and
+    tasks are done with :func:`Pool.map` or :func:`Pool.imap_unordered`.
+
+    Args:
+        func (callable): The function to be applied to each task.
+        tasks (list or tuple[Iterable, int]): A list of tasks or
+            (tasks, total num).
+        nproc (int): Process (worker) number.
+        initializer (None or callable): Refer to :class:`multiprocessing.Pool`
+            for details.
+        initargs (None or tuple): Refer to :class:`multiprocessing.Pool` for
+            details.
+        chunksize (int): Refer to :class:`multiprocessing.Pool` for details.
+        bar_width (int): Width of progress bar.
+        skip_first (bool): Whether to skip the first sample for each worker
+            when estimating fps, since the initialization step may takes
+            longer.
+        keep_order (bool): If True, :func:`Pool.imap` is used, otherwise
+            :func:`Pool.imap_unordered` is used.
+
+    Returns:
+        list: The task results.
+    """
+    if isinstance(tasks, tuple):
+        assert len(tasks) == 2
+        assert isinstance(tasks[0], Iterable)
+        assert isinstance(tasks[1], int)
+        task_num = tasks[1]
+        tasks = tasks[0]
+    elif isinstance(tasks, Iterable):
+        task_num = len(tasks)
+    else:
+        raise TypeError(
+            '"tasks" must be an iterable object or a (iterator, int) tuple')
+    pool = init_pool(nproc, initializer, initargs)
+    start = not skip_first
+    task_num -= nproc * chunksize * int(skip_first)
+    prog_bar = ProgressBar(task_num, bar_width, start, file=file)
+    results = []
+    if keep_order:
+        gen = pool.imap(func, tasks, chunksize)
+    else:
+        gen = pool.imap_unordered(func, tasks, chunksize)
+    for result in gen:
+        results.append(result)
+        if skip_first:
+            if len(results) < nproc * chunksize:
+                continue
+            elif len(results) == nproc * chunksize:
+                prog_bar.start()
+                continue
+        prog_bar.update()
+    prog_bar.file.write('\n')
+    pool.close()
+    pool.join()
+    return results
+
+
+def track_iter_progress(tasks, bar_width=50, file=sys.stdout):
+    """Track the progress of tasks iteration or enumeration with a progress
+    bar.
+
+    Tasks are yielded with a simple for-loop.
+
+    Args:
+        tasks (list or tuple[Iterable, int]): A list of tasks or
+            (tasks, total num).
+        bar_width (int): Width of progress bar.
+
+    Yields:
+        list: The task results.
+    """
+    if isinstance(tasks, tuple):
+        assert len(tasks) == 2
+        assert isinstance(tasks[0], Iterable)
+        assert isinstance(tasks[1], int)
+        task_num = tasks[1]
+        tasks = tasks[0]
+    elif isinstance(tasks, Iterable):
+        task_num = len(tasks)
+    else:
+        raise TypeError(
+            '"tasks" must be an iterable object or a (iterator, int) tuple')
+    prog_bar = ProgressBar(task_num, bar_width, file=file)
+    for task in tasks:
+        yield task
+        prog_bar.update()
+    prog_bar.file.write('\n')
diff --git a/mmcv/utils/registry.py b/mmcv/utils/registry.py
new file mode 100644
index 0000000..21ad671
--- /dev/null
+++ b/mmcv/utils/registry.py
@@ -0,0 +1,315 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import warnings
+from functools import partial
+
+from .misc import is_seq_of
+
+
+def build_from_cfg(cfg, registry, default_args=None):
+    """Build a module from config dict.
+
+    Args:
+        cfg (dict): Config dict. It should at least contain the key "type".
+        registry (:obj:`Registry`): The registry to search the type from.
+        default_args (dict, optional): Default initialization arguments.
+
+    Returns:
+        object: The constructed object.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
+    if 'type' not in cfg:
+        if default_args is None or 'type' not in default_args:
+            raise KeyError(
+                '`cfg` or `default_args` must contain the key "type", '
+                f'but got {cfg}\n{default_args}')
+    if not isinstance(registry, Registry):
+        raise TypeError('registry must be an mmcv.Registry object, '
+                        f'but got {type(registry)}')
+    if not (isinstance(default_args, dict) or default_args is None):
+        raise TypeError('default_args must be a dict or None, '
+                        f'but got {type(default_args)}')
+
+    args = cfg.copy()
+
+    if default_args is not None:
+        for name, value in default_args.items():
+            args.setdefault(name, value)
+
+    obj_type = args.pop('type')
+    if isinstance(obj_type, str):
+        obj_cls = registry.get(obj_type)
+        if obj_cls is None:
+            raise KeyError(
+                f'{obj_type} is not in the {registry.name} registry')
+    elif inspect.isclass(obj_type):
+        obj_cls = obj_type
+    else:
+        raise TypeError(
+            f'type must be a str or valid type, but got {type(obj_type)}')
+    try:
+        return obj_cls(**args)
+    except Exception as e:
+        # Normal TypeError does not print class name.
+        raise type(e)(f'{obj_cls.__name__}: {e}')
+
+
+class Registry:
+    """A registry to map strings to classes.
+
+    Registered object could be built from registry.
+    Example:
+        >>> MODELS = Registry('models')
+        >>> @MODELS.register_module()
+        >>> class ResNet:
+        >>>     pass
+        >>> resnet = MODELS.build(dict(type='ResNet'))
+
+    Please refer to
+    https://mmcv.readthedocs.io/en/latest/understand_mmcv/registry.html for
+    advanced usage.
+
+    Args:
+        name (str): Registry name.
+        build_func(func, optional): Build function to construct instance from
+            Registry, func:`build_from_cfg` is used if neither ``parent`` or
+            ``build_func`` is specified. If ``parent`` is specified and
+            ``build_func`` is not given,  ``build_func`` will be inherited
+            from ``parent``. Default: None.
+        parent (Registry, optional): Parent registry. The class registered in
+            children registry could be built from parent. Default: None.
+        scope (str, optional): The scope of registry. It is the key to search
+            for children registry. If not specified, scope will be the name of
+            the package where class is defined, e.g. mmdet, mmcls, mmseg.
+            Default: None.
+    """
+
+    def __init__(self, name, build_func=None, parent=None, scope=None):
+        self._name = name
+        self._module_dict = dict()
+        self._children = dict()
+        self._scope = self.infer_scope() if scope is None else scope
+
+        # self.build_func will be set with the following priority:
+        # 1. build_func
+        # 2. parent.build_func
+        # 3. build_from_cfg
+        if build_func is None:
+            if parent is not None:
+                self.build_func = parent.build_func
+            else:
+                self.build_func = build_from_cfg
+        else:
+            self.build_func = build_func
+        if parent is not None:
+            assert isinstance(parent, Registry)
+            parent._add_children(self)
+            self.parent = parent
+        else:
+            self.parent = None
+
+    def __len__(self):
+        return len(self._module_dict)
+
+    def __contains__(self, key):
+        return self.get(key) is not None
+
+    def __repr__(self):
+        format_str = self.__class__.__name__ + \
+                     f'(name={self._name}, ' \
+                     f'items={self._module_dict})'
+        return format_str
+
+    @staticmethod
+    def infer_scope():
+        """Infer the scope of registry.
+
+        The name of the package where registry is defined will be returned.
+
+        Example:
+            # in mmdet/models/backbone/resnet.py
+            >>> MODELS = Registry('models')
+            >>> @MODELS.register_module()
+            >>> class ResNet:
+            >>>     pass
+            The scope of ``ResNet`` will be ``mmdet``.
+
+
+        Returns:
+            scope (str): The inferred scope name.
+        """
+        # inspect.stack() trace where this function is called, the index-2
+        # indicates the frame where `infer_scope()` is called
+        filename = inspect.getmodule(inspect.stack()[2][0]).__name__
+        split_filename = filename.split('.')
+        return split_filename[0]
+
+    @staticmethod
+    def split_scope_key(key):
+        """Split scope and key.
+
+        The first scope will be split from key.
+
+        Examples:
+            >>> Registry.split_scope_key('mmcv.ResNet')
+            'mmdet', 'ResNet'
+            >>> Registry.split_scope_key('ResNet')
+            None, 'ResNet'
+
+        Return:
+            scope (str, None): The first scope.
+            key (str): The remaining key.
+        """
+        split_index = key.find('.')
+        if split_index != -1:
+            return key[:split_index], key[split_index + 1:]
+        else:
+            return None, key
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def scope(self):
+        return self._scope
+
+    @property
+    def module_dict(self):
+        return self._module_dict
+
+    @property
+    def children(self):
+        return self._children
+
+    def get(self, key):
+        """Get the registry record.
+
+        Args:
+            key (str): The class name in string format.
+
+        Returns:
+            class: The corresponding class.
+        """
+        scope, real_key = self.split_scope_key(key)
+        if scope is None or scope == self._scope:
+            # get from self
+            if real_key in self._module_dict:
+                return self._module_dict[real_key]
+        else:
+            # get from self._children
+            if scope in self._children:
+                return self._children[scope].get(real_key)
+            else:
+                # goto root
+                parent = self.parent
+                while parent.parent is not None:
+                    parent = parent.parent
+                return parent.get(key)
+
+    def build(self, *args, **kwargs):
+        return self.build_func(*args, **kwargs, registry=self)
+
+    def _add_children(self, registry):
+        """Add children for a registry.
+
+        The ``registry`` will be added as children based on its scope.
+        The parent registry could build objects from children registry.
+
+        Example:
+            >>> models = Registry('models')
+            >>> mmdet_models = Registry('models', parent=models)
+            >>> @mmdet_models.register_module()
+            >>> class ResNet:
+            >>>     pass
+            >>> resnet = models.build(dict(type='mmcv.ResNet'))
+        """
+
+        assert isinstance(registry, Registry)
+        assert registry.scope is not None
+        assert registry.scope not in self.children, \
+            f'scope {registry.scope} exists in {self.name} registry'
+        self.children[registry.scope] = registry
+
+    def _register_module(self, module_class, module_name=None, force=False):
+        if not inspect.isclass(module_class):
+            raise TypeError('module must be a class, '
+                            f'but got {type(module_class)}')
+
+        if module_name is None:
+            module_name = module_class.__name__
+        if isinstance(module_name, str):
+            module_name = [module_name]
+        for name in module_name:
+            if not force and name in self._module_dict:
+                raise KeyError(f'{name} is already registered '
+                               f'in {self.name}')
+            self._module_dict[name] = module_class
+
+    def deprecated_register_module(self, cls=None, force=False):
+        warnings.warn(
+            'The old API of register_module(module, force=False) '
+            'is deprecated and will be removed, please use the new API '
+            'register_module(name=None, force=False, module=None) instead.')
+        if cls is None:
+            return partial(self.deprecated_register_module, force=force)
+        self._register_module(cls, force=force)
+        return cls
+
+    def register_module(self, name=None, force=False, module=None):
+        """Register a module.
+
+        A record will be added to `self._module_dict`, whose key is the class
+        name or the specified name, and value is the class itself.
+        It can be used as a decorator or a normal function.
+
+        Example:
+            >>> backbones = Registry('backbone')
+            >>> @backbones.register_module()
+            >>> class ResNet:
+            >>>     pass
+
+            >>> backbones = Registry('backbone')
+            >>> @backbones.register_module(name='mnet')
+            >>> class MobileNet:
+            >>>     pass
+
+            >>> backbones = Registry('backbone')
+            >>> class ResNet:
+            >>>     pass
+            >>> backbones.register_module(ResNet)
+
+        Args:
+            name (str | None): The module name to be registered. If not
+                specified, the class name will be used.
+            force (bool, optional): Whether to override an existing class with
+                the same name. Default: False.
+            module (type): Module class to be registered.
+        """
+        if not isinstance(force, bool):
+            raise TypeError(f'force must be a boolean, but got {type(force)}')
+        # NOTE: This is a walkaround to be compatible with the old api,
+        # while it may introduce unexpected bugs.
+        if isinstance(name, type):
+            return self.deprecated_register_module(name, force=force)
+
+        # raise the error ahead of time
+        if not (name is None or isinstance(name, str) or is_seq_of(name, str)):
+            raise TypeError(
+                'name must be either of None, an instance of str or a sequence'
+                f'  of str, but got {type(name)}')
+
+        # use it as a normal method: x.register_module(module=SomeClass)
+        if module is not None:
+            self._register_module(
+                module_class=module, module_name=name, force=force)
+            return module
+
+        # use it as a decorator: @x.register_module()
+        def _register(cls):
+            self._register_module(
+                module_class=cls, module_name=name, force=force)
+            return cls
+
+        return _register
diff --git a/mmcv/utils/runner_utils.py b/mmcv/utils/runner_utils.py
new file mode 100644
index 0000000..0ec8cf3
--- /dev/null
+++ b/mmcv/utils/runner_utils.py
@@ -0,0 +1,254 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import random
+import sys
+import time
+import warnings
+from getpass import getuser
+from socket import gethostname
+
+import numpy as np
+from mmcv.utils import is_str
+import functools
+import os
+import subprocess
+from collections import OrderedDict
+
+import torch
+import torch.multiprocessing as mp
+from torch import distributed as dist
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+
+
+def get_host_info():
+    """Get hostname and username.
+
+    Return empty string if exception raised, e.g. ``getpass.getuser()`` will
+    lead to error in docker container
+    """
+    host = ''
+    try:
+        host = f'{getuser()}@{gethostname()}'
+    except Exception as e:
+        warnings.warn(f'Host or user not found: {str(e)}')
+    finally:
+        return host
+
+
+def get_time_str():
+    return time.strftime('%Y%m%d_%H%M%S', time.localtime())
+
+
+def obj_from_dict(info, parent=None, default_args=None):
+    """Initialize an object from dict.
+
+    The dict must contain the key "type", which indicates the object type, it
+    can be either a string or type, such as "list" or ``list``. Remaining
+    fields are treated as the arguments for constructing the object.
+
+    Args:
+        info (dict): Object types and arguments.
+        parent (:class:`module`): Module which may containing expected object
+            classes.
+        default_args (dict, optional): Default arguments for initializing the
+            object.
+
+    Returns:
+        any type: Object built from the dict.
+    """
+    assert isinstance(info, dict) and 'type' in info
+    assert isinstance(default_args, dict) or default_args is None
+    args = info.copy()
+    obj_type = args.pop('type')
+    if is_str(obj_type):
+        if parent is not None:
+            obj_type = getattr(parent, obj_type)
+        else:
+            obj_type = sys.modules[obj_type]
+    elif not isinstance(obj_type, type):
+        raise TypeError('type must be a str or valid type, but '
+                        f'got {type(obj_type)}')
+    if default_args is not None:
+        for name, value in default_args.items():
+            args.setdefault(name, value)
+    return obj_type(**args)
+
+
+def set_random_seed(seed, deterministic=False, use_rank_shift=False):
+    """Set random seed.
+
+    Args:
+        seed (int): Seed to be used.
+        deterministic (bool): Whether to set the deterministic option for
+            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+            to True and `torch.backends.cudnn.benchmark` to False.
+            Default: False.
+        rank_shift (bool): Whether to add rank number to the random seed to
+            have different random seed in different threads. Default: False.
+    """
+    if use_rank_shift:
+        rank, _ = get_dist_info()
+        seed += rank
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+
+def init_dist(launcher, backend='nccl', **kwargs):
+    if mp.get_start_method(allow_none=True) is None:
+        mp.set_start_method('spawn')
+    if launcher == 'pytorch':
+        _init_dist_pytorch(backend, **kwargs)
+    elif launcher == 'mpi':
+        _init_dist_mpi(backend, **kwargs)
+    elif launcher == 'slurm':
+        _init_dist_slurm(backend, **kwargs)
+    else:
+        raise ValueError(f'Invalid launcher type: {launcher}')
+
+
+def _init_dist_pytorch(backend, **kwargs):
+    # TODO: use local_rank instead of rank % num_gpus
+    rank = int(os.environ['RANK'])
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(rank % num_gpus)
+    dist.init_process_group(backend=backend, **kwargs)
+
+
+def _init_dist_mpi(backend, **kwargs):
+    # TODO: use local_rank instead of rank % num_gpus
+    rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(rank % num_gpus)
+    dist.init_process_group(backend=backend, **kwargs)
+
+
+def _init_dist_slurm(backend, port=None):
+    """Initialize slurm distributed training environment.
+
+    If argument ``port`` is not specified, then the master port will be system
+    environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
+    environment variable, then a default port ``29500`` will be used.
+
+    Args:
+        backend (str): Backend of torch.distributed.
+        port (int, optional): Master port. Defaults to None.
+    """
+    proc_id = int(os.environ['SLURM_PROCID'])
+    ntasks = int(os.environ['SLURM_NTASKS'])
+    node_list = os.environ['SLURM_NODELIST']
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(proc_id % num_gpus)
+    addr = subprocess.getoutput(
+        f'scontrol show hostname {node_list} | head -n1')
+    # specify master port
+    if port is not None:
+        os.environ['MASTER_PORT'] = str(port)
+    elif 'MASTER_PORT' in os.environ:
+        pass  # use MASTER_PORT in the environment variable
+    else:
+        # 29500 is torch.distributed default port
+        os.environ['MASTER_PORT'] = '29500'
+    # use MASTER_ADDR in the environment variable if it already exists
+    if 'MASTER_ADDR' not in os.environ:
+        os.environ['MASTER_ADDR'] = addr
+    os.environ['WORLD_SIZE'] = str(ntasks)
+    os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
+    os.environ['RANK'] = str(proc_id)
+    dist.init_process_group(backend=backend)
+
+
+def get_dist_info():
+    if dist.is_available() and dist.is_initialized():
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    else:
+        rank = 0
+        world_size = 1
+    return rank, world_size
+
+
+def master_only(func):
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        rank, _ = get_dist_info()
+        if rank == 0:
+            return func(*args, **kwargs)
+
+    return wrapper
+
+
+def allreduce_params(params, coalesce=True, bucket_size_mb=-1):
+    """Allreduce parameters.
+
+    Args:
+        params (list[torch.Parameters]): List of parameters or buffers of a
+            model.
+        coalesce (bool, optional): Whether allreduce parameters as a whole.
+            Defaults to True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Defaults to -1.
+    """
+    _, world_size = get_dist_info()
+    if world_size == 1:
+        return
+    params = [param.data for param in params]
+    if coalesce:
+        _allreduce_coalesced(params, world_size, bucket_size_mb)
+    else:
+        for tensor in params:
+            dist.all_reduce(tensor.div_(world_size))
+
+
+def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+    """Allreduce gradients.
+
+    Args:
+        params (list[torch.Parameters]): List of parameters of a model
+        coalesce (bool, optional): Whether allreduce parameters as a whole.
+            Defaults to True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Defaults to -1.
+    """
+    grads = [
+        param.grad.data for param in params
+        if param.requires_grad and param.grad is not None
+    ]
+    _, world_size = get_dist_info()
+    if world_size == 1:
+        return
+    if coalesce:
+        _allreduce_coalesced(grads, world_size, bucket_size_mb)
+    else:
+        for tensor in grads:
+            dist.all_reduce(tensor.div_(world_size))
+
+
+def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        dist.all_reduce(flat_tensors)
+        flat_tensors.div_(world_size)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
\ No newline at end of file
diff --git a/mmcv/utils/timer.py b/mmcv/utils/timer.py
new file mode 100644
index 0000000..66d4a78
--- /dev/null
+++ b/mmcv/utils/timer.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from time import time
+
+
+class TimerError(Exception):
+
+    def __init__(self, message):
+        self.message = message
+        super(TimerError, self).__init__(message)
+
+
+class Timer:
+    """A flexible Timer class.
+
+    :Example:
+
+    >>> import time
+    >>> import mmcv
+    >>> with mmcv.Timer():
+    >>>     # simulate a code block that will run for 1s
+    >>>     time.sleep(1)
+    1.000
+    >>> with mmcv.Timer(print_tmpl='it takes {:.1f} seconds'):
+    >>>     # simulate a code block that will run for 1s
+    >>>     time.sleep(1)
+    it takes 1.0 seconds
+    >>> timer = mmcv.Timer()
+    >>> time.sleep(0.5)
+    >>> print(timer.since_start())
+    0.500
+    >>> time.sleep(0.5)
+    >>> print(timer.since_last_check())
+    0.500
+    >>> print(timer.since_start())
+    1.000
+    """
+
+    def __init__(self, start=True, print_tmpl=None):
+        self._is_running = False
+        self.print_tmpl = print_tmpl if print_tmpl else '{:.3f}'
+        if start:
+            self.start()
+
+    @property
+    def is_running(self):
+        """bool: indicate whether the timer is running"""
+        return self._is_running
+
+    def __enter__(self):
+        self.start()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        print(self.print_tmpl.format(self.since_last_check()))
+        self._is_running = False
+
+    def start(self):
+        """Start the timer."""
+        if not self._is_running:
+            self._t_start = time()
+            self._is_running = True
+        self._t_last = time()
+
+    def since_start(self):
+        """Total time since the timer is started.
+
+        Returns (float): Time in seconds.
+        """
+        if not self._is_running:
+            raise TimerError('timer is not running')
+        self._t_last = time()
+        return self._t_last - self._t_start
+
+    def since_last_check(self):
+        """Time since the last checking.
+
+        Either :func:`since_start` or :func:`since_last_check` is a checking
+        operation.
+
+        Returns (float): Time in seconds.
+        """
+        if not self._is_running:
+            raise TimerError('timer is not running')
+        dur = time() - self._t_last
+        self._t_last = time()
+        return dur
+
+
+_g_timers = {}  # global timers
+
+
+def check_time(timer_id):
+    """Add check points in a single line.
+
+    This method is suitable for running a task on a list of items. A timer will
+    be registered when the method is called for the first time.
+
+    :Example:
+
+    >>> import time
+    >>> import mmcv
+    >>> for i in range(1, 6):
+    >>>     # simulate a code block
+    >>>     time.sleep(i)
+    >>>     mmcv.check_time('task1')
+    2.000
+    3.000
+    4.000
+    5.000
+
+    Args:
+        timer_id (str): Timer identifier.
+    """
+    if timer_id not in _g_timers:
+        _g_timers[timer_id] = Timer()
+        return 0
+    else:
+        return _g_timers[timer_id].since_last_check()
diff --git a/mmcv/utils/util_mixins.py b/mmcv/utils/util_mixins.py
new file mode 100644
index 0000000..9aed015
--- /dev/null
+++ b/mmcv/utils/util_mixins.py
@@ -0,0 +1,104 @@
+"""This module defines the :class:`NiceRepr` mixin class, which defines a
+``__repr__`` and ``__str__`` method that only depend on a custom ``__nice__``
+method, which you must define. This means you only have to overload one
+function instead of two.  Furthermore, if the object defines a ``__len__``
+method, then the ``__nice__`` method defaults to something sensible, otherwise
+it is treated as abstract and raises ``NotImplementedError``.
+
+To use simply have your object inherit from :class:`NiceRepr`
+(multi-inheritance should be ok).
+
+This code was copied from the ubelt library: https://github.com/Erotemic/ubelt
+
+Example:
+    >>> # Objects that define __nice__ have a default __str__ and __repr__
+    >>> class Student(NiceRepr):
+    ...    def __init__(self, name):
+    ...        self.name = name
+    ...    def __nice__(self):
+    ...        return self.name
+    >>> s1 = Student('Alice')
+    >>> s2 = Student('Bob')
+    >>> print(f's1 = {s1}')
+    >>> print(f's2 = {s2}')
+    s1 = <Student(Alice)>
+    s2 = <Student(Bob)>
+
+Example:
+    >>> # Objects that define __len__ have a default __nice__
+    >>> class Group(NiceRepr):
+    ...    def __init__(self, data):
+    ...        self.data = data
+    ...    def __len__(self):
+    ...        return len(self.data)
+    >>> g = Group([1, 2, 3])
+    >>> print(f'g = {g}')
+    g = <Group(3)>
+"""
+import warnings
+
+
+class NiceRepr:
+    """Inherit from this class and define ``__nice__`` to "nicely" print your
+    objects.
+
+    Defines ``__str__`` and ``__repr__`` in terms of ``__nice__`` function
+    Classes that inherit from :class:`NiceRepr` should redefine ``__nice__``.
+    If the inheriting class has a ``__len__``, method then the default
+    ``__nice__`` method will return its length.
+
+    Example:
+        >>> class Foo(NiceRepr):
+        ...    def __nice__(self):
+        ...        return 'info'
+        >>> foo = Foo()
+        >>> assert str(foo) == '<Foo(info)>'
+        >>> assert repr(foo).startswith('<Foo(info) at ')
+
+    Example:
+        >>> class Bar(NiceRepr):
+        ...    pass
+        >>> bar = Bar()
+        >>> import pytest
+        >>> with pytest.warns(None) as record:
+        >>>     assert 'object at' in str(bar)
+        >>>     assert 'object at' in repr(bar)
+
+    Example:
+        >>> class Baz(NiceRepr):
+        ...    def __len__(self):
+        ...        return 5
+        >>> baz = Baz()
+        >>> assert str(baz) == '<Baz(5)>'
+    """
+
+    def __nice__(self):
+        """str: a "nice" summary string describing this module"""
+        if hasattr(self, '__len__'):
+            # It is a common pattern for objects to use __len__ in __nice__
+            # As a convenience we define a default __nice__ for these objects
+            return str(len(self))
+        else:
+            # In all other cases force the subclass to overload __nice__
+            raise NotImplementedError(
+                f'Define the __nice__ method for {self.__class__!r}')
+
+    def __repr__(self):
+        """str: the string of the module"""
+        try:
+            nice = self.__nice__()
+            classname = self.__class__.__name__
+            return f'<{classname}({nice}) at {hex(id(self))}>'
+        except NotImplementedError as ex:
+            warnings.warn(str(ex), category=RuntimeWarning)
+            return object.__repr__(self)
+
+    def __str__(self):
+        """str: the string of the module"""
+        try:
+            classname = self.__class__.__name__
+            nice = self.__nice__()
+            return f'<{classname}({nice})>'
+        except NotImplementedError as ex:
+            warnings.warn(str(ex), category=RuntimeWarning)
+            return object.__repr__(self)
diff --git a/mmcv/utils/version_utils.py b/mmcv/utils/version_utils.py
new file mode 100644
index 0000000..a7dda06
--- /dev/null
+++ b/mmcv/utils/version_utils.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import subprocess
+import warnings
+
+from packaging.version import parse
+
+def digit_version(version_str: str, length: int = 4):
+    """Convert a version string into a tuple of integers.
+
+    This method is usually used for comparing two versions. For pre-release
+    versions: alpha < beta < rc.
+
+    Args:
+        version_str (str): The version string.
+        length (int): The maximum number of version levels. Default: 4.
+
+    Returns:
+        tuple[int]: The version info in digits (integers).
+    """
+    version = parse(version_str)
+    assert version.release, f'failed to parse version {version_str}'
+    release = list(version.release)
+    release = release[:length]
+    if len(release) < length:
+        release = release + [0] * (length - len(release))
+    if version.is_prerelease:
+        mapping = {'a': -3, 'b': -2, 'rc': -1}
+        val = -4
+        # version.pre can be None
+        if version.pre:
+            if version.pre[0] not in mapping:
+                warnings.warn(f'unknown prerelease version {version.pre[0]}, '
+                              'version checking may go wrong')
+            else:
+                val = mapping[version.pre[0]]
+            release.extend([val, version.pre[-1]])
+        else:
+            release.extend([val, 0])
+
+    elif version.is_postrelease:
+        release.extend([1, version.post])
+    else:
+        release.extend([0, 0])
+    return tuple(release)
+
+
+def _minimal_ext_cmd(cmd):
+    # construct minimal environment
+    env = {}
+    for k in ['SYSTEMROOT', 'PATH', 'HOME']:
+        v = os.environ.get(k)
+        if v is not None:
+            env[k] = v
+    # LANGUAGE is used on win32
+    env['LANGUAGE'] = 'C'
+    env['LANG'] = 'C'
+    env['LC_ALL'] = 'C'
+    out = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, env=env).communicate()[0]
+    return out
+
+
+def get_git_hash(fallback='unknown', digits=None):
+    """Get the git hash of the current repo.
+
+    Args:
+        fallback (str, optional): The fallback string when git hash is
+            unavailable. Defaults to 'unknown'.
+        digits (int, optional): kept digits of the hash. Defaults to None,
+            meaning all digits are kept.
+
+    Returns:
+        str: Git commit hash.
+    """
+
+    if digits is not None and not isinstance(digits, int):
+        raise TypeError('digits must be None or an integer')
+
+    try:
+        out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
+        sha = out.strip().decode('ascii')
+        if digits is not None:
+            sha = sha[:digits]
+    except OSError:
+        sha = fallback
+
+    return sha
diff --git a/mmcv/utils/visual.py b/mmcv/utils/visual.py
new file mode 100644
index 0000000..f9718af
--- /dev/null
+++ b/mmcv/utils/visual.py
@@ -0,0 +1,24 @@
+import torch
+from torchvision.utils import make_grid
+import torchvision
+import matplotlib.pyplot as plt
+import cv2
+
+
+def convert_color(img_path):
+    plt.figure()
+    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
+    plt.imsave(img_path, img, cmap=plt.get_cmap('viridis'))
+    plt.close()
+
+
+def save_tensor(tensor, path, pad_value=254.0,):
+    print('save_tensor', path)
+    tensor = tensor.to(torch.float).detach().cpu()
+    if tensor.type() == 'torch.BoolTensor':
+        tensor = tensor*255
+    if len(tensor.shape) == 3:
+        tensor = tensor.unsqueeze(1)
+    tensor = make_grid(tensor, pad_value=pad_value, normalize=False).permute(1, 2, 0).numpy().copy()
+    torchvision.utils.save_image(torch.tensor(tensor).permute(2, 0, 1), path)
+    convert_color(path)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..e839ffe
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,48 @@
+cython
+numba==0.48.0 # In order to speed up
+addict
+packaging
+Pillow
+matplotlib
+regex;sys_platform=='win32'
+pycocotools; platform_system == "Linux"
+pycocotools-windows; platform_system == "Windows"
+prettytable
+six
+terminaltables
+lyft_dataset_sdk
+nuscenes-devkit
+scikit-image
+tensorboard
+cityscapesscripts
+imagecorruptions
+scipy
+scikit-learn
+open3d
+networkx
+ipython
+opencv-python
+seaborn
+numpy==1.20.0 # In order to adapt numba
+# metric related
+einops
+casadi
+torchmetrics
+motmetrics==1.1.3 # Fixed
+trimesh
+# pytest related
+pytest
+pytest-cov
+pytest-runner
+yapf==0.40.1
+flake8
+trimesh==2.35.39
+similaritymeasures
+laspy==2.5.0
+lazrs==0.5.3
+py-trees==0.8.3
+simple_watchdog_timer
+transforms3d
+tabulate
+ephem
+dictor
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..b759948
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,224 @@
+import glob
+import os
+import platform
+import re
+from packaging.version import parse as parse_version
+from setuptools import find_packages, setup
+import torch
+from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension
+
+EXT_TYPE = 'pytorch'
+cmd_class = {'build_ext': BuildExtension}
+
+def make_cuda_ext(name,
+                  module,
+                  sources,
+                  sources_cuda=[],
+                  extra_args=[],
+                  extra_include_path=[]):
+
+    define_macros = []
+    extra_compile_args = {'cxx': [] + extra_args}
+
+    if torch.cuda.is_available():
+        define_macros += [('WITH_CUDA', None)]
+        extension = CUDAExtension
+        extra_compile_args['nvcc'] = extra_args + [
+            '-D__CUDA_NO_HALF_OPERATORS__',
+            '-D__CUDA_NO_HALF_CONVERSIONS__',
+            '-D__CUDA_NO_HALF2_OPERATORS__',
+        ]
+        sources += sources_cuda
+    else:
+        print('Compiling {} without CUDA'.format(name))
+        extension = CppExtension
+
+    return extension(
+        name='{}.{}'.format(module, name),
+        sources=[os.path.join(*module.split('.'), p) for p in sources],
+        include_dirs=extra_include_path,
+        define_macros=define_macros,
+        extra_compile_args=extra_compile_args)
+
+def parse_requirements(fname='requirements.txt', with_version=True):
+    """Parse the package dependencies listed in a requirements file but strips
+    specific versioning information.
+
+    Args:
+        fname (str): path to requirements file
+        with_version (bool, default=False): if True include version specs
+
+    Returns:
+        List[str]: list of requirements items
+
+    CommandLine:
+        python -c "import setup; print(setup.parse_requirements())"
+    """
+    import sys
+    from os.path import exists
+    require_fpath = fname
+
+    def parse_line(line):
+        """Parse information from a line in a requirements text file."""
+        if line.startswith('-r '):
+            # Allow specifying requirements in other files
+            target = line.split(' ')[1]
+            for info in parse_require_file(target):
+                yield info
+        else:
+            info = {'line': line}
+            if line.startswith('-e '):
+                info['package'] = line.split('#egg=')[1]
+            else:
+                # Remove versioning from the package
+                pat = '(' + '|'.join(['>=', '==', '>']) + ')'
+                parts = re.split(pat, line, maxsplit=1)
+                parts = [p.strip() for p in parts]
+
+                info['package'] = parts[0]
+                if len(parts) > 1:
+                    op, rest = parts[1:]
+                    if ';' in rest:
+                        # Handle platform specific dependencies
+                        # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies
+                        version, platform_deps = map(str.strip,
+                                                     rest.split(';'))
+                        info['platform_deps'] = platform_deps
+                    else:
+                        version = rest  # NOQA
+                    info['version'] = (op, version)
+            yield info
+
+    def parse_require_file(fpath):
+        with open(fpath, 'r') as f:
+            for line in f.readlines():
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    for info in parse_line(line):
+                        yield info
+
+    def gen_packages_items():
+        if exists(require_fpath):
+            for info in parse_require_file(require_fpath):
+                parts = [info['package']]
+                if with_version and 'version' in info:
+                    parts.extend(info['version'])
+                if not sys.version.startswith('3.4'):
+                    # apparently package_deps are broken in 3.4
+                    platform_deps = info.get('platform_deps')
+                    if platform_deps is not None:
+                        parts.append(';' + platform_deps)
+                item = ''.join(parts)
+                yield item
+
+    packages = list(gen_packages_items())
+    return packages
+
+def get_extensions():
+    extensions = []
+    
+    if EXT_TYPE == 'pytorch':
+        ext_name = 'mmcv._ext'
+        # prevent ninja from using too many resources
+        try:
+            import psutil
+            num_cpu = len(psutil.Process().cpu_affinity())
+            cpu_use = max(4, num_cpu - 1)
+        except (ModuleNotFoundError, AttributeError):
+            cpu_use = 4
+
+        os.environ.setdefault('MAX_JOBS', str(cpu_use))
+        define_macros = []
+
+        extra_compile_args = {'cxx': []}
+        if platform.system() != 'Windows':
+            if parse_version(torch.__version__) <= parse_version('1.12.1'):
+                extra_compile_args['cxx'] = ['-std=c++14']
+            else:
+                extra_compile_args['cxx'] = ['-std=c++17']
+        else:
+            if parse_version(torch.__version__) <= parse_version('1.12.1'):
+                extra_compile_args['cxx'] = ['/std:c++14']
+            else:
+                extra_compile_args['cxx'] = ['/std:c++17']
+
+        include_dirs = []
+
+        if torch.cuda.is_available():
+            define_macros += [('MMCV_WITH_CUDA', None)]
+            cuda_args = os.getenv('MMCV_CUDA_ARGS')
+            extra_compile_args['nvcc'] = [cuda_args] if cuda_args else []
+            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cu') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cpp')
+            extension = CUDAExtension
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/cuda'))
+        else:
+            print(f'Compiling {ext_name} without CUDA')
+            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp')
+            extension = CppExtension
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
+
+        if 'nvcc' in extra_compile_args and platform.system() != 'Windows':
+            if parse_version(torch.__version__) <= parse_version('1.12.1'):
+                extra_compile_args['nvcc'] += ['-std=c++14']
+            else:
+                extra_compile_args['nvcc'] += ['-std=c++17']
+
+        ext_ops = extension(
+            name=ext_name,
+            sources=op_files,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args)
+        extensions.append(ext_ops)
+
+    return extensions
+
+setup(
+    name='mmcv',
+    version='0.0.1',
+    description='OpenMMLab Computer Vision Foundation',
+    keywords='computer vision',
+    packages=[
+        *find_packages(include=('mmcv', "mmcv.*")), 
+        *find_packages(include=('adzoo', "adzoo.*")), 
+    ],
+    include_package_data=True,
+    classifiers=[
+        'Development Status :: 4 - Beta',
+        'License :: OSI Approved :: Apache Software License',
+        'Operating System :: OS Independent',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
+        'Topic :: Utilities',
+    ],
+    url='https://github.com/open-mmlab/mmcv',
+    author='MMCV Contributors',
+    author_email='openmmlab@gmail.com',
+    install_requires=parse_requirements(),
+    ext_modules= get_extensions() + [
+            make_cuda_ext(
+                name='iou3d_cuda',
+                module='mmcv.ops.iou3d_det',
+                sources=[
+                    'src/iou3d.cpp',
+                    'src/iou3d_kernel.cu',
+                ]),
+            make_cuda_ext(
+                name='roiaware_pool3d_ext',
+                module='mmcv.ops.roiaware_pool3d',
+                sources=[
+                    'src/roiaware_pool3d.cpp',
+                    'src/points_in_boxes_cpu.cpp',
+                ],
+                sources_cuda=[
+                    'src/roiaware_pool3d_kernel.cu',
+                    'src/points_in_boxes_cuda.cu',
+                ]),
+    ],
+    cmdclass=cmd_class,
+    zip_safe=False)
diff --git a/team_code/pid_controller.py b/team_code/pid_controller.py
new file mode 100644
index 0000000..af43e3c
--- /dev/null
+++ b/team_code/pid_controller.py
@@ -0,0 +1,113 @@
+from collections import deque
+import numpy as np
+
+class PID(object):
+	def __init__(self, K_P=1.0, K_I=0.0, K_D=0.0, n=20):
+		self._K_P = K_P
+		self._K_I = K_I
+		self._K_D = K_D
+
+		self._window = deque([0 for _ in range(n)], maxlen=n)
+		self._max = 0.0
+		self._min = 0.0
+
+	def step(self, error):
+		self._window.append(error)
+		self._max = max(self._max, abs(error))
+		self._min = -abs(self._max)
+
+		if len(self._window) >= 2:
+			integral = np.mean(self._window)
+			derivative = (self._window[-1] - self._window[-2])
+		else:
+			integral = 0.0
+			derivative = 0.0
+
+		return self._K_P * error + self._K_I * integral + self._K_D * derivative
+
+
+
+class PIDController(object):
+    
+    def __init__(self, turn_KP=0.75, turn_KI=0.75, turn_KD=0.3, turn_n=40, speed_KP=5.0, speed_KI=0.5,speed_KD=1.0, speed_n = 40,max_throttle=0.75, brake_speed=0.4,brake_ratio=1.1, clip_delta=0.25, aim_dist=4.0, angle_thresh=0.3, dist_thresh=10):
+        
+        self.turn_controller = PID(K_P=turn_KP, K_I=turn_KI, K_D=turn_KD, n=turn_n)
+        self.speed_controller = PID(K_P=speed_KP, K_I=speed_KI, K_D=speed_KD, n=speed_n)
+        self.max_throttle = max_throttle
+        self.brake_speed = brake_speed
+        self.brake_ratio = brake_ratio
+        self.clip_delta = clip_delta
+        self.aim_dist = aim_dist
+        self.angle_thresh = angle_thresh
+        self.dist_thresh = dist_thresh
+
+    def control_pid(self, waypoints, speed, target):
+        ''' Predicts vehicle control with a PID controller.
+        Args:
+            waypoints (tensor): output of self.plan()
+            speed (tensor): speedometer input
+        '''
+
+        # iterate over vectors between predicted waypoints
+        num_pairs = len(waypoints) - 1
+        best_norm = 1e5
+        desired_speed = 0
+        aim = waypoints[0]
+        for i in range(num_pairs):
+            # magnitude of vectors, used for speed
+            desired_speed += np.linalg.norm(
+                    waypoints[i+1] - waypoints[i]) * 2.0 / num_pairs
+
+            # norm of vector midpoints, used for steering
+            norm = np.linalg.norm((waypoints[i+1] + waypoints[i]) / 2.0)
+            if abs(self.aim_dist-best_norm) > abs(self.aim_dist-norm):
+                aim = waypoints[i]
+                best_norm = norm
+
+        aim_last = waypoints[-1] - waypoints[-2]
+
+        angle = np.degrees(np.pi / 2 - np.arctan2(aim[1], aim[0])) / 90
+        angle_last = np.degrees(np.pi / 2 - np.arctan2(aim_last[1], aim_last[0])) / 90
+        angle_target = np.degrees(np.pi / 2 - np.arctan2(target[1], target[0])) / 90
+
+        # choice of point to aim for steering, removing outlier predictions
+        # use target point if it has a smaller angle or if error is large
+        # predicted point otherwise
+        # (reduces noise in eg. straight roads, helps with sudden turn commands)
+        use_target_to_aim = np.abs(angle_target) < np.abs(angle)
+        use_target_to_aim = use_target_to_aim or (np.abs(angle_target-angle_last) > self.angle_thresh and target[1] < self.dist_thresh)
+        if use_target_to_aim:
+            angle_final = angle_target
+        else:
+            angle_final = angle
+
+        steer = self.turn_controller.step(angle_final)
+        steer = np.clip(steer, -1.0, 1.0)
+
+        brake = desired_speed < self.brake_speed or (speed / desired_speed) > self.brake_ratio
+
+        delta = np.clip(desired_speed - speed, 0.0, self.clip_delta)
+        throttle = self.speed_controller.step(delta)
+        throttle = np.clip(throttle, 0.0, self.max_throttle)
+        throttle = throttle if not brake else 0.0
+
+        metadata = {
+            'speed': float(speed.astype(np.float64)),
+            'steer': float(steer),
+            'throttle': float(throttle),
+            'brake': float(brake),
+            'wp_4': tuple(waypoints[3].astype(np.float64)),
+            'wp_3': tuple(waypoints[2].astype(np.float64)),
+            'wp_2': tuple(waypoints[1].astype(np.float64)),
+            'wp_1': tuple(waypoints[0].astype(np.float64)),
+            'aim': tuple(aim.astype(np.float64)),
+            'target': tuple(target.astype(np.float64)),
+            'desired_speed': float(desired_speed.astype(np.float64)),
+            'angle': float(angle.astype(np.float64)),
+            'angle_last': float(angle_last.astype(np.float64)),
+            'angle_target': float(angle_target.astype(np.float64)),
+            'angle_final': float(angle_final.astype(np.float64)),
+            'delta': float(delta.astype(np.float64)),
+        }
+
+        return steer, throttle, brake, metadata
\ No newline at end of file
diff --git a/team_code/planner.py b/team_code/planner.py
new file mode 100644
index 0000000..ef3d4c6
--- /dev/null
+++ b/team_code/planner.py
@@ -0,0 +1,128 @@
+import os
+from collections import deque
+
+import numpy as np
+import math
+EARTH_RADIUS_EQUA = 6378137.0
+
+
+DEBUG = int(os.environ.get('HAS_DISPLAY', 0))
+
+
+class Plotter(object):
+    def __init__(self, size):
+        self.size = size
+        self.clear()
+        self.title = str(self.size)
+
+    def clear(self):
+        from PIL import Image, ImageDraw
+
+        self.img = Image.fromarray(np.zeros((self.size, self.size, 3), dtype=np.uint8))
+        self.draw = ImageDraw.Draw(self.img)
+
+    def dot(self, pos, node, color=(255, 255, 255), r=2):
+        x, y = 5.5 * (pos - node)
+        x += self.size / 2
+        y += self.size / 2
+
+        self.draw.ellipse((x-r, y-r, x+r, y+r), color)
+
+    def show(self):
+        if not DEBUG:
+            return
+
+        import cv2
+
+        cv2.imshow(self.title, cv2.cvtColor(np.array(self.img), cv2.COLOR_BGR2RGB))
+        cv2.waitKey(1)
+
+
+class RoutePlanner(object):
+    def __init__(self, min_distance, max_distance, debug_size=256, lat_ref=42.0, lon_ref=2.0):
+        self.route = deque()
+        self.min_distance = min_distance
+        self.max_distance = max_distance
+
+        # self.mean = np.array([49.0, 8.0]) # for carla 9.9
+        # self.scale = np.array([111324.60662786, 73032.1570362]) # for carla 9.9
+        self.mean = np.array([0.0, 0.0]) # for carla 9.10
+        self.scale = np.array([111324.60662786, 111319.490945]) # for carla 9.10
+
+        self.debug = Plotter(debug_size)
+        # self.lat_ref, self.lon_ref = self._get_latlon_ref()
+        self.lat_ref = lat_ref
+        self.lon_ref = lon_ref
+
+    def set_route(self, global_plan, gps=False, global_plan_world = None):
+        self.route.clear()
+
+        if global_plan_world:
+            for (pos, cmd), (pos_word, _ )in zip(global_plan, global_plan_world):
+                if gps:
+                    pos = self.gps_to_location(np.array([pos['lat'], pos['lon']]))
+                    # pos -= self.mean
+                    # pos *= self.scale
+                else:
+                    pos = np.array([pos.location.x, pos.location.y])
+                    # pos -= self.mean
+                
+                self.route.append((pos, cmd, pos_word))
+        else:
+            for pos, cmd in global_plan:
+                if gps:
+                    pos = self.gps_to_location(np.array([pos['lat'], pos['lon']]))
+                    # pos -= self.mean
+                    # pos *= self.scale
+                else:
+                    pos = np.array([pos.location.x, pos.location.y])
+                    # pos -= self.mean
+
+                self.route.append((pos, cmd))
+
+    def run_step(self, gps):
+        self.debug.clear()
+
+        if len(self.route) == 1:
+            return self.route[0]
+
+        to_pop = 0
+        farthest_in_range = -np.inf
+        cumulative_distance = 0.0
+
+        for i in range(1, len(self.route)):
+            if cumulative_distance > self.max_distance:
+                break
+
+            cumulative_distance += np.linalg.norm(self.route[i][0] - self.route[i-1][0])
+            distance = np.linalg.norm(self.route[i][0] - gps)
+
+            if distance <= self.min_distance and distance > farthest_in_range:
+                farthest_in_range = distance
+                to_pop = i
+
+            r = 255 * int(distance > self.min_distance)
+            g = 255 * int(self.route[i][1].value == 4)
+            b = 255
+            self.debug.dot(gps, self.route[i][0], (r, g, b))
+
+        for _ in range(to_pop):
+            if len(self.route) > 2:
+                self.route.popleft()
+
+        self.debug.dot(gps, self.route[0][0], (0, 255, 0))
+        self.debug.dot(gps, self.route[1][0], (255, 0, 0))
+        self.debug.dot(gps, gps, (0, 0, 255))
+        self.debug.show()
+
+        return self.route[1]
+    
+    def gps_to_location(self, gps):
+        # gps content: numpy array: [lat, lon, alt]
+        lat, lon = gps
+        scale = math.cos(self.lat_ref * math.pi / 180.0)
+        my = math.log(math.tan((lat+90) * math.pi / 360.0)) * (EARTH_RADIUS_EQUA * scale)
+        mx = (lon * (math.pi * EARTH_RADIUS_EQUA * scale)) / 180.0
+        y = scale * EARTH_RADIUS_EQUA * math.log(math.tan((90.0 + self.lat_ref) * math.pi / 360.0)) - my
+        x = mx - scale * self.lon_ref * math.pi * EARTH_RADIUS_EQUA / 180.0
+        return np.array([x, y])
\ No newline at end of file
diff --git a/team_code/uniad_b2d_agent.py b/team_code/uniad_b2d_agent.py
new file mode 100644
index 0000000..efc2816
--- /dev/null
+++ b/team_code/uniad_b2d_agent.py
@@ -0,0 +1,433 @@
+import os
+import json
+import datetime
+import pathlib
+import time
+import cv2
+import carla
+from collections import deque
+import math
+from collections import OrderedDict
+import torch
+import carla
+import numpy as np
+from PIL import Image
+from torchvision import transforms as T
+from Bench2DriveZoo.team_code.pid_controller import PIDController
+from Bench2DriveZoo.team_code.planner import RoutePlanner
+from leaderboard.autoagents import autonomous_agent
+from mmcv import Config
+from mmcv.models import build_model
+from mmcv.utils import (get_dist_info, init_dist, load_checkpoint,wrap_fp16_model)
+from mmcv.datasets.pipelines import Compose
+from mmcv.parallel.collate import collate as  mm_collate_to_batch_form
+from mmcv.core.bbox import get_box_type
+from pyquaternion import Quaternion
+from scipy.optimize import fsolve
+SAVE_PATH = os.environ.get('SAVE_PATH', None)
+IS_BENCH2DRIVE = os.environ.get('IS_BENCH2DRIVE', None)
+
+def get_entry_point():
+    return 'UniadAgent'
+
+class UniadAgent(autonomous_agent.AutonomousAgent):
+    def setup(self, path_to_conf_file):
+        self.track = autonomous_agent.Track.SENSORS
+        self.steer_step = 0
+        self.last_moving_status = 0
+        self.last_moving_step = -1
+        self.last_steers = deque()
+        self.pidcontroller = PIDController() 
+        if IS_BENCH2DRIVE:
+            self.save_name = path_to_conf_file.split('+')[-1]
+            self.config_path = path_to_conf_file.split('+')[0]
+        else:
+            self.config_path = path_to_conf_file
+            self.save_name = '_'.join(map(lambda x: '%02d' % x, (now.month, now.day, now.hour, now.minute, now.second)))
+        self.step = -1
+        self.wall_start = time.time()
+        self.initialized = False
+        cfg = Config.fromfile('Bench2DriveZoo/adzoo/uniad/configs/stage2_e2e/base_e2e_b2d.py')
+        cfg.model['motion_head']['anchor_info_path'] = os.path.join('Bench2DriveZoo',cfg.model['motion_head']['anchor_info_path'])
+        if hasattr(cfg, 'plugin'):
+            if cfg.plugin:
+                import importlib
+                if hasattr(cfg, 'plugin_dir'):
+                    plugin_dir = cfg.plugin_dir
+                    plugin_dir = os.path.join("Bench2DriveZoo", plugin_dir)
+                    _module_dir = os.path.dirname(plugin_dir)
+                    _module_dir = _module_dir.split('/')
+                    _module_path = _module_dir[0]
+                    for m in _module_dir[1:]:
+                        _module_path = _module_path + '.' + m
+                    print(_module_path)
+                    plg_lib = importlib.import_module(_module_path)  
+  
+        self.model = build_model(cfg.model, train_cfg=cfg.get('train_cfg'), test_cfg=cfg.get('test_cfg'))
+        checkpoint = load_checkpoint(self.model, self.config_path, map_location='cpu', strict=True)
+        self.model.cuda()
+        self.model.eval()
+        self.inference_only_pipeline = []
+        for inference_only_pipeline in cfg.inference_only_pipeline:
+            if inference_only_pipeline["type"] not in ['LoadMultiViewImageFromFilesInCeph']:
+                self.inference_only_pipeline.append(inference_only_pipeline)
+        self.inference_only_pipeline = Compose(self.inference_only_pipeline)
+        ckpt = torch.load(self.config_path)
+        ckpt = ckpt["state_dict"]
+        new_state_dict = OrderedDict()
+        for key, value in ckpt.items():
+            new_key = key.replace("model.","")
+            new_state_dict[new_key] = value
+        self.takeover = False
+        self.stop_time = 0
+        self.takeover_time = 0
+        self.save_path = None
+        self._im_transform = T.Compose([T.ToTensor(), T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])])
+        self.last_steers = deque()
+        self.lat_ref, self.lon_ref = 42.0, 2.0
+        control = carla.VehicleControl()
+        control.steer = 0.0
+        control.throttle = 0.0
+        control.brake = 0.0	
+        self.prev_control = control
+        if SAVE_PATH is not None:
+            now = datetime.datetime.now()
+            # string = pathlib.Path(os.environ['ROUTES']).stem + '_'
+            string = self.save_name
+            self.save_path = pathlib.Path(os.environ['SAVE_PATH']) / string
+            self.save_path.mkdir(parents=True, exist_ok=False)
+            (self.save_path / 'rgb_front').mkdir()
+            (self.save_path / 'rgb_front_right').mkdir()
+            (self.save_path / 'rgb_front_left').mkdir()
+            (self.save_path / 'rgb_back').mkdir()
+            (self.save_path / 'rgb_back_right').mkdir()
+            (self.save_path / 'rgb_back_left').mkdir()
+            (self.save_path / 'meta').mkdir()
+            (self.save_path / 'bev').mkdir()
+   
+        # write extrinsics directly
+        self.lidar2img = {
+        'CAM_FRONT':np.array([[ 1.14251841e+03,  8.00000000e+02,  0.00000000e+00, -9.52000000e+02],
+                                  [ 0.00000000e+00,  4.50000000e+02, -1.14251841e+03, -8.09704417e+02],
+                                  [ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, -1.19000000e+00],
+                                 [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.00000000e+00]]),
+          'CAM_FRONT_LEFT':np.array([[ 6.03961325e-14,  1.39475744e+03,  0.00000000e+00, -9.20539908e+02],
+                                   [-3.68618420e+02,  2.58109396e+02, -1.14251841e+03, -6.47296750e+02],
+                                   [-8.19152044e-01,  5.73576436e-01,  0.00000000e+00, -8.29094072e-01],
+                                   [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.00000000e+00]]),
+          'CAM_FRONT_RIGHT':np.array([[ 1.31064327e+03, -4.77035138e+02,  0.00000000e+00,-4.06010608e+02],
+                                       [ 3.68618420e+02,  2.58109396e+02, -1.14251841e+03,-6.47296750e+02],
+                                    [ 8.19152044e-01,  5.73576436e-01,  0.00000000e+00,-8.29094072e-01],
+                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.00000000e+00]]),
+        'CAM_BACK':np.array([[-1.00000000e+00, -1.22464680e-16,  0.00000000e+00, -1.97168135e-16],
+                             [ 0.00000000e+00,  0.00000000e+00, -1.00000000e+00, -2.40000000e-01],
+                             [ 1.22464680e-16, -1.00000000e+00,  0.00000000e+00, -1.61000000e+00],
+                             [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.00000000e+00]]),
+        'CAM_BACK_LEFT':np.array([[-1.14251841e+03,  8.00000000e+02,  0.00000000e+00, -6.84385123e+02],
+                                  [-4.22861679e+02, -1.53909064e+02, -1.14251841e+03, -4.96004706e+02],
+                                  [-9.39692621e-01, -3.42020143e-01,  0.00000000e+00, -4.92889531e-01],
+                                  [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.00000000e+00]]),
+  
+        'CAM_BACK_RIGHT': np.array([[ 3.60989788e+02, -1.34723223e+03,  0.00000000e+00, -1.04238127e+02],
+                                    [ 4.22861679e+02, -1.53909064e+02, -1.14251841e+03, -4.96004706e+02],
+                                    [ 9.39692621e-01, -3.42020143e-01,  0.00000000e+00, -4.92889531e-01],
+                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.00000000e+00]])
+        }
+        self.lidar2cam = {
+        'CAM_FRONT':np.array([[ 1.  ,  0.  ,  0.  ,  0.  ],
+                                 [ 0.  ,  0.  , -1.  , -0.24],
+                                 [ 0.  ,  1.  ,  0.  , -1.19],
+                              [ 0.  ,  0.  ,  0.  ,  1.  ]]),
+        'CAM_FRONT_LEFT':np.array([[ 0.57357644,  0.81915204,  0.  , -0.22517331],
+                                      [ 0.        ,  0.        , -1.  , -0.24      ],
+                                   [-0.81915204,  0.57357644,  0.  , -0.82909407],
+                                   [ 0.        ,  0.        ,  0.  ,  1.        ]]),
+          'CAM_FRONT_RIGHT':np.array([[ 0.57357644, -0.81915204, 0.  ,  0.22517331],
+                                   [ 0.        ,  0.        , -1.  , -0.24      ],
+                                   [ 0.81915204,  0.57357644,  0.  , -0.82909407],
+                                   [ 0.        ,  0.        ,  0.  ,  1.        ]]),
+        'CAM_BACK':np.array([[-1. ,  0.,  0.,  0.  ],
+                             [ 0. ,  0., -1., -0.24],
+                             [ 0. , -1.,  0., -1.61],
+                             [ 0. ,  0.,  0.,  1.  ]]),
+     
+        'CAM_BACK_LEFT':np.array([[-0.34202014,  0.93969262,  0.  , -0.25388956],
+                                  [ 0.        ,  0.        , -1.  , -0.24      ],
+                                  [-0.93969262, -0.34202014,  0.  , -0.49288953],
+                                  [ 0.        ,  0.        ,  0.  ,  1.        ]]),
+  
+        'CAM_BACK_RIGHT':np.array([[-0.34202014, -0.93969262,  0.  ,  0.25388956],
+                                  [ 0.        ,  0.         , -1.  , -0.24      ],
+                                  [ 0.93969262, -0.34202014 ,  0.  , -0.49288953],
+                                  [ 0.        ,  0.         ,  0.  ,  1.        ]])
+        }
+        self.lidar2ego = np.array([[ 0. ,  1. ,  0. , -0.39],
+                                   [-1. ,  0. ,  0. ,  0.  ],
+                                   [ 0. ,  0. ,  1. ,  1.84],
+                                   [ 0. ,  0. ,  0. ,  1.  ]])
+        
+        topdown_extrinsics =  np.array([[0.0, -0.0, -1.0, 50.0], [0.0, 1.0, -0.0, 0.0], [1.0, -0.0, 0.0, -0.0], [0.0, 0.0, 0.0, 1.0]])
+        unreal2cam = np.array([[0,1,0,0], [0,0,-1,0], [1,0,0,0], [0,0,0,1]])
+        self.coor2topdown = unreal2cam @ topdown_extrinsics
+        topdown_intrinsics = np.array([[548.993771650447, 0.0, 256.0, 0], [0.0, 548.993771650447, 256.0, 0], [0.0, 0.0, 1.0, 0], [0, 0, 0, 1.0]])
+        self.coor2topdown = topdown_intrinsics @ self.coor2topdown
+
+    def _init(self):
+        
+        try:
+            locx, locy = self._global_plan_world_coord[0][0].location.x, self._global_plan_world_coord[0][0].location.y
+            lon, lat = self._global_plan[0][0]['lon'], self._global_plan[0][0]['lat']
+            EARTH_RADIUS_EQUA = 6378137.0
+            def equations(vars):
+                x, y = vars
+                eq1 = lon * math.cos(x * math.pi / 180) - (locx * x * 180) / (math.pi * EARTH_RADIUS_EQUA) - math.cos(x * math.pi / 180) * y
+                eq2 = math.log(math.tan((lat + 90) * math.pi / 360)) * EARTH_RADIUS_EQUA * math.cos(x * math.pi / 180) + locy - math.cos(x * math.pi / 180) * EARTH_RADIUS_EQUA * math.log(math.tan((90 + x) * math.pi / 360))
+                return [eq1, eq2]
+            initial_guess = [0, 0]
+            solution = fsolve(equations, initial_guess)
+            self.lat_ref, self.lon_ref = solution[0], solution[1]
+        except Exception as e:
+            print(e, flush=True)
+            self.lat_ref, self.lon_ref = 0, 0        
+        self._route_planner = RoutePlanner(4.0, 50.0, lat_ref=self.lat_ref, lon_ref=self.lon_ref)
+        self._route_planner.set_route(self._global_plan, True)
+        self.initialized = True
+  
+  
+
+    def sensors(self):
+        sensors =[
+                # camera rgb
+                {
+                    'type': 'sensor.camera.rgb',
+                    'x': 0.80, 'y': 0.0, 'z': 1.60,
+                    'roll': 0.0, 'pitch': 0.0, 'yaw': 0.0,
+                    'width': 1600, 'height': 900, 'fov': 70,
+                    'id': 'CAM_FRONT'
+                },
+                {
+                    'type': 'sensor.camera.rgb',
+                    'x': 0.27, 'y': -0.55, 'z': 1.60,
+                    'roll': 0.0, 'pitch': 0.0, 'yaw': -55.0,
+                    'width': 1600, 'height': 900, 'fov': 70,
+                    'id': 'CAM_FRONT_LEFT'
+                },
+                {
+                    'type': 'sensor.camera.rgb',
+                    'x': 0.27, 'y': 0.55, 'z': 1.60,
+                    'roll': 0.0, 'pitch': 0.0, 'yaw': 55.0,
+                    'width': 1600, 'height': 900, 'fov': 70,
+                    'id': 'CAM_FRONT_RIGHT'
+                },
+                {
+                    'type': 'sensor.camera.rgb',
+                    'x': -2.0, 'y': 0.0, 'z': 1.60,
+                    'roll': 0.0, 'pitch': 0.0, 'yaw': 180.0,
+                    'width': 1600, 'height': 900, 'fov': 110,
+                    'id': 'CAM_BACK'
+                },
+                {
+                    'type': 'sensor.camera.rgb',
+                    'x': -0.32, 'y': -0.55, 'z': 1.60,
+                    'roll': 0.0, 'pitch': 0.0, 'yaw': -110.0,
+                    'width': 1600, 'height': 900, 'fov': 70,
+                    'id': 'CAM_BACK_LEFT'
+                },
+                {
+                    'type': 'sensor.camera.rgb',
+                    'x': -0.32, 'y': 0.55, 'z': 1.60,
+                    'roll': 0.0, 'pitch': 0.0, 'yaw': 110.0,
+                    'width': 1600, 'height': 900, 'fov': 70,
+                    'id': 'CAM_BACK_RIGHT'
+                },
+                # imu
+                {
+                    'type': 'sensor.other.imu',
+                    'x': -1.4, 'y': 0.0, 'z': 0.0,
+                    'roll': 0.0, 'pitch': 0.0, 'yaw': 0.0,
+                    'sensor_tick': 0.05,
+                    'id': 'IMU'
+                },
+                # gps
+                {
+                    'type': 'sensor.other.gnss',
+                    'x': -1.4, 'y': 0.0, 'z': 0.0,
+                    'roll': 0.0, 'pitch': 0.0, 'yaw': 0.0,
+                    'sensor_tick': 0.01,
+                    'id': 'GPS'
+                },
+                # speed
+                {
+                    'type': 'sensor.speedometer',
+                    'reading_frequency': 20,
+                    'id': 'SPEED'
+                },
+                
+            ]
+        
+        if IS_BENCH2DRIVE:
+            sensors += [
+                    {	
+                        'type': 'sensor.camera.rgb',
+                        'x': 0.0, 'y': 0.0, 'z': 50.0,
+                        'roll': 0.0, 'pitch': -90.0, 'yaw': 0.0,
+                        'width': 512, 'height': 512, 'fov': 5 * 10.0,
+                        'id': 'bev'
+                    }]
+        return sensors
+
+    def tick(self, input_data):
+        self.step += 1
+        encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 20]
+        imgs = {}
+        for cam in ['CAM_FRONT','CAM_FRONT_LEFT','CAM_FRONT_RIGHT','CAM_BACK','CAM_BACK_LEFT','CAM_BACK_RIGHT']:
+            img = cv2.cvtColor(input_data[cam][1][:, :, :3], cv2.COLOR_BGR2RGB)
+            _, img = cv2.imencode('.jpg', img, encode_param)
+            img = cv2.imdecode(img, cv2.IMREAD_COLOR)
+            imgs[cam] = img
+        bev = cv2.cvtColor(input_data['bev'][1][:, :, :3], cv2.COLOR_BGR2RGB)
+        gps = input_data['GPS'][1][:2]
+        speed = input_data['SPEED'][1]['speed']
+        compass = input_data['IMU'][1][-1]
+        acceleration = input_data['IMU'][1][:3]
+        angular_velocity = input_data['IMU'][1][3:6]
+        pos = self.gps_to_location(gps)
+        near_node, near_command = self._route_planner.run_step(pos)
+        if (math.isnan(compass) == True): #It can happen that the compass sends nan for a few frames
+            compass = 0.0
+            acceleration = np.zeros(3)
+            angular_velocity = np.zeros(3)
+
+        result = {
+                'imgs': imgs,
+                'gps': gps,
+                'pos':pos,
+                'speed': speed,
+                'compass': compass,
+                'bev': bev,
+                'acceleration':acceleration,
+                'angular_velocity':angular_velocity,
+                'command_near':near_command,
+                'command_near_xy':near_node
+    
+                }
+        
+        return result
+    
+    @torch.no_grad()
+    def run_step(self, input_data, timestamp):
+        if not self.initialized:
+            self._init()
+        tick_data = self.tick(input_data)
+        results = {}
+        results['lidar2img'] = []
+        results['lidar2cam'] = []
+        results['img'] = []
+        results['folder'] = ' '
+        results['scene_token'] = ' '  
+        results['frame_idx'] = 0
+        results['timestamp'] = self.step / 20
+        results['box_type_3d'], _ = get_box_type('LiDAR')
+  
+        for cam in ['CAM_FRONT','CAM_FRONT_LEFT','CAM_FRONT_RIGHT','CAM_BACK','CAM_BACK_LEFT','CAM_BACK_RIGHT']:
+            results['lidar2img'].append(self.lidar2img[cam])
+            results['lidar2cam'].append(self.lidar2cam[cam])
+            results['img'].append(tick_data['imgs'][cam])
+        results['lidar2img'] = np.stack(results['lidar2img'],axis=0)
+        results['lidar2cam'] = np.stack(results['lidar2cam'],axis=0)
+  
+        raw_theta = tick_data['compass']   if not np.isnan(tick_data['compass']) else 0
+        ego_theta = -raw_theta + np.pi/2
+        rotation = list(Quaternion(axis=[0, 0, 1], radians=ego_theta))
+  
+        can_bus = np.zeros(18)
+        can_bus[0] = tick_data['pos'][0]
+        can_bus[1] = -tick_data['pos'][1]
+        can_bus[3:7] = rotation
+        can_bus[7] = tick_data['speed']
+        can_bus[10:13] = tick_data['acceleration']
+        can_bus[11] *= -1
+        can_bus[13:16] = -tick_data['angular_velocity']
+        can_bus[16] = ego_theta
+        can_bus[17] = ego_theta / np.pi * 180 
+        results['can_bus'] = can_bus
+        command = tick_data['command_near']
+        if command < 0:
+            command = 4
+        command -= 1
+        results['command'] = command
+  
+        theta_to_lidar = raw_theta
+        command_near_xy = np.array([tick_data['command_near_xy'][0]-can_bus[0],-tick_data['command_near_xy'][1]-can_bus[1]])
+        rotation_matrix = np.array([[np.cos(theta_to_lidar),-np.sin(theta_to_lidar)],[np.sin(theta_to_lidar),np.cos(theta_to_lidar)]])
+        local_command_xy = rotation_matrix @ command_near_xy
+  
+        ego2world = np.eye(4)
+        ego2world[0:3,0:3] = Quaternion(axis=[0, 0, 1], radians=ego_theta).rotation_matrix
+        ego2world[0:3,0:2] = can_bus[0:2]
+        lidar2global = ego2world @ self.lidar2ego
+        results['l2g_r_mat'] = lidar2global[0:3,0:3]
+        results['l2g_t'] = lidar2global[0:3,3]
+        stacked_imgs = np.stack(results['img'],axis=-1)
+        results['img_shape'] = stacked_imgs.shape
+        results['ori_shape'] = stacked_imgs.shape
+        results['pad_shape'] = stacked_imgs.shape
+        results = self.inference_only_pipeline(results)
+        self.device="cuda"
+        input_data_batch = mm_collate_to_batch_form([results], samples_per_gpu=1)
+        for key, data in input_data_batch.items():
+            if key != 'img_metas':
+                if torch.is_tensor(data[0]):
+                    data[0] = data[0].to(self.device)
+        output_data_batch = self.model(input_data_batch, return_loss=False, rescale=True)
+        out_truck =  output_data_batch[0]['planning']['result_planning']['sdc_traj'][0].cpu().numpy()
+        steer_traj, throttle_traj, brake_traj, metadata_traj = self.pidcontroller.control_pid(out_truck, tick_data['speed'], local_command_xy)
+        if brake_traj < 0.05: brake_traj = 0.0
+        if throttle_traj > brake_traj: brake_traj = 0.0
+        if tick_data['speed']>5:
+            throttle_traj = 0
+        control = carla.VehicleControl()
+        self.pid_metadata = metadata_traj
+        self.pid_metadata['agent'] = 'only_traj'
+        control.steer = np.clip(float(steer_traj), -1, 1)
+        control.throttle = np.clip(float(throttle_traj), 0, 0.75)
+        control.brake = np.clip(float(brake_traj), 0, 1)
+        self.pid_metadata['steer'] = control.steer
+        self.pid_metadata['throttle'] = control.throttle
+        self.pid_metadata['brake'] = control.brake
+        self.pid_metadata['steer_traj'] = float(steer_traj)
+        self.pid_metadata['throttle_traj'] = float(throttle_traj)
+        self.pid_metadata['brake_traj'] = float(brake_traj)
+        self.pid_metadata['plan'] = out_truck.tolist()
+        if SAVE_PATH is not None and self.step % 10 == 0:
+            self.save(tick_data)
+        self.prev_control = control
+        return control
+
+    def save(self, tick_data):
+        frame = self.step // 10
+        Image.fromarray(tick_data['imgs']['CAM_FRONT']).save(self.save_path / 'rgb_front' / ('%04d.png' % frame))
+        Image.fromarray(tick_data['imgs']['CAM_FRONT_LEFT']).save(self.save_path / 'rgb_front_left' / ('%04d.png' % frame))
+        Image.fromarray(tick_data['imgs']['CAM_FRONT_RIGHT']).save(self.save_path / 'rgb_front_right' / ('%04d.png' % frame))
+        Image.fromarray(tick_data['imgs']['CAM_BACK']).save(self.save_path / 'rgb_back' / ('%04d.png' % frame))
+        Image.fromarray(tick_data['imgs']['CAM_BACK_LEFT']).save(self.save_path / 'rgb_back_left' / ('%04d.png' % frame))
+        Image.fromarray(tick_data['imgs']['CAM_BACK_RIGHT']).save(self.save_path / 'rgb_back_right' / ('%04d.png' % frame))
+        Image.fromarray(tick_data['bev']).save(self.save_path / 'bev' / ('%04d.png' % frame))
+        outfile = open(self.save_path / 'meta' / ('%04d.json' % frame), 'w')
+        json.dump(self.pid_metadata, outfile, indent=4)
+        outfile.close()
+
+    def destroy(self):
+        del self.model
+        torch.cuda.empty_cache()
+
+    def gps_to_location(self, gps):
+        EARTH_RADIUS_EQUA = 6378137.0
+        # gps content: numpy array: [lat, lon, alt]
+        lat, lon = gps
+        scale = math.cos(self.lat_ref * math.pi / 180.0)
+        my = math.log(math.tan((lat+90) * math.pi / 360.0)) * (EARTH_RADIUS_EQUA * scale)
+        mx = (lon * (math.pi * EARTH_RADIUS_EQUA * scale)) / 180.0
+        y = scale * EARTH_RADIUS_EQUA * math.log(math.tan((90.0 + self.lat_ref) * math.pi / 360.0)) - my
+        x = mx - scale * self.lon_ref * math.pi * EARTH_RADIUS_EQUA / 180.0
+        return np.array([x, y])
\ No newline at end of file
diff --git a/team_code/vad_b2d_agent.py b/team_code/vad_b2d_agent.py
new file mode 100644
index 0000000..13c09b7
--- /dev/null
+++ b/team_code/vad_b2d_agent.py
@@ -0,0 +1,460 @@
+import os
+import json
+import datetime
+import pathlib
+import time
+import cv2
+import carla
+from collections import deque
+import math
+from collections import OrderedDict
+from scipy.optimize import fsolve
+import torch
+import carla
+import numpy as np
+from PIL import Image
+from torchvision import transforms as T
+from Bench2DriveZoo.team_code.pid_controller import PIDController
+from Bench2DriveZoo.team_code.planner import RoutePlanner
+from leaderboard.autoagents import autonomous_agent
+from mmcv import Config
+from mmcv.models import build_model
+from mmcv.utils import (get_dist_info, init_dist, load_checkpoint,
+                        wrap_fp16_model)
+from mmcv.datasets.pipelines import Compose
+from mmcv.parallel.collate import collate as  mm_collate_to_batch_form
+from mmcv.core.bbox import get_box_type
+from pyquaternion import Quaternion
+
+SAVE_PATH = os.environ.get('SAVE_PATH', None)
+IS_BENCH2DRIVE = os.environ.get('IS_BENCH2DRIVE', None)
+
+
+def get_entry_point():
+    return 'VadAgent'
+
+
+class VadAgent(autonomous_agent.AutonomousAgent):
+    def setup(self, path_to_conf_file):
+        self.track = autonomous_agent.Track.SENSORS
+        self.steer_step = 0
+        self.last_moving_status = 0
+        self.last_moving_step = -1
+        self.last_steer = 0
+        self.pidcontroller = PIDController() 
+        if IS_BENCH2DRIVE:
+            self.save_name = path_to_conf_file.split('+')[-1]
+            self.config_path = path_to_conf_file.split('+')[0]
+        else:
+            self.config_path = path_to_conf_file
+            self.save_name = '_'.join(map(lambda x: '%02d' % x, (now.month, now.day, now.hour, now.minute, now.second)))
+        self.step = -1
+        self.wall_start = time.time()
+        self.initialized = False
+        cfg = Config.fromfile('Bench2DriveZoo/adzoo/vad/configs/VAD/VAD_base_e2e_b2d.py')
+        if hasattr(cfg, 'plugin'):
+            if cfg.plugin:
+                import importlib
+                if hasattr(cfg, 'plugin_dir'):
+                    plugin_dir = cfg.plugin_dir
+                    plugin_dir = os.path.join("Bench2DriveZoo", plugin_dir)
+                    _module_dir = os.path.dirname(plugin_dir)
+                    _module_dir = _module_dir.split('/')
+                    _module_path = _module_dir[0]
+                    for m in _module_dir[1:]:
+                        _module_path = _module_path + '.' + m
+                    print(_module_path)
+                    plg_lib = importlib.import_module(_module_path)  
+  
+        self.model = build_model(cfg.model, train_cfg=cfg.get('train_cfg'), test_cfg=cfg.get('test_cfg'))
+        checkpoint = load_checkpoint(self.model, self.config_path, map_location='cpu', strict=True)
+        self.model.cuda()
+        self.model.eval()
+        self.inference_only_pipeline = []
+        for inference_only_pipeline in cfg.inference_only_pipeline:
+            if inference_only_pipeline["type"] not in ['LoadMultiViewImageFromFilesInCeph','LoadMultiViewImageFromFiles']:
+                self.inference_only_pipeline.append(inference_only_pipeline)
+                
+        self.inference_only_pipeline = Compose(self.inference_only_pipeline)
+        ckpt = torch.load(self.config_path)
+        ckpt = ckpt["state_dict"]
+        new_state_dict = OrderedDict()
+        for key, value in ckpt.items():
+            new_key = key.replace("model.","")
+            new_state_dict[new_key] = value
+
+        self.takeover = False
+        self.stop_time = 0
+        self.takeover_time = 0
+        self.save_path = None
+        self._im_transform = T.Compose([T.ToTensor(), T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])])
+        self.lat_ref, self.lon_ref = 42.0, 2.0
+
+        control = carla.VehicleControl()
+        control.steer = 0.0
+        control.throttle = 0.0
+        control.brake = 0.0	
+        self.prev_control = control
+        self.prev_control_cache = []
+        if SAVE_PATH is not None:
+            now = datetime.datetime.now()
+            string = pathlib.Path(os.environ['ROUTES']).stem + '_'
+            string += self.save_name
+            self.save_path = pathlib.Path(os.environ['SAVE_PATH']) / string
+            self.save_path.mkdir(parents=True, exist_ok=False)
+            (self.save_path / 'rgb_front').mkdir()
+            (self.save_path / 'rgb_front_right').mkdir()
+            (self.save_path / 'rgb_front_left').mkdir()
+            (self.save_path / 'rgb_back').mkdir()
+            (self.save_path / 'rgb_back_right').mkdir()
+            (self.save_path / 'rgb_back_left').mkdir()
+            (self.save_path / 'meta').mkdir()
+            (self.save_path / 'bev').mkdir()
+   
+        self.lidar2img = {
+        'CAM_FRONT':np.array([[ 1.14251841e+03,  8.00000000e+02,  0.00000000e+00, -9.52000000e+02],
+                              [ 0.00000000e+00,  4.50000000e+02, -1.14251841e+03, -8.09704417e+02],
+                              [ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00, -1.19000000e+00],
+                              [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.00000000e+00]]),
+        'CAM_FRONT_LEFT':np.array([[ 6.03961325e-14,  1.39475744e+03,  0.00000000e+00, -9.20539908e+02],
+                                   [-3.68618420e+02,  2.58109396e+02, -1.14251841e+03, -6.47296750e+02],
+                                   [-8.19152044e-01,  5.73576436e-01,  0.00000000e+00, -8.29094072e-01],
+                                   [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.00000000e+00]]),
+        'CAM_FRONT_RIGHT':np.array([[ 1.31064327e+03, -4.77035138e+02,  0.00000000e+00,-4.06010608e+02],
+                                    [ 3.68618420e+02,  2.58109396e+02, -1.14251841e+03,-6.47296750e+02],
+                                    [ 8.19152044e-01,  5.73576436e-01,  0.00000000e+00,-8.29094072e-01],
+                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, 1.00000000e+00]]),
+        'CAM_BACK':np.array([[-1.00000000e+00, -1.22464680e-16,  0.00000000e+00, -1.97168135e-16],
+                             [ 0.00000000e+00,  0.00000000e+00, -1.00000000e+00, -2.40000000e-01],
+                             [ 1.22464680e-16, -1.00000000e+00,  0.00000000e+00, -1.61000000e+00],
+                             [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.00000000e+00]]),
+        'CAM_BACK_LEFT':np.array([[-1.14251841e+03,  8.00000000e+02,  0.00000000e+00, -6.84385123e+02],
+                                  [-4.22861679e+02, -1.53909064e+02, -1.14251841e+03, -4.96004706e+02],
+                                  [-9.39692621e-01, -3.42020143e-01,  0.00000000e+00, -4.92889531e-01],
+                                  [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.00000000e+00]]),
+  
+        'CAM_BACK_RIGHT': np.array([[ 3.60989788e+02, -1.34723223e+03,  0.00000000e+00, -1.04238127e+02],
+                                    [ 4.22861679e+02, -1.53909064e+02, -1.14251841e+03, -4.96004706e+02],
+                                    [ 9.39692621e-01, -3.42020143e-01,  0.00000000e+00, -4.92889531e-01],
+                                    [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.00000000e+00]])
+        }
+        self.lidar2cam = {
+        'CAM_FRONT':np.array([[ 1.  ,  0.  ,  0.  ,  0.  ],
+                              [ 0.  ,  0.  , -1.  , -0.24],
+                              [ 0.  ,  1.  ,  0.  , -1.19],
+                              [ 0.  ,  0.  ,  0.  ,  1.  ]]),
+        'CAM_FRONT_LEFT':np.array([[ 0.57357644,  0.81915204,  0.  , -0.22517331],
+                                   [ 0.        ,  0.        , -1.  , -0.24      ],
+                                   [-0.81915204,  0.57357644,  0.  , -0.82909407],
+                                   [ 0.        ,  0.        ,  0.  ,  1.        ]]),
+        'CAM_FRONT_RIGHT':np.array([[ 0.57357644, -0.81915204, 0.  ,  0.22517331],
+                                   [ 0.        ,  0.        , -1.  , -0.24      ],
+                                   [ 0.81915204,  0.57357644,  0.  , -0.82909407],
+                                   [ 0.        ,  0.        ,  0.  ,  1.        ]]),
+        'CAM_BACK':np.array([[-1. ,  0.,  0.,  0.  ],
+                             [ 0. ,  0., -1., -0.24],
+                             [ 0. , -1.,  0., -1.61],
+                             [ 0. ,  0.,  0.,  1.  ]]),
+     
+        'CAM_BACK_LEFT':np.array([[-0.34202014,  0.93969262,  0.  , -0.25388956],
+                                  [ 0.        ,  0.        , -1.  , -0.24      ],
+                                  [-0.93969262, -0.34202014,  0.  , -0.49288953],
+                                  [ 0.        ,  0.        ,  0.  ,  1.        ]]),
+  
+        'CAM_BACK_RIGHT':np.array([[-0.34202014, -0.93969262,  0.  ,  0.25388956],
+                                  [ 0.        ,  0.         , -1.  , -0.24      ],
+                                  [ 0.93969262, -0.34202014 ,  0.  , -0.49288953],
+                                  [ 0.        ,  0.         ,  0.  ,  1.        ]])
+        }
+        self.lidar2ego = np.array([[ 0. ,  1. ,  0. , -0.39],
+                                   [-1. ,  0. ,  0. ,  0.  ],
+                                   [ 0. ,  0. ,  1. ,  1.84],
+                                   [ 0. ,  0. ,  0. ,  1.  ]])
+        
+        topdown_extrinsics =  np.array([[0.0, -0.0, -1.0, 50.0], [0.0, 1.0, -0.0, 0.0], [1.0, -0.0, 0.0, -0.0], [0.0, 0.0, 0.0, 1.0]])
+        unreal2cam = np.array([[0,1,0,0], [0,0,-1,0], [1,0,0,0], [0,0,0,1]])
+        self.coor2topdown = unreal2cam @ topdown_extrinsics
+        topdown_intrinsics = np.array([[548.993771650447, 0.0, 256.0, 0], [0.0, 548.993771650447, 256.0, 0], [0.0, 0.0, 1.0, 0], [0, 0, 0, 1.0]])
+        self.coor2topdown = topdown_intrinsics @ self.coor2topdown
+
+    def _init(self):
+        try:
+            locx, locy = self._global_plan_world_coord[0][0].location.x, self._global_plan_world_coord[0][0].location.y
+            lon, lat = self._global_plan[0][0]['lon'], self._global_plan[0][0]['lat']
+            EARTH_RADIUS_EQUA = 6378137.0
+            def equations(vars):
+                x, y = vars
+                eq1 = lon * math.cos(x * math.pi / 180) - (locx * x * 180) / (math.pi * EARTH_RADIUS_EQUA) - math.cos(x * math.pi / 180) * y
+                eq2 = math.log(math.tan((lat + 90) * math.pi / 360)) * EARTH_RADIUS_EQUA * math.cos(x * math.pi / 180) + locy - math.cos(x * math.pi / 180) * EARTH_RADIUS_EQUA * math.log(math.tan((90 + x) * math.pi / 360))
+                return [eq1, eq2]
+            initial_guess = [0, 0]
+            solution = fsolve(equations, initial_guess)
+            self.lat_ref, self.lon_ref = solution[0], solution[1]
+        except Exception as e:
+            print(e, flush=True)
+            self.lat_ref, self.lon_ref = 0, 0      
+        self._route_planner = RoutePlanner(4.0, 50.0, lat_ref=self.lat_ref, lon_ref=self.lon_ref)
+        self._route_planner.set_route(self._global_plan, True)
+        self.initialized = True
+  
+  
+
+    def sensors(self):
+        sensors =[
+                # camera rgb
+                {
+                    'type': 'sensor.camera.rgb',
+                    'x': 0.80, 'y': 0.0, 'z': 1.60,
+                    'roll': 0.0, 'pitch': 0.0, 'yaw': 0.0,
+                    'width': 1600, 'height': 900, 'fov': 70,
+                    'id': 'CAM_FRONT'
+                },
+                {
+                    'type': 'sensor.camera.rgb',
+                    'x': 0.27, 'y': -0.55, 'z': 1.60,
+                    'roll': 0.0, 'pitch': 0.0, 'yaw': -55.0,
+                    'width': 1600, 'height': 900, 'fov': 70,
+                    'id': 'CAM_FRONT_LEFT'
+                },
+                {
+                    'type': 'sensor.camera.rgb',
+                    'x': 0.27, 'y': 0.55, 'z': 1.60,
+                    'roll': 0.0, 'pitch': 0.0, 'yaw': 55.0,
+                    'width': 1600, 'height': 900, 'fov': 70,
+                    'id': 'CAM_FRONT_RIGHT'
+                },
+                {
+                    'type': 'sensor.camera.rgb',
+                    'x': -2.0, 'y': 0.0, 'z': 1.60,
+                    'roll': 0.0, 'pitch': 0.0, 'yaw': 180.0,
+                    'width': 1600, 'height': 900, 'fov': 110,
+                    'id': 'CAM_BACK'
+                },
+                {
+                    'type': 'sensor.camera.rgb',
+                    'x': -0.32, 'y': -0.55, 'z': 1.60,
+                    'roll': 0.0, 'pitch': 0.0, 'yaw': -110.0,
+                    'width': 1600, 'height': 900, 'fov': 70,
+                    'id': 'CAM_BACK_LEFT'
+                },
+                {
+                    'type': 'sensor.camera.rgb',
+                    'x': -0.32, 'y': 0.55, 'z': 1.60,
+                    'roll': 0.0, 'pitch': 0.0, 'yaw': 110.0,
+                    'width': 1600, 'height': 900, 'fov': 70,
+                    'id': 'CAM_BACK_RIGHT'
+                },
+                # imu
+                {
+                    'type': 'sensor.other.imu',
+                    'x': -1.4, 'y': 0.0, 'z': 0.0,
+                    'roll': 0.0, 'pitch': 0.0, 'yaw': 0.0,
+                    'sensor_tick': 0.05,
+                    'id': 'IMU'
+                },
+                # gps
+                {
+                    'type': 'sensor.other.gnss',
+                    'x': -1.4, 'y': 0.0, 'z': 0.0,
+                    'roll': 0.0, 'pitch': 0.0, 'yaw': 0.0,
+                    'sensor_tick': 0.01,
+                    'id': 'GPS'
+                },
+                # speed
+                {
+                    'type': 'sensor.speedometer',
+                    'reading_frequency': 20,
+                    'id': 'SPEED'
+                },
+            ]
+        if IS_BENCH2DRIVE:
+            sensors += [
+                    {	
+                        'type': 'sensor.camera.rgb',
+                        'x': 0.0, 'y': 0.0, 'z': 50.0,
+                        'roll': 0.0, 'pitch': -90.0, 'yaw': 0.0,
+                        'width': 512, 'height': 512, 'fov': 5 * 10.0,
+                        'id': 'bev'
+                    }]
+        return sensors
+
+    def tick(self, input_data):
+        self.step += 1
+        encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), 20]
+        imgs = {}
+        for cam in ['CAM_FRONT','CAM_FRONT_LEFT','CAM_FRONT_RIGHT','CAM_BACK','CAM_BACK_LEFT','CAM_BACK_RIGHT']:
+            img = cv2.cvtColor(input_data[cam][1][:, :, :3], cv2.COLOR_BGR2RGB)
+            _, img = cv2.imencode('.jpg', img, encode_param)
+            img = cv2.imdecode(img, cv2.IMREAD_COLOR)
+            imgs[cam] = img
+        bev = cv2.cvtColor(input_data['bev'][1][:, :, :3], cv2.COLOR_BGR2RGB)
+        gps = input_data['GPS'][1][:2]
+        speed = input_data['SPEED'][1]['speed']
+        compass = input_data['IMU'][1][-1]
+        acceleration = input_data['IMU'][1][:3]
+        angular_velocity = input_data['IMU'][1][3:6]
+  
+        pos = self.gps_to_location(gps)
+        near_node, near_command = self._route_planner.run_step(pos)
+  
+        if (math.isnan(compass) == True): #It can happen that the compass sends nan for a few frames
+            compass = 0.0
+            acceleration = np.zeros(3)
+            angular_velocity = np.zeros(3)
+
+        result = {
+                'imgs': imgs,
+                'gps': gps,
+                'pos':pos,
+                'speed': speed,
+                'compass': compass,
+                'bev': bev,
+                'acceleration':acceleration,
+                'angular_velocity':angular_velocity,
+                'command_near':near_command,
+                'command_near_xy':near_node
+                }
+        
+        return result
+    
+    @torch.no_grad()
+    def run_step(self, input_data, timestamp):
+        if not self.initialized:
+            self._init()
+        tick_data = self.tick(input_data)
+        results = {}
+        results['lidar2img'] = []
+        results['lidar2cam'] = []
+        results['img'] = []
+        results['folder'] = ' '
+        results['scene_token'] = ' '  
+        results['frame_idx'] = 0
+        results['timestamp'] = self.step / 20
+        results['box_type_3d'], _ = get_box_type('LiDAR')
+  
+        for cam in ['CAM_FRONT','CAM_FRONT_LEFT','CAM_FRONT_RIGHT','CAM_BACK','CAM_BACK_LEFT','CAM_BACK_RIGHT']:
+            results['lidar2img'].append(self.lidar2img[cam])
+            results['lidar2cam'].append(self.lidar2cam[cam])
+            results['img'].append(tick_data['imgs'][cam])
+        results['lidar2img'] = np.stack(results['lidar2img'],axis=0)
+        results['lidar2cam'] = np.stack(results['lidar2cam'],axis=0)
+        raw_theta = tick_data['compass']   if not np.isnan(tick_data['compass']) else 0
+        ego_theta = -raw_theta + np.pi/2
+        rotation = list(Quaternion(axis=[0, 0, 1], radians=ego_theta))
+        can_bus = np.zeros(18)
+        can_bus[0] = tick_data['pos'][0]
+        can_bus[1] = -tick_data['pos'][1]
+        can_bus[3:7] = rotation
+        can_bus[7] = tick_data['speed']
+        can_bus[10:13] = tick_data['acceleration']
+        can_bus[11] *= -1
+        can_bus[13:16] = -tick_data['angular_velocity']
+        can_bus[16] = ego_theta
+        can_bus[17] = ego_theta / np.pi * 180 
+        results['can_bus'] = can_bus
+        ego_lcf_feat = np.zeros(9)
+        ego_lcf_feat[0:2] = can_bus[0:2].copy()
+        ego_lcf_feat[2:4] = can_bus[10:12].copy()
+        ego_lcf_feat[4] = rotation[-1]
+        ego_lcf_feat[5] = 4.89238167
+        ego_lcf_feat[6] = 1.83671331
+        ego_lcf_feat[7] = np.sqrt(can_bus[0]**2+can_bus[1]**2)
+
+        if len(self.prev_control_cache)<10:
+            ego_lcf_feat[8] = 0
+        else:
+            ego_lcf_feat[8] = self.prev_control_cache[0].steer
+
+        command = tick_data['command_near']
+        if command < 0:
+            command = 4
+        command -= 1
+        results['command'] = command
+        command_onehot = np.zeros(6)
+        command_onehot[command] = 1
+        results['ego_fut_cmd'] = command_onehot
+        theta_to_lidar = raw_theta
+        command_near_xy = np.array([tick_data['command_near_xy'][0]-can_bus[0],-tick_data['command_near_xy'][1]-can_bus[1]])
+        rotation_matrix = np.array([[np.cos(theta_to_lidar),-np.sin(theta_to_lidar)],[np.sin(theta_to_lidar),np.cos(theta_to_lidar)]])
+        local_command_xy = rotation_matrix @ command_near_xy
+  
+        ego2world = np.eye(4)
+        ego2world[0:3,0:3] = Quaternion(axis=[0, 0, 1], radians=ego_theta).rotation_matrix
+        ego2world[0:2,3] = can_bus[0:2]
+        lidar2global = ego2world @ self.lidar2ego
+        results['l2g_r_mat'] = lidar2global[0:3,0:3]
+        results['l2g_t'] = lidar2global[0:3,3]
+        stacked_imgs = np.stack(results['img'],axis=-1)
+        results['img_shape'] = stacked_imgs.shape
+        results['ori_shape'] = stacked_imgs.shape
+        results['pad_shape'] = stacked_imgs.shape
+        results = self.inference_only_pipeline(results)
+        self.device="cuda"
+        input_data_batch = mm_collate_to_batch_form([results], samples_per_gpu=1)
+        for key, data in input_data_batch.items():
+            if key != 'img_metas':
+                if torch.is_tensor(data[0]):
+                    data[0] = data[0].to(self.device)
+        output_data_batch = self.model(input_data_batch, return_loss=False, rescale=True)
+        all_out_truck_d1 = output_data_batch[0]['pts_bbox']['ego_fut_preds'].cpu().numpy()
+        all_out_truck =  np.cumsum(all_out_truck_d1,axis=1)
+        out_truck = all_out_truck[command]
+        steer_traj, throttle_traj, brake_traj, metadata_traj = self.pidcontroller.control_pid(out_truck, tick_data['speed'], local_command_xy)
+        if brake_traj < 0.05: brake_traj = 0.0
+        if throttle_traj > brake_traj: brake_traj = 0.0
+
+        control = carla.VehicleControl()
+        self.pid_metadata = metadata_traj
+        self.pid_metadata['agent'] = 'only_traj'
+        control.steer = np.clip(float(steer_traj), -1, 1)
+        control.throttle = np.clip(float(throttle_traj), 0, 0.75)
+        control.brake = np.clip(float(brake_traj), 0, 1)     
+        self.pid_metadata['steer'] = control.steer
+        self.pid_metadata['throttle'] = control.throttle
+        self.pid_metadata['brake'] = control.brake
+        self.pid_metadata['steer_traj'] = float(steer_traj)
+        self.pid_metadata['throttle_traj'] = float(throttle_traj)
+        self.pid_metadata['brake_traj'] = float(brake_traj)
+        self.pid_metadata['plan'] = out_truck.tolist()
+        self.pid_metadata['command'] = command
+        self.pid_metadata['all_plan'] = all_out_truck.tolist()
+
+        if SAVE_PATH is not None and self.step % 10 == 0:
+            self.save(tick_data)
+        self.prev_control = control
+        
+        if len(self.prev_control_cache)==10:
+            self.prev_control_cache.pop(0)
+        self.prev_control_cache.append(control)
+        return control
+
+
+    def save(self, tick_data):
+        frame = self.step // 10
+
+        Image.fromarray(tick_data['imgs']['CAM_FRONT']).save(self.save_path / 'rgb_front' / ('%04d.png' % frame))
+        Image.fromarray(tick_data['imgs']['CAM_FRONT_LEFT']).save(self.save_path / 'rgb_front_left' / ('%04d.png' % frame))
+        Image.fromarray(tick_data['imgs']['CAM_FRONT_RIGHT']).save(self.save_path / 'rgb_front_right' / ('%04d.png' % frame))
+        Image.fromarray(tick_data['imgs']['CAM_BACK']).save(self.save_path / 'rgb_back' / ('%04d.png' % frame))
+        Image.fromarray(tick_data['imgs']['CAM_BACK_LEFT']).save(self.save_path / 'rgb_back_left' / ('%04d.png' % frame))
+        Image.fromarray(tick_data['imgs']['CAM_BACK_RIGHT']).save(self.save_path / 'rgb_back_right' / ('%04d.png' % frame))
+        Image.fromarray(tick_data['bev']).save(self.save_path / 'bev' / ('%04d.png' % frame))
+
+        outfile = open(self.save_path / 'meta' / ('%04d.json' % frame), 'w')
+        json.dump(self.pid_metadata, outfile, indent=4)
+        outfile.close()
+
+    def destroy(self):
+        del self.model
+        torch.cuda.empty_cache()
+
+    def gps_to_location(self, gps):
+        EARTH_RADIUS_EQUA = 6378137.0
+        # gps content: numpy array: [lat, lon, alt]
+        lat, lon = gps
+        scale = math.cos(self.lat_ref * math.pi / 180.0)
+        my = math.log(math.tan((lat+90) * math.pi / 360.0)) * (EARTH_RADIUS_EQUA * scale)
+        mx = (lon * (math.pi * EARTH_RADIUS_EQUA * scale)) / 180.0
+        y = scale * EARTH_RADIUS_EQUA * math.log(math.tan((90.0 + self.lat_ref) * math.pi / 360.0)) - my
+        x = mx - scale * self.lon_ref * math.pi * EARTH_RADIUS_EQUA / 180.0
+        return np.array([x, y])
\ No newline at end of file