diff --git a/README.md b/README.md index a4c280d9..e56701b7 100644 --- a/README.md +++ b/README.md @@ -126,19 +126,18 @@ If you want to know about `AWML`, you should read following pages. - :star: is recommended to use -| Task | Model | Use for Autoware | -| ----------------- | --------------------------------------------- | ------------------ | -| 3D detection | [CenterPoint](/projects/CenterPoint/) | :star: | -| 3D detection | [TransFusion](/projects/TransFusion/) | :white_check_mark: | -| 3D detection | [BEVFusion](/projects/BEVFusion/) | :white_check_mark: | -| 3D segmentation | [FRNet](/projects/FRNet/) | (Reviewing now) | -| 2D detection | [YOLOX](/projects/YOLOX/) | | -| 2D detection | [YOLOX_opt](/projects/YOLOX_opt/) | :star: | -| 2D detection | [GLIP](/projects/GLIP/) | | -| 2D detection | [SwinTransformer](/projects/SwinTransformer/) | | -| 2D classification | [MobileNetv2](/projects/MobileNetv2/) | :white_check_mark: | -| Vision language | [BLIP-2](/projects/BLIP-2/) | | -| | | | +| Task | Model | Use for Autoware | +| ----------------- | ------------------------------------- | ------------------ | +| 3D detection | [CenterPoint](/projects/CenterPoint/) | :star: | +| 3D detection | [TransFusion](/projects/TransFusion/) | :white_check_mark: | +| 3D detection | [BEVFusion](/projects/BEVFusion/) | :white_check_mark: | +| 3D segmentation | [FRNet](/projects/FRNet/) | (Reviewing now) | +| 2D detection | [YOLOX](/projects/YOLOX/) | | +| 2D detection | [YOLOX_opt](/projects/YOLOX_opt/) | :star: | +| 2D detection | [GLIP](/projects/GLIP/) | | +| 2D classification | [MobileNetv2](/projects/MobileNetv2/) | :white_check_mark: | +| Vision language | [BLIP-2](/projects/BLIP-2/) | | +| | | | - Additional plug-ins - [SparseConvolutions](/projects/SparseConvolution/) diff --git a/projects/SwinTransformer/README.md b/projects/SwinTransformer/README.md deleted file mode 100644 index 6ca62ad7..00000000 --- a/projects/SwinTransformer/README.md +++ /dev/null @@ -1,26 +0,0 @@ -# SwinTransformer - -- [Support priority](https://github.com/tier4/AWML/blob/main/docs/design/autoware_ml_design.md#support-priority): Tier C -- ROS package: None -- Supported dataset - - [x] NuScenes - - [ ] T4dataset - -## Abstract - -## Results and Models - -## Troubleshooting - -## Reference - -- [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) - -```latex -@article{liu2021Swin, - title={Swin Transformer: Hierarchical Vision Transformer using Shifted Windows}, - author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining}, - journal={arXiv preprint arXiv:2103.14030}, - year={2021} -} -``` diff --git a/projects/SwinTransformer/configs/_base_/mask-rcnn_r50_fpn.py b/projects/SwinTransformer/configs/_base_/mask-rcnn_r50_fpn.py deleted file mode 100644 index 04fd723b..00000000 --- a/projects/SwinTransformer/configs/_base_/mask-rcnn_r50_fpn.py +++ /dev/null @@ -1,107 +0,0 @@ -# model settings -model = dict( - type="MaskRCNN", - data_preprocessor=dict( - type="DetDataPreprocessor", - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.12, 57.375], - bgr_to_rgb=True, - pad_mask=True, - pad_size_divisor=32, - ), - backbone=dict( - type="ResNet", - depth=50, - num_stages=4, - out_indices=(0, 1, 2, 3), - frozen_stages=1, - norm_cfg=dict(type="BN", requires_grad=True), - norm_eval=True, - style="pytorch", - init_cfg=dict(type="Pretrained", checkpoint="torchvision://resnet50"), - ), - neck=dict(type="FPN", in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), - rpn_head=dict( - type="RPNHead", - in_channels=256, - feat_channels=256, - anchor_generator=dict(type="AnchorGenerator", scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), - bbox_coder=dict( - type="DeltaXYWHBBoxCoder", target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[1.0, 1.0, 1.0, 1.0] - ), - loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=True, loss_weight=1.0), - loss_bbox=dict(type="L1Loss", loss_weight=1.0), - ), - roi_head=dict( - type="StandardRoIHead", - bbox_roi_extractor=dict( - type="SingleRoIExtractor", - roi_layer=dict(type="RoIAlign", output_size=7, sampling_ratio=0), - out_channels=256, - featmap_strides=[4, 8, 16, 32], - ), - bbox_head=dict( - type="Shared2FCBBoxHead", - in_channels=256, - fc_out_channels=1024, - roi_feat_size=7, - num_classes=80, - bbox_coder=dict( - type="DeltaXYWHBBoxCoder", target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[0.1, 0.1, 0.2, 0.2] - ), - reg_class_agnostic=False, - loss_cls=dict(type="CrossEntropyLoss", use_sigmoid=False, loss_weight=1.0), - loss_bbox=dict(type="L1Loss", loss_weight=1.0), - ), - mask_roi_extractor=dict( - type="SingleRoIExtractor", - roi_layer=dict(type="RoIAlign", output_size=14, sampling_ratio=0), - out_channels=256, - featmap_strides=[4, 8, 16, 32], - ), - mask_head=dict( - type="FCNMaskHead", - num_convs=4, - in_channels=256, - conv_out_channels=256, - num_classes=80, - loss_mask=dict(type="CrossEntropyLoss", use_mask=True, loss_weight=1.0), - ), - ), - # model training and testing settings - train_cfg=dict( - rpn=dict( - assigner=dict( - type="MaxIoUAssigner", - pos_iou_thr=0.7, - neg_iou_thr=0.3, - min_pos_iou=0.3, - match_low_quality=True, - ignore_iof_thr=-1, - ), - sampler=dict(type="RandomSampler", num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), - allowed_border=-1, - pos_weight=-1, - debug=False, - ), - rpn_proposal=dict(nms_pre=2000, max_per_img=1000, nms=dict(type="nms", iou_threshold=0.7), min_bbox_size=0), - rcnn=dict( - assigner=dict( - type="MaxIoUAssigner", - pos_iou_thr=0.5, - neg_iou_thr=0.5, - min_pos_iou=0.5, - match_low_quality=True, - ignore_iof_thr=-1, - ), - sampler=dict(type="RandomSampler", num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), - mask_size=28, - pos_weight=-1, - debug=False, - ), - ), - test_cfg=dict( - rpn=dict(nms_pre=1000, max_per_img=1000, nms=dict(type="nms", iou_threshold=0.7), min_bbox_size=0), - rcnn=dict(score_thr=0.05, nms=dict(type="nms", iou_threshold=0.5), max_per_img=100, mask_thr_binary=0.5), - ), -) diff --git a/projects/SwinTransformer/configs/_base_/schedule_1x.py b/projects/SwinTransformer/configs/_base_/schedule_1x.py deleted file mode 100644 index e7b67f51..00000000 --- a/projects/SwinTransformer/configs/_base_/schedule_1x.py +++ /dev/null @@ -1,19 +0,0 @@ -# training schedule for 1x -train_cfg = dict(type="EpochBasedTrainLoop", max_epochs=12, val_interval=1) -val_cfg = dict(type="ValLoop") -test_cfg = dict(type="TestLoop") - -# learning rate -param_scheduler = [ - dict(type="LinearLR", start_factor=0.001, by_epoch=False, begin=0, end=500), - dict(type="MultiStepLR", begin=0, end=12, by_epoch=True, milestones=[8, 11], gamma=0.1), -] - -# optimizer -optim_wrapper = dict(type="OptimWrapper", optimizer=dict(type="SGD", lr=0.02, momentum=0.9, weight_decay=0.0001)) - -# Default setting for scaling LR automatically -# - `enable` means enable scaling LR automatically -# or not by default. -# - `base_batch_size` = (8 GPUs) x (2 samples per GPU). -auto_scale_lr = dict(enable=False, base_batch_size=16) diff --git a/projects/SwinTransformer/configs/coco/mask-rcnn_swin-s-p4-w7_fpn_amp-ms-crop-3x_coco.py b/projects/SwinTransformer/configs/coco/mask-rcnn_swin-s-p4-w7_fpn_amp-ms-crop-3x_coco.py deleted file mode 100644 index b02bd3cd..00000000 --- a/projects/SwinTransformer/configs/coco/mask-rcnn_swin-s-p4-w7_fpn_amp-ms-crop-3x_coco.py +++ /dev/null @@ -1,110 +0,0 @@ -_base_ = [ - "../../../../autoware_ml/configs/detection2d/default_runtime.py", - "../_base_/mask-rcnn_r50_fpn.py", - "../../../../autoware_ml/configs/detection2d/dataset/coco/coco_instance.py", - "../_base_/schedule_1x.py", -] - -pretrained = ( - "https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth" # noqa -) - -model = dict( - type="MaskRCNN", - backbone=dict( - _delete_=True, - type="SwinTransformer", - embed_dims=96, - depths=[2, 2, 6, 2], - num_heads=[3, 6, 12, 24], - window_size=7, - mlp_ratio=4, - qkv_bias=True, - qk_scale=None, - drop_rate=0.0, - attn_drop_rate=0.0, - drop_path_rate=0.2, - patch_norm=True, - out_indices=(0, 1, 2, 3), - with_cp=False, - convert_weights=True, - init_cfg=dict(type="Pretrained", checkpoint=pretrained), - ), - neck=dict(in_channels=[96, 192, 384, 768]), -) - -# augmentation strategy originates from DETR / Sparse RCNN -train_pipeline = [ - dict(type="LoadImageFromFile", backend_args={{_base_.backend_args}}), - dict(type="LoadAnnotations", with_bbox=True, with_mask=True), - dict(type="RandomFlip", prob=0.5), - dict( - type="RandomChoice", - transforms=[ - [ - dict( - type="RandomChoiceResize", - scales=[ - (480, 1333), - (512, 1333), - (544, 1333), - (576, 1333), - (608, 1333), - (640, 1333), - (672, 1333), - (704, 1333), - (736, 1333), - (768, 1333), - (800, 1333), - ], - keep_ratio=True, - ) - ], - [ - dict(type="RandomChoiceResize", scales=[(400, 1333), (500, 1333), (600, 1333)], keep_ratio=True), - dict(type="RandomCrop", crop_type="absolute_range", crop_size=(384, 600), allow_negative_crop=True), - dict( - type="RandomChoiceResize", - scales=[ - (480, 1333), - (512, 1333), - (544, 1333), - (576, 1333), - (608, 1333), - (640, 1333), - (672, 1333), - (704, 1333), - (736, 1333), - (768, 1333), - (800, 1333), - ], - keep_ratio=True, - ), - ], - ], - ), - dict(type="PackDetInputs"), -] -train_dataloader = dict(dataset=dict(pipeline=train_pipeline)) - -max_epochs = 36 -train_cfg = dict(max_epochs=max_epochs) - -# learning rate -param_scheduler = [ - dict(type="LinearLR", start_factor=0.001, by_epoch=False, begin=0, end=1000), - dict(type="MultiStepLR", begin=0, end=max_epochs, by_epoch=True, milestones=[27, 33], gamma=0.1), -] - -# optimizer -optim_wrapper = dict( - type="OptimWrapper", - paramwise_cfg=dict( - custom_keys={ - "absolute_pos_embed": dict(decay_mult=0.0), - "relative_position_bias_table": dict(decay_mult=0.0), - "norm": dict(decay_mult=0.0), - } - ), - optimizer=dict(_delete_=True, type="AdamW", lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05), -)