From 3dbb29aecf2fedfa497a429d3032d0d72ed74226 Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Tue, 5 Jul 2022 17:01:16 -0700 Subject: [PATCH 01/22] bev --- datasets/__init__.py | 9 +- datasets/coco.py | 41 ++++++-- datasets/kitti.py | 207 +++++++++++++++++++++++++++++++++++++++++ datasets/transforms.py | 3 +- kitti2coco.py | 71 ++++++++++++++ kittibev | 37 ++++++++ main.py | 39 +++++++- models/detr.py | 36 +++++-- run_KITTI_bev.sh | 8 ++ 9 files changed, 426 insertions(+), 25 deletions(-) create mode 100644 datasets/kitti.py create mode 100644 kitti2coco.py create mode 100644 kittibev create mode 100644 run_KITTI_bev.sh diff --git a/datasets/__init__.py b/datasets/__init__.py index 571b126ea..c49b0861c 100644 --- a/datasets/__init__.py +++ b/datasets/__init__.py @@ -3,7 +3,8 @@ import torchvision from .coco import build as build_coco - +from .coco import build_kitti_coco +from .kitti import build as build_kitti def get_coco_api_from_dataset(dataset): for _ in range(10): @@ -18,6 +19,12 @@ def get_coco_api_from_dataset(dataset): def build_dataset(image_set, args): if args.dataset_file == 'coco': return build_coco(image_set, args) + + if args.dataset_file == 'kitti_coco': + return build_kitti_coco(image_set, args) + + # if args.dataset_file == 'kitti': + # return build_kitti(image_set, args) if args.dataset_file == 'coco_panoptic': # to avoid making panopticapi required for coco from .coco_panoptic import build as build_coco_panoptic diff --git a/datasets/coco.py b/datasets/coco.py index 93a436ba6..bdd39196f 100644 --- a/datasets/coco.py +++ b/datasets/coco.py @@ -5,20 +5,24 @@ Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py """ from pathlib import Path +# from matplotlib import image import torch import torch.utils.data import torchvision from pycocotools import mask as coco_mask +import json import datasets.transforms as T class CocoDetection(torchvision.datasets.CocoDetection): - def __init__(self, img_folder, ann_file, transforms, return_masks): + def __init__(self, img_folder, ann_file, transforms, return_masks, bev_data = None): super(CocoDetection, self).__init__(img_folder, ann_file) self._transforms = transforms self.prepare = ConvertCocoPolysToMask(return_masks) + if bev_data is not None: + self.bev_data = json.load(open(bev_data)) def __getitem__(self, idx): img, target = super(CocoDetection, self).__getitem__(idx) @@ -27,6 +31,8 @@ def __getitem__(self, idx): img, target = self.prepare(img, target) if self._transforms is not None: img, target = self._transforms(img, target) + target['bev'] = torch.tensor(self.bev_data[str(image_id)]) + assert target['bev'].size()[0] == target['boxes'].size()[0] return img, target @@ -63,7 +69,12 @@ def __call__(self, image, target): boxes = [obj["bbox"] for obj in anno] # guard against no boxes via resizing - boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) + # boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) + try: + boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1,4) + except: + print(image_id) + print(boxes) boxes[:, 2:] += boxes[:, :2] boxes[:, 0::2].clamp_(min=0, max=w) boxes[:, 1::2].clamp_(min=0, max=h) @@ -123,15 +134,15 @@ def make_coco_transforms(image_set): if image_set == 'train': return T.Compose([ - T.RandomHorizontalFlip(), - T.RandomSelect( + # T.RandomHorizontalFlip(), + # T.RandomSelect( T.RandomResize(scales, max_size=1333), - T.Compose([ - T.RandomResize([400, 500, 600]), - T.RandomSizeCrop(384, 600), - T.RandomResize(scales, max_size=1333), - ]) - ), + # T.Compose([ + # T.RandomResize([400, 500, 600]), + # T.RandomSizeCrop(384, 600), + # T.RandomResize(scales, max_size=1333), + # ]) + # ), normalize, ]) @@ -156,3 +167,13 @@ def build(image_set, args): img_folder, ann_file = PATHS[image_set] dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks) return dataset + +def build_kitti_coco(image_set, args): + anno_root = Path("/srip-vol/datasets/KITTI3D/coco") + PATHS = { + "train": (Path("/srip-vol/datasets/KITTI3D/training/image_2"), anno_root / f'kitti_{image_set}.json'), + "val": (Path("/srip-vol/datasets/KITTI3D/training/image_2"), anno_root / f'kitti_{image_set}.json'), + } + BEV_DATA = "/srip-vol/datasets/KITTI3D/coco/bev_%s.json"%(image_set) + img_folder, ann_file = PATHS[image_set] + dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks, bev_data = BEV_DATA) \ No newline at end of file diff --git a/datasets/kitti.py b/datasets/kitti.py new file mode 100644 index 000000000..49af042d0 --- /dev/null +++ b/datasets/kitti.py @@ -0,0 +1,207 @@ +""" +KITTI dataset ckass for DeTR +""" +import os +import os.path as osp +from torch.utils.data import Dataset +import torch +import numpy as np +from PIL import Image +import datasets.transforms as T + +SPLIT = ['train', 'val', 'test'] + +class KITTIDataset(Dataset): + def __init__(self, base_path = '/srip-vol/datasets/KITTI3D', split = 'train', transform = None): + assert split in SPLIT + self.split = split + self.base_path = base_path + self.isTest = self.split == 'test' + self.folder_name = 'testing' if self.split == 'test' else 'training' + + # Read imageset with index + image_set_path = osp.join(self.base_path, 'ImageSets', self.split + '.txt') + lines = open(image_set_path).readlines() + self.image_set = [line.strip() for line in lines] + # Define transform + self._transforms = transform + # Set-up paths + self.image_path = osp.join(base_path, self.folder_name, 'image_2') + if not self.isTest: + self.label_path = osp.join(base_path, self.folder_name, 'label_2') + + self.KITTI_CLASS = ['Car', 'Pedestrian', 'Cyclist'] + self.prepare = ConvertCocoPolysToMask() + + def __len__(self): + return len(self.image_set) + + def __getitem__(self, idx): + ''' + Return a dict with following fields + 'image' - image as a numpy array + 'label' - list of dicts each with label info parsed + ''' + data_idx = self.image_set[idx] + data = {} + # img = np.asarray(Image.open(osp.join(self.image_path, data_idx + '.png'))) + img = Image.open(osp.join(self.image_path, data_idx + '.png')) + if not self.isTest: + label = self.__read_label_data(data_idx) + + bbox_data = [] + class_data = [] + for i in range(len(label)): + bbox_data.append(label[i]['bbox_2d']) + class_data.append(label[i]['class_id']) + + target = {} + target['boxes'] = torch.as_tensor(bbox_data) + target['labels'] = torch.as_tensor(class_data) + target['image_id'] = int(data_idx) + + # Prepare dataset + img, target = self.prepare(img, target) + if self._transforms is not None: + img, target = self._transforms(img, target) + return img, target + + def __read_label_data(self, idx): + ''' + Function to read label data from text file + ''' + lines = open(osp.join(self.label_path, idx + '.txt')).readlines() + label = [] + for line in lines: + data = line.split(' ') + if data[0] in self.KITTI_CLASS: + label.append(KITTI_label(data[0], + float(data[1]) , float(data[2]) , float(data[3]), + float(data[4]) , float(data[5]) , float(data[6]), + float(data[7]) , float(data[8]) , float(data[9]), + float(data[10]), float(data[11]), float(data[12]), + float(data[13]), float(data[14]))) + return label + + +KITTI_CLASS = {'Car': 1, 'Pedestrian': 2, 'Cyclist' : 3} + +def KITTI_label(class_name, truncated, occluded, alpha, + bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax, + dim_h, dim_w, dim_l, x_c, y_c, z_c, rot_y, score = 0): + ''' + To create a label dict + Note - score field added at last with default val 0 + ''' + label_info = {} + label_info['class_id'] = KITTI_CLASS[class_name] + label_info['truncated'] = truncated + label_info['occluded'] = occluded + label_info['alpha'] = alpha + # label_info['bbox_2d'] = [bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax] + label_info['bbox_2d'] = [(bbox_xmax + bbox_xmin)/2, (bbox_ymax + bbox_ymin)/2, bbox_xmax-bbox_xmin, bbox_ymax-bbox_ymin] + label_info['dim'] = [dim_h, dim_w, dim_l] + label_info['loc'] = [x_c, y_c, z_c] + label_info['rot_y'] = rot_y + label_info['score'] = score + + return label_info + +def make_coco_transforms(image_set): + + normalize = T.Compose([ + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ]) + + scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800] + + if image_set == 'train': + return T.Compose([ + T.RandomHorizontalFlip(), + T.RandomSelect( + T.RandomResize(scales, max_size=1333), + T.Compose([ + T.RandomResize([400, 500, 600]), + T.RandomSizeCrop(384, 600), + T.RandomResize(scales, max_size=1333), + ]) + ), + normalize, + ]) + + if image_set == 'val': + return T.Compose([ + T.RandomResize([800], max_size=1333), + normalize, + ]) + + raise ValueError(f'unknown {image_set}') + +class ConvertCocoPolysToMask(object): + # def __init__(self, return_masks=False): + # self.return_masks = return_masks + + def __call__(self, image, target): + w, h = image.size + + image_id = target["image_id"] + image_id = torch.tensor([image_id]) + # anno = target["annotations"] + # anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0] + + # boxes = [obj["bbox"] for obj in anno] + # guard against no boxes via resizing + boxes = target['boxes']#torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2].clamp_(min=0, max=w) + boxes[:, 1::2].clamp_(min=0, max=h) + + # classes = [obj["category_id"] for obj in anno] + classes = target['labels']# torch.tensor(classes, dtype=torch.int64) + + # if self.return_masks: + # segmentations = [obj["segmentation"] for obj in anno] + # masks = convert_coco_poly_to_mask(segmentations, h, w) + + # keypoints = None + # if anno and "keypoints" in anno[0]: + # keypoints = [obj["keypoints"] for obj in anno] + # keypoints = torch.as_tensor(keypoints, dtype=torch.float32) + # num_keypoints = keypoints.shape[0] + # if num_keypoints: + # # keypoints = keypoints.view(num_keypoints, -1, 3) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + boxes = boxes[keep] + # classes = classes[keep] + # if self.return_masks: + # masks = masks[keep] + # if keypoints is not None: + # keypoints = keypoints[keep] + + target = {} + target["boxes"] = boxes + target["labels"] = classes + target["image_id"] = image_id + # if self.return_masks: + # target["masks"] = masks + # target["image_id"] = image_id + # if keypoints is not None: + # target["keypoints"] = keypoints + + # for conversion to coco api + # area = torch.tensor([obj["area"] for obj in anno]) + # iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno]) + # target["area"] = [] #area[keep] + # target["iscrowd"] = [] #iscrowd[keep] + + target["orig_size"] = torch.as_tensor([int(h), int(w)]) + target["size"] = torch.as_tensor([int(h), int(w)]) + + return image, target + +def build(image_set, args): + # base_path = '/srip-vol/datasets/KITTI3D' + dataset = KITTIDataset(base_path = args.kitti_path, split = image_set, transform = make_coco_transforms(image_set)) + return dataset \ No newline at end of file diff --git a/datasets/transforms.py b/datasets/transforms.py index 063585789..b771f4247 100644 --- a/datasets/transforms.py +++ b/datasets/transforms.py @@ -22,7 +22,8 @@ def crop(image, target, region): # should we do something wrt the original size? target["size"] = torch.tensor([h, w]) - fields = ["labels", "area", "iscrowd"] + # fields = ["labels", "area", "iscrowd"] + fields = ["labels"] if "boxes" in target: boxes = target["boxes"] diff --git a/kitti2coco.py b/kitti2coco.py new file mode 100644 index 000000000..ff9ede022 --- /dev/null +++ b/kitti2coco.py @@ -0,0 +1,71 @@ +from sahi.utils.coco import Coco, CocoCategory, CocoImage, CocoAnnotation +from sahi.utils.file import save_json +from PIL import Image +import os.path as osp +from math import ceil +import os +from tqdm import tqdm + +KITTI_CLASS = {'Car': 0, 'Pedestrian': 1, 'Cyclist' : 2} + +def decode(label): + ''' + Parse line of kitti label text file + Refer - https://voxel51.com/docs/fiftyone/user_guide/dataset_creation/datasets.html#kittidetectiondataset + ''' + data = label.split(' ') + class_name = data[0] + xmin = float(data[4]) + ymin = float(data[5]) + xmax = float(data[6]) + ymax = float(data[7]) + + # top left cornet and dimensions + # Refer - https://cocodataset.org/#format-data + bbox = [xmin, ymin, ceil(xmax-xmin), ceil(ymax-ymin)] + class_id = KITTI_CLASS.get(class_name, -1) + + return class_id, class_name, bbox + + +# Coco object +coco = Coco() + +# Add categories +coco.add_category(CocoCategory(id=0, name='Car')) +coco.add_category(CocoCategory(id=1, name='Pedastrian')) +coco.add_category(CocoCategory(id=2, name='Cyclist')) + +split = 'val' +assert split in ['train', 'val'] + +# Add paths +imageset_path = osp.join('/srip-vol/datasets/KITTI3D/ImageSets', split + '.txt') +img_folder_path = osp.join('/srip-vol/datasets/KITTI3D/training', 'image_2') +ann_folder_path = osp.join('/srip-vol/datasets/KITTI3D/training', 'label_2') + +idx = open(imageset_path, 'r').readlines() + +for i in tqdm(idx): + i = i[:-1] + img_path = osp.join(img_folder_path, i +'.png') + lab_path = osp.join(ann_folder_path, i +'.txt') + + width, height = Image.open(img_path).size + coco_image = CocoImage(file_name=img_path, height=height, width=width) + + labels = open(lab_path, 'r').readlines() + for l in labels: + category_id, category_name, bbox = decode(l) + if category_id == -1: + continue + coco_image.add_annotation(CocoAnnotation( + bbox=bbox, + category_id=category_id, + category_name=category_name)) + + coco.add_image(coco_image) + +save_path = '/srip-vol/parth/detr/kitti_%s.json'%(split) +# save_path = '/srip-vol/parth/detr/try.json' +save_json(data=coco.json, save_path=save_path) \ No newline at end of file diff --git a/kittibev b/kittibev new file mode 100644 index 000000000..03f908e36 --- /dev/null +++ b/kittibev @@ -0,0 +1,37 @@ +# ----------------------------------------------------------------------------------- +# To generate bev data +# ----------------------------------------------------------------------------------- +import os +import os.path as osp +import json +# import torch +from tqdm import tqdm + +split = 'val' +data_path = '/srip-vol/datasets/KITTI3D/coco/kitti_%s.json' %(split) +data = json.load(open(data_path)) +bev_data = {} + +KITTI_CLASS = {'Car': 0, 'Pedestrian': 1, 'Cyclist' : 2} + +for i in tqdm(range(len(data['images']))): + # Label path + img_path = data['images'][i]['file_name'].split('/') + img_path[-2] = 'label_2' + img_path[-1] = img_path[-1].split('.')[0] + '.txt' + label_path = '/' + osp.join(*img_path) + + # Read annotations and assembler point depth value + lines = open(label_path).readlines() + bev = [] + for line in lines: + label_data = line.split(' ') + if(KITTI_CLASS.get(label_data[0],-1) == -1): + continue + x_c = float(label_data[11]) + z_c = float(label_data[13]) + + bev.append([x_c, z_c]) + + # Save in dict + bev_data[i+1] = bev #torch.tensor(depth) diff --git a/main.py b/main.py index e5f9eff80..ced8daf46 100644 --- a/main.py +++ b/main.py @@ -16,21 +16,27 @@ from engine import evaluate, train_one_epoch from models import build_model +import PIL.Image as Image +from torchvision import transforms def get_args_parser(): parser = argparse.ArgumentParser('Set transformer detector', add_help=False) parser.add_argument('--lr', default=1e-4, type=float) - parser.add_argument('--lr_backbone', default=1e-5, type=float) + # parser.add_argument('--lr_backbone', default=1e-5, type=float) + # freezing the backbone + parser.add_argument('--lr_backbone', default=0, type=float) parser.add_argument('--batch_size', default=2, type=int) parser.add_argument('--weight_decay', default=1e-4, type=float) parser.add_argument('--epochs', default=300, type=int) parser.add_argument('--lr_drop', default=200, type=int) parser.add_argument('--clip_max_norm', default=0.1, type=float, help='gradient clipping max norm') - + parser.add_argument('--num_classes', default=3, type=int, help = "max class id. Refer comment at end of detr.py") # Model parameters parser.add_argument('--frozen_weights', type=str, default=None, help="Path to the pretrained model. If set, only the mask head will be trained") + # parser.add_argument('--bev_regression', action='store_false', + # help="Add flag to regress bev directly") # * Backbone parser.add_argument('--backbone', default='resnet50', type=str, help="Name of the convolutional backbone to use") @@ -75,26 +81,34 @@ def get_args_parser(): parser.add_argument('--dice_loss_coef', default=1, type=float) parser.add_argument('--bbox_loss_coef', default=5, type=float) parser.add_argument('--giou_loss_coef', default=2, type=float) + parser.add_argument('--bev_loss_coef', default=2, type=float) parser.add_argument('--eos_coef', default=0.1, type=float, help="Relative classification weight of the no-object class") # dataset parameters - parser.add_argument('--dataset_file', default='coco') + # parser.add_argument('--dataset_file', default='coco') + parser.add_argument('--dataset_file', default='kitti_coco') parser.add_argument('--coco_path', type=str) parser.add_argument('--coco_panoptic_path', type=str) + parser.add_argument('--kitti_path', default='/srip-vol/datasets/KITTI3D/', type=str) parser.add_argument('--remove_difficult', action='store_true') - parser.add_argument('--output_dir', default='', + # parser.add_argument('--output_dir', default='', + parser.add_argument('--output_dir', default='output_logs_local', help='path where to save, empty for no saving') parser.add_argument('--device', default='cuda', help='device to use for training / testing') parser.add_argument('--seed', default=42, type=int) - parser.add_argument('--resume', default='', help='resume from checkpoint') + # parser.add_argument('--resume', default='', help='resume from checkpoint') + parser.add_argument('--resume', default='pretrained/detr-r101-dc5-a2e86def.pth', help='resume from checkpoint') parser.add_argument('--start_epoch', default=0, type=int, metavar='N', help='start epoch') parser.add_argument('--eval', action='store_true') + parser.add_argument('--test', action='store_true') + parser.add_argument('--num_workers', default=2, type=int) + parser.add_argument('--test_image', default = None, type = str, help = 'Path to image for testing') # distributed training parameters parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes') @@ -110,6 +124,7 @@ def main(args): assert args.masks, "Frozen training is meant for segmentation only" print(args) + args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device(args.device) # fix the seed for reproducibility @@ -174,8 +189,22 @@ def main(args): checkpoint = torch.hub.load_state_dict_from_url( args.resume, map_location='cpu', check_hash=True) else: + print('loading pretrianed weights.....') checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) + del checkpoint["model"]["class_embed.weight"] + del checkpoint["model"]["class_embed.bias"] + # Remove box weights + keys_to_delete = [] + for key in checkpoint["model"]: + if 'box_embed' in key: + print(key) + keys_to_delete.append(key) + + for key in keys_to_delete: + del checkpoint["model"][key] + + model_without_ddp.load_state_dict(checkpoint['model'], strict = False) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) diff --git a/models/detr.py b/models/detr.py index 23c2376da..9011bcc20 100644 --- a/models/detr.py +++ b/models/detr.py @@ -40,6 +40,7 @@ def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=Fal self.input_proj = nn.Conv2d(backbone.num_channels, hidden_dim, kernel_size=1) self.backbone = backbone self.aux_loss = aux_loss + self.bev_embed = nn.Linear(hidden_dim, 2) def forward(self, samples: NestedTensor): """ The forward expects a NestedTensor, which consists of: @@ -66,18 +67,19 @@ def forward(self, samples: NestedTensor): outputs_class = self.class_embed(hs) outputs_coord = self.bbox_embed(hs).sigmoid() - out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]} + outputs_bev = self.bev_embed(hs) + out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1], 'pred_bev': outputs_bev[-1]} if self.aux_loss: - out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord) + out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord, outputs_bev) return out @torch.jit.unused - def _set_aux_loss(self, outputs_class, outputs_coord): + def _set_aux_loss(self, outputs_class, outputs_coord, outputs_bev): # this is a workaround to make torchscript happy, as torchscript # doesn't support dictionary with non-homogeneous values, such # as a dict having both a Tensor and a list. - return [{'pred_logits': a, 'pred_boxes': b} - for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] + return [{'pred_logits': a, 'pred_boxes': b, 'pred_bev': c} + for a, b, c in zip(outputs_class[:-1], outputs_coord[:-1], outputs_bev[:-1])] class SetCriterion(nn.Module): @@ -190,6 +192,15 @@ def loss_masks(self, outputs, targets, indices, num_boxes): } return losses + def loss_bev(self, outputs, targets, indices, num_boxes): + assert 'pred_bev' in outputs + idx = self._get_src_permutation_idx(indices) + src_bev = outputs['pred_bev'][idx].squeeze() + target_bev = torch.cat([t['bev'][i] for t, (_, i) in zip(targets, indices)]) + loss = F.mse_loss(src_bev, target_bev) + losses = {'loss_bev' : loss} + return losses + def _get_src_permutation_idx(self, indices): # permute predictions following indices batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) @@ -203,11 +214,18 @@ def _get_tgt_permutation_idx(self, indices): return batch_idx, tgt_idx def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs): + # loss_map = { + # 'labels': self.loss_labels, + # 'cardinality': self.loss_cardinality, + # 'boxes': self.loss_boxes, + # 'masks': self.loss_masks + # } loss_map = { 'labels': self.loss_labels, 'cardinality': self.loss_cardinality, 'boxes': self.loss_boxes, - 'masks': self.loss_masks + 'masks': self.loss_masks, + 'bev': self.loss_bev } assert loss in loss_map, f'do you really want to compute {loss} loss?' return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs) @@ -310,7 +328,8 @@ def build(args): # you should pass `num_classes` to be 2 (max_obj_id + 1). # For more details on this, check the following discussion # https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223 - num_classes = 20 if args.dataset_file != 'coco' else 91 + # num_classes = 20 if args.dataset_file != 'coco' else 91 + num_classes = arg.num_classes+1 if args.dataset_file != 'coco' else 91 if args.dataset_file == "coco_panoptic": # for panoptic, we just add a num_classes that is large enough to hold # max_obj_id + 1, but the exact value doesn't really matter @@ -333,6 +352,7 @@ def build(args): matcher = build_matcher(args) weight_dict = {'loss_ce': 1, 'loss_bbox': args.bbox_loss_coef} weight_dict['loss_giou'] = args.giou_loss_coef + weight_dict['loss_bev'] = args.bev_loss_coef if args.masks: weight_dict["loss_mask"] = args.mask_loss_coef weight_dict["loss_dice"] = args.dice_loss_coef @@ -343,7 +363,7 @@ def build(args): aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()}) weight_dict.update(aux_weight_dict) - losses = ['labels', 'boxes', 'cardinality'] + losses = ['labels', 'boxes', 'cardinality', 'bev'] if args.masks: losses += ["masks"] criterion = SetCriterion(num_classes, matcher=matcher, weight_dict=weight_dict, diff --git a/run_KITTI_bev.sh b/run_KITTI_bev.sh new file mode 100644 index 000000000..cbcf59560 --- /dev/null +++ b/run_KITTI_bev.sh @@ -0,0 +1,8 @@ +echo $(date) +conda config --append envs_dirs /srip-vol/parth/myenvs +cd /srip-vol/yuze/detr/ +python=/srip-vol/parth/myenvs/detr/bin/python + +$python main.py + +echo "done" \ No newline at end of file From 4cd93ede0b98422bbb0f3dfbc967113e2de50a6f Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Tue, 5 Jul 2022 21:54:33 -0700 Subject: [PATCH 02/22] correct kittibev.py --- kittibev => kittibev.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename kittibev => kittibev.py (100%) diff --git a/kittibev b/kittibev.py similarity index 100% rename from kittibev rename to kittibev.py From 8fa2460fd0189187fcc4e52d52be4f16a98e6469 Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Thu, 7 Jul 2022 17:46:37 -0700 Subject: [PATCH 03/22] pretrained model loading problem fixed --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index ced8daf46..d61f229a2 100644 --- a/main.py +++ b/main.py @@ -191,7 +191,7 @@ def main(args): else: print('loading pretrianed weights.....') checkpoint = torch.load(args.resume, map_location='cpu') - model_without_ddp.load_state_dict(checkpoint['model']) + # model_without_ddp.load_state_dict(checkpoint['model']) del checkpoint["model"]["class_embed.weight"] del checkpoint["model"]["class_embed.bias"] # Remove box weights From 182e8714c629e2ef9aa4b26f3bc0245b8705ccbc Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Thu, 7 Jul 2022 18:25:47 -0700 Subject: [PATCH 04/22] json file for bev --- kittibev.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kittibev.py b/kittibev.py index 03f908e36..c59d96466 100644 --- a/kittibev.py +++ b/kittibev.py @@ -35,3 +35,8 @@ # Save in dict bev_data[i+1] = bev #torch.tensor(depth) + +# Save bev data as json file +output_path = '/srip-vol/datasets/KITTI3D/coco/bev_%s.json' %(split) +with open(output_path, "w") as outfile: + json.dump(bev_data, outfile) From a8dab26a752742c3ad2f3c9844f62ca18429cbcf Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Mon, 11 Jul 2022 18:21:56 -0700 Subject: [PATCH 05/22] json file for bev --- kittidim.py | 42 ++++++++++++++++++++++++++++++++++++++++++ kittihbins.py | 42 ++++++++++++++++++++++++++++++++++++++++++ kittihres.py | 0 3 files changed, 84 insertions(+) create mode 100644 kittidim.py create mode 100644 kittihbins.py create mode 100644 kittihres.py diff --git a/kittidim.py b/kittidim.py new file mode 100644 index 000000000..006e44494 --- /dev/null +++ b/kittidim.py @@ -0,0 +1,42 @@ +# ----------------------------------------------------------------------------------- +# To generate bev dimension data +# ----------------------------------------------------------------------------------- +import os +import os.path as osp +import json +# import torch +from tqdm import tqdm + +split = 'val' +data_path = '/srip-vol/datasets/KITTI3D/coco/kitti_%s.json' %(split) +data = json.load(open(data_path)) +bevdim_data = {} + +KITTI_CLASS = {'Car': 0, 'Pedestrian': 1, 'Cyclist' : 2} + +for i in tqdm(range(len(data['images']))): + # Label path + img_path = data['images'][i]['file_name'].split('/') + img_path[-2] = 'label_2' + img_path[-1] = img_path[-1].split('.')[0] + '.txt' + label_path = '/' + osp.join(*img_path) + + # Read annotations and assembler point depth value + lines = open(label_path).readlines() + bevdim = [] + for line in lines: + label_data = line.split(' ') + if(KITTI_CLASS.get(label_data[0],-1) == -1): + continue + dim_w = float(label_data[9]) + dim_l = float(label_data[10]) + + bevdim.append([dim_w, dim_l]) + + # Save in dict + bevdim_data[i+1] = bevdim #torch.tensor(depth) + +# Save bev data as json file +output_path = '/srip-vol/datasets/KITTI3D/coco/bevdim_%s.json' %(split) +with open(output_path, "w") as outfile: + json.dump(bevdim_data, outfile) diff --git a/kittihbins.py b/kittihbins.py new file mode 100644 index 000000000..b70cd38eb --- /dev/null +++ b/kittihbins.py @@ -0,0 +1,42 @@ +# ----------------------------------------------------------------------------------- +# To generate bev +# ----------------------------------------------------------------------------------- +import os +import os.path as osp +import json +# import torch +from tqdm import tqdm + +split = 'val' +data_path = '/srip-vol/datasets/KITTI3D/coco/kitti_%s.json' %(split) +data = json.load(open(data_path)) +bev_data = {} + +KITTI_CLASS = {'Car': 0, 'Pedestrian': 1, 'Cyclist' : 2} + +for i in tqdm(range(len(data['images']))): + # Label path + img_path = data['images'][i]['file_name'].split('/') + img_path[-2] = 'label_2' + img_path[-1] = img_path[-1].split('.')[0] + '.txt' + label_path = '/' + osp.join(*img_path) + + # Read annotations and assembler point depth value + lines = open(label_path).readlines() + bev = [] + for line in lines: + label_data = line.split(' ') + if(KITTI_CLASS.get(label_data[0],-1) == -1): + continue + x_c = float(label_data[11]) + z_c = float(label_data[13]) + + bev.append([x_c, z_c]) + + # Save in dict + bev_data[i+1] = bev #torch.tensor(depth) + +# Save bev data as json file +output_path = '/srip-vol/datasets/KITTI3D/coco/bev_%s.json' %(split) +with open(output_path, "w") as outfile: + json.dump(bev_data, outfile) diff --git a/kittihres.py b/kittihres.py new file mode 100644 index 000000000..e69de29bb From 3379328e6d4b880583b8c095e468faca04784fec Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Wed, 13 Jul 2022 15:46:17 -0700 Subject: [PATCH 06/22] add kitti angle --- kittihbins.py | 83 ++++++++- kittihres.py | 0 util/kitti_utils.py | 410 ++++++++++++++++++++++++++++++++++++++++++++ util/utils.py | 123 +++++++++++++ 4 files changed, 608 insertions(+), 8 deletions(-) delete mode 100644 kittihres.py create mode 100644 util/kitti_utils.py create mode 100644 util/utils.py diff --git a/kittihbins.py b/kittihbins.py index b70cd38eb..0d0cd3d9a 100644 --- a/kittihbins.py +++ b/kittihbins.py @@ -6,11 +6,34 @@ import json # import torch from tqdm import tqdm +import numpy as np +import cv2 + +def ry2alpha(ry, u, cu, fu): + alpha = ry - np.arctan2(u - cu, fu) + + if alpha > np.pi: + alpha -= 2 * np.pi + if alpha < -np.pi: + alpha += 2 * np.pi + + return alpha + +def angle2class(angle): + ''' Convert continuous angle to discrete class and residual. ''' + angle = angle % (2 * np.pi) + assert (angle >= 0 and angle <= 2 * np.pi) + angle_per_class = 2 * np.pi / float(12) + shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi) + class_id = int(shifted_angle / angle_per_class) + residual_angle = shifted_angle - (class_id * angle_per_class + angle_per_class / 2) + return class_id, residual_angle split = 'val' data_path = '/srip-vol/datasets/KITTI3D/coco/kitti_%s.json' %(split) data = json.load(open(data_path)) -bev_data = {} +heading_bins_data = {} +heading_ress_data = {} KITTI_CLASS = {'Car': 0, 'Pedestrian': 1, 'Cyclist' : 2} @@ -20,23 +43,67 @@ img_path[-2] = 'label_2' img_path[-1] = img_path[-1].split('.')[0] + '.txt' label_path = '/' + osp.join(*img_path) + + # Calibration path + img_path = data['images'][i]['file_name'].split('/') + img_path[-2] = 'calib' + img_path[-1] = img_path[-1].split('.')[0] + '.txt' + calib_path = '/' + osp.join(*img_path) # Read annotations and assembler point depth value lines = open(label_path).readlines() - bev = [] + lines_calib = open(calib_path).readlines() + + obj = lines_calib[2].strip().split(' ')[1:] + P2 = np.array(obj, dtype=np.float32) + # obj = lines_calib[3].strip().split(' ')[1:] + # P3 = np.array(obj, dtype=np.float32) + # obj = lines_calib[4].strip().split(' ')[1:] + # R0 = np.array(obj, dtype=np.float32) + # obj = lines_calib[5].strip().split(' ')[1:] + # Tr_velo_to_cam = np.array(obj, dtype=np.float32) + + P2.reshape(3, 4) + # P3.reshape(3, 4) + # R0.reshape(3, 3) + # Tr_velo_to_cam.reshape(3, 4) + + # cv = P2[1, 2] + # fv = P2[1, 1] + # tx = P2[0, 3] / (-fu) + # ty = P2[1, 3] / (-fv) + cu = P2[0, 2] + fu = P2[0, 0] + + heading_bins = [] + heading_ress = [] for line in lines: label_data = line.split(' ') if(KITTI_CLASS.get(label_data[0],-1) == -1): continue - x_c = float(label_data[11]) - z_c = float(label_data[13]) + bbx0 = float(label_data[4]) + bbx2 = float(label_data[6]) + ry = float(label_data[14]) - bev.append([x_c, z_c]) + heading_angle = ry2alpha(ry, (bbx0 + bbx2) / 2, cu, fu) + if heading_angle > np.pi: heading_angle -= 2 * np.pi # check range + if heading_angle < -np.pi: heading_angle += 2 * np.pi + heading_bin, heading_res = angle2class(heading_angle) + # x_c = float(label_data[11]) + # z_c = float(label_data[13]) + + heading_bins.append([heading_bin]) + heading_ress.append([heading_res]) # Save in dict - bev_data[i+1] = bev #torch.tensor(depth) + heading_bins_data[i+1] = heading_bins #torch.tensor(depth) + heading_ress_data[i+1] = heading_ress # Save bev data as json file -output_path = '/srip-vol/datasets/KITTI3D/coco/bev_%s.json' %(split) +output_path = '/srip-vol/datasets/KITTI3D/coco/heading_bins_%s.json' %(split) +with open(output_path, "w") as outfile: + json.dump(heading_bins_data, outfile) + +output_path = '/srip-vol/datasets/KITTI3D/coco/heading_ress_%s.json' %(split) with open(output_path, "w") as outfile: - json.dump(bev_data, outfile) + json.dump(heading_ress_data, outfile) diff --git a/kittihres.py b/kittihres.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/util/kitti_utils.py b/util/kitti_utils.py new file mode 100644 index 000000000..1200530d4 --- /dev/null +++ b/util/kitti_utils.py @@ -0,0 +1,410 @@ +import numpy as np +import cv2 + +################ Object3D ################## + +def get_objects_from_label(label_file): + with open(label_file, 'r') as f: + lines = f.readlines() + objects = [Object3d(line) for line in lines] + return objects + + +class Object3d(object): + def __init__(self, line): + label = line.strip().split(' ') + self.src = line + self.cls_type = label[0] + self.trucation = float(label[1]) + self.occlusion = float(label[2]) # 0:fully visible 1:partly occluded 2:largely occluded 3:unknown + self.alpha = float(label[3]) + self.box2d = np.array((float(label[4]), float(label[5]), float(label[6]), float(label[7])), dtype=np.float32) + self.h = float(label[8]) + self.w = float(label[9]) + self.l = float(label[10]) + self.pos = np.array((float(label[11]), float(label[12]), float(label[13])), dtype=np.float32) + self.dis_to_cam = np.linalg.norm(self.pos) + self.ry = float(label[14]) + self.score = float(label[15]) if label.__len__() == 16 else -1.0 + self.level_str = None + self.level = self.get_obj_level() + + + def get_obj_level(self): + height = float(self.box2d[3]) - float(self.box2d[1]) + 1 + + if self.trucation == -1: + self.level_str = 'DontCare' + return 0 + + if height >= 40 and self.trucation <= 0.15 and self.occlusion <= 0: + self.level_str = 'Easy' + return 1 # Easy + elif height >= 25 and self.trucation <= 0.3 and self.occlusion <= 1: + self.level_str = 'Moderate' + return 2 # Moderate + elif height >= 25 and self.trucation <= 0.5 and self.occlusion <= 2: + self.level_str = 'Hard' + return 3 # Hard + else: + self.level_str = 'UnKnown' + return 4 + + + def generate_corners3d(self): + """ + generate corners3d representation for this object + :return corners_3d: (8, 3) corners of box3d in camera coord + """ + l, h, w = self.l, self.h, self.w + x_corners = [l / 2, l / 2, -l / 2, -l / 2, l / 2, l / 2, -l / 2, -l / 2] + y_corners = [0, 0, 0, 0, -h, -h, -h, -h] + z_corners = [w / 2, -w / 2, -w / 2, w / 2, w / 2, -w / 2, -w / 2, w / 2] + + R = np.array([[np.cos(self.ry), 0, np.sin(self.ry)], + [0, 1, 0], + [-np.sin(self.ry), 0, np.cos(self.ry)]]) + corners3d = np.vstack([x_corners, y_corners, z_corners]) # (3, 8) + corners3d = np.dot(R, corners3d).T + corners3d = corners3d + self.pos + return corners3d + + + def to_bev_box2d(self, oblique=True, voxel_size=0.1): + """ + :param bev_shape: (2) for bev shape (h, w), => (y_max, x_max) in image + :param voxel_size: float, 0.1m + :param oblique: + :return: box2d (4, 2)/ (4) in image coordinate + """ + if oblique: + corners3d = self.generate_corners3d() + xz_corners = corners3d[0:4, [0, 2]] + box2d = np.zeros((4, 2), dtype=np.int32) + box2d[:, 0] = ((xz_corners[:, 0] - Object3d.MIN_XZ[0]) / voxel_size).astype(np.int32) + box2d[:, 1] = Object3d.BEV_SHAPE[0] - 1 - ((xz_corners[:, 1] - Object3d.MIN_XZ[1]) / voxel_size).astype(np.int32) + box2d[:, 0] = np.clip(box2d[:, 0], 0, Object3d.BEV_SHAPE[1]) + box2d[:, 1] = np.clip(box2d[:, 1], 0, Object3d.BEV_SHAPE[0]) + else: + box2d = np.zeros(4, dtype=np.int32) + # discrete_center = np.floor((self.pos / voxel_size)).astype(np.int32) + cu = np.floor((self.pos[0] - Object3d.MIN_XZ[0]) / voxel_size).astype(np.int32) + cv = Object3d.BEV_SHAPE[0] - 1 - ((self.pos[2] - Object3d.MIN_XZ[1]) / voxel_size).astype(np.int32) + half_l, half_w = int(self.l / voxel_size / 2), int(self.w / voxel_size / 2) + box2d[0], box2d[1] = cu - half_l, cv - half_w + box2d[2], box2d[3] = cu + half_l, cv + half_w + + return box2d + + + def to_str(self): + print_str = '%s %.3f %.3f %.3f box2d: %s hwl: [%.3f %.3f %.3f] pos: %s ry: %.3f' \ + % (self.cls_type, self.trucation, self.occlusion, self.alpha, self.box2d, self.h, self.w, self.l, + self.pos, self.ry) + return print_str + + + def to_kitti_format(self): + kitti_str = '%s %.2f %d %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f' \ + % (self.cls_type, self.trucation, int(self.occlusion), self.alpha, self.box2d[0], self.box2d[1], + self.box2d[2], self.box2d[3], self.h, self.w, self.l, self.pos[0], self.pos[1], self.pos[2], + self.ry) + return kitti_str + + + +################### calibration ################### + +def get_calib_from_file(calib_file): + with open(calib_file) as f: + lines = f.readlines() + + obj = lines[2].strip().split(' ')[1:] + P2 = np.array(obj, dtype=np.float32) + obj = lines[3].strip().split(' ')[1:] + P3 = np.array(obj, dtype=np.float32) + obj = lines[4].strip().split(' ')[1:] + R0 = np.array(obj, dtype=np.float32) + obj = lines[5].strip().split(' ')[1:] + Tr_velo_to_cam = np.array(obj, dtype=np.float32) + + return {'P2': P2.reshape(3, 4), + 'P3': P3.reshape(3, 4), + 'R0': R0.reshape(3, 3), + 'Tr_velo2cam': Tr_velo_to_cam.reshape(3, 4)} + + +class Calibration(object): + def __init__(self, calib_file): + if isinstance(calib_file, str): + calib = get_calib_from_file(calib_file) + else: + calib = calib_file + + self.P2 = calib['P2'] # 3 x 4 + self.R0 = calib['R0'] # 3 x 3 + self.V2C = calib['Tr_velo2cam'] # 3 x 4 + self.C2V = self.inverse_rigid_trans(self.V2C) + + # Camera intrinsics and extrinsics + self.cu = self.P2[0, 2] + self.cv = self.P2[1, 2] + self.fu = self.P2[0, 0] + self.fv = self.P2[1, 1] + self.tx = self.P2[0, 3] / (-self.fu) + self.ty = self.P2[1, 3] / (-self.fv) + + def cart_to_hom(self, pts): + """ + :param pts: (N, 3 or 2) + :return pts_hom: (N, 4 or 3) + """ + pts_hom = np.hstack((pts, np.ones((pts.shape[0], 1), dtype=np.float32))) + return pts_hom + + def lidar_to_rect(self, pts_lidar): + """ + :param pts_lidar: (N, 3) + :return pts_rect: (N, 3) + """ + pts_lidar_hom = self.cart_to_hom(pts_lidar) + pts_rect = np.dot(pts_lidar_hom, np.dot(self.V2C.T, self.R0.T)) + # pts_rect = reduce(np.dot, (pts_lidar_hom, self.V2C.T, self.R0.T)) + return pts_rect + + def rect_to_lidar(self, pts_rect): + pts_ref = np.transpose(np.dot(np.linalg.inv(self.R0), np.transpose(pts_rect))) + pts_ref = self.cart_to_hom(pts_ref) # nx4 + return np.dot(pts_ref, np.transpose(self.C2V)) + + def rect_to_img(self, pts_rect): + """ + :param pts_rect: (N, 3) + :return pts_img: (N, 2) + """ + pts_rect_hom = self.cart_to_hom(pts_rect) + pts_2d_hom = np.dot(pts_rect_hom, self.P2.T) + pts_img = (pts_2d_hom[:, 0:2].T / pts_rect_hom[:, 2]).T # (N, 2) + pts_rect_depth = pts_2d_hom[:, 2] - self.P2.T[3, 2] # depth in rect camera coord + return pts_img, pts_rect_depth + + def lidar_to_img(self, pts_lidar): + """ + :param pts_lidar: (N, 3) + :return pts_img: (N, 2) + """ + pts_rect = self.lidar_to_rect(pts_lidar) + pts_img, pts_depth = self.rect_to_img(pts_rect) + return pts_img, pts_depth + + def img_to_rect(self, u, v, depth_rect): + """ + :param u: (N) + :param v: (N) + :param depth_rect: (N) + :return: + """ + x = ((u - self.cu) * depth_rect) / self.fu + self.tx + y = ((v - self.cv) * depth_rect) / self.fv + self.ty + pts_rect = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1), depth_rect.reshape(-1, 1)), axis=1) + return pts_rect + + def depthmap_to_rect(self, depth_map): + """ + :param depth_map: (H, W), depth_map + :return: + """ + x_range = np.arange(0, depth_map.shape[1]) + y_range = np.arange(0, depth_map.shape[0]) + x_idxs, y_idxs = np.meshgrid(x_range, y_range) + x_idxs, y_idxs = x_idxs.reshape(-1), y_idxs.reshape(-1) + depth = depth_map[y_idxs, x_idxs] + pts_rect = self.img_to_rect(x_idxs, y_idxs, depth) + return pts_rect, x_idxs, y_idxs + + def corners3d_to_img_boxes(self, corners3d): + """ + :param corners3d: (N, 8, 3) corners in rect coordinate + :return: boxes: (None, 4) [x1, y1, x2, y2] in rgb coordinate + :return: boxes_corner: (None, 8) [xi, yi] in rgb coordinate + """ + sample_num = corners3d.shape[0] + corners3d_hom = np.concatenate((corners3d, np.ones((sample_num, 8, 1))), axis=2) # (N, 8, 4) + + img_pts = np.matmul(corners3d_hom, self.P2.T) # (N, 8, 3) + + x, y = img_pts[:, :, 0] / img_pts[:, :, 2], img_pts[:, :, 1] / img_pts[:, :, 2] + x1, y1 = np.min(x, axis=1), np.min(y, axis=1) + x2, y2 = np.max(x, axis=1), np.max(y, axis=1) + + boxes = np.concatenate((x1.reshape(-1, 1), y1.reshape(-1, 1), x2.reshape(-1, 1), y2.reshape(-1, 1)), axis=1) + boxes_corner = np.concatenate((x.reshape(-1, 8, 1), y.reshape(-1, 8, 1)), axis=2) + + return boxes, boxes_corner + + def camera_dis_to_rect(self, u, v, d): + """ + Can only process valid u, v, d, which means u, v can not beyond the image shape, reprojection error 0.02 + :param u: (N) + :param v: (N) + :param d: (N), the distance between camera and 3d points, d^2 = x^2 + y^2 + z^2 + :return: + """ + assert self.fu == self.fv, '%.8f != %.8f' % (self.fu, self.fv) + fd = np.sqrt((u - self.cu) ** 2 + (v - self.cv) ** 2 + self.fu ** 2) + x = ((u - self.cu) * d) / fd + self.tx + y = ((v - self.cv) * d) / fd + self.ty + z = np.sqrt(d ** 2 - x ** 2 - y ** 2) + pts_rect = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1), z.reshape(-1, 1)), axis=1) + return pts_rect + + def inverse_rigid_trans(self, Tr): + ''' Inverse a rigid body transform matrix (3x4 as [R|t]) + [R'|-R't; 0|1] + ''' + inv_Tr = np.zeros_like(Tr) # 3x4 + inv_Tr[0:3, 0:3] = np.transpose(Tr[0:3, 0:3]) + inv_Tr[0:3, 3] = np.dot(-np.transpose(Tr[0:3, 0:3]), Tr[0:3, 3]) + return inv_Tr + + def alpha2ry(self, alpha, u): + """ + Get rotation_y by alpha + theta - 180 + alpha : Observation angle of object, ranging [-pi..pi] + x : Object center x to the camera center (x-W/2), in pixels + rotation_y : Rotation ry around Y-axis in camera coordinates [-pi..pi] + """ + ry = alpha + np.arctan2(u - self.cu, self.fu) + + if ry > np.pi: + ry -= 2 * np.pi + if ry < -np.pi: + ry += 2 * np.pi + + return ry + + def ry2alpha(self, ry, u): + alpha = ry - np.arctan2(u - self.cu, self.fu) + + if alpha > np.pi: + alpha -= 2 * np.pi + if alpha < -np.pi: + alpha += 2 * np.pi + + return alpha + + def flip(self,img_size): + wsize = 4 + hsize = 2 + p2ds = (np.concatenate([np.expand_dims(np.tile(np.expand_dims(np.linspace(0,img_size[0],wsize),0),[hsize,1]),-1),\ + np.expand_dims(np.tile(np.expand_dims(np.linspace(0,img_size[1],hsize),1),[1,wsize]),-1), + np.linspace(2,78,wsize*hsize).reshape(hsize,wsize,1)],-1)).reshape(-1,3) + p3ds = self.img_to_rect(p2ds[:,0:1],p2ds[:,1:2],p2ds[:,2:3]) + p3ds[:,0]*=-1 + p2ds[:,0] = img_size[0] - p2ds[:,0] + + #self.P2[0,3] *= -1 + cos_matrix = np.zeros([wsize*hsize,2,7]) + cos_matrix[:,0,0] = p3ds[:,0] + cos_matrix[:,0,1] = cos_matrix[:,1,2] = p3ds[:,2] + cos_matrix[:,1,0] = p3ds[:,1] + cos_matrix[:,0,3] = cos_matrix[:,1,4] = 1 + cos_matrix[:,:,-2] = -p2ds[:,:2] + cos_matrix[:,:,-1] = (-p2ds[:,:2]*p3ds[:,2:3]) + new_calib = np.linalg.svd(cos_matrix.reshape(-1,7))[-1][-1] + new_calib /= new_calib[-1] + + new_calib_matrix = np.zeros([4,3]).astype(np.float32) + new_calib_matrix[0,0] = new_calib_matrix[1,1] = new_calib[0] + new_calib_matrix[2,0:2] = new_calib[1:3] + new_calib_matrix[3,:] = new_calib[3:6] + new_calib_matrix[-1,-1] = self.P2[-1,-1] + self.P2 = new_calib_matrix.T + self.cu = self.P2[0, 2] + self.cv = self.P2[1, 2] + self.fu = self.P2[0, 0] + self.fv = self.P2[1, 1] + self.tx = self.P2[0, 3] / (-self.fu) + self.ty = self.P2[1, 3] / (-self.fv) + +################### affine trainsform ################### + +def get_dir(src_point, rot_rad): + sn, cs = np.sin(rot_rad), np.cos(rot_rad) + + src_result = [0, 0] + src_result[0] = src_point[0] * cs - src_point[1] * sn + src_result[1] = src_point[0] * sn + src_point[1] * cs + + return src_result + + +def get_3rd_point(a, b): + direct = a - b + return b + np.array([-direct[1], direct[0]], dtype=np.float32) + + +def get_affine_transform(center, + scale, + rot, + output_size, + shift=np.array([0, 0], dtype=np.float32), + inv=0): + if not isinstance(scale, np.ndarray) and not isinstance(scale, list): + scale = np.array([scale, scale], dtype=np.float32) + + scale_tmp = scale + src_w = scale_tmp[0] + dst_w = output_size[0] + dst_h = output_size[1] + + rot_rad = np.pi * rot / 180 + src_dir = get_dir([0, src_w * -0.5], rot_rad) + dst_dir = np.array([0, dst_w * -0.5], np.float32) + + src = np.zeros((3, 2), dtype=np.float32) + dst = np.zeros((3, 2), dtype=np.float32) + src[0, :] = center + scale_tmp * shift + src[1, :] = center + src_dir + scale_tmp * shift + dst[0, :] = [dst_w * 0.5, dst_h * 0.5] + dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5], np.float32) + dst_dir + + src[2:, :] = get_3rd_point(src[0, :], src[1, :]) + dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :]) + + if inv: + trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) + trans_inv = cv2.getAffineTransform(np.float32(dst), np.float32(src)) + return trans, trans_inv + else: + trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) + return trans + + +def affine_transform(pt, t): + new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32).T + new_pt = np.dot(t, new_pt) + return new_pt[:2] + + +if __name__ == '__main__': + from lib.datasets.kitti.kitti_dataset import KITTI_Dataset + cfg = {'root_dir': '../../../data'} + dataset = KITTI_Dataset('train', cfg) + + # calib testing + # we project center fo 3D objects to image plane + index = 1 + calib = dataset.get_calib(index) + objects = dataset.get_label(index) + for object in objects: + print(object.to_kitti_format()) + object.pos[0] *= 1 + center_3d = object.pos + [0, -object.h/2, 0] # real 3D center + center_3d = center_3d.reshape(-1, 3) #(N, 3) + center_3d_projected, depth = calib.rect_to_img(center_3d) + box2d = object.box2d + center_2d = [(box2d[0]+box2d[2])/2, (box2d[1]+box2d[3])/2] + print ('3D center/2D center/projected 3D center:', center_3d, center_2d, center_3d_projected) + print('alpha ---> ry ', object.alpha, calib.alpha2ry(object.alpha, center_2d[0])) + break \ No newline at end of file diff --git a/util/utils.py b/util/utils.py new file mode 100644 index 000000000..c59a91f50 --- /dev/null +++ b/util/utils.py @@ -0,0 +1,123 @@ +''' some auxiliary functions for all datasets ''' +import numpy as np +import cv2 + + +num_heading_bin = 12 # hyper param + +def angle2class(angle): + ''' Convert continuous angle to discrete class and residual. ''' + angle = angle % (2 * np.pi) + assert (angle >= 0 and angle <= 2 * np.pi) + angle_per_class = 2 * np.pi / float(num_heading_bin) + shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi) + class_id = int(shifted_angle / angle_per_class) + residual_angle = shifted_angle - (class_id * angle_per_class + angle_per_class / 2) + return class_id, residual_angle + + +def class2angle(cls, residual, to_label_format=False): + ''' Inverse function to angle2class. ''' + angle_per_class = 2 * np.pi / float(num_heading_bin) + angle_center = cls * angle_per_class + angle = angle_center + residual + if to_label_format and angle > np.pi: + angle = angle - 2 * np.pi + return angle + + +def gaussian_radius(bbox_size, min_overlap=0.7): + height, width = bbox_size + + a1 = 1 + b1 = (height + width) + c1 = width * height * (1 - min_overlap) / (1 + min_overlap) + sq1 = np.sqrt(b1 ** 2 - 4 * a1 * c1) + r1 = (b1 + sq1) / 2 + + a2 = 4 + b2 = 2 * (height + width) + c2 = (1 - min_overlap) * width * height + sq2 = np.sqrt(b2 ** 2 - 4 * a2 * c2) + r2 = (b2 + sq2) / 2 + + a3 = 4 * min_overlap + b3 = -2 * min_overlap * (height + width) + c3 = (min_overlap - 1) * width * height + sq3 = np.sqrt(b3 ** 2 - 4 * a3 * c3) + r3 = (b3 + sq3) / 2 + return min(r1, r2, r3) + + +def gaussian2D(shape, sigma=1): + m, n = [(ss - 1.) / 2. for ss in shape] + y, x = np.ogrid[-m:m+1,-n:n+1] + + h = np.exp(-(x * x + y * y) / (2 * sigma * sigma)) + h[h < np.finfo(h.dtype).eps * h.max()] = 0 + return h + + +def draw_umich_gaussian(heatmap, center, radius, k=1): + diameter = 2 * radius + 1 + gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6) + x, y = int(center[0]), int(center[1]) + height, width = heatmap.shape[0:2] + + left, right = min(x, radius), min(width - x, radius + 1) + top, bottom = min(y, radius), min(height - y, radius + 1) + + masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] + masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:radius + right] + if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: # TODO debug + np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap) + return heatmap + + +def draw_msra_gaussian(heatmap, center, sigma): + tmp_size = sigma * 3 + mu_x = int(center[0] + 0.5) + mu_y = int(center[1] + 0.5) + w, h = heatmap.shape[0], heatmap.shape[1] + ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)] + br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)] + if ul[0] >= h or ul[1] >= w or br[0] < 0 or br[1] < 0: + return heatmap + size = 2 * tmp_size + 1 + x = np.arange(0, size, 1, np.float32) + y = x[:, np.newaxis] + x0 = y0 = size // 2 + g = np.exp(- ((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma ** 2)) + g_x = max(0, -ul[0]), min(br[0], h) - ul[0] + g_y = max(0, -ul[1]), min(br[1], w) - ul[1] + img_x = max(0, ul[0]), min(br[0], h) + img_y = max(0, ul[1]), min(br[1], w) + heatmap[img_y[0]:img_y[1], img_x[0]:img_x[1]] = np.maximum( + heatmap[img_y[0]:img_y[1], img_x[0]:img_x[1]], + g[g_y[0]:g_y[1], g_x[0]:g_x[1]]) + return heatmap + + +def draw_projected_box3d(image, corners3d, color=(255, 255, 255), thickness=1): + ''' Draw 3d bounding box in image + input: + image: RGB image + corners3d: (8,3) array of vertices (in image plane) for the 3d box in following order: + 1 -------- 0 + /| /| + 2 -------- 3 . + | | | | + . 5 -------- 4 + |/ |/ + 6 -------- 7 + ''' + + corners3d = corners3d.astype(np.int32) + for k in range(0, 4): + i, j = k, (k + 1) % 4 + cv2.line(image, (corners3d[i, 0], corners3d[i, 1]), (corners3d[j, 0], corners3d[j, 1]), color, thickness, lineType=cv2.LINE_AA) + i, j = k + 4, (k + 1) % 4 + 4 + cv2.line(image, (corners3d[i, 0], corners3d[i, 1]), (corners3d[j, 0], corners3d[j, 1]), color, thickness, lineType=cv2.LINE_AA) + i, j = k, k + 4 + cv2.line(image, (corners3d[i, 0], corners3d[i, 1]), (corners3d[j, 0], corners3d[j, 1]), color, thickness, lineType=cv2.LINE_AA) + return image \ No newline at end of file From 5335caa299ae84c430291414eabfa89a30a6ded3 Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Wed, 13 Jul 2022 16:48:46 -0700 Subject: [PATCH 07/22] dimension and angle --- datasets/coco.py | 22 ++++++++++++-- kittidim.py | 2 +- kittihbins.py | 4 +-- main.py | 2 ++ models/detr.py | 75 ++++++++++++++++++++++++++++++++++++++++++------ 5 files changed, 92 insertions(+), 13 deletions(-) diff --git a/datasets/coco.py b/datasets/coco.py index bdd39196f..18b486ccd 100644 --- a/datasets/coco.py +++ b/datasets/coco.py @@ -17,12 +17,18 @@ class CocoDetection(torchvision.datasets.CocoDetection): - def __init__(self, img_folder, ann_file, transforms, return_masks, bev_data = None): + def __init__(self, img_folder, ann_file, transforms, return_masks, bev_data = None, dim_data = None, heading_bin_data = None, heading_res_data = None): super(CocoDetection, self).__init__(img_folder, ann_file) self._transforms = transforms self.prepare = ConvertCocoPolysToMask(return_masks) if bev_data is not None: self.bev_data = json.load(open(bev_data)) + if dim_data is not None: + self.dim_data = json.load(open(dim_data)) + if heading_bin_data is not None: + self.heading_bin_data = json.load(open(heading_bin_data)) + if heading_res_data is not None: + self.heading_res_data = json.load(open(heading_res_data)) def __getitem__(self, idx): img, target = super(CocoDetection, self).__getitem__(idx) @@ -31,8 +37,17 @@ def __getitem__(self, idx): img, target = self.prepare(img, target) if self._transforms is not None: img, target = self._transforms(img, target) + # load the bev data target['bev'] = torch.tensor(self.bev_data[str(image_id)]) assert target['bev'].size()[0] == target['boxes'].size()[0] + # load the dimension data + target['dim'] = torch.tensor(self.dim_data[str(image_id)]) + assert target['dim'].size()[0] == target['boxes'].size()[0] + # load the angle data + target['heading_bin'] = torch.tensor(self.heading_bin_data[str(image_id)]) + assert target['heading_bin'].size()[0] == target['boxes'].size()[0] + target['heading_res'] = torch.tensor(self.heading_res_data[str(image_id)]) + assert target['heading_res'].size()[0] == target['boxes'].size()[0] return img, target @@ -175,5 +190,8 @@ def build_kitti_coco(image_set, args): "val": (Path("/srip-vol/datasets/KITTI3D/training/image_2"), anno_root / f'kitti_{image_set}.json'), } BEV_DATA = "/srip-vol/datasets/KITTI3D/coco/bev_%s.json"%(image_set) + DIM_DATA = "/srip-vol/datasets/KITTI3D/coco/dim_%s.json"%(image_set) + HEADING_BIN_DATA = "/srip-vol/datasets/KITTI3D/coco/heading_bins_%s.json"%(image_set) + HEADING_RES_DATA = "/srip-vol/datasets/KITTI3D/coco/heading_ress_%s.json"%(image_set) img_folder, ann_file = PATHS[image_set] - dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks, bev_data = BEV_DATA) \ No newline at end of file + dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks, bev_data = BEV_DATA, heading_bin_data = HEADING_BIN_DATA, heading_res_data = HEADING_RES_DATA) \ No newline at end of file diff --git a/kittidim.py b/kittidim.py index 006e44494..91def6cf3 100644 --- a/kittidim.py +++ b/kittidim.py @@ -37,6 +37,6 @@ bevdim_data[i+1] = bevdim #torch.tensor(depth) # Save bev data as json file -output_path = '/srip-vol/datasets/KITTI3D/coco/bevdim_%s.json' %(split) +output_path = '/srip-vol/datasets/KITTI3D/coco/bev_dim_%s.json' %(split) with open(output_path, "w") as outfile: json.dump(bevdim_data, outfile) diff --git a/kittihbins.py b/kittihbins.py index 0d0cd3d9a..8766091e6 100644 --- a/kittihbins.py +++ b/kittihbins.py @@ -92,8 +92,8 @@ def angle2class(angle): # x_c = float(label_data[11]) # z_c = float(label_data[13]) - heading_bins.append([heading_bin]) - heading_ress.append([heading_res]) + heading_bins.append(heading_bin) + heading_ress.append(heading_res) # Save in dict heading_bins_data[i+1] = heading_bins #torch.tensor(depth) diff --git a/main.py b/main.py index d61f229a2..317ffb485 100644 --- a/main.py +++ b/main.py @@ -82,6 +82,8 @@ def get_args_parser(): parser.add_argument('--bbox_loss_coef', default=5, type=float) parser.add_argument('--giou_loss_coef', default=2, type=float) parser.add_argument('--bev_loss_coef', default=2, type=float) + parser.add_argument('--dim_loss_coef', default=2, type=float) + parser.add_argument('--angle_loss_coef', default=1, type=float) parser.add_argument('--eos_coef', default=0.1, type=float, help="Relative classification weight of the no-object class") diff --git a/models/detr.py b/models/detr.py index 9011bcc20..e7360a09f 100644 --- a/models/detr.py +++ b/models/detr.py @@ -40,7 +40,9 @@ def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=Fal self.input_proj = nn.Conv2d(backbone.num_channels, hidden_dim, kernel_size=1) self.backbone = backbone self.aux_loss = aux_loss - self.bev_embed = nn.Linear(hidden_dim, 2) + self.bev_embed = MLP(hidden_dim, hidden_dim, 2, 2) + self.angle_embed = MLP(hidden_dim, hidden_dim, 24, 2) + self.dim_embed = MLP(hidden_dim, hidden_dim, 2, 2) def forward(self, samples: NestedTensor): """ The forward expects a NestedTensor, which consists of: @@ -68,18 +70,20 @@ def forward(self, samples: NestedTensor): outputs_class = self.class_embed(hs) outputs_coord = self.bbox_embed(hs).sigmoid() outputs_bev = self.bev_embed(hs) - out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1], 'pred_bev': outputs_bev[-1]} + outputs_dim = self.dim_embed(hs) + outputs_angle = self.angle_embed(hs) + out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1], 'pred_bev': outputs_bev[-1], 'pred_dim': outputs_dim[-1], 'pred_angle': outputs_angle[-1]} if self.aux_loss: - out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord, outputs_bev) + out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord, outputs_bev, outputs_dim, outputs_angle) return out @torch.jit.unused - def _set_aux_loss(self, outputs_class, outputs_coord, outputs_bev): + def _set_aux_loss(self, outputs_class, outputs_coord, outputs_bev, outputs_dim, outputs_angle): # this is a workaround to make torchscript happy, as torchscript # doesn't support dictionary with non-homogeneous values, such # as a dict having both a Tensor and a list. - return [{'pred_logits': a, 'pred_boxes': b, 'pred_bev': c} - for a, b, c in zip(outputs_class[:-1], outputs_coord[:-1], outputs_bev[:-1])] + return [{'pred_logits': a, 'pred_boxes': b, 'pred_bev': c, 'pred_dim': d, 'pred_angle': e} + for a, b, c, d, e in zip(outputs_class[:-1], outputs_coord[:-1], outputs_bev[:-1], outputs_dim[:-1], outputs_angle[:-1])] class SetCriterion(nn.Module): @@ -201,6 +205,57 @@ def loss_bev(self, outputs, targets, indices, num_boxes): losses = {'loss_bev' : loss} return losses + def loss_dims(self, outputs, targets, indices, num_boxes): + assert 'pred_dim' in outputs + # idx = self._get_src_permutation_idx(indices) + # src_dim = outputs['pred_dim'][idx].squeeze() + # target_dim = torch.cat([t['dim'][i] for t, (_, i) in zip(targets, indices)]) + # loss = F.mse_loss(src_dim, target_dim) + # losses = {'loss_bev' : loss} + + idx = self._get_src_permutation_idx(indices) + src_dims = outputs['pred_dim'][idx] + target_dims = torch.cat([t['dim'][i] for t, (_, i) in zip(targets, indices)], dim=0) + + dimension = target_dims.clone().detach() + dim_loss = torch.abs(src_dims - target_dims) + dim_loss /= dimension + with torch.no_grad(): + compensation_weight = F.l1_loss(src_dims, target_dims) / dim_loss.mean() + dim_loss *= compensation_weight + losses = {} + losses['loss_dim'] = dim_loss.sum() / num_boxes + + return losses + + def loss_angles(self, outputs, targets, indices, num_boxes): + + idx = self._get_src_permutation_idx(indices) + heading_input = outputs['pred_angle'][idx] + target_heading_cls = torch.cat([t['heading_bin'][i] for t, (_, i) in zip(targets, indices)], dim=0) + target_heading_res = torch.cat([t['heading_res'][i] for t, (_, i) in zip(targets, indices)], dim=0) + + heading_input = heading_input.view(-1, 24) + heading_target_cls = target_heading_cls.view(-1).long() + heading_target_res = target_heading_res.view(-1) + + # classification loss + heading_input_cls = heading_input[:, 0:12] + cls_loss = F.cross_entropy(heading_input_cls, heading_target_cls, reduction='none') + + # regression loss + heading_input_res = heading_input[:, 12:24] + cls_onehot = torch.zeros(heading_target_cls.shape[0], 12).cuda().scatter_(dim=1, index=heading_target_cls.view(-1, 1), value=1) + heading_input_res = torch.sum(heading_input_res * cls_onehot, 1) + reg_loss = F.l1_loss(heading_input_res, heading_target_res, reduction='none') + + angle_loss = cls_loss + reg_loss + losses = {} + losses['loss_angle'] = angle_loss.sum() / num_boxes + return losses + + + def _get_src_permutation_idx(self, indices): # permute predictions following indices batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) @@ -225,7 +280,9 @@ def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs): 'cardinality': self.loss_cardinality, 'boxes': self.loss_boxes, 'masks': self.loss_masks, - 'bev': self.loss_bev + 'bev': self.loss_bev, + 'dim': self.loss_dims, + 'angle': self.loss_angles } assert loss in loss_map, f'do you really want to compute {loss} loss?' return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs) @@ -353,6 +410,8 @@ def build(args): weight_dict = {'loss_ce': 1, 'loss_bbox': args.bbox_loss_coef} weight_dict['loss_giou'] = args.giou_loss_coef weight_dict['loss_bev'] = args.bev_loss_coef + weight_dict['loss_dim'] = args.dim_loss_coef + weight_dict['loss_angle'] = args.angle_loss_coef if args.masks: weight_dict["loss_mask"] = args.mask_loss_coef weight_dict["loss_dice"] = args.dice_loss_coef @@ -363,7 +422,7 @@ def build(args): aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()}) weight_dict.update(aux_weight_dict) - losses = ['labels', 'boxes', 'cardinality', 'bev'] + losses = ['labels', 'boxes', 'cardinality', 'bev', 'dim', 'angle'] if args.masks: losses += ["masks"] criterion = SetCriterion(num_classes, matcher=matcher, weight_dict=weight_dict, From 415e91a5f514fcbeac423a7038bf877aa90dcd6e Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Wed, 13 Jul 2022 17:26:09 -0700 Subject: [PATCH 08/22] fix some small error --- kittihbins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kittihbins.py b/kittihbins.py index 8766091e6..7d5dbdf45 100644 --- a/kittihbins.py +++ b/kittihbins.py @@ -63,7 +63,7 @@ def angle2class(angle): # obj = lines_calib[5].strip().split(' ')[1:] # Tr_velo_to_cam = np.array(obj, dtype=np.float32) - P2.reshape(3, 4) + P2 = P2.reshape(3, 4) # P3.reshape(3, 4) # R0.reshape(3, 3) # Tr_velo_to_cam.reshape(3, 4) From 1a82b436cc89823fba68c574264bd59517d52e43 Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Wed, 13 Jul 2022 22:06:22 -0700 Subject: [PATCH 09/22] fix small error --- datasets/coco.py | 3 ++- models/detr.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/datasets/coco.py b/datasets/coco.py index 18b486ccd..d817cde1d 100644 --- a/datasets/coco.py +++ b/datasets/coco.py @@ -194,4 +194,5 @@ def build_kitti_coco(image_set, args): HEADING_BIN_DATA = "/srip-vol/datasets/KITTI3D/coco/heading_bins_%s.json"%(image_set) HEADING_RES_DATA = "/srip-vol/datasets/KITTI3D/coco/heading_ress_%s.json"%(image_set) img_folder, ann_file = PATHS[image_set] - dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks, bev_data = BEV_DATA, heading_bin_data = HEADING_BIN_DATA, heading_res_data = HEADING_RES_DATA) \ No newline at end of file + dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks, bev_data = BEV_DATA, heading_bin_data = HEADING_BIN_DATA, heading_res_data = HEADING_RES_DATA) + return dataset \ No newline at end of file diff --git a/models/detr.py b/models/detr.py index e7360a09f..2250b9356 100644 --- a/models/detr.py +++ b/models/detr.py @@ -386,7 +386,7 @@ def build(args): # For more details on this, check the following discussion # https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223 # num_classes = 20 if args.dataset_file != 'coco' else 91 - num_classes = arg.num_classes+1 if args.dataset_file != 'coco' else 91 + num_classes = args.num_classes+1 if args.dataset_file != 'coco' else 91 if args.dataset_file == "coco_panoptic": # for panoptic, we just add a num_classes that is large enough to hold # max_obj_id + 1, but the exact value doesn't really matter From dcb579a58dc2535cef3ae861958e3f276311f85c Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Thu, 14 Jul 2022 21:18:44 -0700 Subject: [PATCH 10/22] fix small error --- kittihbins.py | 85 +++++++++------------------------------------------ 1 file changed, 14 insertions(+), 71 deletions(-) diff --git a/kittihbins.py b/kittihbins.py index 7d5dbdf45..54d80ac00 100644 --- a/kittihbins.py +++ b/kittihbins.py @@ -1,109 +1,52 @@ # ----------------------------------------------------------------------------------- -# To generate bev +# To generate bev heading bins # ----------------------------------------------------------------------------------- import os import os.path as osp import json # import torch from tqdm import tqdm -import numpy as np -import cv2 -def ry2alpha(ry, u, cu, fu): - alpha = ry - np.arctan2(u - cu, fu) - - if alpha > np.pi: - alpha -= 2 * np.pi - if alpha < -np.pi: - alpha += 2 * np.pi +split = 'val' +data_path = '/srip-vol/datasets/KITTI3D/coco/kitti_%s.json' %(split) +data = json.load(open(data_path)) +bev_data = {} - return alpha +KITTI_CLASS = {'Car': 0, 'Pedestrian': 1, 'Cyclist' : 2} def angle2class(angle): ''' Convert continuous angle to discrete class and residual. ''' angle = angle % (2 * np.pi) assert (angle >= 0 and angle <= 2 * np.pi) - angle_per_class = 2 * np.pi / float(12) + angle_per_class = 2 * np.pi / float(num_heading_bin) shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi) class_id = int(shifted_angle / angle_per_class) residual_angle = shifted_angle - (class_id * angle_per_class + angle_per_class / 2) return class_id, residual_angle -split = 'val' -data_path = '/srip-vol/datasets/KITTI3D/coco/kitti_%s.json' %(split) -data = json.load(open(data_path)) -heading_bins_data = {} -heading_ress_data = {} - -KITTI_CLASS = {'Car': 0, 'Pedestrian': 1, 'Cyclist' : 2} - for i in tqdm(range(len(data['images']))): # Label path img_path = data['images'][i]['file_name'].split('/') img_path[-2] = 'label_2' img_path[-1] = img_path[-1].split('.')[0] + '.txt' label_path = '/' + osp.join(*img_path) - - # Calibration path - img_path = data['images'][i]['file_name'].split('/') - img_path[-2] = 'calib' - img_path[-1] = img_path[-1].split('.')[0] + '.txt' - calib_path = '/' + osp.join(*img_path) # Read annotations and assembler point depth value lines = open(label_path).readlines() - lines_calib = open(calib_path).readlines() - - obj = lines_calib[2].strip().split(' ')[1:] - P2 = np.array(obj, dtype=np.float32) - # obj = lines_calib[3].strip().split(' ')[1:] - # P3 = np.array(obj, dtype=np.float32) - # obj = lines_calib[4].strip().split(' ')[1:] - # R0 = np.array(obj, dtype=np.float32) - # obj = lines_calib[5].strip().split(' ')[1:] - # Tr_velo_to_cam = np.array(obj, dtype=np.float32) - - P2 = P2.reshape(3, 4) - # P3.reshape(3, 4) - # R0.reshape(3, 3) - # Tr_velo_to_cam.reshape(3, 4) - - # cv = P2[1, 2] - # fv = P2[1, 1] - # tx = P2[0, 3] / (-fu) - # ty = P2[1, 3] / (-fv) - cu = P2[0, 2] - fu = P2[0, 0] - - heading_bins = [] - heading_ress = [] + bev = [] for line in lines: label_data = line.split(' ') if(KITTI_CLASS.get(label_data[0],-1) == -1): continue - bbx0 = float(label_data[4]) - bbx2 = float(label_data[6]) - ry = float(label_data[14]) - - heading_angle = ry2alpha(ry, (bbx0 + bbx2) / 2, cu, fu) - if heading_angle > np.pi: heading_angle -= 2 * np.pi # check range - if heading_angle < -np.pi: heading_angle += 2 * np.pi - heading_bin, heading_res = angle2class(heading_angle) - # x_c = float(label_data[11]) - # z_c = float(label_data[13]) + x_c = float(label_data[11]) + z_c = float(label_data[13]) - heading_bins.append(heading_bin) - heading_ress.append(heading_res) + bev.append([x_c, z_c]) # Save in dict - heading_bins_data[i+1] = heading_bins #torch.tensor(depth) - heading_ress_data[i+1] = heading_ress + bev_data[i+1] = bev #torch.tensor(depth) # Save bev data as json file -output_path = '/srip-vol/datasets/KITTI3D/coco/heading_bins_%s.json' %(split) -with open(output_path, "w") as outfile: - json.dump(heading_bins_data, outfile) - -output_path = '/srip-vol/datasets/KITTI3D/coco/heading_ress_%s.json' %(split) +output_path = '/srip-vol/datasets/KITTI3D/coco/bev_hbins_%s.json' %(split) with open(output_path, "w") as outfile: - json.dump(heading_ress_data, outfile) + json.dump(bev_data, outfile) From 08a26e7e366b988dd9e9e9cbe7b00f9be62572aa Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Thu, 14 Jul 2022 21:20:54 -0700 Subject: [PATCH 11/22] kittibins --- kittihbins.py | 85 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 71 insertions(+), 14 deletions(-) diff --git a/kittihbins.py b/kittihbins.py index 54d80ac00..a1bdf3a7f 100644 --- a/kittihbins.py +++ b/kittihbins.py @@ -1,52 +1,109 @@ # ----------------------------------------------------------------------------------- -# To generate bev heading bins +# To generate bev bins # ----------------------------------------------------------------------------------- import os import os.path as osp import json # import torch from tqdm import tqdm +import numpy as np +import cv2 -split = 'val' -data_path = '/srip-vol/datasets/KITTI3D/coco/kitti_%s.json' %(split) -data = json.load(open(data_path)) -bev_data = {} +def ry2alpha(ry, u, cu, fu): + alpha = ry - np.arctan2(u - cu, fu) -KITTI_CLASS = {'Car': 0, 'Pedestrian': 1, 'Cyclist' : 2} + if alpha > np.pi: + alpha -= 2 * np.pi + if alpha < -np.pi: + alpha += 2 * np.pi + + return alpha def angle2class(angle): ''' Convert continuous angle to discrete class and residual. ''' angle = angle % (2 * np.pi) assert (angle >= 0 and angle <= 2 * np.pi) - angle_per_class = 2 * np.pi / float(num_heading_bin) + angle_per_class = 2 * np.pi / float(12) shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi) class_id = int(shifted_angle / angle_per_class) residual_angle = shifted_angle - (class_id * angle_per_class + angle_per_class / 2) return class_id, residual_angle +split = 'val' +data_path = '/srip-vol/datasets/KITTI3D/coco/kitti_%s.json' %(split) +data = json.load(open(data_path)) +heading_bins_data = {} +heading_ress_data = {} + +KITTI_CLASS = {'Car': 0, 'Pedestrian': 1, 'Cyclist' : 2} + for i in tqdm(range(len(data['images']))): # Label path img_path = data['images'][i]['file_name'].split('/') img_path[-2] = 'label_2' img_path[-1] = img_path[-1].split('.')[0] + '.txt' label_path = '/' + osp.join(*img_path) + + # Calibration path + img_path = data['images'][i]['file_name'].split('/') + img_path[-2] = 'calib' + img_path[-1] = img_path[-1].split('.')[0] + '.txt' + calib_path = '/' + osp.join(*img_path) # Read annotations and assembler point depth value lines = open(label_path).readlines() - bev = [] + lines_calib = open(calib_path).readlines() + + obj = lines_calib[2].strip().split(' ')[1:] + P2 = np.array(obj, dtype=np.float32) + # obj = lines_calib[3].strip().split(' ')[1:] + # P3 = np.array(obj, dtype=np.float32) + # obj = lines_calib[4].strip().split(' ')[1:] + # R0 = np.array(obj, dtype=np.float32) + # obj = lines_calib[5].strip().split(' ')[1:] + # Tr_velo_to_cam = np.array(obj, dtype=np.float32) + + P2 = P2.reshape(3, 4) + # P3.reshape(3, 4) + # R0.reshape(3, 3) + # Tr_velo_to_cam.reshape(3, 4) + + # cv = P2[1, 2] + # fv = P2[1, 1] + # tx = P2[0, 3] / (-fu) + # ty = P2[1, 3] / (-fv) + cu = P2[0, 2] + fu = P2[0, 0] + + heading_bins = [] + heading_ress = [] for line in lines: label_data = line.split(' ') if(KITTI_CLASS.get(label_data[0],-1) == -1): continue - x_c = float(label_data[11]) - z_c = float(label_data[13]) + bbx0 = float(label_data[4]) + bbx2 = float(label_data[6]) + ry = float(label_data[14]) + + heading_angle = ry2alpha(ry, (bbx0 + bbx2) / 2, cu, fu) + if heading_angle > np.pi: heading_angle -= 2 * np.pi # check range + if heading_angle < -np.pi: heading_angle += 2 * np.pi + heading_bin, heading_res = angle2class(heading_angle) + # x_c = float(label_data[11]) + # z_c = float(label_data[13]) - bev.append([x_c, z_c]) + heading_bins.append(heading_bin) + heading_ress.append(heading_res) # Save in dict - bev_data[i+1] = bev #torch.tensor(depth) + heading_bins_data[i+1] = heading_bins #torch.tensor(depth) + heading_ress_data[i+1] = heading_ress # Save bev data as json file -output_path = '/srip-vol/datasets/KITTI3D/coco/bev_hbins_%s.json' %(split) +output_path = '/srip-vol/datasets/KITTI3D/coco/heading_bins_%s.json' %(split) +with open(output_path, "w") as outfile: + json.dump(heading_bins_data, outfile) + +output_path = '/srip-vol/datasets/KITTI3D/coco/heading_ress_%s.json' %(split) with open(output_path, "w") as outfile: - json.dump(bev_data, outfile) + json.dump(heading_ress_data, outfile) From 952eae385119716dc7bf14b5ef873c4f56dfb60c Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Mon, 25 Jul 2022 15:03:32 -0700 Subject: [PATCH 12/22] Update coco.py for the ground truth query part --- datasets/coco.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/datasets/coco.py b/datasets/coco.py index d817cde1d..50e7df8d0 100644 --- a/datasets/coco.py +++ b/datasets/coco.py @@ -23,6 +23,7 @@ def __init__(self, img_folder, ann_file, transforms, return_masks, bev_data = No self.prepare = ConvertCocoPolysToMask(return_masks) if bev_data is not None: self.bev_data = json.load(open(bev_data)) + self.bev_coor = json.load(open(bev_data)) if dim_data is not None: self.dim_data = json.load(open(dim_data)) if heading_bin_data is not None: @@ -48,7 +49,10 @@ def __getitem__(self, idx): assert target['heading_bin'].size()[0] == target['boxes'].size()[0] target['heading_res'] = torch.tensor(self.heading_res_data[str(image_id)]) assert target['heading_res'].size()[0] == target['boxes'].size()[0] - return img, target + b_coordinate = torch.tensor([self.bev_coor[str(image_id)]]) + # b_coordinate = self.bev_coor[str(image_id)] + return img, target, b_coordinate + def convert_coco_poly_to_mask(segmentations, height, width): @@ -195,4 +199,4 @@ def build_kitti_coco(image_set, args): HEADING_RES_DATA = "/srip-vol/datasets/KITTI3D/coco/heading_ress_%s.json"%(image_set) img_folder, ann_file = PATHS[image_set] dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks, bev_data = BEV_DATA, heading_bin_data = HEADING_BIN_DATA, heading_res_data = HEADING_RES_DATA) - return dataset \ No newline at end of file + return dataset From 34b51ea34429f04a1fc94a27bfdb99c2b48f757a Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Mon, 25 Jul 2022 15:05:48 -0700 Subject: [PATCH 13/22] Update engine.py for ground truth query part --- engine.py | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/engine.py b/engine.py index ac5ea6ff4..0180bad8f 100644 --- a/engine.py +++ b/engine.py @@ -25,11 +25,25 @@ def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, header = 'Epoch: [{}]'.format(epoch) print_freq = 10 - for samples, targets in metric_logger.log_every(data_loader, print_freq, header): + for samples, targets, b_coordinates in metric_logger.log_every(data_loader, print_freq, header): + # print(samples) samples = samples.to(device) + + # size + # print(b_coordinates[0].size()) + # print(b_coordinates[3]) + b_m1 = torch.mean(b_coordinates[0], 1, True) + # print(b_m1.size()) + # print(b_coordinates[3]) + b_m2 = torch.mean(b_coordinates[1], 1, True) + temp = torch.cat((b_m1, b_m2), 0) + temp = temp.squeeze(1) + temp = temp.to(device) + + # print(temp.size()) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] - outputs = model(samples) + outputs = model(samples, temp) loss_dict = criterion(outputs, targets) weight_dict = criterion.weight_dict losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) @@ -85,11 +99,26 @@ def evaluate(model, criterion, postprocessors, data_loader, base_ds, device, out output_dir=os.path.join(output_dir, "panoptic_eval"), ) - for samples, targets in metric_logger.log_every(data_loader, 10, header): + for samples, targets, b_coordinates in metric_logger.log_every(data_loader, 10, header): samples = samples.to(device) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] - outputs = model(samples) + # print(b_coordinates) + if len(b_coordinates) == 0: + print(samples) + temp = torch.zeros([1, 2], dtype=torch.float64) + elif len(b_coordinates) == 1: + b_m1 = torch.mean(b_coordinates[0], 1, True) + temp = b_m1.squeeze(1) + else: + b_m1 = torch.mean(b_coordinates[0], 1, True) + b_m2 = torch.mean(b_coordinates[1], 1, True) + temp = torch.cat((b_m1, b_m2), 0) + temp = temp.squeeze(1) + # temp = b_m1.squeeze(1) + temp = temp.to(device) + + outputs = model(samples, temp) loss_dict = criterion(outputs, targets) weight_dict = criterion.weight_dict From f7dfbb480734df903abab65b35273d7404b6ff76 Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Mon, 25 Jul 2022 15:07:13 -0700 Subject: [PATCH 14/22] Update detr.py for ground truth query part --- models/detr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/models/detr.py b/models/detr.py index 2250b9356..89009c9da 100644 --- a/models/detr.py +++ b/models/detr.py @@ -44,7 +44,7 @@ def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=Fal self.angle_embed = MLP(hidden_dim, hidden_dim, 24, 2) self.dim_embed = MLP(hidden_dim, hidden_dim, 2, 2) - def forward(self, samples: NestedTensor): + def forward(self, samples: NestedTensor, b_coordinate): """ The forward expects a NestedTensor, which consists of: - samples.tensor: batched images, of shape [batch_size x 3 x H x W] - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels @@ -65,7 +65,7 @@ def forward(self, samples: NestedTensor): src, mask = features[-1].decompose() assert mask is not None - hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1])[0] + hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1], b_coordinate)[0] outputs_class = self.class_embed(hs) outputs_coord = self.bbox_embed(hs).sigmoid() From 4cdaf63a2818c3e96a43603a1d8da5698ebefb17 Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Mon, 25 Jul 2022 15:08:31 -0700 Subject: [PATCH 15/22] Update transformer.py for query part --- models/transformer.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/models/transformer.py b/models/transformer.py index dcd536750..02a1940ea 100644 --- a/models/transformer.py +++ b/models/transformer.py @@ -38,20 +38,34 @@ def __init__(self, d_model=512, nhead=8, num_encoder_layers=6, self.d_model = d_model self.nhead = nhead + + self.linear_b = nn.Linear(2, 100) + self.conv_b = nn.Conv1d(in_channels=1, out_channels=256,kernel_size=1) def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) - def forward(self, src, mask, query_embed, pos_embed): + def forward(self, src, mask, query_embed, pos_embed, b_coordinate): # flatten NxCxHxW to HWxNxC bs, c, h, w = src.shape src = src.flatten(2).permute(2, 0, 1) pos_embed = pos_embed.flatten(2).permute(2, 0, 1) query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) mask = mask.flatten(1) - + + # From N*2 to N*num_queries + b_coordinate = self.linear_b(b_coordinate) + # From N*num_queries to N*1*num_queries + b_coordinate = b_coordinate.unsqueeze(1) + # From N*1*num_quries to N*hidden_dim*num_queries + b_coordinate = self.conv_b(b_coordinate) + # From N*hidden_dim*num_queries to num_quries*N*hidden_dim + b_coordinate = b_coordinate.permute(2, 0, 1) + + query_embed = query_embed + b_coordinate + tgt = torch.zeros_like(query_embed) memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) hs = self.decoder(tgt, memory, memory_key_padding_mask=mask, From 3a7615e9ae3f5a89426cb0c089e448d605981055 Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Thu, 28 Jul 2022 15:40:48 -0700 Subject: [PATCH 16/22] Let the output of the image view transformer be the query of the BEV transformer --- datasets/coco.py | 5 +- engine.py | 36 +---- models/detr.py | 12 +- models/transformer.py | 13 +- models/transformer_BEV.py | 313 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 335 insertions(+), 44 deletions(-) create mode 100644 models/transformer_BEV.py diff --git a/datasets/coco.py b/datasets/coco.py index 50e7df8d0..b1ce6201a 100644 --- a/datasets/coco.py +++ b/datasets/coco.py @@ -49,9 +49,10 @@ def __getitem__(self, idx): assert target['heading_bin'].size()[0] == target['boxes'].size()[0] target['heading_res'] = torch.tensor(self.heading_res_data[str(image_id)]) assert target['heading_res'].size()[0] == target['boxes'].size()[0] - b_coordinate = torch.tensor([self.bev_coor[str(image_id)]]) + # b_coordinate = torch.tensor([self.bev_coor[str(image_id)]]) # b_coordinate = self.bev_coor[str(image_id)] - return img, target, b_coordinate + return img, target + diff --git a/engine.py b/engine.py index 0180bad8f..1a8cc03f7 100644 --- a/engine.py +++ b/engine.py @@ -25,25 +25,12 @@ def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, header = 'Epoch: [{}]'.format(epoch) print_freq = 10 - for samples, targets, b_coordinates in metric_logger.log_every(data_loader, print_freq, header): - # print(samples) + for samples, targets in metric_logger.log_every(data_loader, print_freq, header): samples = samples.to(device) - # size - # print(b_coordinates[0].size()) - # print(b_coordinates[3]) - b_m1 = torch.mean(b_coordinates[0], 1, True) - # print(b_m1.size()) - # print(b_coordinates[3]) - b_m2 = torch.mean(b_coordinates[1], 1, True) - temp = torch.cat((b_m1, b_m2), 0) - temp = temp.squeeze(1) - temp = temp.to(device) - - # print(temp.size()) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] - outputs = model(samples, temp) + outputs = model(samples) loss_dict = criterion(outputs, targets) weight_dict = criterion.weight_dict losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) @@ -99,26 +86,11 @@ def evaluate(model, criterion, postprocessors, data_loader, base_ds, device, out output_dir=os.path.join(output_dir, "panoptic_eval"), ) - for samples, targets, b_coordinates in metric_logger.log_every(data_loader, 10, header): + for samples, targets in metric_logger.log_every(data_loader, 10, header): samples = samples.to(device) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] - # print(b_coordinates) - if len(b_coordinates) == 0: - print(samples) - temp = torch.zeros([1, 2], dtype=torch.float64) - elif len(b_coordinates) == 1: - b_m1 = torch.mean(b_coordinates[0], 1, True) - temp = b_m1.squeeze(1) - else: - b_m1 = torch.mean(b_coordinates[0], 1, True) - b_m2 = torch.mean(b_coordinates[1], 1, True) - temp = torch.cat((b_m1, b_m2), 0) - temp = temp.squeeze(1) - # temp = b_m1.squeeze(1) - temp = temp.to(device) - - outputs = model(samples, temp) + outputs = model(samples) loss_dict = criterion(outputs, targets) weight_dict = criterion.weight_dict diff --git a/models/detr.py b/models/detr.py index 89009c9da..b4d497215 100644 --- a/models/detr.py +++ b/models/detr.py @@ -16,11 +16,12 @@ from .segmentation import (DETRsegm, PostProcessPanoptic, PostProcessSegm, dice_loss, sigmoid_focal_loss) from .transformer import build_transformer +from .transformer_BEV import build_transformer_BEV class DETR(nn.Module): """ This is the DETR module that performs object detection """ - def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=False): + def __init__(self, backbone, transformer, transformer_BEV, num_classes, num_queries, aux_loss=False): """ Initializes the model. Parameters: backbone: torch module of the backbone to be used. See backbone.py @@ -33,6 +34,7 @@ def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=Fal super().__init__() self.num_queries = num_queries self.transformer = transformer + self.transformer_BEV = transformer_BEV hidden_dim = transformer.d_model self.class_embed = nn.Linear(hidden_dim, num_classes + 1) self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3) @@ -44,7 +46,7 @@ def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=Fal self.angle_embed = MLP(hidden_dim, hidden_dim, 24, 2) self.dim_embed = MLP(hidden_dim, hidden_dim, 2, 2) - def forward(self, samples: NestedTensor, b_coordinate): + def forward(self, samples: NestedTensor): """ The forward expects a NestedTensor, which consists of: - samples.tensor: batched images, of shape [batch_size x 3 x H x W] - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels @@ -65,7 +67,9 @@ def forward(self, samples: NestedTensor, b_coordinate): src, mask = features[-1].decompose() assert mask is not None - hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1], b_coordinate)[0] + query_B = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1])[0] + print(query_B.size()) + hs = self.transformer_BEV(self.input_proj(src), mask, self.query_embed.weight, pos[-1], query_B)[0] outputs_class = self.class_embed(hs) outputs_coord = self.bbox_embed(hs).sigmoid() @@ -396,10 +400,12 @@ def build(args): backbone = build_backbone(args) transformer = build_transformer(args) + transformer_BEV = build_transformer_BEV(args) model = DETR( backbone, transformer, + transformer_BEV, num_classes=num_classes, num_queries=args.num_queries, aux_loss=args.aux_loss, diff --git a/models/transformer.py b/models/transformer.py index 02a1940ea..15fadde4b 100644 --- a/models/transformer.py +++ b/models/transformer.py @@ -47,7 +47,7 @@ def _reset_parameters(self): if p.dim() > 1: nn.init.xavier_uniform_(p) - def forward(self, src, mask, query_embed, pos_embed, b_coordinate): + def forward(self, src, mask, query_embed, pos_embed): # flatten NxCxHxW to HWxNxC bs, c, h, w = src.shape src = src.flatten(2).permute(2, 0, 1) @@ -56,15 +56,14 @@ def forward(self, src, mask, query_embed, pos_embed, b_coordinate): mask = mask.flatten(1) # From N*2 to N*num_queries - b_coordinate = self.linear_b(b_coordinate) + # b_coordinate = self.linear_b(b_coordinate) # From N*num_queries to N*1*num_queries - b_coordinate = b_coordinate.unsqueeze(1) + # b_coordinate = b_coordinate.unsqueeze(1) # From N*1*num_quries to N*hidden_dim*num_queries - b_coordinate = self.conv_b(b_coordinate) + # b_coordinate = self.conv_b(b_coordinate) # From N*hidden_dim*num_queries to num_quries*N*hidden_dim - b_coordinate = b_coordinate.permute(2, 0, 1) - - query_embed = query_embed + b_coordinate + # b_coordinate = b_coordinate.permute(2, 0, 1) + # query_embed = query_embed + b_coordinate tgt = torch.zeros_like(query_embed) memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) diff --git a/models/transformer_BEV.py b/models/transformer_BEV.py new file mode 100644 index 000000000..82a2a380f --- /dev/null +++ b/models/transformer_BEV.py @@ -0,0 +1,313 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +DETR Transformer class. + +Copy-paste from torch.nn.Transformer with modifications: + * positional encodings are passed in MHattention + * extra LN at the end of encoder is removed + * decoder returns a stack of activations from all decoding layers +""" +import copy +from typing import Optional, List + +import torch +import torch.nn.functional as F +from torch import nn, Tensor + + +class Transformer(nn.Module): + + def __init__(self, d_model=512, nhead=8, num_encoder_layers=6, + num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, + activation="relu", normalize_before=False, + return_intermediate_dec=False): + super().__init__() + + encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, + dropout, activation, normalize_before) + encoder_norm = nn.LayerNorm(d_model) if normalize_before else None + self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) + + decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, + dropout, activation, normalize_before) + decoder_norm = nn.LayerNorm(d_model) + self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm, + return_intermediate=return_intermediate_dec) + + self._reset_parameters() + + self.d_model = d_model + self.nhead = nhead + + self.linear_b = nn.Linear(2, 100) + self.conv_b = nn.Conv1d(in_channels=1, out_channels=256,kernel_size=1) + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def forward(self, src, mask, query_embed, pos_embed, query_B): + # flatten NxCxHxW to HWxNxC + bs, c, h, w = src.shape + src = src.flatten(2).permute(2, 0, 1) + pos_embed = pos_embed.flatten(2).permute(2, 0, 1) + query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) + mask = mask.flatten(1) + + # From N*2 to N*num_queries + # b_coordinate = self.linear_b(b_coordinate) + # From N*num_queries to N*1*num_queries + # b_coordinate = b_coordinate.unsqueeze(1) + # From N*1*num_quries to N*hidden_dim*num_queries + # b_coordinate = self.conv_b(b_coordinate) + # From N*hidden_dim*num_queries to num_quries*N*hidden_dim + # b_coordinate = b_coordinate.permute(2, 0, 1) + # query_embed = query_embed + b_coordinate + query_B = query_B.permute(1, 0, 2) + query_embed = query_embed + query_B + + + tgt = torch.zeros_like(query_embed) + memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) + hs = self.decoder(tgt, memory, memory_key_padding_mask=mask, + pos=pos_embed, query_pos=query_embed) + return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w) + + +class TransformerEncoder(nn.Module): + + def __init__(self, encoder_layer, num_layers, norm=None): + super().__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + + def forward(self, src, + mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None): + output = src + + for layer in self.layers: + output = layer(output, src_mask=mask, + src_key_padding_mask=src_key_padding_mask, pos=pos) + + if self.norm is not None: + output = self.norm(output) + + return output + + +class TransformerDecoder(nn.Module): + + def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False): + super().__init__() + self.layers = _get_clones(decoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + self.return_intermediate = return_intermediate + + def forward(self, tgt, memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None): + output = tgt + + intermediate = [] + + for layer in self.layers: + output = layer(output, memory, tgt_mask=tgt_mask, + memory_mask=memory_mask, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=memory_key_padding_mask, + pos=pos, query_pos=query_pos) + if self.return_intermediate: + intermediate.append(self.norm(output)) + + if self.norm is not None: + output = self.norm(output) + if self.return_intermediate: + intermediate.pop() + intermediate.append(output) + + if self.return_intermediate: + return torch.stack(intermediate) + + return output.unsqueeze(0) + + +class TransformerEncoderLayer(nn.Module): + + def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, + activation="relu", normalize_before=False): + super().__init__() + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + def with_pos_embed(self, tensor, pos: Optional[Tensor]): + return tensor if pos is None else tensor + pos + + def forward_post(self, + src, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None): + q = k = self.with_pos_embed(src, pos) + src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, + key_padding_mask=src_key_padding_mask)[0] + src = src + self.dropout1(src2) + src = self.norm1(src) + src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = src + self.dropout2(src2) + src = self.norm2(src) + return src + + def forward_pre(self, src, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None): + src2 = self.norm1(src) + q = k = self.with_pos_embed(src2, pos) + src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask, + key_padding_mask=src_key_padding_mask)[0] + src = src + self.dropout1(src2) + src2 = self.norm2(src) + src2 = self.linear2(self.dropout(self.activation(self.linear1(src2)))) + src = src + self.dropout2(src2) + return src + + def forward(self, src, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None): + if self.normalize_before: + return self.forward_pre(src, src_mask, src_key_padding_mask, pos) + return self.forward_post(src, src_mask, src_key_padding_mask, pos) + + +class TransformerDecoderLayer(nn.Module): + + def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, + activation="relu", normalize_before=False): + super().__init__() + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + self.dropout3 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + def with_pos_embed(self, tensor, pos: Optional[Tensor]): + return tensor if pos is None else tensor + pos + + def forward_post(self, tgt, memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None): + q = k = self.with_pos_embed(tgt, query_pos) + tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask, + key_padding_mask=tgt_key_padding_mask)[0] + tgt = tgt + self.dropout1(tgt2) + tgt = self.norm1(tgt) + tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos), + key=self.with_pos_embed(memory, pos), + value=memory, attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask)[0] + tgt = tgt + self.dropout2(tgt2) + tgt = self.norm2(tgt) + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) + tgt = tgt + self.dropout3(tgt2) + tgt = self.norm3(tgt) + return tgt + + def forward_pre(self, tgt, memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None): + tgt2 = self.norm1(tgt) + q = k = self.with_pos_embed(tgt2, query_pos) + tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, + key_padding_mask=tgt_key_padding_mask)[0] + tgt = tgt + self.dropout1(tgt2) + tgt2 = self.norm2(tgt) + tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos), + key=self.with_pos_embed(memory, pos), + value=memory, attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask)[0] + tgt = tgt + self.dropout2(tgt2) + tgt2 = self.norm3(tgt) + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) + tgt = tgt + self.dropout3(tgt2) + return tgt + + def forward(self, tgt, memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None): + if self.normalize_before: + return self.forward_pre(tgt, memory, tgt_mask, memory_mask, + tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos) + return self.forward_post(tgt, memory, tgt_mask, memory_mask, + tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos) + + +def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + +def build_transformer_BEV(args): + return Transformer( + d_model=args.hidden_dim, + dropout=args.dropout, + nhead=args.nheads, + dim_feedforward=args.dim_feedforward, + num_encoder_layers=args.enc_layers, + num_decoder_layers=args.dec_layers, + normalize_before=args.pre_norm, + return_intermediate_dec=True, + ) + + +def _get_activation_fn(activation): + """Return an activation function given a string""" + if activation == "relu": + return F.relu + if activation == "gelu": + return F.gelu + if activation == "glu": + return F.glu + raise RuntimeError(F"activation should be relu/gelu, not {activation}.") From 235510bf10e16d9e121ff1568aa979618e9b3d17 Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Thu, 28 Jul 2022 16:04:24 -0700 Subject: [PATCH 17/22] fix small bugs --- datasets/coco.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/coco.py b/datasets/coco.py index b1ce6201a..fbcdba863 100644 --- a/datasets/coco.py +++ b/datasets/coco.py @@ -195,9 +195,9 @@ def build_kitti_coco(image_set, args): "val": (Path("/srip-vol/datasets/KITTI3D/training/image_2"), anno_root / f'kitti_{image_set}.json'), } BEV_DATA = "/srip-vol/datasets/KITTI3D/coco/bev_%s.json"%(image_set) - DIM_DATA = "/srip-vol/datasets/KITTI3D/coco/dim_%s.json"%(image_set) + DIM_DATA = "/srip-vol/datasets/KITTI3D/coco/bev_dim_%s.json"%(image_set) HEADING_BIN_DATA = "/srip-vol/datasets/KITTI3D/coco/heading_bins_%s.json"%(image_set) HEADING_RES_DATA = "/srip-vol/datasets/KITTI3D/coco/heading_ress_%s.json"%(image_set) img_folder, ann_file = PATHS[image_set] - dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks, bev_data = BEV_DATA, heading_bin_data = HEADING_BIN_DATA, heading_res_data = HEADING_RES_DATA) + dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks, bev_data = BEV_DATA, dim_data = DIM_DATA, heading_bin_data = HEADING_BIN_DATA, heading_res_data = HEADING_RES_DATA) return dataset From d718fc3ddb07217aec39eafdbb92a602e7ecb13b Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Thu, 28 Jul 2022 16:22:21 -0700 Subject: [PATCH 18/22] change the dimension of the query to fit the model --- models/transformer_BEV.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/models/transformer_BEV.py b/models/transformer_BEV.py index 82a2a380f..8742adc62 100644 --- a/models/transformer_BEV.py +++ b/models/transformer_BEV.py @@ -64,10 +64,17 @@ def forward(self, src, mask, query_embed, pos_embed, query_B): # From N*hidden_dim*num_queries to num_quries*N*hidden_dim # b_coordinate = b_coordinate.permute(2, 0, 1) # query_embed = query_embed + b_coordinate - query_B = query_B.permute(1, 0, 2) + + # 6 * N * num_queries * hidden_dim to num_queries * N * hidden_dim * 6 + query_B = query_B.permute(2, 1, 3, 0) + # num_queries * N * hidden_dim * 6 to num_queries * N * hidden_dim * 1 + query_B = self.linear_Q(query_B) + # num_queries * N * hidden_dim * 1 to num_queries * N * 1 * hidden_dim + query_B = query_B.permute(0, 1, 3, 2) + # num_queries * N * 1 * hidden_dim to num_queries * N * hidden_dim + query_B = query_embed.squeeze(2) query_embed = query_embed + query_B - tgt = torch.zeros_like(query_embed) memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) hs = self.decoder(tgt, memory, memory_key_padding_mask=mask, From c8f5132737a728a6da8e873677d183ff3b1614e7 Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Tue, 2 Aug 2022 15:21:25 -0700 Subject: [PATCH 19/22] loss function for the bev center --- models/detr.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/models/detr.py b/models/detr.py index b4d497215..0df42a636 100644 --- a/models/detr.py +++ b/models/detr.py @@ -205,8 +205,9 @@ def loss_bev(self, outputs, targets, indices, num_boxes): idx = self._get_src_permutation_idx(indices) src_bev = outputs['pred_bev'][idx].squeeze() target_bev = torch.cat([t['bev'][i] for t, (_, i) in zip(targets, indices)]) - loss = F.mse_loss(src_bev, target_bev) - losses = {'loss_bev' : loss} + loss = F.l1_loss(src_bev, target_bev, reduction='none') + losses = {} + losses['loss_center'] = loss.sum() / num_boxes return losses def loss_dims(self, outputs, targets, indices, num_boxes): From 8394e82d6af73554d401cec7ab1c6c41972741d3 Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Tue, 2 Aug 2022 18:32:10 -0700 Subject: [PATCH 20/22] code to get the output of 1 batch images --- engine.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/engine.py b/engine.py index 1a8cc03f7..7f24f4753 100644 --- a/engine.py +++ b/engine.py @@ -31,6 +31,19 @@ def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, targets = [{k: v.to(device) for k, v in t.items()} for t in targets] outputs = model(samples) + + # print(image_id) + # outputs = model(samples, temp) + # torch.save(outputs, 'outputs.pt') + # # print(outputs) + # outputs_json = {k: v.item() for k, v in outputs.items()} + # tf = open("data.json", "w") + # json.dump(outputs_json, tf) + # tf.close() + + # with open('data.txt','w') as f: + # f.write(outputs) + loss_dict = criterion(outputs, targets) weight_dict = criterion.weight_dict losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) From bb10227e870afaf86dcd215809b924450e41c473 Mon Sep 17 00:00:00 2001 From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com> Date: Tue, 2 Aug 2022 22:25:29 -0700 Subject: [PATCH 21/22] fix small bugs --- models/detr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/detr.py b/models/detr.py index 0df42a636..66a316fe0 100644 --- a/models/detr.py +++ b/models/detr.py @@ -207,7 +207,7 @@ def loss_bev(self, outputs, targets, indices, num_boxes): target_bev = torch.cat([t['bev'][i] for t, (_, i) in zip(targets, indices)]) loss = F.l1_loss(src_bev, target_bev, reduction='none') losses = {} - losses['loss_center'] = loss.sum() / num_boxes + losses['loss_bev'] = loss.sum() / num_boxes return losses def loss_dims(self, outputs, targets, indices, num_boxes): From 03fd5f2d694a6c77916eb08b638afacddfbaa4ce Mon Sep 17 00:00:00 2001 From: ESONG1999 Date: Fri, 12 Aug 2022 03:52:55 +0000 Subject: [PATCH 22/22] whole model (avaliable for pretrained weight) --- main.py | 49 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/main.py b/main.py index 317ffb485..6af451592 100644 --- a/main.py +++ b/main.py @@ -37,6 +37,8 @@ def get_args_parser(): help="Path to the pretrained model. If set, only the mask head will be trained") # parser.add_argument('--bev_regression', action='store_false', # help="Add flag to regress bev directly") + # parser.add_argument('--depth_regression', action='store_true', + # help="Add flag to regress depth directly else use multi bin approach") # * Backbone parser.add_argument('--backbone', default='resnet50', type=str, help="Name of the convolutional backbone to use") @@ -65,6 +67,11 @@ def get_args_parser(): # * Segmentation parser.add_argument('--masks', action='store_true', help="Train segmentation head if the flag is provided") + # Depth + parser.add_argument('--num_depth_bins', type=int , default = 9, + help="Number of depth bins") + parser.add_argument('--depth_bin_res', type=int , default = 10, + help="Width of each depth bin") # Loss parser.add_argument('--no_aux_loss', dest='aux_loss', action='store_false', @@ -81,7 +88,10 @@ def get_args_parser(): parser.add_argument('--dice_loss_coef', default=1, type=float) parser.add_argument('--bbox_loss_coef', default=5, type=float) parser.add_argument('--giou_loss_coef', default=2, type=float) + parser.add_argument('--depth_loss_coef', default=1, type=float) parser.add_argument('--bev_loss_coef', default=2, type=float) + parser.add_argument('--head_loss_coef', default=2, type=float) + parser.add_argument('--feet_loss_coef', default=2, type=float) parser.add_argument('--dim_loss_coef', default=2, type=float) parser.add_argument('--angle_loss_coef', default=1, type=float) parser.add_argument('--eos_coef', default=0.1, type=float, @@ -102,7 +112,7 @@ def get_args_parser(): help='device to use for training / testing') parser.add_argument('--seed', default=42, type=int) # parser.add_argument('--resume', default='', help='resume from checkpoint') - parser.add_argument('--resume', default='pretrained/detr-r101-dc5-a2e86def.pth', help='resume from checkpoint') + parser.add_argument('--resume', default='pretrained/checkpoint0299.pth', help='resume from checkpoint') parser.add_argument('--start_epoch', default=0, type=int, metavar='N', help='start epoch') parser.add_argument('--eval', action='store_true') @@ -138,6 +148,23 @@ def main(args): model, criterion, postprocessors = build_model(args) model.to(device) + for name, param in model.transformer.named_parameters(): + param.requires_grad = False + + for name, param in model.bbox_embed.named_parameters(): + param.requires_grad = False + + for name, param in model.class_embed.named_parameters(): + param.requires_grad = False + + # for name, param in model.depth_delta.named_parameters(): + # param.requires_grad = False + + # for name, param in model.depth_bin.named_parameters(): + # param.requires_grad = False + + # seed = seed + model + model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) @@ -194,21 +221,21 @@ def main(args): print('loading pretrianed weights.....') checkpoint = torch.load(args.resume, map_location='cpu') # model_without_ddp.load_state_dict(checkpoint['model']) - del checkpoint["model"]["class_embed.weight"] - del checkpoint["model"]["class_embed.bias"] + # del checkpoint["model"]["class_embed.weight"] + # del checkpoint["model"]["class_embed.bias"] # Remove box weights - keys_to_delete = [] - for key in checkpoint["model"]: - if 'box_embed' in key: - print(key) - keys_to_delete.append(key) + # keys_to_delete = [] + # for key in checkpoint["model"]: + # if 'box_embed' in key: + # print(key) + # keys_to_delete.append(key) - for key in keys_to_delete: - del checkpoint["model"][key] + # for key in keys_to_delete: + # del checkpoint["model"][key] model_without_ddp.load_state_dict(checkpoint['model'], strict = False) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: - optimizer.load_state_dict(checkpoint['optimizer']) + # optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1