From 3dbb29aecf2fedfa497a429d3032d0d72ed74226 Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Tue, 5 Jul 2022 17:01:16 -0700
Subject: [PATCH 01/22] bev

---
 datasets/__init__.py   |   9 +-
 datasets/coco.py       |  41 ++++++--
 datasets/kitti.py      | 207 +++++++++++++++++++++++++++++++++++++++++
 datasets/transforms.py |   3 +-
 kitti2coco.py          |  71 ++++++++++++++
 kittibev               |  37 ++++++++
 main.py                |  39 +++++++-
 models/detr.py         |  36 +++++--
 run_KITTI_bev.sh       |   8 ++
 9 files changed, 426 insertions(+), 25 deletions(-)
 create mode 100644 datasets/kitti.py
 create mode 100644 kitti2coco.py
 create mode 100644 kittibev
 create mode 100644 run_KITTI_bev.sh

diff --git a/datasets/__init__.py b/datasets/__init__.py
index 571b126ea..c49b0861c 100644
--- a/datasets/__init__.py
+++ b/datasets/__init__.py
@@ -3,7 +3,8 @@
 import torchvision
 
 from .coco import build as build_coco
-
+from .coco import build_kitti_coco
+from .kitti import build as build_kitti
 
 def get_coco_api_from_dataset(dataset):
     for _ in range(10):
@@ -18,6 +19,12 @@ def get_coco_api_from_dataset(dataset):
 def build_dataset(image_set, args):
     if args.dataset_file == 'coco':
         return build_coco(image_set, args)
+
+    if args.dataset_file == 'kitti_coco':
+        return build_kitti_coco(image_set, args)
+    
+    # if args.dataset_file == 'kitti':
+    #     return build_kitti(image_set, args)
     if args.dataset_file == 'coco_panoptic':
         # to avoid making panopticapi required for coco
         from .coco_panoptic import build as build_coco_panoptic
diff --git a/datasets/coco.py b/datasets/coco.py
index 93a436ba6..bdd39196f 100644
--- a/datasets/coco.py
+++ b/datasets/coco.py
@@ -5,20 +5,24 @@
 Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
 """
 from pathlib import Path
+# from matplotlib import image
 
 import torch
 import torch.utils.data
 import torchvision
 from pycocotools import mask as coco_mask
+import json
 
 import datasets.transforms as T
 
 
 class CocoDetection(torchvision.datasets.CocoDetection):
-    def __init__(self, img_folder, ann_file, transforms, return_masks):
+    def __init__(self, img_folder, ann_file, transforms, return_masks, bev_data = None):
         super(CocoDetection, self).__init__(img_folder, ann_file)
         self._transforms = transforms
         self.prepare = ConvertCocoPolysToMask(return_masks)
+        if bev_data is not None:
+            self.bev_data = json.load(open(bev_data))
 
     def __getitem__(self, idx):
         img, target = super(CocoDetection, self).__getitem__(idx)
@@ -27,6 +31,8 @@ def __getitem__(self, idx):
         img, target = self.prepare(img, target)
         if self._transforms is not None:
             img, target = self._transforms(img, target)
+        target['bev'] = torch.tensor(self.bev_data[str(image_id)])
+        assert target['bev'].size()[0] == target['boxes'].size()[0]
         return img, target
 
 
@@ -63,7 +69,12 @@ def __call__(self, image, target):
 
         boxes = [obj["bbox"] for obj in anno]
         # guard against no boxes via resizing
-        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        # boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        try:
+            boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1,4)
+        except:
+            print(image_id)
+            print(boxes)
         boxes[:, 2:] += boxes[:, :2]
         boxes[:, 0::2].clamp_(min=0, max=w)
         boxes[:, 1::2].clamp_(min=0, max=h)
@@ -123,15 +134,15 @@ def make_coco_transforms(image_set):
 
     if image_set == 'train':
         return T.Compose([
-            T.RandomHorizontalFlip(),
-            T.RandomSelect(
+            # T.RandomHorizontalFlip(),
+            # T.RandomSelect(
                 T.RandomResize(scales, max_size=1333),
-                T.Compose([
-                    T.RandomResize([400, 500, 600]),
-                    T.RandomSizeCrop(384, 600),
-                    T.RandomResize(scales, max_size=1333),
-                ])
-            ),
+            #     T.Compose([
+            #         T.RandomResize([400, 500, 600]),
+            #         T.RandomSizeCrop(384, 600),
+            #         T.RandomResize(scales, max_size=1333),
+            #     ])
+            # ),
             normalize,
         ])
 
@@ -156,3 +167,13 @@ def build(image_set, args):
     img_folder, ann_file = PATHS[image_set]
     dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks)
     return dataset
+
+def build_kitti_coco(image_set, args):
+    anno_root = Path("/srip-vol/datasets/KITTI3D/coco")
+    PATHS = {
+        "train": (Path("/srip-vol/datasets/KITTI3D/training/image_2"), anno_root / f'kitti_{image_set}.json'),
+        "val": (Path("/srip-vol/datasets/KITTI3D/training/image_2"), anno_root / f'kitti_{image_set}.json'),
+    }
+    BEV_DATA = "/srip-vol/datasets/KITTI3D/coco/bev_%s.json"%(image_set)
+    img_folder, ann_file = PATHS[image_set]
+    dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks, bev_data = BEV_DATA)
\ No newline at end of file
diff --git a/datasets/kitti.py b/datasets/kitti.py
new file mode 100644
index 000000000..49af042d0
--- /dev/null
+++ b/datasets/kitti.py
@@ -0,0 +1,207 @@
+"""
+KITTI dataset ckass for DeTR
+"""
+import os
+import os.path as osp
+from torch.utils.data import Dataset
+import torch
+import numpy as np
+from PIL import Image
+import datasets.transforms as T
+
+SPLIT = ['train', 'val', 'test']
+
+class KITTIDataset(Dataset):
+    def __init__(self, base_path = '/srip-vol/datasets/KITTI3D', split = 'train', transform = None):
+        assert split in SPLIT
+        self.split = split
+        self.base_path = base_path
+        self.isTest = self.split == 'test'
+        self.folder_name = 'testing' if self.split == 'test' else 'training'
+
+        # Read imageset with index
+        image_set_path = osp.join(self.base_path, 'ImageSets', self.split + '.txt')
+        lines = open(image_set_path).readlines()
+        self.image_set = [line.strip() for line in lines]
+        # Define transform
+        self._transforms = transform
+        # Set-up paths
+        self.image_path = osp.join(base_path,  self.folder_name, 'image_2')
+        if not self.isTest:
+            self.label_path = osp.join(base_path,  self.folder_name, 'label_2')
+
+        self.KITTI_CLASS = ['Car', 'Pedestrian', 'Cyclist']
+        self.prepare = ConvertCocoPolysToMask()
+
+    def __len__(self):
+        return len(self.image_set)
+
+    def __getitem__(self, idx):
+        '''
+        Return a dict with following fields
+        'image' - image as a numpy array
+        'label' - list of dicts each with label info parsed
+        '''
+        data_idx = self.image_set[idx]
+        data = {}
+        # img = np.asarray(Image.open(osp.join(self.image_path, data_idx + '.png')))
+        img = Image.open(osp.join(self.image_path, data_idx + '.png'))
+        if not self.isTest:
+            label = self.__read_label_data(data_idx)
+
+        bbox_data = []
+        class_data = []
+        for i in range(len(label)):
+            bbox_data.append(label[i]['bbox_2d'])
+            class_data.append(label[i]['class_id'])
+
+        target = {}
+        target['boxes'] = torch.as_tensor(bbox_data)
+        target['labels'] = torch.as_tensor(class_data)
+        target['image_id'] = int(data_idx)
+
+        # Prepare dataset
+        img, target = self.prepare(img, target)
+        if self._transforms is not None:
+            img, target = self._transforms(img, target)
+        return img, target
+
+    def __read_label_data(self, idx):
+        '''
+        Function to read label data from text file
+        '''
+        lines = open(osp.join(self.label_path, idx + '.txt')).readlines()
+        label = []
+        for line in lines:
+            data = line.split(' ')
+            if data[0] in self.KITTI_CLASS:
+                label.append(KITTI_label(data[0],
+                    float(data[1]) , float(data[2]) , float(data[3]),
+                    float(data[4]) , float(data[5]) , float(data[6]),
+                    float(data[7]) , float(data[8]) , float(data[9]),
+                    float(data[10]), float(data[11]), float(data[12]),
+                    float(data[13]), float(data[14])))
+        return label
+
+
+KITTI_CLASS = {'Car': 1, 'Pedestrian': 2, 'Cyclist' : 3}
+
+def KITTI_label(class_name, truncated, occluded, alpha,
+    bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax,
+    dim_h, dim_w, dim_l, x_c, y_c, z_c, rot_y, score = 0):
+    '''
+    To create a label dict
+    Note - score field added at last with default val 0
+    '''
+    label_info = {}
+    label_info['class_id'] = KITTI_CLASS[class_name]
+    label_info['truncated'] = truncated
+    label_info['occluded'] = occluded
+    label_info['alpha'] = alpha
+    # label_info['bbox_2d'] = [bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax]
+    label_info['bbox_2d'] = [(bbox_xmax + bbox_xmin)/2, (bbox_ymax + bbox_ymin)/2, bbox_xmax-bbox_xmin, bbox_ymax-bbox_ymin]
+    label_info['dim'] = [dim_h, dim_w, dim_l]
+    label_info['loc'] = [x_c, y_c, z_c]
+    label_info['rot_y'] = rot_y
+    label_info['score'] = score
+
+    return label_info
+
+def make_coco_transforms(image_set):
+
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+
+    scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.RandomSelect(
+                T.RandomResize(scales, max_size=1333),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=1333),
+                ])
+            ),
+            normalize,
+        ])
+
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([800], max_size=1333),
+            normalize,
+        ])
+
+    raise ValueError(f'unknown {image_set}')
+
+class ConvertCocoPolysToMask(object):
+    # def __init__(self, return_masks=False):
+    #     self.return_masks = return_masks
+
+    def __call__(self, image, target):
+        w, h = image.size
+
+        image_id = target["image_id"]
+        image_id = torch.tensor([image_id])
+        # anno = target["annotations"]
+        # anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0]
+
+        # boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = target['boxes']#torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+
+        # classes = [obj["category_id"] for obj in anno]
+        classes = target['labels']# torch.tensor(classes, dtype=torch.int64)
+
+        # if self.return_masks:
+            # segmentations = [obj["segmentation"] for obj in anno]
+            # masks = convert_coco_poly_to_mask(segmentations, h, w)
+
+        # keypoints = None
+        # if anno and "keypoints" in anno[0]:
+        #     keypoints = [obj["keypoints"] for obj in anno]
+        #     keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+        #     num_keypoints = keypoints.shape[0]
+        #     if num_keypoints:
+        #         # keypoints = keypoints.view(num_keypoints, -1, 3)
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        # classes = classes[keep]
+        # if self.return_masks:
+        #     masks = masks[keep]
+        # if keypoints is not None:
+        #     keypoints = keypoints[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        target["image_id"] = image_id
+        # if self.return_masks:
+        #     target["masks"] = masks
+        # target["image_id"] = image_id
+        # if keypoints is not None:
+        #     target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        # area = torch.tensor([obj["area"] for obj in anno])
+        # iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno])
+        # target["area"] = [] #area[keep]
+        # target["iscrowd"] = [] #iscrowd[keep]
+
+        target["orig_size"] = torch.as_tensor([int(h), int(w)])
+        target["size"] = torch.as_tensor([int(h), int(w)])
+
+        return image, target
+
+def build(image_set, args):
+    # base_path = '/srip-vol/datasets/KITTI3D'
+    dataset = KITTIDataset(base_path = args.kitti_path, split = image_set, transform = make_coco_transforms(image_set))
+    return dataset 
\ No newline at end of file
diff --git a/datasets/transforms.py b/datasets/transforms.py
index 063585789..b771f4247 100644
--- a/datasets/transforms.py
+++ b/datasets/transforms.py
@@ -22,7 +22,8 @@ def crop(image, target, region):
     # should we do something wrt the original size?
     target["size"] = torch.tensor([h, w])
 
-    fields = ["labels", "area", "iscrowd"]
+    # fields = ["labels", "area", "iscrowd"]
+    fields = ["labels"]
 
     if "boxes" in target:
         boxes = target["boxes"]
diff --git a/kitti2coco.py b/kitti2coco.py
new file mode 100644
index 000000000..ff9ede022
--- /dev/null
+++ b/kitti2coco.py
@@ -0,0 +1,71 @@
+from sahi.utils.coco import Coco, CocoCategory, CocoImage, CocoAnnotation
+from sahi.utils.file import save_json
+from PIL import Image
+import os.path as osp
+from math import ceil
+import os
+from tqdm import tqdm
+
+KITTI_CLASS = {'Car': 0, 'Pedestrian': 1, 'Cyclist' : 2}
+
+def decode(label):
+     '''
+     Parse line of kitti label text file
+     Refer - https://voxel51.com/docs/fiftyone/user_guide/dataset_creation/datasets.html#kittidetectiondataset
+     '''
+     data = label.split(' ')
+     class_name = data[0]
+     xmin = float(data[4])
+     ymin = float(data[5])
+     xmax = float(data[6])
+     ymax = float(data[7])
+
+     # top left cornet and dimensions
+     # Refer - https://cocodataset.org/#format-data
+     bbox = [xmin, ymin, ceil(xmax-xmin), ceil(ymax-ymin)]
+     class_id = KITTI_CLASS.get(class_name, -1)
+
+     return class_id, class_name, bbox
+
+
+# Coco object
+coco = Coco()
+
+# Add categories
+coco.add_category(CocoCategory(id=0, name='Car'))
+coco.add_category(CocoCategory(id=1, name='Pedastrian'))
+coco.add_category(CocoCategory(id=2, name='Cyclist'))
+
+split = 'val'
+assert split in ['train', 'val']
+
+# Add paths
+imageset_path = osp.join('/srip-vol/datasets/KITTI3D/ImageSets', split + '.txt')
+img_folder_path = osp.join('/srip-vol/datasets/KITTI3D/training', 'image_2')
+ann_folder_path = osp.join('/srip-vol/datasets/KITTI3D/training', 'label_2')
+
+idx = open(imageset_path, 'r').readlines()
+
+for i in tqdm(idx):
+     i = i[:-1]
+     img_path = osp.join(img_folder_path, i +'.png')
+     lab_path = osp.join(ann_folder_path, i +'.txt')
+
+     width, height = Image.open(img_path).size
+     coco_image = CocoImage(file_name=img_path, height=height, width=width)
+
+     labels = open(lab_path, 'r').readlines()
+     for l in labels:
+          category_id, category_name, bbox = decode(l)
+          if category_id == -1:
+               continue
+          coco_image.add_annotation(CocoAnnotation(   
+               bbox=bbox,
+               category_id=category_id,    
+               category_name=category_name))
+
+     coco.add_image(coco_image)
+
+save_path = '/srip-vol/parth/detr/kitti_%s.json'%(split)
+# save_path = '/srip-vol/parth/detr/try.json'
+save_json(data=coco.json, save_path=save_path) 
\ No newline at end of file
diff --git a/kittibev b/kittibev
new file mode 100644
index 000000000..03f908e36
--- /dev/null
+++ b/kittibev
@@ -0,0 +1,37 @@
+# -----------------------------------------------------------------------------------
+# To generate bev data
+# -----------------------------------------------------------------------------------
+import os 
+import os.path as osp
+import json
+# import torch
+from tqdm import tqdm
+
+split = 'val'
+data_path = '/srip-vol/datasets/KITTI3D/coco/kitti_%s.json' %(split)
+data = json.load(open(data_path))
+bev_data = {}
+
+KITTI_CLASS = {'Car': 0, 'Pedestrian': 1, 'Cyclist' : 2}
+
+for i in tqdm(range(len(data['images']))):
+    # Label path
+    img_path = data['images'][i]['file_name'].split('/')
+    img_path[-2] = 'label_2'
+    img_path[-1] = img_path[-1].split('.')[0] + '.txt'
+    label_path = '/' + osp.join(*img_path)
+
+    # Read annotations and assembler point depth value
+    lines = open(label_path).readlines()
+    bev = []
+    for line in lines:
+        label_data = line.split(' ')
+        if(KITTI_CLASS.get(label_data[0],-1) == -1):
+            continue
+        x_c = float(label_data[11])
+        z_c = float(label_data[13])
+
+        bev.append([x_c, z_c])
+
+    # Save in dict
+    bev_data[i+1] = bev #torch.tensor(depth) 
diff --git a/main.py b/main.py
index e5f9eff80..ced8daf46 100644
--- a/main.py
+++ b/main.py
@@ -16,21 +16,27 @@
 from engine import evaluate, train_one_epoch
 from models import build_model
 
+import PIL.Image as Image
+from torchvision import transforms
 
 def get_args_parser():
     parser = argparse.ArgumentParser('Set transformer detector', add_help=False)
     parser.add_argument('--lr', default=1e-4, type=float)
-    parser.add_argument('--lr_backbone', default=1e-5, type=float)
+    # parser.add_argument('--lr_backbone', default=1e-5, type=float)
+    # freezing the backbone
+    parser.add_argument('--lr_backbone', default=0, type=float)
     parser.add_argument('--batch_size', default=2, type=int)
     parser.add_argument('--weight_decay', default=1e-4, type=float)
     parser.add_argument('--epochs', default=300, type=int)
     parser.add_argument('--lr_drop', default=200, type=int)
     parser.add_argument('--clip_max_norm', default=0.1, type=float,
                         help='gradient clipping max norm')
-
+    parser.add_argument('--num_classes', default=3, type=int, help = "max class id. Refer comment at end of detr.py")
     # Model parameters
     parser.add_argument('--frozen_weights', type=str, default=None,
                         help="Path to the pretrained model. If set, only the mask head will be trained")
+    # parser.add_argument('--bev_regression', action='store_false',
+    #                     help="Add flag to regress bev directly")
     # * Backbone
     parser.add_argument('--backbone', default='resnet50', type=str,
                         help="Name of the convolutional backbone to use")
@@ -75,26 +81,34 @@ def get_args_parser():
     parser.add_argument('--dice_loss_coef', default=1, type=float)
     parser.add_argument('--bbox_loss_coef', default=5, type=float)
     parser.add_argument('--giou_loss_coef', default=2, type=float)
+    parser.add_argument('--bev_loss_coef', default=2, type=float)
     parser.add_argument('--eos_coef', default=0.1, type=float,
                         help="Relative classification weight of the no-object class")
 
     # dataset parameters
-    parser.add_argument('--dataset_file', default='coco')
+    # parser.add_argument('--dataset_file', default='coco')
+    parser.add_argument('--dataset_file', default='kitti_coco')
     parser.add_argument('--coco_path', type=str)
     parser.add_argument('--coco_panoptic_path', type=str)
+    parser.add_argument('--kitti_path', default='/srip-vol/datasets/KITTI3D/', type=str)
     parser.add_argument('--remove_difficult', action='store_true')
 
-    parser.add_argument('--output_dir', default='',
+    # parser.add_argument('--output_dir', default='',
+    parser.add_argument('--output_dir', default='output_logs_local',
                         help='path where to save, empty for no saving')
     parser.add_argument('--device', default='cuda',
                         help='device to use for training / testing')
     parser.add_argument('--seed', default=42, type=int)
-    parser.add_argument('--resume', default='', help='resume from checkpoint')
+    # parser.add_argument('--resume', default='', help='resume from checkpoint')
+    parser.add_argument('--resume', default='pretrained/detr-r101-dc5-a2e86def.pth', help='resume from checkpoint')
     parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
                         help='start epoch')
     parser.add_argument('--eval', action='store_true')
+    parser.add_argument('--test', action='store_true')
+
     parser.add_argument('--num_workers', default=2, type=int)
 
+    parser.add_argument('--test_image', default = None, type = str, help = 'Path to image for testing')
     # distributed training parameters
     parser.add_argument('--world_size', default=1, type=int,
                         help='number of distributed processes')
@@ -110,6 +124,7 @@ def main(args):
         assert args.masks, "Frozen training is meant for segmentation only"
     print(args)
 
+    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     device = torch.device(args.device)
 
     # fix the seed for reproducibility
@@ -174,8 +189,22 @@ def main(args):
             checkpoint = torch.hub.load_state_dict_from_url(
                 args.resume, map_location='cpu', check_hash=True)
         else:
+            print('loading pretrianed weights.....')
             checkpoint = torch.load(args.resume, map_location='cpu')
         model_without_ddp.load_state_dict(checkpoint['model'])
+        del checkpoint["model"]["class_embed.weight"]
+        del checkpoint["model"]["class_embed.bias"]
+        # Remove box weights
+        keys_to_delete = []
+        for key in checkpoint["model"]:
+            if 'box_embed' in key:
+                print(key)
+                keys_to_delete.append(key)
+
+        for key in keys_to_delete:
+            del checkpoint["model"][key]
+
+        model_without_ddp.load_state_dict(checkpoint['model'], strict = False)
         if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
             optimizer.load_state_dict(checkpoint['optimizer'])
             lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
diff --git a/models/detr.py b/models/detr.py
index 23c2376da..9011bcc20 100644
--- a/models/detr.py
+++ b/models/detr.py
@@ -40,6 +40,7 @@ def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=Fal
         self.input_proj = nn.Conv2d(backbone.num_channels, hidden_dim, kernel_size=1)
         self.backbone = backbone
         self.aux_loss = aux_loss
+        self.bev_embed = nn.Linear(hidden_dim, 2)
 
     def forward(self, samples: NestedTensor):
         """ The forward expects a NestedTensor, which consists of:
@@ -66,18 +67,19 @@ def forward(self, samples: NestedTensor):
 
         outputs_class = self.class_embed(hs)
         outputs_coord = self.bbox_embed(hs).sigmoid()
-        out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1]}
+        outputs_bev = self.bev_embed(hs)
+        out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1], 'pred_bev': outputs_bev[-1]}
         if self.aux_loss:
-            out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord)
+            out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord, outputs_bev)
         return out
 
     @torch.jit.unused
-    def _set_aux_loss(self, outputs_class, outputs_coord):
+    def _set_aux_loss(self, outputs_class, outputs_coord, outputs_bev):
         # this is a workaround to make torchscript happy, as torchscript
         # doesn't support dictionary with non-homogeneous values, such
         # as a dict having both a Tensor and a list.
-        return [{'pred_logits': a, 'pred_boxes': b}
-                for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]
+        return [{'pred_logits': a, 'pred_boxes': b, 'pred_bev': c}
+                for a, b, c in zip(outputs_class[:-1], outputs_coord[:-1], outputs_bev[:-1])]
 
 
 class SetCriterion(nn.Module):
@@ -190,6 +192,15 @@ def loss_masks(self, outputs, targets, indices, num_boxes):
         }
         return losses
 
+    def loss_bev(self, outputs, targets, indices, num_boxes):
+        assert 'pred_bev' in outputs
+        idx = self._get_src_permutation_idx(indices)
+        src_bev = outputs['pred_bev'][idx].squeeze()
+        target_bev = torch.cat([t['bev'][i] for t, (_, i) in zip(targets, indices)])
+        loss = F.mse_loss(src_bev, target_bev)
+        losses = {'loss_bev' : loss}
+        return losses
+
     def _get_src_permutation_idx(self, indices):
         # permute predictions following indices
         batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
@@ -203,11 +214,18 @@ def _get_tgt_permutation_idx(self, indices):
         return batch_idx, tgt_idx
 
     def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
+        # loss_map = {
+        #     'labels': self.loss_labels,
+        #     'cardinality': self.loss_cardinality,
+        #     'boxes': self.loss_boxes,
+        #     'masks': self.loss_masks
+        # }
         loss_map = {
             'labels': self.loss_labels,
             'cardinality': self.loss_cardinality,
             'boxes': self.loss_boxes,
-            'masks': self.loss_masks
+            'masks': self.loss_masks,
+            'bev': self.loss_bev
         }
         assert loss in loss_map, f'do you really want to compute {loss} loss?'
         return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
@@ -310,7 +328,8 @@ def build(args):
     # you should pass `num_classes` to be 2 (max_obj_id + 1).
     # For more details on this, check the following discussion
     # https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223
-    num_classes = 20 if args.dataset_file != 'coco' else 91
+    # num_classes = 20 if args.dataset_file != 'coco' else 91
+    num_classes = arg.num_classes+1 if args.dataset_file != 'coco' else 91
     if args.dataset_file == "coco_panoptic":
         # for panoptic, we just add a num_classes that is large enough to hold
         # max_obj_id + 1, but the exact value doesn't really matter
@@ -333,6 +352,7 @@ def build(args):
     matcher = build_matcher(args)
     weight_dict = {'loss_ce': 1, 'loss_bbox': args.bbox_loss_coef}
     weight_dict['loss_giou'] = args.giou_loss_coef
+    weight_dict['loss_bev'] = args.bev_loss_coef
     if args.masks:
         weight_dict["loss_mask"] = args.mask_loss_coef
         weight_dict["loss_dice"] = args.dice_loss_coef
@@ -343,7 +363,7 @@ def build(args):
             aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()})
         weight_dict.update(aux_weight_dict)
 
-    losses = ['labels', 'boxes', 'cardinality']
+    losses = ['labels', 'boxes', 'cardinality', 'bev']
     if args.masks:
         losses += ["masks"]
     criterion = SetCriterion(num_classes, matcher=matcher, weight_dict=weight_dict,
diff --git a/run_KITTI_bev.sh b/run_KITTI_bev.sh
new file mode 100644
index 000000000..cbcf59560
--- /dev/null
+++ b/run_KITTI_bev.sh
@@ -0,0 +1,8 @@
+echo $(date)
+conda config --append envs_dirs /srip-vol/parth/myenvs
+cd /srip-vol/yuze/detr/
+python=/srip-vol/parth/myenvs/detr/bin/python
+
+$python main.py
+
+echo "done"
\ No newline at end of file

From 4cd93ede0b98422bbb0f3dfbc967113e2de50a6f Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Tue, 5 Jul 2022 21:54:33 -0700
Subject: [PATCH 02/22] correct kittibev.py

---
 kittibev => kittibev.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename kittibev => kittibev.py (100%)

diff --git a/kittibev b/kittibev.py
similarity index 100%
rename from kittibev
rename to kittibev.py

From 8fa2460fd0189187fcc4e52d52be4f16a98e6469 Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Thu, 7 Jul 2022 17:46:37 -0700
Subject: [PATCH 03/22] pretrained model loading problem fixed

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index ced8daf46..d61f229a2 100644
--- a/main.py
+++ b/main.py
@@ -191,7 +191,7 @@ def main(args):
         else:
             print('loading pretrianed weights.....')
             checkpoint = torch.load(args.resume, map_location='cpu')
-        model_without_ddp.load_state_dict(checkpoint['model'])
+        # model_without_ddp.load_state_dict(checkpoint['model'])
         del checkpoint["model"]["class_embed.weight"]
         del checkpoint["model"]["class_embed.bias"]
         # Remove box weights

From 182e8714c629e2ef9aa4b26f3bc0245b8705ccbc Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Thu, 7 Jul 2022 18:25:47 -0700
Subject: [PATCH 04/22] json file for bev

---
 kittibev.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kittibev.py b/kittibev.py
index 03f908e36..c59d96466 100644
--- a/kittibev.py
+++ b/kittibev.py
@@ -35,3 +35,8 @@
 
     # Save in dict
     bev_data[i+1] = bev #torch.tensor(depth) 
+
+# Save bev data as json file
+output_path = '/srip-vol/datasets/KITTI3D/coco/bev_%s.json' %(split)
+with open(output_path, "w") as outfile:
+    json.dump(bev_data, outfile)

From a8dab26a752742c3ad2f3c9844f62ca18429cbcf Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Mon, 11 Jul 2022 18:21:56 -0700
Subject: [PATCH 05/22] json file for bev

---
 kittidim.py   | 42 ++++++++++++++++++++++++++++++++++++++++++
 kittihbins.py | 42 ++++++++++++++++++++++++++++++++++++++++++
 kittihres.py  |  0
 3 files changed, 84 insertions(+)
 create mode 100644 kittidim.py
 create mode 100644 kittihbins.py
 create mode 100644 kittihres.py

diff --git a/kittidim.py b/kittidim.py
new file mode 100644
index 000000000..006e44494
--- /dev/null
+++ b/kittidim.py
@@ -0,0 +1,42 @@
+# -----------------------------------------------------------------------------------
+# To generate bev dimension data
+# -----------------------------------------------------------------------------------
+import os 
+import os.path as osp
+import json
+# import torch
+from tqdm import tqdm
+
+split = 'val'
+data_path = '/srip-vol/datasets/KITTI3D/coco/kitti_%s.json' %(split)
+data = json.load(open(data_path))
+bevdim_data = {}
+
+KITTI_CLASS = {'Car': 0, 'Pedestrian': 1, 'Cyclist' : 2}
+
+for i in tqdm(range(len(data['images']))):
+    # Label path
+    img_path = data['images'][i]['file_name'].split('/')
+    img_path[-2] = 'label_2'
+    img_path[-1] = img_path[-1].split('.')[0] + '.txt'
+    label_path = '/' + osp.join(*img_path)
+
+    # Read annotations and assembler point depth value
+    lines = open(label_path).readlines()
+    bevdim = []
+    for line in lines:
+        label_data = line.split(' ')
+        if(KITTI_CLASS.get(label_data[0],-1) == -1):
+            continue
+        dim_w = float(label_data[9])
+        dim_l = float(label_data[10])
+
+        bevdim.append([dim_w, dim_l])
+
+    # Save in dict
+    bevdim_data[i+1] = bevdim #torch.tensor(depth) 
+
+# Save bev data as json file
+output_path = '/srip-vol/datasets/KITTI3D/coco/bevdim_%s.json' %(split)
+with open(output_path, "w") as outfile:
+    json.dump(bevdim_data, outfile)
diff --git a/kittihbins.py b/kittihbins.py
new file mode 100644
index 000000000..b70cd38eb
--- /dev/null
+++ b/kittihbins.py
@@ -0,0 +1,42 @@
+# -----------------------------------------------------------------------------------
+# To generate bev 
+# -----------------------------------------------------------------------------------
+import os 
+import os.path as osp
+import json
+# import torch
+from tqdm import tqdm
+
+split = 'val'
+data_path = '/srip-vol/datasets/KITTI3D/coco/kitti_%s.json' %(split)
+data = json.load(open(data_path))
+bev_data = {}
+
+KITTI_CLASS = {'Car': 0, 'Pedestrian': 1, 'Cyclist' : 2}
+
+for i in tqdm(range(len(data['images']))):
+    # Label path
+    img_path = data['images'][i]['file_name'].split('/')
+    img_path[-2] = 'label_2'
+    img_path[-1] = img_path[-1].split('.')[0] + '.txt'
+    label_path = '/' + osp.join(*img_path)
+
+    # Read annotations and assembler point depth value
+    lines = open(label_path).readlines()
+    bev = []
+    for line in lines:
+        label_data = line.split(' ')
+        if(KITTI_CLASS.get(label_data[0],-1) == -1):
+            continue
+        x_c = float(label_data[11])
+        z_c = float(label_data[13])
+
+        bev.append([x_c, z_c])
+
+    # Save in dict
+    bev_data[i+1] = bev #torch.tensor(depth) 
+
+# Save bev data as json file
+output_path = '/srip-vol/datasets/KITTI3D/coco/bev_%s.json' %(split)
+with open(output_path, "w") as outfile:
+    json.dump(bev_data, outfile)
diff --git a/kittihres.py b/kittihres.py
new file mode 100644
index 000000000..e69de29bb

From 3379328e6d4b880583b8c095e468faca04784fec Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Wed, 13 Jul 2022 15:46:17 -0700
Subject: [PATCH 06/22] add kitti angle

---
 kittihbins.py       |  83 ++++++++-
 kittihres.py        |   0
 util/kitti_utils.py | 410 ++++++++++++++++++++++++++++++++++++++++++++
 util/utils.py       | 123 +++++++++++++
 4 files changed, 608 insertions(+), 8 deletions(-)
 delete mode 100644 kittihres.py
 create mode 100644 util/kitti_utils.py
 create mode 100644 util/utils.py

diff --git a/kittihbins.py b/kittihbins.py
index b70cd38eb..0d0cd3d9a 100644
--- a/kittihbins.py
+++ b/kittihbins.py
@@ -6,11 +6,34 @@
 import json
 # import torch
 from tqdm import tqdm
+import numpy as np
+import cv2
+
+def ry2alpha(ry, u, cu, fu):
+    alpha = ry - np.arctan2(u - cu, fu)
+
+    if alpha > np.pi:
+        alpha -= 2 * np.pi
+    if alpha < -np.pi:
+        alpha += 2 * np.pi
+
+    return alpha
+
+def angle2class(angle):
+    ''' Convert continuous angle to discrete class and residual. '''
+    angle = angle % (2 * np.pi)
+    assert (angle >= 0 and angle <= 2 * np.pi)
+    angle_per_class = 2 * np.pi / float(12)
+    shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi)
+    class_id = int(shifted_angle / angle_per_class)
+    residual_angle = shifted_angle - (class_id * angle_per_class + angle_per_class / 2)
+    return class_id, residual_angle
 
 split = 'val'
 data_path = '/srip-vol/datasets/KITTI3D/coco/kitti_%s.json' %(split)
 data = json.load(open(data_path))
-bev_data = {}
+heading_bins_data = {}
+heading_ress_data = {}
 
 KITTI_CLASS = {'Car': 0, 'Pedestrian': 1, 'Cyclist' : 2}
 
@@ -20,23 +43,67 @@
     img_path[-2] = 'label_2'
     img_path[-1] = img_path[-1].split('.')[0] + '.txt'
     label_path = '/' + osp.join(*img_path)
+    
+    # Calibration path
+    img_path = data['images'][i]['file_name'].split('/')
+    img_path[-2] = 'calib'
+    img_path[-1] = img_path[-1].split('.')[0] + '.txt'
+    calib_path = '/' + osp.join(*img_path)
 
     # Read annotations and assembler point depth value
     lines = open(label_path).readlines()
-    bev = []
+    lines_calib = open(calib_path).readlines()
+
+    obj = lines_calib[2].strip().split(' ')[1:]
+    P2 = np.array(obj, dtype=np.float32)
+    # obj = lines_calib[3].strip().split(' ')[1:]
+    # P3 = np.array(obj, dtype=np.float32)
+    # obj = lines_calib[4].strip().split(' ')[1:]
+    # R0 = np.array(obj, dtype=np.float32)
+    # obj = lines_calib[5].strip().split(' ')[1:]
+    # Tr_velo_to_cam = np.array(obj, dtype=np.float32)
+
+    P2.reshape(3, 4)
+    # P3.reshape(3, 4)
+    # R0.reshape(3, 3)
+    # Tr_velo_to_cam.reshape(3, 4)
+
+    # cv = P2[1, 2]
+    # fv = P2[1, 1]
+    # tx = P2[0, 3] / (-fu)
+    # ty = P2[1, 3] / (-fv)
+    cu = P2[0, 2]
+    fu = P2[0, 0]
+    
+    heading_bins = []
+    heading_ress = []
     for line in lines:
         label_data = line.split(' ')
         if(KITTI_CLASS.get(label_data[0],-1) == -1):
             continue
-        x_c = float(label_data[11])
-        z_c = float(label_data[13])
+        bbx0 = float(label_data[4])
+        bbx2 = float(label_data[6])
+        ry = float(label_data[14])
 
-        bev.append([x_c, z_c])
+        heading_angle = ry2alpha(ry, (bbx0 + bbx2) / 2, cu, fu)
+        if heading_angle > np.pi:  heading_angle -= 2 * np.pi  # check range
+        if heading_angle < -np.pi: heading_angle += 2 * np.pi
+        heading_bin, heading_res = angle2class(heading_angle)
+        # x_c = float(label_data[11])
+        # z_c = float(label_data[13])
+
+        heading_bins.append([heading_bin])
+        heading_ress.append([heading_res])
 
     # Save in dict
-    bev_data[i+1] = bev #torch.tensor(depth) 
+    heading_bins_data[i+1] = heading_bins #torch.tensor(depth) 
+    heading_ress_data[i+1] = heading_ress
 
 # Save bev data as json file
-output_path = '/srip-vol/datasets/KITTI3D/coco/bev_%s.json' %(split)
+output_path = '/srip-vol/datasets/KITTI3D/coco/heading_bins_%s.json' %(split)
+with open(output_path, "w") as outfile:
+    json.dump(heading_bins_data, outfile)
+
+output_path = '/srip-vol/datasets/KITTI3D/coco/heading_ress_%s.json' %(split)
 with open(output_path, "w") as outfile:
-    json.dump(bev_data, outfile)
+    json.dump(heading_ress_data, outfile)
diff --git a/kittihres.py b/kittihres.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/util/kitti_utils.py b/util/kitti_utils.py
new file mode 100644
index 000000000..1200530d4
--- /dev/null
+++ b/util/kitti_utils.py
@@ -0,0 +1,410 @@
+import numpy as np
+import cv2
+
+################  Object3D  ##################
+
+def get_objects_from_label(label_file):
+    with open(label_file, 'r') as f:
+        lines = f.readlines()
+    objects = [Object3d(line) for line in lines]
+    return objects
+
+
+class Object3d(object):
+    def __init__(self, line):
+        label = line.strip().split(' ')
+        self.src = line
+        self.cls_type = label[0]
+        self.trucation = float(label[1])
+        self.occlusion = float(label[2])  # 0:fully visible 1:partly occluded 2:largely occluded 3:unknown
+        self.alpha = float(label[3])
+        self.box2d = np.array((float(label[4]), float(label[5]), float(label[6]), float(label[7])), dtype=np.float32)
+        self.h = float(label[8])
+        self.w = float(label[9])
+        self.l = float(label[10])
+        self.pos = np.array((float(label[11]), float(label[12]), float(label[13])), dtype=np.float32)
+        self.dis_to_cam = np.linalg.norm(self.pos)
+        self.ry = float(label[14])
+        self.score = float(label[15]) if label.__len__() == 16 else -1.0
+        self.level_str = None
+        self.level = self.get_obj_level()
+
+
+    def get_obj_level(self):
+        height = float(self.box2d[3]) - float(self.box2d[1]) + 1
+
+        if self.trucation == -1:
+            self.level_str = 'DontCare'
+            return 0
+
+        if height >= 40 and self.trucation <= 0.15 and self.occlusion <= 0:
+            self.level_str = 'Easy'
+            return 1  # Easy
+        elif height >= 25 and self.trucation <= 0.3 and self.occlusion <= 1:
+            self.level_str = 'Moderate'
+            return 2  # Moderate
+        elif height >= 25 and self.trucation <= 0.5 and self.occlusion <= 2:
+            self.level_str = 'Hard'
+            return 3  # Hard
+        else:
+            self.level_str = 'UnKnown'
+            return 4
+
+
+    def generate_corners3d(self):
+        """
+        generate corners3d representation for this object
+        :return corners_3d: (8, 3) corners of box3d in camera coord
+        """
+        l, h, w = self.l, self.h, self.w
+        x_corners = [l / 2, l / 2, -l / 2, -l / 2, l / 2, l / 2, -l / 2, -l / 2]
+        y_corners = [0, 0, 0, 0, -h, -h, -h, -h]
+        z_corners = [w / 2, -w / 2, -w / 2, w / 2, w / 2, -w / 2, -w / 2, w / 2]
+
+        R = np.array([[np.cos(self.ry), 0, np.sin(self.ry)],
+                      [0, 1, 0],
+                      [-np.sin(self.ry), 0, np.cos(self.ry)]])
+        corners3d = np.vstack([x_corners, y_corners, z_corners])  # (3, 8)
+        corners3d = np.dot(R, corners3d).T
+        corners3d = corners3d + self.pos
+        return corners3d
+
+
+    def to_bev_box2d(self, oblique=True, voxel_size=0.1):
+        """
+        :param bev_shape: (2) for bev shape (h, w), => (y_max, x_max) in image
+        :param voxel_size: float, 0.1m
+        :param oblique:
+        :return: box2d (4, 2)/ (4) in image coordinate
+        """
+        if oblique:
+            corners3d = self.generate_corners3d()
+            xz_corners = corners3d[0:4, [0, 2]]
+            box2d = np.zeros((4, 2), dtype=np.int32)
+            box2d[:, 0] = ((xz_corners[:, 0] - Object3d.MIN_XZ[0]) / voxel_size).astype(np.int32)
+            box2d[:, 1] = Object3d.BEV_SHAPE[0] - 1 - ((xz_corners[:, 1] - Object3d.MIN_XZ[1]) / voxel_size).astype(np.int32)
+            box2d[:, 0] = np.clip(box2d[:, 0], 0, Object3d.BEV_SHAPE[1])
+            box2d[:, 1] = np.clip(box2d[:, 1], 0, Object3d.BEV_SHAPE[0])
+        else:
+            box2d = np.zeros(4, dtype=np.int32)
+            # discrete_center = np.floor((self.pos / voxel_size)).astype(np.int32)
+            cu = np.floor((self.pos[0] - Object3d.MIN_XZ[0]) / voxel_size).astype(np.int32)
+            cv = Object3d.BEV_SHAPE[0] - 1 - ((self.pos[2] - Object3d.MIN_XZ[1]) / voxel_size).astype(np.int32)
+            half_l, half_w = int(self.l / voxel_size / 2), int(self.w / voxel_size / 2)
+            box2d[0], box2d[1] = cu - half_l, cv - half_w
+            box2d[2], box2d[3] = cu + half_l, cv + half_w
+
+        return box2d
+
+
+    def to_str(self):
+        print_str = '%s %.3f %.3f %.3f box2d: %s hwl: [%.3f %.3f %.3f] pos: %s ry: %.3f' \
+                     % (self.cls_type, self.trucation, self.occlusion, self.alpha, self.box2d, self.h, self.w, self.l,
+                        self.pos, self.ry)
+        return print_str
+
+
+    def to_kitti_format(self):
+        kitti_str = '%s %.2f %d %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f %.2f' \
+                    % (self.cls_type, self.trucation, int(self.occlusion), self.alpha, self.box2d[0], self.box2d[1],
+                       self.box2d[2], self.box2d[3], self.h, self.w, self.l, self.pos[0], self.pos[1], self.pos[2],
+                       self.ry)
+        return kitti_str
+
+
+
+###################  calibration  ###################
+
+def get_calib_from_file(calib_file):
+    with open(calib_file) as f:
+        lines = f.readlines()
+
+    obj = lines[2].strip().split(' ')[1:]
+    P2 = np.array(obj, dtype=np.float32)
+    obj = lines[3].strip().split(' ')[1:]
+    P3 = np.array(obj, dtype=np.float32)
+    obj = lines[4].strip().split(' ')[1:]
+    R0 = np.array(obj, dtype=np.float32)
+    obj = lines[5].strip().split(' ')[1:]
+    Tr_velo_to_cam = np.array(obj, dtype=np.float32)
+
+    return {'P2': P2.reshape(3, 4),
+            'P3': P3.reshape(3, 4),
+            'R0': R0.reshape(3, 3),
+            'Tr_velo2cam': Tr_velo_to_cam.reshape(3, 4)}
+
+
+class Calibration(object):
+    def __init__(self, calib_file):
+        if isinstance(calib_file, str):
+            calib = get_calib_from_file(calib_file)
+        else:
+            calib = calib_file
+
+        self.P2 = calib['P2']  # 3 x 4
+        self.R0 = calib['R0']  # 3 x 3
+        self.V2C = calib['Tr_velo2cam']  # 3 x 4
+        self.C2V = self.inverse_rigid_trans(self.V2C)
+
+        # Camera intrinsics and extrinsics
+        self.cu = self.P2[0, 2]
+        self.cv = self.P2[1, 2]
+        self.fu = self.P2[0, 0]
+        self.fv = self.P2[1, 1]
+        self.tx = self.P2[0, 3] / (-self.fu)
+        self.ty = self.P2[1, 3] / (-self.fv)
+
+    def cart_to_hom(self, pts):
+        """
+        :param pts: (N, 3 or 2)
+        :return pts_hom: (N, 4 or 3)
+        """
+        pts_hom = np.hstack((pts, np.ones((pts.shape[0], 1), dtype=np.float32)))
+        return pts_hom
+
+    def lidar_to_rect(self, pts_lidar):
+        """
+        :param pts_lidar: (N, 3)
+        :return pts_rect: (N, 3)
+        """
+        pts_lidar_hom = self.cart_to_hom(pts_lidar)
+        pts_rect = np.dot(pts_lidar_hom, np.dot(self.V2C.T, self.R0.T))
+        # pts_rect = reduce(np.dot, (pts_lidar_hom, self.V2C.T, self.R0.T))
+        return pts_rect
+
+    def rect_to_lidar(self, pts_rect):
+        pts_ref = np.transpose(np.dot(np.linalg.inv(self.R0), np.transpose(pts_rect)))
+        pts_ref = self.cart_to_hom(pts_ref)  # nx4
+        return np.dot(pts_ref, np.transpose(self.C2V))
+
+    def rect_to_img(self, pts_rect):
+        """
+        :param pts_rect: (N, 3)
+        :return pts_img: (N, 2)
+        """
+        pts_rect_hom = self.cart_to_hom(pts_rect)
+        pts_2d_hom = np.dot(pts_rect_hom, self.P2.T)
+        pts_img = (pts_2d_hom[:, 0:2].T / pts_rect_hom[:, 2]).T  # (N, 2)
+        pts_rect_depth = pts_2d_hom[:, 2] - self.P2.T[3, 2]  # depth in rect camera coord
+        return pts_img, pts_rect_depth
+
+    def lidar_to_img(self, pts_lidar):
+        """
+        :param pts_lidar: (N, 3)
+        :return pts_img: (N, 2)
+        """
+        pts_rect = self.lidar_to_rect(pts_lidar)
+        pts_img, pts_depth = self.rect_to_img(pts_rect)
+        return pts_img, pts_depth
+
+    def img_to_rect(self, u, v, depth_rect):
+        """
+        :param u: (N)
+        :param v: (N)
+        :param depth_rect: (N)
+        :return:
+        """
+        x = ((u - self.cu) * depth_rect) / self.fu + self.tx
+        y = ((v - self.cv) * depth_rect) / self.fv + self.ty
+        pts_rect = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1), depth_rect.reshape(-1, 1)), axis=1)
+        return pts_rect
+
+    def depthmap_to_rect(self, depth_map):
+        """
+        :param depth_map: (H, W), depth_map
+        :return:
+        """
+        x_range = np.arange(0, depth_map.shape[1])
+        y_range = np.arange(0, depth_map.shape[0])
+        x_idxs, y_idxs = np.meshgrid(x_range, y_range)
+        x_idxs, y_idxs = x_idxs.reshape(-1), y_idxs.reshape(-1)
+        depth = depth_map[y_idxs, x_idxs]
+        pts_rect = self.img_to_rect(x_idxs, y_idxs, depth)
+        return pts_rect, x_idxs, y_idxs
+
+    def corners3d_to_img_boxes(self, corners3d):
+        """
+        :param corners3d: (N, 8, 3) corners in rect coordinate
+        :return: boxes: (None, 4) [x1, y1, x2, y2] in rgb coordinate
+        :return: boxes_corner: (None, 8) [xi, yi] in rgb coordinate
+        """
+        sample_num = corners3d.shape[0]
+        corners3d_hom = np.concatenate((corners3d, np.ones((sample_num, 8, 1))), axis=2)  # (N, 8, 4)
+
+        img_pts = np.matmul(corners3d_hom, self.P2.T)  # (N, 8, 3)
+
+        x, y = img_pts[:, :, 0] / img_pts[:, :, 2], img_pts[:, :, 1] / img_pts[:, :, 2]
+        x1, y1 = np.min(x, axis=1), np.min(y, axis=1)
+        x2, y2 = np.max(x, axis=1), np.max(y, axis=1)
+
+        boxes = np.concatenate((x1.reshape(-1, 1), y1.reshape(-1, 1), x2.reshape(-1, 1), y2.reshape(-1, 1)), axis=1)
+        boxes_corner = np.concatenate((x.reshape(-1, 8, 1), y.reshape(-1, 8, 1)), axis=2)
+
+        return boxes, boxes_corner
+
+    def camera_dis_to_rect(self, u, v, d):
+        """
+        Can only process valid u, v, d, which means u, v can not beyond the image shape, reprojection error 0.02
+        :param u: (N)
+        :param v: (N)
+        :param d: (N), the distance between camera and 3d points, d^2 = x^2 + y^2 + z^2
+        :return:
+        """
+        assert self.fu == self.fv, '%.8f != %.8f' % (self.fu, self.fv)
+        fd = np.sqrt((u - self.cu) ** 2 + (v - self.cv) ** 2 + self.fu ** 2)
+        x = ((u - self.cu) * d) / fd + self.tx
+        y = ((v - self.cv) * d) / fd + self.ty
+        z = np.sqrt(d ** 2 - x ** 2 - y ** 2)
+        pts_rect = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1), z.reshape(-1, 1)), axis=1)
+        return pts_rect
+
+    def inverse_rigid_trans(self, Tr):
+        ''' Inverse a rigid body transform matrix (3x4 as [R|t])
+            [R'|-R't; 0|1]
+        '''
+        inv_Tr = np.zeros_like(Tr)  # 3x4
+        inv_Tr[0:3, 0:3] = np.transpose(Tr[0:3, 0:3])
+        inv_Tr[0:3, 3] = np.dot(-np.transpose(Tr[0:3, 0:3]), Tr[0:3, 3])
+        return inv_Tr
+
+    def alpha2ry(self, alpha, u):
+        """
+        Get rotation_y by alpha + theta - 180
+        alpha : Observation angle of object, ranging [-pi..pi]
+        x : Object center x to the camera center (x-W/2), in pixels
+        rotation_y : Rotation ry around Y-axis in camera coordinates [-pi..pi]
+        """
+        ry = alpha + np.arctan2(u - self.cu, self.fu)
+
+        if ry > np.pi:
+            ry -= 2 * np.pi
+        if ry < -np.pi:
+            ry += 2 * np.pi
+
+        return ry
+
+    def ry2alpha(self, ry, u):
+        alpha = ry - np.arctan2(u - self.cu, self.fu)
+
+        if alpha > np.pi:
+            alpha -= 2 * np.pi
+        if alpha < -np.pi:
+            alpha += 2 * np.pi
+
+        return alpha
+
+    def flip(self,img_size):
+        wsize = 4
+        hsize = 2
+        p2ds = (np.concatenate([np.expand_dims(np.tile(np.expand_dims(np.linspace(0,img_size[0],wsize),0),[hsize,1]),-1),\
+                                np.expand_dims(np.tile(np.expand_dims(np.linspace(0,img_size[1],hsize),1),[1,wsize]),-1),
+                                np.linspace(2,78,wsize*hsize).reshape(hsize,wsize,1)],-1)).reshape(-1,3)
+        p3ds = self.img_to_rect(p2ds[:,0:1],p2ds[:,1:2],p2ds[:,2:3])
+        p3ds[:,0]*=-1
+        p2ds[:,0] = img_size[0] - p2ds[:,0]
+
+        #self.P2[0,3] *= -1
+        cos_matrix = np.zeros([wsize*hsize,2,7])
+        cos_matrix[:,0,0] = p3ds[:,0]
+        cos_matrix[:,0,1] = cos_matrix[:,1,2] = p3ds[:,2]
+        cos_matrix[:,1,0] = p3ds[:,1]
+        cos_matrix[:,0,3] = cos_matrix[:,1,4] = 1
+        cos_matrix[:,:,-2] = -p2ds[:,:2]
+        cos_matrix[:,:,-1] = (-p2ds[:,:2]*p3ds[:,2:3])
+        new_calib = np.linalg.svd(cos_matrix.reshape(-1,7))[-1][-1]
+        new_calib /= new_calib[-1]
+
+        new_calib_matrix = np.zeros([4,3]).astype(np.float32)
+        new_calib_matrix[0,0] = new_calib_matrix[1,1] = new_calib[0]
+        new_calib_matrix[2,0:2] = new_calib[1:3]
+        new_calib_matrix[3,:] = new_calib[3:6]
+        new_calib_matrix[-1,-1] = self.P2[-1,-1]
+        self.P2 = new_calib_matrix.T
+        self.cu = self.P2[0, 2]
+        self.cv = self.P2[1, 2]
+        self.fu = self.P2[0, 0]
+        self.fv = self.P2[1, 1]
+        self.tx = self.P2[0, 3] / (-self.fu)
+        self.ty = self.P2[1, 3] / (-self.fv)
+
+###################  affine trainsform  ###################
+
+def get_dir(src_point, rot_rad):
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+
+    src_result = [0, 0]
+    src_result[0] = src_point[0] * cs - src_point[1] * sn
+    src_result[1] = src_point[0] * sn + src_point[1] * cs
+
+    return src_result
+
+
+def get_3rd_point(a, b):
+    direct = a - b
+    return b + np.array([-direct[1], direct[0]], dtype=np.float32)
+
+
+def get_affine_transform(center,
+                         scale,
+                         rot,
+                         output_size,
+                         shift=np.array([0, 0], dtype=np.float32),
+                         inv=0):
+    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
+        scale = np.array([scale, scale], dtype=np.float32)
+
+    scale_tmp = scale
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = get_dir([0, src_w * -0.5], rot_rad)
+    dst_dir = np.array([0, dst_w * -0.5], np.float32)
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    dst = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5], np.float32) + dst_dir
+
+    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
+    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+        trans_inv = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+        return trans, trans_inv
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+    return trans
+
+
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]
+
+
+if __name__ == '__main__':
+    from lib.datasets.kitti.kitti_dataset import KITTI_Dataset
+    cfg = {'root_dir': '../../../data'}
+    dataset = KITTI_Dataset('train', cfg)
+
+    # calib testing
+    # we project center fo 3D objects to image plane
+    index = 1
+    calib = dataset.get_calib(index)
+    objects = dataset.get_label(index)
+    for object in objects:
+        print(object.to_kitti_format())
+        object.pos[0] *= 1
+        center_3d = object.pos + [0, -object.h/2, 0]   # real 3D center
+        center_3d = center_3d.reshape(-1, 3)   #(N, 3)
+        center_3d_projected, depth = calib.rect_to_img(center_3d)
+        box2d = object.box2d
+        center_2d = [(box2d[0]+box2d[2])/2, (box2d[1]+box2d[3])/2]
+        print ('3D center/2D center/projected 3D center:', center_3d, center_2d, center_3d_projected)
+        print('alpha ---> ry ', object.alpha, calib.alpha2ry(object.alpha, center_2d[0]))
+        break
\ No newline at end of file
diff --git a/util/utils.py b/util/utils.py
new file mode 100644
index 000000000..c59a91f50
--- /dev/null
+++ b/util/utils.py
@@ -0,0 +1,123 @@
+''' some auxiliary functions for all datasets '''
+import numpy as np
+import cv2
+
+
+num_heading_bin = 12  # hyper param
+
+def angle2class(angle):
+    ''' Convert continuous angle to discrete class and residual. '''
+    angle = angle % (2 * np.pi)
+    assert (angle >= 0 and angle <= 2 * np.pi)
+    angle_per_class = 2 * np.pi / float(num_heading_bin)
+    shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi)
+    class_id = int(shifted_angle / angle_per_class)
+    residual_angle = shifted_angle - (class_id * angle_per_class + angle_per_class / 2)
+    return class_id, residual_angle
+
+
+def class2angle(cls, residual, to_label_format=False):
+    ''' Inverse function to angle2class. '''
+    angle_per_class = 2 * np.pi / float(num_heading_bin)
+    angle_center = cls * angle_per_class
+    angle = angle_center + residual
+    if to_label_format and angle > np.pi:
+        angle = angle - 2 * np.pi
+    return angle
+
+
+def gaussian_radius(bbox_size, min_overlap=0.7):
+    height, width = bbox_size
+
+    a1  = 1
+    b1  = (height + width)
+    c1  = width * height * (1 - min_overlap) / (1 + min_overlap)
+    sq1 = np.sqrt(b1 ** 2 - 4 * a1 * c1)
+    r1  = (b1 + sq1) / 2
+
+    a2  = 4
+    b2  = 2 * (height + width)
+    c2  = (1 - min_overlap) * width * height
+    sq2 = np.sqrt(b2 ** 2 - 4 * a2 * c2)
+    r2  = (b2 + sq2) / 2
+
+    a3  = 4 * min_overlap
+    b3  = -2 * min_overlap * (height + width)
+    c3  = (min_overlap - 1) * width * height
+    sq3 = np.sqrt(b3 ** 2 - 4 * a3 * c3)
+    r3  = (b3 + sq3) / 2
+    return min(r1, r2, r3)
+
+
+def gaussian2D(shape, sigma=1):
+    m, n = [(ss - 1.) / 2. for ss in shape]
+    y, x = np.ogrid[-m:m+1,-n:n+1]
+
+    h = np.exp(-(x * x + y * y) / (2 * sigma * sigma))
+    h[h < np.finfo(h.dtype).eps * h.max()] = 0
+    return h
+
+
+def draw_umich_gaussian(heatmap, center, radius, k=1):
+    diameter = 2 * radius + 1
+    gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6)
+    x, y = int(center[0]), int(center[1])
+    height, width = heatmap.shape[0:2]
+
+    left, right = min(x, radius), min(width - x, radius + 1)
+    top, bottom = min(y, radius), min(height - y, radius + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:radius + right]
+    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:  # TODO debug
+        np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
+    return heatmap
+
+
+def draw_msra_gaussian(heatmap, center, sigma):
+    tmp_size = sigma * 3
+    mu_x = int(center[0] + 0.5)
+    mu_y = int(center[1] + 0.5)
+    w, h = heatmap.shape[0], heatmap.shape[1]
+    ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
+    br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
+    if ul[0] >= h or ul[1] >= w or br[0] < 0 or br[1] < 0:
+        return heatmap
+    size = 2 * tmp_size + 1
+    x = np.arange(0, size, 1, np.float32)
+    y = x[:, np.newaxis]
+    x0 = y0 = size // 2
+    g = np.exp(- ((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma ** 2))
+    g_x = max(0, -ul[0]), min(br[0], h) - ul[0]
+    g_y = max(0, -ul[1]), min(br[1], w) - ul[1]
+    img_x = max(0, ul[0]), min(br[0], h)
+    img_y = max(0, ul[1]), min(br[1], w)
+    heatmap[img_y[0]:img_y[1], img_x[0]:img_x[1]] = np.maximum(
+    heatmap[img_y[0]:img_y[1], img_x[0]:img_x[1]],
+    g[g_y[0]:g_y[1], g_x[0]:g_x[1]])
+    return heatmap
+
+
+def draw_projected_box3d(image, corners3d, color=(255, 255, 255), thickness=1):
+    ''' Draw 3d bounding box in image
+    input:
+        image: RGB image
+        corners3d: (8,3) array of vertices (in image plane) for the 3d box in following order:
+            1 -------- 0
+           /|         /|
+          2 -------- 3 .
+          | |        | |
+          . 5 -------- 4
+          |/         |/
+          6 -------- 7
+    '''
+
+    corners3d = corners3d.astype(np.int32)
+    for k in range(0, 4):
+        i, j = k, (k + 1) % 4
+        cv2.line(image, (corners3d[i, 0], corners3d[i, 1]), (corners3d[j, 0], corners3d[j, 1]), color, thickness, lineType=cv2.LINE_AA)
+        i, j = k + 4, (k + 1) % 4 + 4
+        cv2.line(image, (corners3d[i, 0], corners3d[i, 1]), (corners3d[j, 0], corners3d[j, 1]), color, thickness, lineType=cv2.LINE_AA)
+        i, j = k, k + 4
+        cv2.line(image, (corners3d[i, 0], corners3d[i, 1]), (corners3d[j, 0], corners3d[j, 1]), color, thickness, lineType=cv2.LINE_AA)
+    return image
\ No newline at end of file

From 5335caa299ae84c430291414eabfa89a30a6ded3 Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Wed, 13 Jul 2022 16:48:46 -0700
Subject: [PATCH 07/22] dimension and angle

---
 datasets/coco.py | 22 ++++++++++++--
 kittidim.py      |  2 +-
 kittihbins.py    |  4 +--
 main.py          |  2 ++
 models/detr.py   | 75 ++++++++++++++++++++++++++++++++++++++++++------
 5 files changed, 92 insertions(+), 13 deletions(-)

diff --git a/datasets/coco.py b/datasets/coco.py
index bdd39196f..18b486ccd 100644
--- a/datasets/coco.py
+++ b/datasets/coco.py
@@ -17,12 +17,18 @@
 
 
 class CocoDetection(torchvision.datasets.CocoDetection):
-    def __init__(self, img_folder, ann_file, transforms, return_masks, bev_data = None):
+    def __init__(self, img_folder, ann_file, transforms, return_masks, bev_data = None, dim_data = None, heading_bin_data = None, heading_res_data = None):
         super(CocoDetection, self).__init__(img_folder, ann_file)
         self._transforms = transforms
         self.prepare = ConvertCocoPolysToMask(return_masks)
         if bev_data is not None:
             self.bev_data = json.load(open(bev_data))
+        if dim_data is not None:
+            self.dim_data = json.load(open(dim_data))
+        if heading_bin_data is not None:
+            self.heading_bin_data = json.load(open(heading_bin_data))
+        if heading_res_data is not None:
+            self.heading_res_data = json.load(open(heading_res_data))
 
     def __getitem__(self, idx):
         img, target = super(CocoDetection, self).__getitem__(idx)
@@ -31,8 +37,17 @@ def __getitem__(self, idx):
         img, target = self.prepare(img, target)
         if self._transforms is not None:
             img, target = self._transforms(img, target)
+        # load the bev data
         target['bev'] = torch.tensor(self.bev_data[str(image_id)])
         assert target['bev'].size()[0] == target['boxes'].size()[0]
+        # load the dimension data
+        target['dim'] = torch.tensor(self.dim_data[str(image_id)])
+        assert target['dim'].size()[0] == target['boxes'].size()[0]
+        # load the angle data
+        target['heading_bin'] = torch.tensor(self.heading_bin_data[str(image_id)])
+        assert target['heading_bin'].size()[0] == target['boxes'].size()[0]
+        target['heading_res'] = torch.tensor(self.heading_res_data[str(image_id)])
+        assert target['heading_res'].size()[0] == target['boxes'].size()[0]
         return img, target
 
 
@@ -175,5 +190,8 @@ def build_kitti_coco(image_set, args):
         "val": (Path("/srip-vol/datasets/KITTI3D/training/image_2"), anno_root / f'kitti_{image_set}.json'),
     }
     BEV_DATA = "/srip-vol/datasets/KITTI3D/coco/bev_%s.json"%(image_set)
+    DIM_DATA = "/srip-vol/datasets/KITTI3D/coco/dim_%s.json"%(image_set)
+    HEADING_BIN_DATA = "/srip-vol/datasets/KITTI3D/coco/heading_bins_%s.json"%(image_set)
+    HEADING_RES_DATA = "/srip-vol/datasets/KITTI3D/coco/heading_ress_%s.json"%(image_set)
     img_folder, ann_file = PATHS[image_set]
-    dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks, bev_data = BEV_DATA)
\ No newline at end of file
+    dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks, bev_data = BEV_DATA, heading_bin_data = HEADING_BIN_DATA, heading_res_data = HEADING_RES_DATA)
\ No newline at end of file
diff --git a/kittidim.py b/kittidim.py
index 006e44494..91def6cf3 100644
--- a/kittidim.py
+++ b/kittidim.py
@@ -37,6 +37,6 @@
     bevdim_data[i+1] = bevdim #torch.tensor(depth) 
 
 # Save bev data as json file
-output_path = '/srip-vol/datasets/KITTI3D/coco/bevdim_%s.json' %(split)
+output_path = '/srip-vol/datasets/KITTI3D/coco/bev_dim_%s.json' %(split)
 with open(output_path, "w") as outfile:
     json.dump(bevdim_data, outfile)
diff --git a/kittihbins.py b/kittihbins.py
index 0d0cd3d9a..8766091e6 100644
--- a/kittihbins.py
+++ b/kittihbins.py
@@ -92,8 +92,8 @@ def angle2class(angle):
         # x_c = float(label_data[11])
         # z_c = float(label_data[13])
 
-        heading_bins.append([heading_bin])
-        heading_ress.append([heading_res])
+        heading_bins.append(heading_bin)
+        heading_ress.append(heading_res)
 
     # Save in dict
     heading_bins_data[i+1] = heading_bins #torch.tensor(depth) 
diff --git a/main.py b/main.py
index d61f229a2..317ffb485 100644
--- a/main.py
+++ b/main.py
@@ -82,6 +82,8 @@ def get_args_parser():
     parser.add_argument('--bbox_loss_coef', default=5, type=float)
     parser.add_argument('--giou_loss_coef', default=2, type=float)
     parser.add_argument('--bev_loss_coef', default=2, type=float)
+    parser.add_argument('--dim_loss_coef', default=2, type=float)
+    parser.add_argument('--angle_loss_coef', default=1, type=float)
     parser.add_argument('--eos_coef', default=0.1, type=float,
                         help="Relative classification weight of the no-object class")
 
diff --git a/models/detr.py b/models/detr.py
index 9011bcc20..e7360a09f 100644
--- a/models/detr.py
+++ b/models/detr.py
@@ -40,7 +40,9 @@ def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=Fal
         self.input_proj = nn.Conv2d(backbone.num_channels, hidden_dim, kernel_size=1)
         self.backbone = backbone
         self.aux_loss = aux_loss
-        self.bev_embed = nn.Linear(hidden_dim, 2)
+        self.bev_embed = MLP(hidden_dim, hidden_dim, 2, 2)
+        self.angle_embed = MLP(hidden_dim, hidden_dim, 24, 2)
+        self.dim_embed = MLP(hidden_dim, hidden_dim, 2, 2)
 
     def forward(self, samples: NestedTensor):
         """ The forward expects a NestedTensor, which consists of:
@@ -68,18 +70,20 @@ def forward(self, samples: NestedTensor):
         outputs_class = self.class_embed(hs)
         outputs_coord = self.bbox_embed(hs).sigmoid()
         outputs_bev = self.bev_embed(hs)
-        out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1], 'pred_bev': outputs_bev[-1]}
+        outputs_dim = self.dim_embed(hs)
+        outputs_angle = self.angle_embed(hs)
+        out = {'pred_logits': outputs_class[-1], 'pred_boxes': outputs_coord[-1], 'pred_bev': outputs_bev[-1], 'pred_dim': outputs_dim[-1], 'pred_angle': outputs_angle[-1]}
         if self.aux_loss:
-            out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord, outputs_bev)
+            out['aux_outputs'] = self._set_aux_loss(outputs_class, outputs_coord, outputs_bev, outputs_dim, outputs_angle)
         return out
 
     @torch.jit.unused
-    def _set_aux_loss(self, outputs_class, outputs_coord, outputs_bev):
+    def _set_aux_loss(self, outputs_class, outputs_coord, outputs_bev, outputs_dim, outputs_angle):
         # this is a workaround to make torchscript happy, as torchscript
         # doesn't support dictionary with non-homogeneous values, such
         # as a dict having both a Tensor and a list.
-        return [{'pred_logits': a, 'pred_boxes': b, 'pred_bev': c}
-                for a, b, c in zip(outputs_class[:-1], outputs_coord[:-1], outputs_bev[:-1])]
+        return [{'pred_logits': a, 'pred_boxes': b, 'pred_bev': c, 'pred_dim': d, 'pred_angle': e}
+                for a, b, c, d, e in zip(outputs_class[:-1], outputs_coord[:-1], outputs_bev[:-1], outputs_dim[:-1], outputs_angle[:-1])]
 
 
 class SetCriterion(nn.Module):
@@ -201,6 +205,57 @@ def loss_bev(self, outputs, targets, indices, num_boxes):
         losses = {'loss_bev' : loss}
         return losses
 
+    def loss_dims(self, outputs, targets, indices, num_boxes):
+        assert 'pred_dim' in outputs
+        # idx = self._get_src_permutation_idx(indices)
+        # src_dim = outputs['pred_dim'][idx].squeeze()
+        # target_dim = torch.cat([t['dim'][i] for t, (_, i) in zip(targets, indices)])
+        # loss = F.mse_loss(src_dim, target_dim)
+        # losses = {'loss_bev' : loss}
+
+        idx = self._get_src_permutation_idx(indices)
+        src_dims = outputs['pred_dim'][idx]
+        target_dims = torch.cat([t['dim'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        dimension = target_dims.clone().detach()
+        dim_loss = torch.abs(src_dims - target_dims)
+        dim_loss /= dimension
+        with torch.no_grad():
+            compensation_weight = F.l1_loss(src_dims, target_dims) / dim_loss.mean()
+        dim_loss *= compensation_weight
+        losses = {}
+        losses['loss_dim'] = dim_loss.sum() / num_boxes
+
+        return losses
+    
+    def loss_angles(self, outputs, targets, indices, num_boxes):  
+
+        idx = self._get_src_permutation_idx(indices)
+        heading_input = outputs['pred_angle'][idx]
+        target_heading_cls = torch.cat([t['heading_bin'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        target_heading_res = torch.cat([t['heading_res'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        heading_input = heading_input.view(-1, 24)
+        heading_target_cls = target_heading_cls.view(-1).long()
+        heading_target_res = target_heading_res.view(-1)
+
+        # classification loss
+        heading_input_cls = heading_input[:, 0:12]
+        cls_loss = F.cross_entropy(heading_input_cls, heading_target_cls, reduction='none')
+
+        # regression loss
+        heading_input_res = heading_input[:, 12:24]
+        cls_onehot = torch.zeros(heading_target_cls.shape[0], 12).cuda().scatter_(dim=1, index=heading_target_cls.view(-1, 1), value=1)
+        heading_input_res = torch.sum(heading_input_res * cls_onehot, 1)
+        reg_loss = F.l1_loss(heading_input_res, heading_target_res, reduction='none')
+        
+        angle_loss = cls_loss + reg_loss
+        losses = {}
+        losses['loss_angle'] = angle_loss.sum() / num_boxes 
+        return losses
+
+    
+
     def _get_src_permutation_idx(self, indices):
         # permute predictions following indices
         batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
@@ -225,7 +280,9 @@ def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
             'cardinality': self.loss_cardinality,
             'boxes': self.loss_boxes,
             'masks': self.loss_masks,
-            'bev': self.loss_bev
+            'bev': self.loss_bev,
+            'dim': self.loss_dims,
+            'angle': self.loss_angles
         }
         assert loss in loss_map, f'do you really want to compute {loss} loss?'
         return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
@@ -353,6 +410,8 @@ def build(args):
     weight_dict = {'loss_ce': 1, 'loss_bbox': args.bbox_loss_coef}
     weight_dict['loss_giou'] = args.giou_loss_coef
     weight_dict['loss_bev'] = args.bev_loss_coef
+    weight_dict['loss_dim'] = args.dim_loss_coef
+    weight_dict['loss_angle'] = args.angle_loss_coef
     if args.masks:
         weight_dict["loss_mask"] = args.mask_loss_coef
         weight_dict["loss_dice"] = args.dice_loss_coef
@@ -363,7 +422,7 @@ def build(args):
             aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()})
         weight_dict.update(aux_weight_dict)
 
-    losses = ['labels', 'boxes', 'cardinality', 'bev']
+    losses = ['labels', 'boxes', 'cardinality', 'bev', 'dim', 'angle']
     if args.masks:
         losses += ["masks"]
     criterion = SetCriterion(num_classes, matcher=matcher, weight_dict=weight_dict,

From 415e91a5f514fcbeac423a7038bf877aa90dcd6e Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Wed, 13 Jul 2022 17:26:09 -0700
Subject: [PATCH 08/22] fix some small error

---
 kittihbins.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kittihbins.py b/kittihbins.py
index 8766091e6..7d5dbdf45 100644
--- a/kittihbins.py
+++ b/kittihbins.py
@@ -63,7 +63,7 @@ def angle2class(angle):
     # obj = lines_calib[5].strip().split(' ')[1:]
     # Tr_velo_to_cam = np.array(obj, dtype=np.float32)
 
-    P2.reshape(3, 4)
+    P2 = P2.reshape(3, 4)
     # P3.reshape(3, 4)
     # R0.reshape(3, 3)
     # Tr_velo_to_cam.reshape(3, 4)

From 1a82b436cc89823fba68c574264bd59517d52e43 Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Wed, 13 Jul 2022 22:06:22 -0700
Subject: [PATCH 09/22] fix small error

---
 datasets/coco.py | 3 ++-
 models/detr.py   | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/datasets/coco.py b/datasets/coco.py
index 18b486ccd..d817cde1d 100644
--- a/datasets/coco.py
+++ b/datasets/coco.py
@@ -194,4 +194,5 @@ def build_kitti_coco(image_set, args):
     HEADING_BIN_DATA = "/srip-vol/datasets/KITTI3D/coco/heading_bins_%s.json"%(image_set)
     HEADING_RES_DATA = "/srip-vol/datasets/KITTI3D/coco/heading_ress_%s.json"%(image_set)
     img_folder, ann_file = PATHS[image_set]
-    dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks, bev_data = BEV_DATA, heading_bin_data = HEADING_BIN_DATA, heading_res_data = HEADING_RES_DATA)
\ No newline at end of file
+    dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks, bev_data = BEV_DATA, heading_bin_data = HEADING_BIN_DATA, heading_res_data = HEADING_RES_DATA)
+    return dataset
\ No newline at end of file
diff --git a/models/detr.py b/models/detr.py
index e7360a09f..2250b9356 100644
--- a/models/detr.py
+++ b/models/detr.py
@@ -386,7 +386,7 @@ def build(args):
     # For more details on this, check the following discussion
     # https://github.com/facebookresearch/detr/issues/108#issuecomment-650269223
     # num_classes = 20 if args.dataset_file != 'coco' else 91
-    num_classes = arg.num_classes+1 if args.dataset_file != 'coco' else 91
+    num_classes = args.num_classes+1 if args.dataset_file != 'coco' else 91
     if args.dataset_file == "coco_panoptic":
         # for panoptic, we just add a num_classes that is large enough to hold
         # max_obj_id + 1, but the exact value doesn't really matter

From dcb579a58dc2535cef3ae861958e3f276311f85c Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Thu, 14 Jul 2022 21:18:44 -0700
Subject: [PATCH 10/22] fix small error

---
 kittihbins.py | 85 +++++++++------------------------------------------
 1 file changed, 14 insertions(+), 71 deletions(-)

diff --git a/kittihbins.py b/kittihbins.py
index 7d5dbdf45..54d80ac00 100644
--- a/kittihbins.py
+++ b/kittihbins.py
@@ -1,109 +1,52 @@
 # -----------------------------------------------------------------------------------
-# To generate bev 
+# To generate bev heading bins
 # -----------------------------------------------------------------------------------
 import os 
 import os.path as osp
 import json
 # import torch
 from tqdm import tqdm
-import numpy as np
-import cv2
 
-def ry2alpha(ry, u, cu, fu):
-    alpha = ry - np.arctan2(u - cu, fu)
-
-    if alpha > np.pi:
-        alpha -= 2 * np.pi
-    if alpha < -np.pi:
-        alpha += 2 * np.pi
+split = 'val'
+data_path = '/srip-vol/datasets/KITTI3D/coco/kitti_%s.json' %(split)
+data = json.load(open(data_path))
+bev_data = {}
 
-    return alpha
+KITTI_CLASS = {'Car': 0, 'Pedestrian': 1, 'Cyclist' : 2}
 
 def angle2class(angle):
     ''' Convert continuous angle to discrete class and residual. '''
     angle = angle % (2 * np.pi)
     assert (angle >= 0 and angle <= 2 * np.pi)
-    angle_per_class = 2 * np.pi / float(12)
+    angle_per_class = 2 * np.pi / float(num_heading_bin)
     shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi)
     class_id = int(shifted_angle / angle_per_class)
     residual_angle = shifted_angle - (class_id * angle_per_class + angle_per_class / 2)
     return class_id, residual_angle
 
-split = 'val'
-data_path = '/srip-vol/datasets/KITTI3D/coco/kitti_%s.json' %(split)
-data = json.load(open(data_path))
-heading_bins_data = {}
-heading_ress_data = {}
-
-KITTI_CLASS = {'Car': 0, 'Pedestrian': 1, 'Cyclist' : 2}
-
 for i in tqdm(range(len(data['images']))):
     # Label path
     img_path = data['images'][i]['file_name'].split('/')
     img_path[-2] = 'label_2'
     img_path[-1] = img_path[-1].split('.')[0] + '.txt'
     label_path = '/' + osp.join(*img_path)
-    
-    # Calibration path
-    img_path = data['images'][i]['file_name'].split('/')
-    img_path[-2] = 'calib'
-    img_path[-1] = img_path[-1].split('.')[0] + '.txt'
-    calib_path = '/' + osp.join(*img_path)
 
     # Read annotations and assembler point depth value
     lines = open(label_path).readlines()
-    lines_calib = open(calib_path).readlines()
-
-    obj = lines_calib[2].strip().split(' ')[1:]
-    P2 = np.array(obj, dtype=np.float32)
-    # obj = lines_calib[3].strip().split(' ')[1:]
-    # P3 = np.array(obj, dtype=np.float32)
-    # obj = lines_calib[4].strip().split(' ')[1:]
-    # R0 = np.array(obj, dtype=np.float32)
-    # obj = lines_calib[5].strip().split(' ')[1:]
-    # Tr_velo_to_cam = np.array(obj, dtype=np.float32)
-
-    P2 = P2.reshape(3, 4)
-    # P3.reshape(3, 4)
-    # R0.reshape(3, 3)
-    # Tr_velo_to_cam.reshape(3, 4)
-
-    # cv = P2[1, 2]
-    # fv = P2[1, 1]
-    # tx = P2[0, 3] / (-fu)
-    # ty = P2[1, 3] / (-fv)
-    cu = P2[0, 2]
-    fu = P2[0, 0]
-    
-    heading_bins = []
-    heading_ress = []
+    bev = []
     for line in lines:
         label_data = line.split(' ')
         if(KITTI_CLASS.get(label_data[0],-1) == -1):
             continue
-        bbx0 = float(label_data[4])
-        bbx2 = float(label_data[6])
-        ry = float(label_data[14])
-
-        heading_angle = ry2alpha(ry, (bbx0 + bbx2) / 2, cu, fu)
-        if heading_angle > np.pi:  heading_angle -= 2 * np.pi  # check range
-        if heading_angle < -np.pi: heading_angle += 2 * np.pi
-        heading_bin, heading_res = angle2class(heading_angle)
-        # x_c = float(label_data[11])
-        # z_c = float(label_data[13])
+        x_c = float(label_data[11])
+        z_c = float(label_data[13])
 
-        heading_bins.append(heading_bin)
-        heading_ress.append(heading_res)
+        bev.append([x_c, z_c])
 
     # Save in dict
-    heading_bins_data[i+1] = heading_bins #torch.tensor(depth) 
-    heading_ress_data[i+1] = heading_ress
+    bev_data[i+1] = bev #torch.tensor(depth) 
 
 # Save bev data as json file
-output_path = '/srip-vol/datasets/KITTI3D/coco/heading_bins_%s.json' %(split)
-with open(output_path, "w") as outfile:
-    json.dump(heading_bins_data, outfile)
-
-output_path = '/srip-vol/datasets/KITTI3D/coco/heading_ress_%s.json' %(split)
+output_path = '/srip-vol/datasets/KITTI3D/coco/bev_hbins_%s.json' %(split)
 with open(output_path, "w") as outfile:
-    json.dump(heading_ress_data, outfile)
+    json.dump(bev_data, outfile)

From 08a26e7e366b988dd9e9e9cbe7b00f9be62572aa Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Thu, 14 Jul 2022 21:20:54 -0700
Subject: [PATCH 11/22] kittibins

---
 kittihbins.py | 85 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 71 insertions(+), 14 deletions(-)

diff --git a/kittihbins.py b/kittihbins.py
index 54d80ac00..a1bdf3a7f 100644
--- a/kittihbins.py
+++ b/kittihbins.py
@@ -1,52 +1,109 @@
 # -----------------------------------------------------------------------------------
-# To generate bev heading bins
+# To generate bev bins
 # -----------------------------------------------------------------------------------
 import os 
 import os.path as osp
 import json
 # import torch
 from tqdm import tqdm
+import numpy as np
+import cv2
 
-split = 'val'
-data_path = '/srip-vol/datasets/KITTI3D/coco/kitti_%s.json' %(split)
-data = json.load(open(data_path))
-bev_data = {}
+def ry2alpha(ry, u, cu, fu):
+    alpha = ry - np.arctan2(u - cu, fu)
 
-KITTI_CLASS = {'Car': 0, 'Pedestrian': 1, 'Cyclist' : 2}
+    if alpha > np.pi:
+        alpha -= 2 * np.pi
+    if alpha < -np.pi:
+        alpha += 2 * np.pi
+
+    return alpha
 
 def angle2class(angle):
     ''' Convert continuous angle to discrete class and residual. '''
     angle = angle % (2 * np.pi)
     assert (angle >= 0 and angle <= 2 * np.pi)
-    angle_per_class = 2 * np.pi / float(num_heading_bin)
+    angle_per_class = 2 * np.pi / float(12)
     shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi)
     class_id = int(shifted_angle / angle_per_class)
     residual_angle = shifted_angle - (class_id * angle_per_class + angle_per_class / 2)
     return class_id, residual_angle
 
+split = 'val'
+data_path = '/srip-vol/datasets/KITTI3D/coco/kitti_%s.json' %(split)
+data = json.load(open(data_path))
+heading_bins_data = {}
+heading_ress_data = {}
+
+KITTI_CLASS = {'Car': 0, 'Pedestrian': 1, 'Cyclist' : 2}
+
 for i in tqdm(range(len(data['images']))):
     # Label path
     img_path = data['images'][i]['file_name'].split('/')
     img_path[-2] = 'label_2'
     img_path[-1] = img_path[-1].split('.')[0] + '.txt'
     label_path = '/' + osp.join(*img_path)
+    
+    # Calibration path
+    img_path = data['images'][i]['file_name'].split('/')
+    img_path[-2] = 'calib'
+    img_path[-1] = img_path[-1].split('.')[0] + '.txt'
+    calib_path = '/' + osp.join(*img_path)
 
     # Read annotations and assembler point depth value
     lines = open(label_path).readlines()
-    bev = []
+    lines_calib = open(calib_path).readlines()
+
+    obj = lines_calib[2].strip().split(' ')[1:]
+    P2 = np.array(obj, dtype=np.float32)
+    # obj = lines_calib[3].strip().split(' ')[1:]
+    # P3 = np.array(obj, dtype=np.float32)
+    # obj = lines_calib[4].strip().split(' ')[1:]
+    # R0 = np.array(obj, dtype=np.float32)
+    # obj = lines_calib[5].strip().split(' ')[1:]
+    # Tr_velo_to_cam = np.array(obj, dtype=np.float32)
+
+    P2 = P2.reshape(3, 4)
+    # P3.reshape(3, 4)
+    # R0.reshape(3, 3)
+    # Tr_velo_to_cam.reshape(3, 4)
+
+    # cv = P2[1, 2]
+    # fv = P2[1, 1]
+    # tx = P2[0, 3] / (-fu)
+    # ty = P2[1, 3] / (-fv)
+    cu = P2[0, 2]
+    fu = P2[0, 0]
+    
+    heading_bins = []
+    heading_ress = []
     for line in lines:
         label_data = line.split(' ')
         if(KITTI_CLASS.get(label_data[0],-1) == -1):
             continue
-        x_c = float(label_data[11])
-        z_c = float(label_data[13])
+        bbx0 = float(label_data[4])
+        bbx2 = float(label_data[6])
+        ry = float(label_data[14])
+
+        heading_angle = ry2alpha(ry, (bbx0 + bbx2) / 2, cu, fu)
+        if heading_angle > np.pi:  heading_angle -= 2 * np.pi  # check range
+        if heading_angle < -np.pi: heading_angle += 2 * np.pi
+        heading_bin, heading_res = angle2class(heading_angle)
+        # x_c = float(label_data[11])
+        # z_c = float(label_data[13])
 
-        bev.append([x_c, z_c])
+        heading_bins.append(heading_bin)
+        heading_ress.append(heading_res)
 
     # Save in dict
-    bev_data[i+1] = bev #torch.tensor(depth) 
+    heading_bins_data[i+1] = heading_bins #torch.tensor(depth) 
+    heading_ress_data[i+1] = heading_ress
 
 # Save bev data as json file
-output_path = '/srip-vol/datasets/KITTI3D/coco/bev_hbins_%s.json' %(split)
+output_path = '/srip-vol/datasets/KITTI3D/coco/heading_bins_%s.json' %(split)
+with open(output_path, "w") as outfile:
+    json.dump(heading_bins_data, outfile)
+
+output_path = '/srip-vol/datasets/KITTI3D/coco/heading_ress_%s.json' %(split)
 with open(output_path, "w") as outfile:
-    json.dump(bev_data, outfile)
+    json.dump(heading_ress_data, outfile)

From 952eae385119716dc7bf14b5ef873c4f56dfb60c Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Mon, 25 Jul 2022 15:03:32 -0700
Subject: [PATCH 12/22] Update coco.py for the ground truth query part

---
 datasets/coco.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/datasets/coco.py b/datasets/coco.py
index d817cde1d..50e7df8d0 100644
--- a/datasets/coco.py
+++ b/datasets/coco.py
@@ -23,6 +23,7 @@ def __init__(self, img_folder, ann_file, transforms, return_masks, bev_data = No
         self.prepare = ConvertCocoPolysToMask(return_masks)
         if bev_data is not None:
             self.bev_data = json.load(open(bev_data))
+            self.bev_coor = json.load(open(bev_data))
         if dim_data is not None:
             self.dim_data = json.load(open(dim_data))
         if heading_bin_data is not None:
@@ -48,7 +49,10 @@ def __getitem__(self, idx):
         assert target['heading_bin'].size()[0] == target['boxes'].size()[0]
         target['heading_res'] = torch.tensor(self.heading_res_data[str(image_id)])
         assert target['heading_res'].size()[0] == target['boxes'].size()[0]
-        return img, target
+        b_coordinate = torch.tensor([self.bev_coor[str(image_id)]])
+        # b_coordinate = self.bev_coor[str(image_id)]
+        return img, target, b_coordinate
+
 
 
 def convert_coco_poly_to_mask(segmentations, height, width):
@@ -195,4 +199,4 @@ def build_kitti_coco(image_set, args):
     HEADING_RES_DATA = "/srip-vol/datasets/KITTI3D/coco/heading_ress_%s.json"%(image_set)
     img_folder, ann_file = PATHS[image_set]
     dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks, bev_data = BEV_DATA, heading_bin_data = HEADING_BIN_DATA, heading_res_data = HEADING_RES_DATA)
-    return dataset
\ No newline at end of file
+    return dataset

From 34b51ea34429f04a1fc94a27bfdb99c2b48f757a Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Mon, 25 Jul 2022 15:05:48 -0700
Subject: [PATCH 13/22] Update engine.py for ground truth query part

---
 engine.py | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/engine.py b/engine.py
index ac5ea6ff4..0180bad8f 100644
--- a/engine.py
+++ b/engine.py
@@ -25,11 +25,25 @@ def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
     header = 'Epoch: [{}]'.format(epoch)
     print_freq = 10
 
-    for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
+    for samples, targets, b_coordinates in metric_logger.log_every(data_loader, print_freq, header):
+        # print(samples)
         samples = samples.to(device)
+
+        # size 
+        # print(b_coordinates[0].size())
+        # print(b_coordinates[3])
+        b_m1 = torch.mean(b_coordinates[0], 1, True)
+        # print(b_m1.size())
+        # print(b_coordinates[3])
+        b_m2 = torch.mean(b_coordinates[1], 1, True)
+        temp = torch.cat((b_m1, b_m2), 0)
+        temp = temp.squeeze(1)
+        temp = temp.to(device)
+
+        # print(temp.size())
         targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
 
-        outputs = model(samples)
+        outputs = model(samples, temp)
         loss_dict = criterion(outputs, targets)
         weight_dict = criterion.weight_dict
         losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
@@ -85,11 +99,26 @@ def evaluate(model, criterion, postprocessors, data_loader, base_ds, device, out
             output_dir=os.path.join(output_dir, "panoptic_eval"),
         )
 
-    for samples, targets in metric_logger.log_every(data_loader, 10, header):
+    for samples, targets, b_coordinates in metric_logger.log_every(data_loader, 10, header):
         samples = samples.to(device)
         targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
 
-        outputs = model(samples)
+        # print(b_coordinates)
+        if len(b_coordinates) == 0:
+            print(samples)
+            temp = torch.zeros([1, 2], dtype=torch.float64)
+        elif len(b_coordinates) == 1:
+            b_m1 = torch.mean(b_coordinates[0], 1, True)
+            temp = b_m1.squeeze(1)
+        else:
+            b_m1 = torch.mean(b_coordinates[0], 1, True)
+            b_m2 = torch.mean(b_coordinates[1], 1, True)
+            temp = torch.cat((b_m1, b_m2), 0)
+            temp = temp.squeeze(1)
+        # temp = b_m1.squeeze(1)
+        temp = temp.to(device)
+
+        outputs = model(samples, temp)
         loss_dict = criterion(outputs, targets)
         weight_dict = criterion.weight_dict
 

From f7dfbb480734df903abab65b35273d7404b6ff76 Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Mon, 25 Jul 2022 15:07:13 -0700
Subject: [PATCH 14/22] Update detr.py for ground truth query part

---
 models/detr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/models/detr.py b/models/detr.py
index 2250b9356..89009c9da 100644
--- a/models/detr.py
+++ b/models/detr.py
@@ -44,7 +44,7 @@ def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=Fal
         self.angle_embed = MLP(hidden_dim, hidden_dim, 24, 2)
         self.dim_embed = MLP(hidden_dim, hidden_dim, 2, 2)
 
-    def forward(self, samples: NestedTensor):
+    def forward(self, samples: NestedTensor, b_coordinate):
         """ The forward expects a NestedTensor, which consists of:
                - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
                - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
@@ -65,7 +65,7 @@ def forward(self, samples: NestedTensor):
 
         src, mask = features[-1].decompose()
         assert mask is not None
-        hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1])[0]
+        hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1], b_coordinate)[0]
 
         outputs_class = self.class_embed(hs)
         outputs_coord = self.bbox_embed(hs).sigmoid()

From 4cdaf63a2818c3e96a43603a1d8da5698ebefb17 Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Mon, 25 Jul 2022 15:08:31 -0700
Subject: [PATCH 15/22] Update transformer.py for query part

---
 models/transformer.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/models/transformer.py b/models/transformer.py
index dcd536750..02a1940ea 100644
--- a/models/transformer.py
+++ b/models/transformer.py
@@ -38,20 +38,34 @@ def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
 
         self.d_model = d_model
         self.nhead = nhead
+         
+        self.linear_b = nn.Linear(2, 100)
+        self.conv_b = nn.Conv1d(in_channels=1, out_channels=256,kernel_size=1)
 
     def _reset_parameters(self):
         for p in self.parameters():
             if p.dim() > 1:
                 nn.init.xavier_uniform_(p)
 
-    def forward(self, src, mask, query_embed, pos_embed):
+    def forward(self, src, mask, query_embed, pos_embed, b_coordinate):
         # flatten NxCxHxW to HWxNxC
         bs, c, h, w = src.shape
         src = src.flatten(2).permute(2, 0, 1)
         pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
         query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
         mask = mask.flatten(1)
-
+         
+        # From N*2 to N*num_queries 
+        b_coordinate = self.linear_b(b_coordinate)
+        # From N*num_queries to N*1*num_queries
+        b_coordinate = b_coordinate.unsqueeze(1)
+        # From N*1*num_quries to N*hidden_dim*num_queries
+        b_coordinate = self.conv_b(b_coordinate)
+        # From N*hidden_dim*num_queries to num_quries*N*hidden_dim
+        b_coordinate = b_coordinate.permute(2, 0, 1)
+
+        query_embed = query_embed + b_coordinate
+        
         tgt = torch.zeros_like(query_embed)
         memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
         hs = self.decoder(tgt, memory, memory_key_padding_mask=mask,

From 3a7615e9ae3f5a89426cb0c089e448d605981055 Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Thu, 28 Jul 2022 15:40:48 -0700
Subject: [PATCH 16/22] Let the output of the image view transformer be the
 query of the BEV transformer

---
 datasets/coco.py          |   5 +-
 engine.py                 |  36 +----
 models/detr.py            |  12 +-
 models/transformer.py     |  13 +-
 models/transformer_BEV.py | 313 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 335 insertions(+), 44 deletions(-)
 create mode 100644 models/transformer_BEV.py

diff --git a/datasets/coco.py b/datasets/coco.py
index 50e7df8d0..b1ce6201a 100644
--- a/datasets/coco.py
+++ b/datasets/coco.py
@@ -49,9 +49,10 @@ def __getitem__(self, idx):
         assert target['heading_bin'].size()[0] == target['boxes'].size()[0]
         target['heading_res'] = torch.tensor(self.heading_res_data[str(image_id)])
         assert target['heading_res'].size()[0] == target['boxes'].size()[0]
-        b_coordinate = torch.tensor([self.bev_coor[str(image_id)]])
+        # b_coordinate = torch.tensor([self.bev_coor[str(image_id)]])
         # b_coordinate = self.bev_coor[str(image_id)]
-        return img, target, b_coordinate
+        return img, target
+        
 
 
 
diff --git a/engine.py b/engine.py
index 0180bad8f..1a8cc03f7 100644
--- a/engine.py
+++ b/engine.py
@@ -25,25 +25,12 @@ def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
     header = 'Epoch: [{}]'.format(epoch)
     print_freq = 10
 
-    for samples, targets, b_coordinates in metric_logger.log_every(data_loader, print_freq, header):
-        # print(samples)
+    for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
         samples = samples.to(device)
 
-        # size 
-        # print(b_coordinates[0].size())
-        # print(b_coordinates[3])
-        b_m1 = torch.mean(b_coordinates[0], 1, True)
-        # print(b_m1.size())
-        # print(b_coordinates[3])
-        b_m2 = torch.mean(b_coordinates[1], 1, True)
-        temp = torch.cat((b_m1, b_m2), 0)
-        temp = temp.squeeze(1)
-        temp = temp.to(device)
-
-        # print(temp.size())
         targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
 
-        outputs = model(samples, temp)
+        outputs = model(samples)
         loss_dict = criterion(outputs, targets)
         weight_dict = criterion.weight_dict
         losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
@@ -99,26 +86,11 @@ def evaluate(model, criterion, postprocessors, data_loader, base_ds, device, out
             output_dir=os.path.join(output_dir, "panoptic_eval"),
         )
 
-    for samples, targets, b_coordinates in metric_logger.log_every(data_loader, 10, header):
+    for samples, targets in metric_logger.log_every(data_loader, 10, header):
         samples = samples.to(device)
         targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
 
-        # print(b_coordinates)
-        if len(b_coordinates) == 0:
-            print(samples)
-            temp = torch.zeros([1, 2], dtype=torch.float64)
-        elif len(b_coordinates) == 1:
-            b_m1 = torch.mean(b_coordinates[0], 1, True)
-            temp = b_m1.squeeze(1)
-        else:
-            b_m1 = torch.mean(b_coordinates[0], 1, True)
-            b_m2 = torch.mean(b_coordinates[1], 1, True)
-            temp = torch.cat((b_m1, b_m2), 0)
-            temp = temp.squeeze(1)
-        # temp = b_m1.squeeze(1)
-        temp = temp.to(device)
-
-        outputs = model(samples, temp)
+        outputs = model(samples)
         loss_dict = criterion(outputs, targets)
         weight_dict = criterion.weight_dict
 
diff --git a/models/detr.py b/models/detr.py
index 89009c9da..b4d497215 100644
--- a/models/detr.py
+++ b/models/detr.py
@@ -16,11 +16,12 @@
 from .segmentation import (DETRsegm, PostProcessPanoptic, PostProcessSegm,
                            dice_loss, sigmoid_focal_loss)
 from .transformer import build_transformer
+from .transformer_BEV import build_transformer_BEV
 
 
 class DETR(nn.Module):
     """ This is the DETR module that performs object detection """
-    def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=False):
+    def __init__(self, backbone, transformer, transformer_BEV, num_classes, num_queries, aux_loss=False):
         """ Initializes the model.
         Parameters:
             backbone: torch module of the backbone to be used. See backbone.py
@@ -33,6 +34,7 @@ def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=Fal
         super().__init__()
         self.num_queries = num_queries
         self.transformer = transformer
+        self.transformer_BEV = transformer_BEV
         hidden_dim = transformer.d_model
         self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
         self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
@@ -44,7 +46,7 @@ def __init__(self, backbone, transformer, num_classes, num_queries, aux_loss=Fal
         self.angle_embed = MLP(hidden_dim, hidden_dim, 24, 2)
         self.dim_embed = MLP(hidden_dim, hidden_dim, 2, 2)
 
-    def forward(self, samples: NestedTensor, b_coordinate):
+    def forward(self, samples: NestedTensor):
         """ The forward expects a NestedTensor, which consists of:
                - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
                - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
@@ -65,7 +67,9 @@ def forward(self, samples: NestedTensor, b_coordinate):
 
         src, mask = features[-1].decompose()
         assert mask is not None
-        hs = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1], b_coordinate)[0]
+        query_B = self.transformer(self.input_proj(src), mask, self.query_embed.weight, pos[-1])[0]
+        print(query_B.size())
+        hs = self.transformer_BEV(self.input_proj(src), mask, self.query_embed.weight, pos[-1], query_B)[0]
 
         outputs_class = self.class_embed(hs)
         outputs_coord = self.bbox_embed(hs).sigmoid()
@@ -396,10 +400,12 @@ def build(args):
     backbone = build_backbone(args)
 
     transformer = build_transformer(args)
+    transformer_BEV = build_transformer_BEV(args)
 
     model = DETR(
         backbone,
         transformer,
+        transformer_BEV,
         num_classes=num_classes,
         num_queries=args.num_queries,
         aux_loss=args.aux_loss,
diff --git a/models/transformer.py b/models/transformer.py
index 02a1940ea..15fadde4b 100644
--- a/models/transformer.py
+++ b/models/transformer.py
@@ -47,7 +47,7 @@ def _reset_parameters(self):
             if p.dim() > 1:
                 nn.init.xavier_uniform_(p)
 
-    def forward(self, src, mask, query_embed, pos_embed, b_coordinate):
+    def forward(self, src, mask, query_embed, pos_embed):
         # flatten NxCxHxW to HWxNxC
         bs, c, h, w = src.shape
         src = src.flatten(2).permute(2, 0, 1)
@@ -56,15 +56,14 @@ def forward(self, src, mask, query_embed, pos_embed, b_coordinate):
         mask = mask.flatten(1)
          
         # From N*2 to N*num_queries 
-        b_coordinate = self.linear_b(b_coordinate)
+        # b_coordinate = self.linear_b(b_coordinate)
         # From N*num_queries to N*1*num_queries
-        b_coordinate = b_coordinate.unsqueeze(1)
+        # b_coordinate = b_coordinate.unsqueeze(1)
         # From N*1*num_quries to N*hidden_dim*num_queries
-        b_coordinate = self.conv_b(b_coordinate)
+        # b_coordinate = self.conv_b(b_coordinate)
         # From N*hidden_dim*num_queries to num_quries*N*hidden_dim
-        b_coordinate = b_coordinate.permute(2, 0, 1)
-
-        query_embed = query_embed + b_coordinate
+        # b_coordinate = b_coordinate.permute(2, 0, 1)
+        # query_embed = query_embed + b_coordinate
         
         tgt = torch.zeros_like(query_embed)
         memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
diff --git a/models/transformer_BEV.py b/models/transformer_BEV.py
new file mode 100644
index 000000000..82a2a380f
--- /dev/null
+++ b/models/transformer_BEV.py
@@ -0,0 +1,313 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+DETR Transformer class.
+
+Copy-paste from torch.nn.Transformer with modifications:
+    * positional encodings are passed in MHattention
+    * extra LN at the end of encoder is removed
+    * decoder returns a stack of activations from all decoding layers
+"""
+import copy
+from typing import Optional, List
+
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+
+class Transformer(nn.Module):
+
+    def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
+                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False,
+                 return_intermediate_dec=False):
+        super().__init__()
+
+        encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
+                                                dropout, activation, normalize_before)
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
+
+        decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
+                                                dropout, activation, normalize_before)
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
+                                          return_intermediate=return_intermediate_dec)
+
+        self._reset_parameters()
+
+        self.d_model = d_model
+        self.nhead = nhead
+         
+        self.linear_b = nn.Linear(2, 100)
+        self.conv_b = nn.Conv1d(in_channels=1, out_channels=256,kernel_size=1)
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(self, src, mask, query_embed, pos_embed, query_B):
+        # flatten NxCxHxW to HWxNxC
+        bs, c, h, w = src.shape
+        src = src.flatten(2).permute(2, 0, 1)
+        pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
+        query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
+        mask = mask.flatten(1)
+         
+        # From N*2 to N*num_queries 
+        # b_coordinate = self.linear_b(b_coordinate)
+        # From N*num_queries to N*1*num_queries
+        # b_coordinate = b_coordinate.unsqueeze(1)
+        # From N*1*num_quries to N*hidden_dim*num_queries
+        # b_coordinate = self.conv_b(b_coordinate)
+        # From N*hidden_dim*num_queries to num_quries*N*hidden_dim
+        # b_coordinate = b_coordinate.permute(2, 0, 1)
+        # query_embed = query_embed + b_coordinate
+        query_B = query_B.permute(1, 0, 2)
+        query_embed = query_embed + query_B
+
+
+        tgt = torch.zeros_like(query_embed)
+        memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
+        hs = self.decoder(tgt, memory, memory_key_padding_mask=mask,
+                          pos=pos_embed, query_pos=query_embed)
+        return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)
+
+
+class TransformerEncoder(nn.Module):
+
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, src,
+                mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        output = src
+
+        for layer in self.layers:
+            output = layer(output, src_mask=mask,
+                           src_key_padding_mask=src_key_padding_mask, pos=pos)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoder(nn.Module):
+
+    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+
+    def forward(self, tgt, memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        output = tgt
+
+        intermediate = []
+
+        for layer in self.layers:
+            output = layer(output, memory, tgt_mask=tgt_mask,
+                           memory_mask=memory_mask,
+                           tgt_key_padding_mask=tgt_key_padding_mask,
+                           memory_key_padding_mask=memory_key_padding_mask,
+                           pos=pos, query_pos=query_pos)
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+
+        if self.norm is not None:
+            output = self.norm(output)
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(output)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+
+        return output.unsqueeze(0)
+
+
+class TransformerEncoderLayer(nn.Module):
+
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(self,
+                     src,
+                     src_mask: Optional[Tensor] = None,
+                     src_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(src, pos)
+        src2 = self.self_attn(q, k, value=src, attn_mask=src_mask,
+                              key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward_pre(self, src,
+                    src_mask: Optional[Tensor] = None,
+                    src_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None):
+        src2 = self.norm1(src)
+        q = k = self.with_pos_embed(src2, pos)
+        src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
+                              key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src2 = self.norm2(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
+        src = src + self.dropout2(src2)
+        return src
+
+    def forward(self, src,
+                src_mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
+        return self.forward_post(src, src_mask, src_key_padding_mask, pos)
+
+
+class TransformerDecoderLayer(nn.Module):
+
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
+                 activation="relu", normalize_before=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(self, tgt, memory,
+                     tgt_mask: Optional[Tensor] = None,
+                     memory_mask: Optional[Tensor] = None,
+                     tgt_key_padding_mask: Optional[Tensor] = None,
+                     memory_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward_pre(self, tgt, memory,
+                    tgt_mask: Optional[Tensor] = None,
+                    memory_mask: Optional[Tensor] = None,
+                    tgt_key_padding_mask: Optional[Tensor] = None,
+                    memory_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
+                              key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt2 = self.norm2(tgt)
+        tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
+                                   key=self.with_pos_embed(memory, pos),
+                                   value=memory, attn_mask=memory_mask,
+                                   key_padding_mask=memory_key_padding_mask)[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt2 = self.norm3(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt
+
+    def forward(self, tgt, memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        if self.normalize_before:
+            return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
+                                    tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
+        return self.forward_post(tgt, memory, tgt_mask, memory_mask,
+                                 tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def build_transformer_BEV(args):
+    return Transformer(
+        d_model=args.hidden_dim,
+        dropout=args.dropout,
+        nhead=args.nheads,
+        dim_feedforward=args.dim_feedforward,
+        num_encoder_layers=args.enc_layers,
+        num_decoder_layers=args.dec_layers,
+        normalize_before=args.pre_norm,
+        return_intermediate_dec=True,
+    )
+
+
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")

From 235510bf10e16d9e121ff1568aa979618e9b3d17 Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Thu, 28 Jul 2022 16:04:24 -0700
Subject: [PATCH 17/22] fix small bugs

---
 datasets/coco.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datasets/coco.py b/datasets/coco.py
index b1ce6201a..fbcdba863 100644
--- a/datasets/coco.py
+++ b/datasets/coco.py
@@ -195,9 +195,9 @@ def build_kitti_coco(image_set, args):
         "val": (Path("/srip-vol/datasets/KITTI3D/training/image_2"), anno_root / f'kitti_{image_set}.json'),
     }
     BEV_DATA = "/srip-vol/datasets/KITTI3D/coco/bev_%s.json"%(image_set)
-    DIM_DATA = "/srip-vol/datasets/KITTI3D/coco/dim_%s.json"%(image_set)
+    DIM_DATA = "/srip-vol/datasets/KITTI3D/coco/bev_dim_%s.json"%(image_set)
     HEADING_BIN_DATA = "/srip-vol/datasets/KITTI3D/coco/heading_bins_%s.json"%(image_set)
     HEADING_RES_DATA = "/srip-vol/datasets/KITTI3D/coco/heading_ress_%s.json"%(image_set)
     img_folder, ann_file = PATHS[image_set]
-    dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks, bev_data = BEV_DATA, heading_bin_data = HEADING_BIN_DATA, heading_res_data = HEADING_RES_DATA)
+    dataset = CocoDetection(img_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks, bev_data = BEV_DATA, dim_data = DIM_DATA, heading_bin_data = HEADING_BIN_DATA, heading_res_data = HEADING_RES_DATA)
     return dataset

From d718fc3ddb07217aec39eafdbb92a602e7ecb13b Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Thu, 28 Jul 2022 16:22:21 -0700
Subject: [PATCH 18/22] change the dimension of the query to fit the model

---
 models/transformer_BEV.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/models/transformer_BEV.py b/models/transformer_BEV.py
index 82a2a380f..8742adc62 100644
--- a/models/transformer_BEV.py
+++ b/models/transformer_BEV.py
@@ -64,10 +64,17 @@ def forward(self, src, mask, query_embed, pos_embed, query_B):
         # From N*hidden_dim*num_queries to num_quries*N*hidden_dim
         # b_coordinate = b_coordinate.permute(2, 0, 1)
         # query_embed = query_embed + b_coordinate
-        query_B = query_B.permute(1, 0, 2)
+        
+        # 6 * N * num_queries * hidden_dim to num_queries * N * hidden_dim * 6
+        query_B = query_B.permute(2, 1, 3, 0)
+        # num_queries * N * hidden_dim * 6 to num_queries * N * hidden_dim * 1
+        query_B = self.linear_Q(query_B)
+        # num_queries * N * hidden_dim * 1 to num_queries * N * 1 * hidden_dim
+        query_B = query_B.permute(0, 1, 3, 2)
+        # num_queries * N * 1 * hidden_dim to num_queries * N * hidden_dim
+        query_B = query_embed.squeeze(2)
         query_embed = query_embed + query_B
 
-
         tgt = torch.zeros_like(query_embed)
         memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
         hs = self.decoder(tgt, memory, memory_key_padding_mask=mask,

From c8f5132737a728a6da8e873677d183ff3b1614e7 Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Tue, 2 Aug 2022 15:21:25 -0700
Subject: [PATCH 19/22] loss function for the bev center

---
 models/detr.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/models/detr.py b/models/detr.py
index b4d497215..0df42a636 100644
--- a/models/detr.py
+++ b/models/detr.py
@@ -205,8 +205,9 @@ def loss_bev(self, outputs, targets, indices, num_boxes):
         idx = self._get_src_permutation_idx(indices)
         src_bev = outputs['pred_bev'][idx].squeeze()
         target_bev = torch.cat([t['bev'][i] for t, (_, i) in zip(targets, indices)])
-        loss = F.mse_loss(src_bev, target_bev)
-        losses = {'loss_bev' : loss}
+        loss = F.l1_loss(src_bev, target_bev, reduction='none')
+        losses = {}
+        losses['loss_center'] = loss.sum() / num_boxes
         return losses
 
     def loss_dims(self, outputs, targets, indices, num_boxes):

From 8394e82d6af73554d401cec7ab1c6c41972741d3 Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Tue, 2 Aug 2022 18:32:10 -0700
Subject: [PATCH 20/22] code to get the output of 1 batch images

---
 engine.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/engine.py b/engine.py
index 1a8cc03f7..7f24f4753 100644
--- a/engine.py
+++ b/engine.py
@@ -31,6 +31,19 @@ def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
         targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
 
         outputs = model(samples)
+
+        # print(image_id)
+        # outputs = model(samples, temp)
+        # torch.save(outputs, 'outputs.pt')
+        # # print(outputs)
+        # outputs_json = {k: v.item() for k, v in outputs.items()}
+        # tf = open("data.json", "w")
+        # json.dump(outputs_json, tf)
+        # tf.close()
+
+        # with open('data.txt','w') as f:
+        #     f.write(outputs)
+
         loss_dict = criterion(outputs, targets)
         weight_dict = criterion.weight_dict
         losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)

From bb10227e870afaf86dcd215809b924450e41c473 Mon Sep 17 00:00:00 2001
From: ESONG1999 <100745711+ESONG1999@users.noreply.github.com>
Date: Tue, 2 Aug 2022 22:25:29 -0700
Subject: [PATCH 21/22] fix small bugs

---
 models/detr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/detr.py b/models/detr.py
index 0df42a636..66a316fe0 100644
--- a/models/detr.py
+++ b/models/detr.py
@@ -207,7 +207,7 @@ def loss_bev(self, outputs, targets, indices, num_boxes):
         target_bev = torch.cat([t['bev'][i] for t, (_, i) in zip(targets, indices)])
         loss = F.l1_loss(src_bev, target_bev, reduction='none')
         losses = {}
-        losses['loss_center'] = loss.sum() / num_boxes
+        losses['loss_bev'] = loss.sum() / num_boxes
         return losses
 
     def loss_dims(self, outputs, targets, indices, num_boxes):

From 03fd5f2d694a6c77916eb08b638afacddfbaa4ce Mon Sep 17 00:00:00 2001
From: ESONG1999 <yus059@ucsd.edu>
Date: Fri, 12 Aug 2022 03:52:55 +0000
Subject: [PATCH 22/22] whole model (avaliable for pretrained weight)

---
 main.py | 49 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 38 insertions(+), 11 deletions(-)

diff --git a/main.py b/main.py
index 317ffb485..6af451592 100644
--- a/main.py
+++ b/main.py
@@ -37,6 +37,8 @@ def get_args_parser():
                         help="Path to the pretrained model. If set, only the mask head will be trained")
     # parser.add_argument('--bev_regression', action='store_false',
     #                     help="Add flag to regress bev directly")
+    # parser.add_argument('--depth_regression', action='store_true',
+    #                     help="Add flag to regress depth directly else use multi bin approach")
     # * Backbone
     parser.add_argument('--backbone', default='resnet50', type=str,
                         help="Name of the convolutional backbone to use")
@@ -65,6 +67,11 @@ def get_args_parser():
     # * Segmentation
     parser.add_argument('--masks', action='store_true',
                         help="Train segmentation head if the flag is provided")
+    # Depth
+    parser.add_argument('--num_depth_bins', type=int , default = 9,
+                        help="Number of depth bins")
+    parser.add_argument('--depth_bin_res', type=int , default = 10,
+                        help="Width of each depth bin")
 
     # Loss
     parser.add_argument('--no_aux_loss', dest='aux_loss', action='store_false',
@@ -81,7 +88,10 @@ def get_args_parser():
     parser.add_argument('--dice_loss_coef', default=1, type=float)
     parser.add_argument('--bbox_loss_coef', default=5, type=float)
     parser.add_argument('--giou_loss_coef', default=2, type=float)
+    parser.add_argument('--depth_loss_coef', default=1, type=float)
     parser.add_argument('--bev_loss_coef', default=2, type=float)
+    parser.add_argument('--head_loss_coef', default=2, type=float)
+    parser.add_argument('--feet_loss_coef', default=2, type=float)
     parser.add_argument('--dim_loss_coef', default=2, type=float)
     parser.add_argument('--angle_loss_coef', default=1, type=float)
     parser.add_argument('--eos_coef', default=0.1, type=float,
@@ -102,7 +112,7 @@ def get_args_parser():
                         help='device to use for training / testing')
     parser.add_argument('--seed', default=42, type=int)
     # parser.add_argument('--resume', default='', help='resume from checkpoint')
-    parser.add_argument('--resume', default='pretrained/detr-r101-dc5-a2e86def.pth', help='resume from checkpoint')
+    parser.add_argument('--resume', default='pretrained/checkpoint0299.pth', help='resume from checkpoint')
     parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
                         help='start epoch')
     parser.add_argument('--eval', action='store_true')
@@ -138,6 +148,23 @@ def main(args):
     model, criterion, postprocessors = build_model(args)
     model.to(device)
 
+    for name, param in model.transformer.named_parameters():
+        param.requires_grad = False
+    
+    for name, param in model.bbox_embed.named_parameters():
+        param.requires_grad = False
+    
+    for name, param in model.class_embed.named_parameters():
+        param.requires_grad = False
+    
+    # for name, param in model.depth_delta.named_parameters():
+    #     param.requires_grad = False
+    
+    # for name, param in model.depth_bin.named_parameters():
+    #     param.requires_grad = False
+    
+    # seed = seed + model
+
     model_without_ddp = model
     if args.distributed:
         model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
@@ -194,21 +221,21 @@ def main(args):
             print('loading pretrianed weights.....')
             checkpoint = torch.load(args.resume, map_location='cpu')
         # model_without_ddp.load_state_dict(checkpoint['model'])
-        del checkpoint["model"]["class_embed.weight"]
-        del checkpoint["model"]["class_embed.bias"]
+        # del checkpoint["model"]["class_embed.weight"]
+        # del checkpoint["model"]["class_embed.bias"]
         # Remove box weights
-        keys_to_delete = []
-        for key in checkpoint["model"]:
-            if 'box_embed' in key:
-                print(key)
-                keys_to_delete.append(key)
+        # keys_to_delete = []
+        # for key in checkpoint["model"]:
+        #     if 'box_embed' in key:
+        #         print(key)
+        #         keys_to_delete.append(key)
 
-        for key in keys_to_delete:
-            del checkpoint["model"][key]
+        # for key in keys_to_delete:
+        #     del checkpoint["model"][key]
 
         model_without_ddp.load_state_dict(checkpoint['model'], strict = False)
         if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
-            optimizer.load_state_dict(checkpoint['optimizer'])
+            # optimizer.load_state_dict(checkpoint['optimizer'])
             lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
             args.start_epoch = checkpoint['epoch'] + 1