diff --git a/LISA_annotation_to_VOC.py b/LISA_annotation_to_VOC.py index ff8374c..be8d323 100644 --- a/LISA_annotation_to_VOC.py +++ b/LISA_annotation_to_VOC.py @@ -80,6 +80,11 @@ def generate_xml(name,img_size): title_text = doc.createTextNode(str(img_size[2])) title.appendChild(title_text) size.appendChild(title) + + title = doc.createElement('segmented') + title_text = doc.createTextNode('0') + title.appendChild(title_text) + annotation.appendChild(title) # A loop for several objects to be detected #The bounding boxes are described using the top left point, a width, and a height [x y w h] in the 2D image plane.=>[xmin,ymin,xmax,ymax] @@ -87,7 +92,7 @@ def generate_xml(name,img_size): data=lines[i].strip().split(" ") name=data[0] x,y,w,h=int(data[1]),int(data[2]),int(data[3]),int(data[4]) - xmin,ymin,xmax,ymax=x,y-h,x+w,y + xmin,ymin,xmax,ymax=x,y,x+w,y+h object = doc.createElement('object') @@ -96,7 +101,24 @@ def generate_xml(name,img_size): title_text = doc.createTextNode(name) title.appendChild(title_text) object.appendChild(title) - + + + title = doc.createElement('pose') + title_text = doc.createTextNode('Unspecified') + title.appendChild(title_text) + object.appendChild(title) + + title = doc.createElement('truncated') + title_text = doc.createTextNode('0') + title.appendChild(title_text) + object.appendChild(title) + + title = doc.createElement('difficult') + title_text = doc.createTextNode('0') + title.appendChild(title_text) + object.appendChild(title) + + bndbox = doc.createElement('bndbox') object.appendChild(bndbox) title = doc.createElement('xmin') @@ -132,6 +154,7 @@ def generate_xml(name,img_size): generate_xml(name,img_size) + diff --git a/LISA_posGt_to_VOC_main.py b/LISA_posGt_to_VOC_main.py new file mode 100644 index 0000000..e368b7f --- /dev/null +++ b/LISA_posGt_to_VOC_main.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- +""" +Created on Sat Jun 24 19:26:54 2017 + + +""" +# -------------------------------------------------------- +#Transform posGt to VOC2007/imagesets/main +#Used for create train.txt,val.txt,trainval.txt under main folder in VOC2007 +#Written by Shaoshen Wang +# -------------------------------------------------------- +#Usage: +#Put this script under train folder +#Create a new folder named "Main" in train folder +#Run this script + + +import os + +annotation_path = "./posGt/" +result_path = "./ImageSets/Main/" + +ratio_trainval = 0.5 #(trainval/total) +ratio_train = 0.5 #(train/trainval) + +def create_train_val_trainval(): + files = os.listdir(annotation_path) + total_cases = len(files) + t2 = int(ratio_trainval*total_cases) + t1 = int(ratio_train*t2) + train_cases=files[:t1] + val_cases=files[t1:t2] + test_cases=files[t2:] + + train_txt = "" + val_txt = "" + trainval_txt = "" + test_txt="" + + for file in train_cases: + train_txt += file[:-4] + "\n" #Delete ".txt" + for file in val_cases: + val_txt += file[:-4] + "\n" + trainval_txt = train_txt+val_txt + for file in test_cases: + test_txt += file[:-4] + "\n" + + f = open(result_path+"train.txt","w") + f.write(train_txt) + f.close() + f = open(result_path+"val.txt","w") + f.write(val_txt) + f.close() + f = open(result_path+"trainval.txt","w") + f.write(trainval_txt) + f.close() + f = open(result_path+"test.txt","w") + f.write(test_txt) + f.close() + +def create_train_for_classes(): #Not being used so far + files = os.listdir(annotation_path) + total_cases = len(files) + total_train = 3 + total_test = 0 + record = [[],[],[],[]] + names = ["leftHand_driver","rightHand_driver","leftHand_passenger","rightHand_passenger"] + + + train_cases = files[:total_train] + for case in train_cases: + file = open(annotation_path+case) + lines = file.readlines() + lines = lines[1:] #ignore first line + indicator = [-1,-1,-1,-1] + + for line in lines: + line = line.strip().split(" ") + name = line[0] + if name == "leftHand_driver": + indicator[0] = 1 + elif name == "rightHand_driver": + indicator[1] = 1 + elif name == "leftHand_passenger": + indicator[2] = 1 + elif name == "rightHand_passenger": + indicator[3] = 1 + else: + pass + for i in range(4): + record[i].append((case,indicator[i])) + + for i in range(4): + file_path=result_path+names[i]+"_train"+".txt" + content="" + for k in record[i]: + content+=k[0]+" "+str(k[1])+"\n" + f=open(file_path,"w") + f.write(content) + f.close() + #print(record) + +if __name__ == '__main__': + create_train_val_trainval() + + diff --git a/Modification Points b/Modification Points new file mode 100644 index 0000000..f7e86fd --- /dev/null +++ b/Modification Points @@ -0,0 +1,49 @@ + +Generate annotations +Generate 4 txt file train.txt val.txt trainval.txt test.txt under Main + +Error: overlaps = entry['max_overlaps']: +Delete data/cache folder,因为里面保存了上一次数据集的roidb。因为错误显示加载了以前的文件。 + +Config.py: +暂时去掉使用flip扩增数据集的方法 + +Pascal_voc.py: +1)修改大小写obj.find('name').text.lower() delete lower() + +2)Delete -1 in +x1 = float(bbox.find('xmin').text)-1… +y2 = float(bbox.find('ymax').text)-1 + +因为原坐标位置起始是(1,1),现在是(0,0) + +3)修改分类class 为4+1类 +4)修改jpg为png,因为新数据集图像格式改变了 + +Vgg16: +1)修改网络 +2)修改load pretrain model时需要加载的参数 + +Error: Train loss 出现NAN: +重新制作数据集,问题消失,怀疑之前数据集有损坏。 + +Error: rpn_cls_score与 label不匹配,reshape无法完成: +Label长度代表了anchor数量 +通过查找anchor产生过程发现产生anchor的数量是根据input(224*224) resize得到的,resize的ratio被写死了,需要修改。 +修改network.py self._feat_stride, self._feat_compress +从16改为4. +Change this ratio to 4 = input width/conv5 width = 224/56 = 4 in modified case + + +Testing: +Vgg16.py: +修改concate 维度为-1,即连接channel的维度 + +lib/datasets/voc_eval.py: + +注释掉部分evaluation的代码,把结果改成正确格式output到txt里面. + +mAP低可能由于train不充分 +需要调整thresh +testing得到很多bbox的坐标heconfidence,取 +得到所有testing的结果之后,把预测的box 通过 pascal_voc 的_write_voc_results_file写入了result 文件 diff --git a/README.md b/README.md index 419d890..19c9280 100644 --- a/README.md +++ b/README.md @@ -1 +1,51 @@ -# HandDetection \ No newline at end of file +# HandDetection +This is a Modify faster-rcnn hand detection project, developed during my research assistant in Centre of Artificial Intelligence (CAI) in UTS.
+This project achieves Top 10 performance in VIVA hand detection competition. + +![](pic/arch.png) + + + + +Setup via [https://github.com/endernewton/tf-faster-rcnn](https://github.com/endernewton/tf-faster-rcnn) + +Modified the code via [Robust Hand Detection in Vehicles](http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7899695) for hand detection. + +This project is collaboration with my collegue Yunqiu Xu (https://github.com/YunqiuXu). + +# Preprocessing +~/tf-faster-rcnn-endernewton/data/LISA_HD_Static/detectiondata$ python LISA_posGt_to_VOC_Annotations.py
+~/tf-faster-rcnn-endernewton/data/LISA_HD_Static/detectiondata$ python LISA_posGt_to_VOC_Main.py
+ +# Train +~/tf-faster-rcnn-endernewton$ ./experiments/scripts/train_faster_rcnn.sh 0 pascal_voc vgg16
+ +# Test +Modifiy the iter times in test_faster_rcnn.sh
+~/tf-faster-rcnn-endernewton$ ./experiments/scripts/test_faster_rcnn.sh 0 pascal_voc vgg16
+ +# How to do prediction on your own dataset + +cd tf-faster-rcnn-endernewton/data/LISA_HD_Static/detectiondata/ImageSets/Main
+mv test.txt test_for_train.txt
+mv test5500.txt test.txt
+ +cd tf-faster-rcnn-endernewton/data/LISA_HD_Static/detectiondata
+mv JPEGImages JPEGImages_train
+mv JPEGImages_test JPEGImages
+ +Open tf-faster-rcnn-endernewton/experiments/scripts/test_faster_rcnn.sh
+Set line 21 "ITERS = the iters of the model you trained" Say if you trained a model with 10000 iters, set this line "ITERS = 10000"
+ +cd tf-faster-rcnn-endernewton
+./experiments/scripts/test_faster_rcnn.sh 0 pascal_voc vgg16
+ +# How to stop the training + +tmux attach
+ctrl+c + + + + + diff --git a/checkpoint_params.py b/checkpoint_params.py new file mode 100644 index 0000000..f94e076 --- /dev/null +++ b/checkpoint_params.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun Jun 25 17:00:50 2017 + +@author: Shaoshen Wang +""" +#Used for show the variables in a checkpoint file +#Usage: Put this code under tf-faster-rcnn-master + +import os +import tensorflow as tf +from tensorflow.python import pywrap_tensorflow +from tensorflow.python.tools.inspect_checkpoint import print_tensors_in_checkpoint_file + +def get_variables_in_checkpoint_file(file_name): + try: + reader = pywrap_tensorflow.NewCheckpointReader(file_name) + var_to_shape_map = reader.get_variable_to_shape_map() + return var_to_shape_map + except Exception as e: + print(str(e)) + + +model_dir=".\data\imagenet_weights" +checkpoint_path = os.path.join(model_dir, "vgg16.ckpt") + +#print(type(file_name)) + +var_to_shape_map=get_variables_in_checkpoint_file(checkpoint_path) + +for var in var_to_shape_map: + print(var,var_to_shape_map[var]) + + +# List ALL tensors example output: v0/Adam (DT_FLOAT) [3,3,1,80] +#print_tensors_in_checkpoint_file(file_name=checkpoint_path, tensor_name='',all_tensors='') diff --git a/network.py b/network.py new file mode 100644 index 0000000..8552146 --- /dev/null +++ b/network.py @@ -0,0 +1,403 @@ +# -------------------------------------------------------- +# Tensorflow Faster R-CNN +# Licensed under The MIT License [see LICENSE for details] +# Written by Xinlei Chen +# Modified by Shaoshen Wang +# -------------------------------------------------------- +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf +import tensorflow.contrib.slim as slim +from tensorflow.contrib.slim import losses +from tensorflow.contrib.slim import arg_scope + +import numpy as np + +from layer_utils.snippets import generate_anchors_pre +from layer_utils.proposal_layer import proposal_layer +from layer_utils.proposal_top_layer import proposal_top_layer +from layer_utils.anchor_target_layer import anchor_target_layer +from layer_utils.proposal_target_layer import proposal_target_layer + +from model.config import cfg + +class Network(object): + def __init__(self, batch_size=1): + #Change this ratio to 4 = input width/conv5 width = 224/56 = 4 in modified case + #Old:self._feat_stride = [16, ] + self._feat_stride = [4, ] + #Change this ratio to 4 ,same as above + #Old:self._feat_compress = [1. / 16., ] + self._feat_compress = [1. / 4., ] + + self._batch_size = batch_size + self._predictions = {} + self._losses = {} + self._anchor_targets = {} + self._proposal_targets = {} + self._layers = {} + self._act_summaries = [] + self._score_summaries = {} + self._train_summaries = [] + self._event_summaries = {} + + def _add_image_summary(self, image, boxes): + # add back mean + image += cfg.PIXEL_MEANS + # bgr to rgb (opencv uses bgr) + channels = tf.unstack (image, axis=-1) + image = tf.stack ([channels[2], channels[1], channels[0]], axis=-1) + # dims for normalization + width = tf.to_float(tf.shape(image)[2]) + height = tf.to_float(tf.shape(image)[1]) + # from [x1, y1, x2, y2, cls] to normalized [y1, x1, y1, x1] + cols = tf.unstack(boxes, axis=1) + boxes = tf.stack([cols[1] / height, + cols[0] / width, + cols[3] / height, + cols[2] / width], axis=1) + # add batch dimension (assume batch_size==1) + assert image.get_shape()[0] == 1 + boxes = tf.expand_dims(boxes, dim=0) + image = tf.image.draw_bounding_boxes(image, boxes) + + return tf.summary.image('ground_truth', image) + + def _add_act_summary(self, tensor): + tf.summary.histogram('ACT/' + tensor.op.name + '/activations', tensor) + tf.summary.scalar('ACT/' + tensor.op.name + '/zero_fraction', + tf.nn.zero_fraction(tensor)) + + def _add_score_summary(self, key, tensor): + tf.summary.histogram('SCORE/' + tensor.op.name + '/' + key + '/scores', tensor) + + def _add_train_summary(self, var): + tf.summary.histogram('TRAIN/' + var.op.name, var) + + def _reshape_layer(self, bottom, num_dim, name): + input_shape = tf.shape(bottom) + with tf.variable_scope(name) as scope: + # change the channel to the caffe format + to_caffe = tf.transpose(bottom, [0, 3, 1, 2]) + # then force it to have channel 2 + reshaped = tf.reshape(to_caffe, + tf.concat(axis=0, values=[[self._batch_size], [num_dim, -1], [input_shape[2]]])) + # then swap the channel back + to_tf = tf.transpose(reshaped, [0, 2, 3, 1]) + return to_tf + + def _softmax_layer(self, bottom, name): + if name == 'rpn_cls_prob_reshape': + input_shape = tf.shape(bottom) + bottom_reshaped = tf.reshape(bottom, [-1, input_shape[-1]]) + reshaped_score = tf.nn.softmax(bottom_reshaped, name=name) + return tf.reshape(reshaped_score, input_shape) + return tf.nn.softmax(bottom, name=name) + + def _proposal_top_layer(self, rpn_cls_prob, rpn_bbox_pred, name): + with tf.variable_scope(name) as scope: + rois, rpn_scores = tf.py_func(proposal_top_layer, + [rpn_cls_prob, rpn_bbox_pred, self._im_info, + self._feat_stride, self._anchors, self._num_anchors], + [tf.float32, tf.float32]) + rois.set_shape([cfg.TEST.RPN_TOP_N, 5]) + rpn_scores.set_shape([cfg.TEST.RPN_TOP_N, 1]) + + return rois, rpn_scores + + def _proposal_layer(self, rpn_cls_prob, rpn_bbox_pred, name): + with tf.variable_scope(name) as scope: + rois, rpn_scores = tf.py_func(proposal_layer, + [rpn_cls_prob, rpn_bbox_pred, self._im_info, self._mode, + self._feat_stride, self._anchors, self._num_anchors], + [tf.float32, tf.float32]) + rois.set_shape([None, 5]) + rpn_scores.set_shape([None, 1]) + + return rois, rpn_scores + + # Only use it if you have roi_pooling op written in tf.image + def _roi_pool_layer(self, bootom, rois, name): + with tf.variable_scope(name) as scope: + return tf.image.roi_pooling(bootom, rois, + pooled_height=cfg.POOLING_SIZE, + pooled_width=cfg.POOLING_SIZE, + spatial_scale=1. / 16.)[0] + + def _crop_pool_layer(self, bottom, rois, name): + with tf.variable_scope(name) as scope: + batch_ids = tf.squeeze(tf.slice(rois, [0, 0], [-1, 1], name="batch_id"), [1]) + # Get the normalized coordinates of bboxes + bottom_shape = tf.shape(bottom) + height = (tf.to_float(bottom_shape[1]) - 1.) * np.float32(self._feat_stride[0]) + width = (tf.to_float(bottom_shape[2]) - 1.) * np.float32(self._feat_stride[0]) + x1 = tf.slice(rois, [0, 1], [-1, 1], name="x1") / width + y1 = tf.slice(rois, [0, 2], [-1, 1], name="y1") / height + x2 = tf.slice(rois, [0, 3], [-1, 1], name="x2") / width + y2 = tf.slice(rois, [0, 4], [-1, 1], name="y2") / height + # Won't be backpropagated to rois anyway, but to save time + bboxes = tf.stop_gradient(tf.concat([y1, x1, y2, x2], axis=1)) + pre_pool_size = cfg.POOLING_SIZE * 2 + crops = tf.image.crop_and_resize(bottom, bboxes, tf.to_int32(batch_ids), [pre_pool_size, pre_pool_size], name="crops") + + return slim.max_pool2d(crops, [2, 2], padding='SAME') + + def _dropout_layer(self, bottom, name, ratio=0.5): + return tf.nn.dropout(bottom, ratio, name=name) + + def _anchor_target_layer(self, rpn_cls_score, name): + with tf.variable_scope(name) as scope: + rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = tf.py_func( + anchor_target_layer, + [rpn_cls_score, self._gt_boxes, self._im_info, self._feat_stride, self._anchors, self._num_anchors], + [tf.float32, tf.float32, tf.float32, tf.float32]) + + rpn_labels.set_shape([1, 1, None, None]) + rpn_bbox_targets.set_shape([1, None, None, self._num_anchors * 4]) + rpn_bbox_inside_weights.set_shape([1, None, None, self._num_anchors * 4]) + rpn_bbox_outside_weights.set_shape([1, None, None, self._num_anchors * 4]) + + rpn_labels = tf.to_int32(rpn_labels, name="to_int32") + self._anchor_targets['rpn_labels'] = rpn_labels + self._anchor_targets['rpn_bbox_targets'] = rpn_bbox_targets + self._anchor_targets['rpn_bbox_inside_weights'] = rpn_bbox_inside_weights + self._anchor_targets['rpn_bbox_outside_weights'] = rpn_bbox_outside_weights + + self._score_summaries.update(self._anchor_targets) + + return rpn_labels + + def _proposal_target_layer(self, rois, roi_scores, name): + with tf.variable_scope(name) as scope: + rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights = tf.py_func( + proposal_target_layer, + [rois, roi_scores, self._gt_boxes, self._num_classes], + [tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32]) + + rois.set_shape([cfg.TRAIN.BATCH_SIZE, 5]) + roi_scores.set_shape([cfg.TRAIN.BATCH_SIZE]) + labels.set_shape([cfg.TRAIN.BATCH_SIZE, 1]) + bbox_targets.set_shape([cfg.TRAIN.BATCH_SIZE, self._num_classes * 4]) + bbox_inside_weights.set_shape([cfg.TRAIN.BATCH_SIZE, self._num_classes * 4]) + bbox_outside_weights.set_shape([cfg.TRAIN.BATCH_SIZE, self._num_classes * 4]) + + self._proposal_targets['rois'] = rois + self._proposal_targets['labels'] = tf.to_int32(labels, name="to_int32") + self._proposal_targets['bbox_targets'] = bbox_targets + self._proposal_targets['bbox_inside_weights'] = bbox_inside_weights + self._proposal_targets['bbox_outside_weights'] = bbox_outside_weights + + self._score_summaries.update(self._proposal_targets) + + return rois, roi_scores + + def _anchor_component(self): + with tf.variable_scope('ANCHOR_' + self._tag) as scope: + # just to get the shape right + + height = tf.to_int32(tf.ceil(self._im_info[0, 0] / np.float32(self._feat_stride[0]))) + width = tf.to_int32(tf.ceil(self._im_info[0, 1] / np.float32(self._feat_stride[0]))) + + anchors, anchor_length = tf.py_func(generate_anchors_pre, + [height, width, + self._feat_stride, self._anchor_scales, self._anchor_ratios], + [tf.float32, tf.int32], name="generate_anchors") + anchors.set_shape([None, 4]) + + anchor_length.set_shape([]) + self._anchors = anchors + self._anchor_length = anchor_length + + def build_network(self, sess, is_training=True): + raise NotImplementedError + + def _smooth_l1_loss(self, bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights, sigma=1.0, dim=[1]): + sigma_2 = sigma ** 2 + box_diff = bbox_pred - bbox_targets + in_box_diff = bbox_inside_weights * box_diff + abs_in_box_diff = tf.abs(in_box_diff) + smoothL1_sign = tf.stop_gradient(tf.to_float(tf.less(abs_in_box_diff, 1. / sigma_2))) + in_loss_box = tf.pow(in_box_diff, 2) * (sigma_2 / 2.) * smoothL1_sign \ + + (abs_in_box_diff - (0.5 / sigma_2)) * (1. - smoothL1_sign) + out_loss_box = bbox_outside_weights * in_loss_box + loss_box = tf.reduce_mean(tf.reduce_sum( + out_loss_box, + axis=dim + )) + return loss_box + + def _add_losses(self, sigma_rpn=3.0): + with tf.variable_scope('loss_' + self._tag) as scope: + # RPN, class loss + rpn_cls_score = tf.reshape(self._predictions['rpn_cls_score_reshape'], [-1, 2]) + rpn_label = tf.reshape(self._anchor_targets['rpn_labels'], [-1]) + rpn_select = tf.where(tf.not_equal(rpn_label, -1)) + rpn_cls_score = tf.reshape(tf.gather(rpn_cls_score, rpn_select), [-1, 2]) + rpn_label = tf.reshape(tf.gather(rpn_label, rpn_select), [-1]) + rpn_cross_entropy = tf.reduce_mean( + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=rpn_cls_score, labels=rpn_label)) + + # RPN, bbox loss + rpn_bbox_pred = self._predictions['rpn_bbox_pred'] + rpn_bbox_targets = self._anchor_targets['rpn_bbox_targets'] + rpn_bbox_inside_weights = self._anchor_targets['rpn_bbox_inside_weights'] + rpn_bbox_outside_weights = self._anchor_targets['rpn_bbox_outside_weights'] + + rpn_loss_box = self._smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights, + rpn_bbox_outside_weights, sigma=sigma_rpn, dim=[1, 2, 3]) + + # RCNN, class loss + cls_score = self._predictions["cls_score"] + label = tf.reshape(self._proposal_targets["labels"], [-1]) + + cross_entropy = tf.reduce_mean( + tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=tf.reshape(cls_score, [-1, self._num_classes]), labels=label)) + + # RCNN, bbox loss + bbox_pred = self._predictions['bbox_pred'] + bbox_targets = self._proposal_targets['bbox_targets'] + bbox_inside_weights = self._proposal_targets['bbox_inside_weights'] + bbox_outside_weights = self._proposal_targets['bbox_outside_weights'] + + loss_box = self._smooth_l1_loss(bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights) + + self._losses['cross_entropy'] = cross_entropy + self._losses['loss_box'] = loss_box + self._losses['rpn_cross_entropy'] = rpn_cross_entropy + self._losses['rpn_loss_box'] = rpn_loss_box + + loss = cross_entropy + loss_box + rpn_cross_entropy + rpn_loss_box + self._losses['total_loss'] = loss + + self._event_summaries.update(self._losses) + + return loss + + def create_architecture(self, sess, mode, num_classes, tag=None, + anchor_scales=(8, 16, 32), anchor_ratios=(0.5, 1, 2)): + self._image = tf.placeholder(tf.float32, shape=[self._batch_size, None, None, 3]) + self._im_info = tf.placeholder(tf.float32, shape=[self._batch_size, 3]) + self._gt_boxes = tf.placeholder(tf.float32, shape=[None, 5]) + self._tag = tag + + self._num_classes = num_classes + self._mode = mode + self._anchor_scales = anchor_scales + self._num_scales = len(anchor_scales) + + self._anchor_ratios = anchor_ratios + self._num_ratios = len(anchor_ratios) + + self._num_anchors = self._num_scales * self._num_ratios + + training = mode == 'TRAIN' + testing = mode == 'TEST' + + assert tag != None + + # handle most of the regularizers here + weights_regularizer = tf.contrib.layers.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY) + if cfg.TRAIN.BIAS_DECAY: + biases_regularizer = weights_regularizer + else: + biases_regularizer = tf.no_regularizer + + # list as many types of layers as possible, even if they are not used now + with arg_scope([slim.conv2d, slim.conv2d_in_plane, \ + slim.conv2d_transpose, slim.separable_conv2d, slim.fully_connected], + weights_regularizer=weights_regularizer, + biases_regularizer=biases_regularizer, + biases_initializer=tf.constant_initializer(0.0)): + rois, cls_prob, bbox_pred = self.build_network(sess, training) + + layers_to_output = {'rois': rois} + layers_to_output.update(self._predictions) + + for var in tf.trainable_variables(): + self._train_summaries.append(var) + + if mode == 'TEST': + stds = np.tile(np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (self._num_classes)) + means = np.tile(np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (self._num_classes)) + self._predictions["bbox_pred"] *= stds + self._predictions["bbox_pred"] += means + else: + self._add_losses() + layers_to_output.update(self._losses) + + val_summaries = [] + with tf.device("/cpu:0"): + val_summaries.append(self._add_image_summary(self._image, self._gt_boxes)) + for key, var in self._event_summaries.items(): + val_summaries.append(tf.summary.scalar(key, var)) + for key, var in self._score_summaries.items(): + self._add_score_summary(key, var) + for var in self._act_summaries: + self._add_act_summary(var) + for var in self._train_summaries: + self._add_train_summary(var) + + self._summary_op = tf.summary.merge_all() + if not testing: + self._summary_op_val = tf.summary.merge(val_summaries) + + return layers_to_output + + # Extract the head feature maps, for example for vgg16 it is conv5_3 + # only useful during testing mode + def extract_head(self, sess, image): + feed_dict = {self._image: image} + feat = sess.run(self._layers["head"], feed_dict=feed_dict) + return feat + + # only useful during testing mode + def test_image(self, sess, image, im_info): + feed_dict = {self._image: image, + self._im_info: im_info} + cls_score, cls_prob, bbox_pred, rois = sess.run([self._predictions["cls_score"], + self._predictions['cls_prob'], + self._predictions['bbox_pred'], + self._predictions['rois']], + feed_dict=feed_dict) + return cls_score, cls_prob, bbox_pred, rois + + def get_summary(self, sess, blobs): + feed_dict = {self._image: blobs['data'], self._im_info: blobs['im_info'], + self._gt_boxes: blobs['gt_boxes']} + summary = sess.run(self._summary_op_val, feed_dict=feed_dict) + + return summary + + def train_step(self, sess, blobs, train_op): + feed_dict = {self._image: blobs['data'], self._im_info: blobs['im_info'], + self._gt_boxes: blobs['gt_boxes']} + rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, loss, _ = sess.run([self._losses["rpn_cross_entropy"], + self._losses['rpn_loss_box'], + self._losses['cross_entropy'], + self._losses['loss_box'], + self._losses['total_loss'], + train_op], + feed_dict=feed_dict) + return rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, loss + + def train_step_with_summary(self, sess, blobs, train_op): + feed_dict = {self._image: blobs['data'], self._im_info: blobs['im_info'], + self._gt_boxes: blobs['gt_boxes']} + rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, loss, summary, _ = sess.run([self._losses["rpn_cross_entropy"], + self._losses['rpn_loss_box'], + self._losses['cross_entropy'], + self._losses['loss_box'], + self._losses['total_loss'], + self._summary_op, + train_op], + feed_dict=feed_dict) + return rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, loss, summary + + def train_step_no_return(self, sess, blobs, train_op): + feed_dict = {self._image: blobs['data'], self._im_info: blobs['im_info'], + self._gt_boxes: blobs['gt_boxes']} + sess.run([train_op], feed_dict=feed_dict) + diff --git a/pascal_voc.py b/pascal_voc.py new file mode 100644 index 0000000..be6c81f --- /dev/null +++ b/pascal_voc.py @@ -0,0 +1,317 @@ +# -------------------------------------------------------- +# Fast R-CNN +# Copyright (c) 2015 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ross Girshick and Xinlei Chen +# Modified by Shaoshen Wang +# -------------------------------------------------------- +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +from datasets.imdb import imdb +import datasets.ds_utils as ds_utils +import xml.etree.ElementTree as ET +import numpy as np +import scipy.sparse +import scipy.io as sio +import utils.cython_bbox +import pickle +import subprocess +import uuid +from .voc_eval import voc_eval +from model.config import cfg + + +class pascal_voc(imdb): + def __init__(self, image_set, year, devkit_path=None): + imdb.__init__(self, 'voc_' + year + '_' + image_set) + self._year = year + self._image_set = image_set + self._devkit_path = self._get_default_path() if devkit_path is None \ + else devkit_path + #[Hand Detection] + self._hand_path = cfg.DATA_DIR #./data/ + self._data_path = os.path.join(self._hand_path, 'LISA_HD_Static','detectiondata')#Same as VOC2007/ + + #self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year) + + #Substitute original _classes with 1+4 classes + #["leftHand_driver","rightHand_driver","leftHand_passenger","rightHand_passenger"]+background + + self._classes = ('__background__', # always index 0 + 'leftHand_driver', 'rightHand_driver', 'leftHand_passenger', 'rightHand_passenger' + ) + self._class_to_ind = dict(list(zip(self.classes, list(range(self.num_classes))))) + self._image_ext = '.png' + self._image_index = self._load_image_set_index() + # Default to roidb handler + self._roidb_handler = self.gt_roidb + self._salt = str(uuid.uuid4()) + self._comp_id = 'comp4' + + # PASCAL specific config options + self.config = {'cleanup': True, + 'use_salt': True, + 'use_diff': False, + 'matlab_eval': False, + 'rpn_file': None} + + assert os.path.exists(self._devkit_path), \ + 'VOCdevkit path does not exist: {}'.format(self._devkit_path) + assert os.path.exists(self._data_path), \ + 'Path does not exist: {}'.format(self._data_path) + + def image_path_at(self, i): + """ + Return the absolute path to image i in the image sequence. + """ + return self.image_path_from_index(self._image_index[i]) + + def image_path_from_index(self, index): + """ + Construct an image path from the image's "index" identifier. + """ + image_path = os.path.join(self._data_path, 'JPEGImages', + index + self._image_ext) + assert os.path.exists(image_path), \ + 'Path does not exist: {}'.format(image_path) + return image_path + + def _load_image_set_index(self): + """ + Load the indexes listed in this dataset's image set file. + """ + # Example path to image set file: + # self._devkit_path + /VOCdevkit2007/VOC2007/ImageSets/Main/val.txt + image_set_file = os.path.join(self._data_path, 'ImageSets', 'Main', + self._image_set + '.txt') + assert os.path.exists(image_set_file), \ + 'Path does not exist: {}'.format(image_set_file) + with open(image_set_file) as f: + image_index = [x.strip() for x in f.readlines()] + return image_index + + def _get_default_path(self): + """ + Return the default path where PASCAL VOC is expected to be installed. + """ + return os.path.join(cfg.DATA_DIR, 'VOCdevkit' + self._year) + + def gt_roidb(self): + """ + Return the database of ground-truth regions of interest. + + This function loads/saves from/to a cache file to speed up future calls. + """ + cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl') + if os.path.exists(cache_file): + with open(cache_file, 'rb') as fid: + try: + roidb = pickle.load(fid) + except: + roidb = pickle.load(fid, encoding='bytes') + print('{} gt roidb loaded from {}'.format(self.name, cache_file)) + return roidb + + gt_roidb = [self._load_pascal_annotation(index) + for index in self.image_index] + with open(cache_file, 'wb') as fid: + pickle.dump(gt_roidb, fid, pickle.HIGHEST_PROTOCOL) + print('wrote gt roidb to {}'.format(cache_file)) + + return gt_roidb + + def rpn_roidb(self): + if int(self._year) == 2007 or self._image_set != 'test': + gt_roidb = self.gt_roidb() + rpn_roidb = self._load_rpn_roidb(gt_roidb) + roidb = imdb.merge_roidbs(gt_roidb, rpn_roidb) + else: + roidb = self._load_rpn_roidb(None) + + return roidb + + def _load_rpn_roidb(self, gt_roidb): + filename = self.config['rpn_file'] + print('loading {}'.format(filename)) + assert os.path.exists(filename), \ + 'rpn data not found at: {}'.format(filename) + with open(filename, 'rb') as f: + box_list = pickle.load(f) + return self.create_roidb_from_box_list(box_list, gt_roidb) + + def _load_pascal_annotation(self, index): + """ + Load image and bounding boxes info from XML file in the PASCAL VOC + format. + """ + filename = os.path.join(self._data_path, 'Annotations', index + '.xml') + tree = ET.parse(filename) + objs = tree.findall('object') + if not self.config['use_diff']: + # Exclude the samples labeled as difficult + non_diff_objs = [ + obj for obj in objs if int(obj.find('difficult').text) == 0] + # if len(non_diff_objs) != len(objs): + # print 'Removed {} difficult objects'.format( + # len(objs) - len(non_diff_objs)) + objs = non_diff_objs + num_objs = len(objs) + + boxes = np.zeros((num_objs, 4), dtype=np.uint16) + gt_classes = np.zeros((num_objs), dtype=np.int32) + overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32) + # "Seg" area for pascal is just the box area + seg_areas = np.zeros((num_objs), dtype=np.float32) + + # Load object bounding boxes into a data frame. + for ix, obj in enumerate(objs): + bbox = obj.find('bndbox') + # Make pixel indexes 0-based + #Delete -1 + x1 = float(bbox.find('xmin').text) + y1 = float(bbox.find('ymin').text) + x2 = float(bbox.find('xmax').text) + y2 = float(bbox.find('ymax').text) + #[Hand Detection] + #cls = self._class_to_ind[obj.find('name').text.lower().strip()] + cls = self._class_to_ind[obj.find('name').text.strip()] #Delete lower(), cause the annotation class has upper case. + + boxes[ix, :] = [x1, y1, x2, y2] + gt_classes[ix] = cls + overlaps[ix, cls] = 1.0 + seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1) + + overlaps = scipy.sparse.csr_matrix(overlaps) + + return {'boxes': boxes, + 'gt_classes': gt_classes, + 'gt_overlaps': overlaps, + 'flipped': False, + 'seg_areas': seg_areas} + + def _get_comp_id(self): + comp_id = (self._comp_id + '_' + self._salt if self.config['use_salt'] + else self._comp_id) + return comp_id + + def _get_voc_results_file_template(self): + # VOCdevkit/results/VOC2007/Main/_det_test_aeroplane.txt + filename = self._get_comp_id() + '_det_' + self._image_set + '_{:s}.txt' + path = os.path.join( + self._devkit_path, + 'results', + 'VOC' + self._year, + 'Main', + filename) + return path + + def _write_voc_results_file(self, all_boxes): + for cls_ind, cls in enumerate(self.classes): + if cls == '__background__': + continue + print('Writing {} VOC results file'.format(cls)) + filename = self._get_voc_results_file_template().format(cls) + with open(filename, 'wt') as f: + for im_ind, index in enumerate(self.image_index): + dets = all_boxes[cls_ind][im_ind] + if dets == []: + continue + # the VOCdevkit expects 1-based indices + for k in range(dets.shape[0]): + f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. + format(index, dets[k, -1], + dets[k, 0] + 1, dets[k, 1] + 1, + dets[k, 2] + 1, dets[k, 3] + 1)) + + def _do_python_eval(self, output_dir='output'): + annopath = os.path.join( + self._devkit_path, + 'VOC' + self._year, + 'Annotations', + '{:s}.xml') + imagesetfile = os.path.join( + self._devkit_path, + 'VOC' + self._year, + 'ImageSets', + 'Main', + self._image_set + '.txt') + cachedir = os.path.join(self._devkit_path, 'annotations_cache') + aps = [] + # The PASCAL VOC metric changed in 2010 + use_07_metric = True if int(self._year) < 2010 else False + print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) + if not os.path.isdir(output_dir): + os.mkdir(output_dir) + for i, cls in enumerate(self._classes): + if cls == '__background__': + continue + filename = self._get_voc_results_file_template().format(cls) + rec, prec, ap = voc_eval( + filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5, + use_07_metric=use_07_metric) + aps += [ap] + print(('AP for {} = {:.4f}'.format(cls, ap))) + with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f: + pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) + print(('Mean AP = {:.4f}'.format(np.mean(aps)))) + print('~~~~~~~~') + print('Results:') + for ap in aps: + print(('{:.3f}'.format(ap))) + print(('{:.3f}'.format(np.mean(aps)))) + print('~~~~~~~~') + print('') + print('--------------------------------------------------------------') + print('Results computed with the **unofficial** Python eval code.') + print('Results should be very close to the official MATLAB eval code.') + print('Recompute with `./tools/reval.py --matlab ...` for your paper.') + print('-- Thanks, The Management') + print('--------------------------------------------------------------') + + def _do_matlab_eval(self, output_dir='output'): + print('-----------------------------------------------------') + print('Computing results with the official MATLAB eval code.') + print('-----------------------------------------------------') + path = os.path.join(cfg.ROOT_DIR, 'lib', 'datasets', + 'VOCdevkit-matlab-wrapper') + cmd = 'cd {} && '.format(path) + cmd += '{:s} -nodisplay -nodesktop '.format(cfg.MATLAB) + cmd += '-r "dbstop if error; ' + cmd += 'voc_eval(\'{:s}\',\'{:s}\',\'{:s}\',\'{:s}\'); quit;"' \ + .format(self._devkit_path, self._get_comp_id(), + self._image_set, output_dir) + print(('Running:\n{}'.format(cmd))) + status = subprocess.call(cmd, shell=True) + + def evaluate_detections(self, all_boxes, output_dir): + self._write_voc_results_file(all_boxes) + self._do_python_eval(output_dir) + if self.config['matlab_eval']: + self._do_matlab_eval(output_dir) + if self.config['cleanup']: + for cls in self._classes: + if cls == '__background__': + continue + filename = self._get_voc_results_file_template().format(cls) + os.remove(filename) + + def competition_mode(self, on): + if on: + self.config['use_salt'] = False + self.config['cleanup'] = False + else: + self.config['use_salt'] = True + self.config['cleanup'] = True + + +if __name__ == '__main__': + from datasets.pascal_voc import pascal_voc + + d = pascal_voc('trainval', '2007') + res = d.roidb + from IPython import embed; + + embed() diff --git a/pic/arch.png b/pic/arch.png new file mode 100644 index 0000000..68c096b Binary files /dev/null and b/pic/arch.png differ diff --git a/vgg16.py b/vgg16.py new file mode 100644 index 0000000..e3e7eb3 --- /dev/null +++ b/vgg16.py @@ -0,0 +1,367 @@ +# _______________________________________________________ +# Tensorflow Faster R-CNN +# Licensed under The MIT License [see LICENSE for details] +# Written by Xinlei Chen +# Modified by Yunqiu Xu +# -------------------------------------------------------- + + +# A revision of VGG16 (tensorflow backend) +# Input : 224 * 224 * 3 +# (after) conv1 : 224 * 224 * 64 +# maxpool : 112 * 112 * 64 +# conv2 : 112 * 112 * 128 +# maxpool : 56 * 56 * 128 +# conv3 : 56 * 56 * 256 +# maxpool : 28 * 28 * 256 +# conv4 : 28 * 28 * 512 +# maxpool : 14 * 14 * 512 +# conv5 : 14 * 14 * 512 +# maxpool : 7 * 7 * 512 +# fc6 : 4096 +# fc7 : 4096 +# --------------------------------------------------------- + + +# Why Faster RCNN is faster : RPN +# RCNN: +# get proposal --> get features(CNN) --> SVM --> bbox regression +# Fast RCNN: +# send proposal and features to ROI pooling --> combine bbox and SVM together +# Faster RCNN: +# get features first --> get proposals from RPN --> send proposal and features to ROI pooling +# --------------------------------------------------------- + + +# ------------- To do 1: Multiple Scale Faster-RCNN ------ +# Combine both global and local features --> enhance hand detecting in an image +# Collect features not only conv5, but also conv3 and conv4, then incorporate them +# Implementation: +# 1. For conv3, conv4, conv5, each conv is only followed with ReLU, remove Max-pooling layer. +# 2. Take their output as the input of 3 corresponding ROI pooling layers and normalization layers +# 3. Concat and shrink normalization layers as input of fc layers +# 4. roi pooling in fc layers: make prediction of class and position +# -------------------------------------------------------- +# ------------- To do 2: Weight Normalization ------------ +# Features in shallower layers: larger-scaled values +# Features in deeper layers: smaller-scaled values +# To combine the features of 3 conv layers, we need to normalize them +# Implementation: +# 1. Put each feature into normalization layer(see the equations) +# 2. Each pixel xi is normalized, then multiply scaling factor ri +# 3. Use backpropagation to get ri in training step, we need to build loop here +# 4. After normalization, the features will be concated +# -------------------------------------------------------- +# ------------- To do 3 Add New Layer -------------------- +# 1. Each RPN needs a normalization layer +# 2. Add two more ROI pooling layers in detector part +# 3. Each ROI pooling layer needs a normalization layer +# 4. After each concatenation(2 positions in total), we need a 1*1 conv layer +# -------------------------------------------------------- + + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf +import tensorflow.contrib.slim as slim +from tensorflow.contrib.slim import losses +from tensorflow.contrib.slim import arg_scope +import numpy as np + +from nets.network import Network +from model.config import cfg + +class vgg16(Network): + def __init__(self, batch_size=1): + Network.__init__(self, batch_size=batch_size) + self._arch = 'vgg16' + + # [Hand Detection] Batch normalization + # http://stackoverflow.com/a/34634291/2267819 + # Note that this is different from the paper(they use another method) + def batch_norm_layer(self, to_be_normalized, is_training): + if is_training: + train_phase = tf.constant(1) + else: + train_phase = tf.constant(-1) + beta = tf.Variable(tf.constant(0.0, shape=[to_be_normalized.shape[-1]]), name='beta', trainable=True) + gamma = tf.Variable(tf.constant(1.0, shape=[to_be_normalized.shape[-1]]), name='gamma', trainable=True) + axises = np.arange(len(to_be_normalized.shape) - 1) + batch_mean, batch_var = tf.nn.moments(to_be_normalized, axises, name='moments') + ema = tf.train.ExponentialMovingAverage(decay=0.5) + + def mean_var_with_update(): + ema_apply_op = ema.apply([batch_mean, batch_var]) + with tf.control_dependencies([ema_apply_op]): + return tf.identity(batch_mean), tf.identity(batch_var) + + mean, var = tf.cond(train_phase > 0, mean_var_with_update, lambda: (ema.average(batch_mean), ema.average(batch_var))) # if is training --> update + normed = tf.nn.batch_normalization(to_be_normalized, mean, var, beta, gamma, 1e-3) + return normed + + + def build_network(self, sess, is_training=True): + with tf.variable_scope('vgg_16', 'vgg_16'): + # select initializers + if cfg.TRAIN.TRUNCATED: + initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.01) + initializer_bbox = tf.truncated_normal_initializer(mean=0.0, stddev=0.001) + else: + initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01) + initializer_bbox = tf.random_normal_initializer(mean=0.0, stddev=0.001) + + # [VGG16] conv1 + # input shape : 224 * 224 * 3 + # output shape : 112 * 112 * 64 + net = slim.repeat(self._image, 2, slim.conv2d, 64, [3, 3], + trainable=False, scope='conv1') + net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool1') + + # [VGG16] conv2 + # input shape : 112 * 112 * 64 + # output shape : 56 * 56 * 128 + net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], + trainable=False, scope='conv2') + net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool2') + + + # [Hand Detection] REMOVE net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool3') + # [Hand Detection] conv3 + # input shape : 56 * 56 * 128 + # output shape : 56 * 56 * 256 + net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], + trainable=is_training, scope='conv3') + to_be_normalized_1 = net + # [Hand Detection] conv4 + # input shape : 56 * 56 * 256 + # output shape : 56 * 56 * 256 + net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], + trainable=is_training, scope='conv4') + to_be_normalized_2 = net + # [Hand Detection] conv5 + # input shape : 56 * 56 * 256 + # output shape : 56 * 56 * 256 + net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], + trainable=is_training, scope='conv5') + to_be_normalized_3 = net + +# ------------- Take a break ----------------------------- +# Now as we get to_be_normalized_1 / to_be_normalized_2 / to_be_normalized_3, each is 56 * 56 * 256 +# For RPN , we need to: +# 1. normalize each to_be_normalized layer +# 2. concat 3 normalized layers +# 3. change the dimension using 1 * 1 conv +# 3. Then the modified net can be used in RPN +# +# For ROI pooling, we need to: +# 1. put each conv output into its ROI pooling (so there should be 3 ROI pooling layers) +# 2. normalize each layer +# 3. concat them +# 4. change the dimension using 1 * 1 conv +# -------------------------------------------------------- + + # ------------- Normalization for RPN -------------------- + # old version + # normed_1_rpn = tf.nn.l2_normalize(to_be_normalized_1, dim = [0, 1]) + # normed_2_rpn = tf.nn.l2_normalize(to_be_normalized_2, dim = [0, 1]) + # normed_3_rpn = tf.nn.l2_normalize(to_be_normalized_3, dim = [0, 1]) + normed_1_rpn = self.batch_norm_layer(to_be_normalized_1, is_training) + normed_2_rpn = self.batch_norm_layer(to_be_normalized_2, is_training) + normed_3_rpn = self.batch_norm_layer(to_be_normalized_3, is_training) + + # ------------- Concatation for RPN (56 * 56 * 768) ------ + # old version + # concated_rpn = tf.concat([normed_1_rpn, normed_2_rpn, normed_3_rpn], 2) + #batch *length*width*channel + #concate in the channel + concated_rpn = tf.concat([normed_1_rpn, normed_2_rpn, normed_3_rpn], -1) + + # ------------- 1 * 1 conv ------------------------------- + scaled_rpn = slim.conv2d(concated_rpn, 512, [1, 1], trainable=is_training, weights_initializer=initializer, scope="scaled_rpn/1x1") + # Then we can get 56 * 56 * 512 + + + # [Faster RCNN] summary and anchor + self._act_summaries.append(scaled_rpn) + self._layers['head'] = scaled_rpn + self._anchor_component() + + # ------------- RPN Begin -------------------------------- + + rpn = slim.conv2d(scaled_rpn, 512, [3, 3], trainable=is_training, weights_initializer=initializer, scope="rpn_conv/3x3") + self.show_variables("rpn",rpn.get_shape()) + + print("rpn",rpn.get_shape()) + self._act_summaries.append(rpn) + rpn_cls_score = slim.conv2d(rpn, self._num_anchors * 2, [1, 1], trainable=is_training, + weights_initializer=initializer, + padding='VALID', activation_fn=None, scope='rpn_cls_score') + # [Hand Detection] change it so that the score has 2 as its channel size + rpn_cls_score_reshape = self._reshape_layer(rpn_cls_score, 2, 'rpn_cls_score_reshape') + rpn_cls_prob_reshape = self._softmax_layer(rpn_cls_score_reshape, "rpn_cls_prob_reshape") + rpn_cls_prob = self._reshape_layer(rpn_cls_prob_reshape, self._num_anchors * 2, "rpn_cls_prob") + rpn_bbox_pred = slim.conv2d(rpn, self._num_anchors * 4, [1, 1], trainable=is_training, + weights_initializer=initializer, + padding='VALID', activation_fn=None, scope='rpn_bbox_pred') + print("rpn_cls_score",rpn_cls_score.get_shape()) + if is_training: + print("Compute rois,roi_scores") + print("training:rpn_cls_score",rpn_cls_score.get_shape()) + rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois") + + print("Compute rpn_labels") + self.show_variables("rpn_cls_score",rpn_cls_score.get_shape()) + + rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor") + # Try to have a determinestic order for the computing graph, for reproducibility + with tf.control_dependencies([rpn_labels]): + rois, _ = self._proposal_target_layer(rois, roi_scores, "rpn_rois") + else: + if cfg.TEST.MODE == 'nms': + rois, _ = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois") + elif cfg.TEST.MODE == 'top': + rois, _ = self._proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, "rois") + else: + raise NotImplementedError + # ------------- RPN End ---------------------------------- + + print("vgg16_rois",str(rois.shape)) + + # ------------- ROI Pooling Begin ------------------------ + if cfg.POOLING_MODE == 'crop': + # get roi layers + roi1 = self._crop_pool_layer(to_be_normalized_1, rois, "roi1") # 28 * 28 * 256 + #print("vgg16_roi1",str(roi1.shape)) + roi2 = self._crop_pool_layer(to_be_normalized_2, rois, "roi2") # 28 * 28 * 256 + roi3 = self._crop_pool_layer(to_be_normalized_3, rois, "roi3") # 28 * 28 * 256 + # normalization + normed_1_roi = self.batch_norm_layer(roi1, is_training) + normed_2_roi = self.batch_norm_layer(roi2, is_training) + normed_3_roi = self.batch_norm_layer(roi3, is_training) + # concat + concated_roi = tf.concat([normed_1_roi, normed_2_roi, normed_3_roi], -1) # 28 * 28 * 768 + + #concated_roi = tf.slice(concated_roi,[0,0,0,0],[channel1,-1,-1,-1])#train 256 testing 300 + #print("concated_roi",concated_roi.get_shape()) + + # scale + #with tf.variable_scope("rois") as scope: + # out = rois.shape[0] + + pool5 = slim.conv2d(concated_roi,512, [1, 1], trainable=is_training, weights_initializer=initializer, scope="pool5/1x1") # 28 * 28 * 512 + + #print("pool5",pool5.get_shape()) + #pool5 = tf.reshape(pool5,[-1,]) + #pool5 = tf.slice(pool5,[0,0,0,0],[self._anchor_length,-1,-1,-1]) + + else: + raise NotImplementedError + # old version + # if cfg.POOLING_MODE == 'crop': + # roi_pool_1 = self._crop_pool_layer(to_be_normalized_1, rois, "roi_pool_1") + # roi_pool_2 = self._crop_pool_layer(to_be_normalized_2, rois, "roi_pool_2") + # roi_pool_3 = self._crop_pool_layer(to_be_normalized_3, rois, "roi_pool_3") + + # roi_pool_1_normalized = tf.nn.l2_normalize(roi_pool_1, dim = [0, 1]) + # roi_pool_2_normalized = tf.nn.l2_normalize(roi_pool_2, dim = [0, 1]) + # roi_pool_3_normalized = tf.nn.l2_normalize(roi_pool_3, dim = [0, 1]) + # pool5 = tf.concat([roi_pool_1_normalized, roi_pool_1_normalized, roi_pool_1_normalized], 2) + # ------------- ROI Pooling End -------------------------- + + + # [VGG16] flatten + pool5_flat = slim.flatten(pool5, scope='flatten') + # [VGG16] dense 4096 + dropout + fc6 = slim.fully_connected(pool5_flat, 4096, scope='fc6') + if is_training: + fc6 = slim.dropout(fc6, keep_prob=0.5, is_training=True, scope='dropout6') + # [VGG16] dense 4096 + dropout + fc7 = slim.fully_connected(fc6, 4096, scope='fc7') + if is_training: + fc7 = slim.dropout(fc7, keep_prob=0.5, is_training=True, scope='dropout7') + + # [Faster RCNN] get cls_score(class) and bbox_predict(position) + cls_score = slim.fully_connected(fc7, self._num_classes, + weights_initializer=initializer, + trainable=is_training, + activation_fn=None, scope='cls_score') + cls_prob = self._softmax_layer(cls_score, "cls_prob") + bbox_pred = slim.fully_connected(fc7, self._num_classes * 4, + weights_initializer=initializer_bbox, + trainable=is_training, + activation_fn=None, scope='bbox_pred') + #print("cls_score",cls_score.get_shape()) + #if not is_training and len(rois.shape)==2: + # bbox_pred = tf.slice(bbox_pred,[0,0,0,0],[rois.shape[0],-1,-1,-1]) + + print("vgg16_bbox_pred",str(bbox_pred.shape)) + print("vgg16_rois",str(rois.shape)) + + self._predictions["rpn_cls_score"] = rpn_cls_score + self._predictions["rpn_cls_score_reshape"] = rpn_cls_score_reshape + self._predictions["rpn_cls_prob"] = rpn_cls_prob + self._predictions["rpn_bbox_pred"] = rpn_bbox_pred + self._predictions["cls_score"] = cls_score + self._predictions["cls_prob"] = cls_prob + self._predictions["bbox_pred"] = bbox_pred + self._predictions["rois"] = rois # not original rois + + self._score_summaries.update(self._predictions) + + return rois, cls_prob, bbox_pred + + + def get_variables_to_restore(self, variables, var_keep_dic): + variables_to_restore = [] + #[Hand Detection] + var_modified=['vgg_16/conv3/conv3_1/weights:0','vgg_16/conv3/conv3_2/weights:0','vgg_16/conv3/conv3_3/weights:0', + 'vgg_16/conv4/conv4_1/weights:0','vgg_16/conv4/conv4_2/weights:0','vgg_16/conv4/conv4_3/weights:0', + 'vgg_16/conv5/conv5_1/weights:0','vgg_16/conv5/conv5_2/weights:0','vgg_16/conv5/conv5_3/weights:0', + 'vgg_16/conv3/conv3_1/biases:0','vgg_16/conv3/conv3_2/biases:0','vgg_16/conv3/conv3_3/biases:0', + 'vgg_16/conv4/conv4_1/biases:0','vgg_16/conv4/conv4_2/biases:0','vgg_16/conv4/conv4_3/biases:0', + 'vgg_16/conv5/conv5_1/biases:0','vgg_16/conv5/conv5_2/biases:0','vgg_16/conv5/conv5_3/biases:0'] + #/[Hand Detection] + + for v in variables: + # exclude the conv weights that are fc weights in vgg16 + if v.name == 'vgg_16/fc6/weights:0' or v.name == 'vgg_16/fc7/weights:0': + self._variables_to_fix[v.name] = v + continue + # exclude the first conv layer to swap RGB to BGR + if v.name == 'vgg_16/conv1/conv1_1/weights:0': + self._variables_to_fix[v.name] = v + continue + + # [Hand Detection] + if v.name in var_modified: + continue + # /[Hand Detection] + + if v.name.split(':')[0] in var_keep_dic: + print('Varibles restored: %s' % v.name) + variables_to_restore.append(v) + + return variables_to_restore + + def fix_variables(self, sess, pretrained_model): + print('Fix VGG16 layers..') + with tf.variable_scope('Fix_VGG16') as scope: + with tf.device("/cpu:0"): + # fix the vgg16 issue from conv weights to fc weights + # fix RGB to BGR + #fc6_conv = tf.get_variable("fc6_conv", [7, 7, 512, 4096], trainable=False) + #fc7_conv = tf.get_variable("fc7_conv", [1, 1, 4096, 4096], trainable=False) + conv1_rgb = tf.get_variable("conv1_rgb", [3, 3, 3, 64], trainable=False) + restorer_fc = tf.train.Saver({ "vgg_16/conv1/conv1_1/weights": conv1_rgb}) + restorer_fc.restore(sess, pretrained_model) + + #sess.run(tf.assign(self._variables_to_fix['vgg_16/fc6/weights:0'], tf.reshape(fc6_conv, + # self._variables_to_fix['vgg_16/fc6/weights:0'].get_shape()))) + #sess.run(tf.assign(self._variables_to_fix['vgg_16/fc7/weights:0'], tf.reshape(fc7_conv, + # self._variables_to_fix['vgg_16/fc7/weights:0'].get_shape()))) + sess.run(tf.assign(self._variables_to_fix['vgg_16/conv1/conv1_1/weights:0'],tf.reverse(conv1_rgb, [2]))) + + def show_variables(self,var_name,var): + print(var_name,var) diff --git a/vgg16_modified1.py b/vgg16_modified1.py deleted file mode 100644 index c9d5a06..0000000 --- a/vgg16_modified1.py +++ /dev/null @@ -1,215 +0,0 @@ -# -------------------------------------------------------- -# Tensorflow Faster R-CNN -# Licensed under The MIT License [see LICENSE for details] -# Written by Xinlei Chen -# -------------------------------------------------------- - -# -------------------------------------------------------- -# [Modified by Yunqiu Xu] -# Ref: -# https://leonardoaraujosantos.gitbooks.io/artificial-inteligence/content/object_localization_and_detection.html -# http://blog.csdn.net/shenxiaolu1984/article/details/51152614 -# http://closure11.com/rcnn-fast-rcnn-faster-rcnn%E7%9A%84%E4%B8%80%E4%BA%9B%E4%BA%8B/ -# http://blog.csdn.net/lanran2/article/details/60143861 - -# Why Faster RCNN is faster : RPN -# RCNN: -# get proposal --> get features(CNN) --> SVM --> bbox regression -# Fast RCNN: -# send proposal and features to ROI pooling --> combine bbox and SVM together -# Faster RCNN: -# get features first --> get proposals from RPN --> send proposal and features to ROI pooling -# --------------------------------------------------------- - -# ------------- To do 1: Multiple Scale Faster-RCNN ------ -# Combine both global and local features --> enhance hand detecting in an image -# Collect features not only conv5, but also conv3 and conv4, then incorporate them -# Implementation: -# 1. For conv3, conv4, conv5, each conv is only followed with ReLU, remove Max-pooling layer. -# 2. Take their output as the input of 3 corresponding ROI pooling layers and normalization layers -# 3. Concat and shrink normalization layers as input of fc layers -# 4. roi pooling in fc layers: make prediction of class and position -# -------------------------------------------------------- -# ------------- To do 2: Weight Normalization ------------ -# Features in shallower layers: larger-scaled values -# Features in deeper layers: smaller-scaled values -# To combine the features of 3 conv layers, we need to normalize them -# Implementation: -# 1. Put each feature into normalization layer(see the equations) -# 2. Each pixel xi is normalized, then multiply scaling factor ri -# 3. Use backpropagation to get ri in training step, we need to build loop here -# 4. After normalization, the features will be concated -# -------------------------------------------------------- -# ------------- To do 3 Add New Layer -------------------- -# 1. Each RPN needs a normalization layer -# 2. Add two more ROI pooling layers in detector part -# 3. Each ROI pooling layer needs a normalization layer -# 4. After each concatenation(2 positions in total), we need a 1*1 conv layer -# -------------------------------------------------------- - - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import tensorflow as tf -import tensorflow.contrib.slim as slim -from tensorflow.contrib.slim import losses -from tensorflow.contrib.slim import arg_scope -import numpy as np - -from nets.network import Network -from model.config import cfg - -class vgg16(Network): - def __init__(self, batch_size=1): - Network.__init__(self, batch_size=batch_size) - self._arch = 'vgg16' - - def build_network(self, sess, is_training=True): - with tf.variable_scope('vgg_16', 'vgg_16'): - # select initializers - if cfg.TRAIN.TRUNCATED: - initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.01) - initializer_bbox = tf.truncated_normal_initializer(mean=0.0, stddev=0.001) - else: - initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01) - initializer_bbox = tf.random_normal_initializer(mean=0.0, stddev=0.001) - - # [VGG16] conv1 - # input shape : 224 * 224 * 3 - # conv 64 * 3 * 3 - # conv 64 * 3 * 3 - # maxpool 2 * 2 - net = slim.repeat(self._image, 2, slim.conv2d, 64, [3, 3], - trainable=False, scope='conv1') - net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool1') - - # [VGG16] conv2 - # conv 128 * 3 * 3 - # conv 128 * 3 * 3 - # maxpool 2 * 2 - net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], - trainable=False, scope='conv2') - net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool2') - - - # [VGG16] conv3 - # conv 256 * 3 * 3 - # conv 256 * 3 * 3 - # conv 256 * 3 * 3 - # maxpool 2 * 2 - net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], - trainable=is_training, scope='conv3') - to_be_normalized_1 = net - # [Hand Detection] REMOVE net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool3') - - # [VGG16] conv4 - # conv 512 * 3 * 3 - # conv 512 * 3 * 3 - # conv 512 * 3 * 3 - # maxpool 2 * 2 - net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], - trainable=is_training, scope='conv4') - to_be_normalized_2 = net - # [Hand Detection] REMOVE net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool4') - - # [VGG16] conv5 - # conv 512 * 3 * 3 - # conv 512 * 3 * 3 - # conv 512 * 3 * 3 - # maxpool 2 * 2 - net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], - trainable=is_training, scope='conv5') - to_be_normalized_3 = net - - # [Hand detection] - # Use the result of conv3, conv4 and conv5 - # normalize and concat them, then use 1*1 conv, then use RPN - # use the result of 3 layers and RPN in training loop - - # // self._act_summaries.append(net) - # // self._layers['head'] = net - # // self._anchor_component() # Yunqiu Xu: generate anchors? - -# ------------- RPN Begin -------------------------------- -## %%%%%%%%%% RPN Begin %%%%%%%%%% ## - # [Faster RCNN] RPN: put features into RPN layer --> get proposals - # input features(or anchors?), output rois(proposals) - # [Hand Detection] Normalize , concat, then use 1*1 conv, finally the data will be treated as the input here - rpn = slim.conv2d(net, 512, [3, 3], trainable=is_training, weights_initializer=initializer, scope="rpn_conv/3x3") - self._act_summaries.append(rpn) - rpn_cls_score = slim.conv2d(rpn, self._num_anchors * 2, [1, 1], trainable=is_training, - weights_initializer=initializer, - padding='VALID', activation_fn=None, scope='rpn_cls_score') - # [Hand Detection] change it so that the score has 2 as its channel size - rpn_cls_score_reshape = self._reshape_layer(rpn_cls_score, 2, 'rpn_cls_score_reshape') - rpn_cls_prob_reshape = self._softmax_layer(rpn_cls_score_reshape, "rpn_cls_prob_reshape") - rpn_cls_prob = self._reshape_layer(rpn_cls_prob_reshape, self._num_anchors * 2, "rpn_cls_prob") - rpn_bbox_pred = slim.conv2d(rpn, self._num_anchors * 4, [1, 1], trainable=is_training, - weights_initializer=initializer, - padding='VALID', activation_fn=None, scope='rpn_bbox_pred') - if is_training: - rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois") - rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor") - # Try to have a determinestic order for the computing graph, for reproducibility - with tf.control_dependencies([rpn_labels]): - rois, _ = self._proposal_target_layer(rois, roi_scores, "rpn_rois") - else: - if cfg.TEST.MODE == 'nms': - rois, _ = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois") - elif cfg.TEST.MODE == 'top': - rois, _ = self._proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, "rois") - else: - raise NotImplementedError -# ------------- RPN End ---------------------------------- - -# ------------- ROI Pooling Begin ------------------------ - # [Faster RCNN] build roi pooling layer(here is same with RCNN) - # [Hand Detection] add another 2 roi pooling layer - # Input: proposals(rois) from RPN and features from CNN - if cfg.POOLING_MODE == 'crop': - pool5 = self._crop_pool_layer(net, rois, "pool5") - else: - raise NotImplementedError -# ------------- ROI Pooling End -------------------------- - - # [Hand Detection] Then we use 3 normalize layers - # [Hand Detection] Then we concat them - # [Hand Detection] Then we use 1*1 conv to return the channel size - - - # [VGG16] flatten - pool5_flat = slim.flatten(pool5, scope='flatten') - # [VGG16] dense 4096 + dropout - fc6 = slim.fully_connected(pool5_flat, 4096, scope='fc6') - if is_training: - fc6 = slim.dropout(fc6, keep_prob=0.5, is_training=True, scope='dropout6') - # [VGG16] dense 4096 + dropout - fc7 = slim.fully_connected(fc6, 4096, scope='fc7') - if is_training: - fc7 = slim.dropout(fc7, keep_prob=0.5, is_training=True, scope='dropout7') - - # [Faster RCNN] get cls_score(class) and bbox_predict(position) - cls_score = slim.fully_connected(fc7, self._num_classes, - weights_initializer=initializer, - trainable=is_training, - activation_fn=None, scope='cls_score') - cls_prob = self._softmax_layer(cls_score, "cls_prob") - bbox_pred = slim.fully_connected(fc7, self._num_classes * 4, - weights_initializer=initializer_bbox, - trainable=is_training, - activation_fn=None, scope='bbox_pred') - - self._predictions["rpn_cls_score"] = rpn_cls_score - self._predictions["rpn_cls_score_reshape"] = rpn_cls_score_reshape - self._predictions["rpn_cls_prob"] = rpn_cls_prob - self._predictions["rpn_bbox_pred"] = rpn_bbox_pred - self._predictions["cls_score"] = cls_score - self._predictions["cls_prob"] = cls_prob - self._predictions["bbox_pred"] = bbox_pred - self._predictions["rois"] = rois - - self._score_summaries.update(self._predictions) - - return rois, cls_prob, bbox_pred diff --git a/voc_eval.py b/voc_eval.py new file mode 100644 index 0000000..bd51d65 --- /dev/null +++ b/voc_eval.py @@ -0,0 +1,282 @@ +# -------------------------------------------------------- +# Fast/er R-CNN +# Licensed under The MIT License [see LICENSE for details] +# Written by Bharath Hariharan +# -------------------------------------------------------- +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import xml.etree.ElementTree as ET +import os +import pickle +import numpy as np + + +#['2L_0000080_X_7_4_3 0.294 34.6 202.3 97.7 264.7\n',...] => +#[imagename x y w h score left/right driver/passenger number_hands_on_wheel]; + +def transform(line,hand,person): + line=line.strip().split() + imagename = line[0] + x = int(float(line[2])) + y = int(float(line[3])) + w = int(float(line[4])) - x + h = int(float(line[5])) - y + score = line[1] + number_hands_on_wheel = "-1" #Not implemented + newlist = [imagename,str(x),str(y),str(w),str(h),score,hand,person,number_hands_on_wheel] + newline = "["+" ".join(newlist)+"];\n" + + return newline + + + +def parse_rec(filename): + """ Parse a PASCAL VOC xml file """ + tree = ET.parse(filename) + objects = [] + for obj in tree.findall('object'): + obj_struct = {} + obj_struct['name'] = obj.find('name').text + obj_struct['pose'] = obj.find('pose').text + obj_struct['truncated'] = int(obj.find('truncated').text) + obj_struct['difficult'] = int(obj.find('difficult').text) + bbox = obj.find('bndbox') + obj_struct['bbox'] = [int(bbox.find('xmin').text), + int(bbox.find('ymin').text), + int(bbox.find('xmax').text), + int(bbox.find('ymax').text)] + objects.append(obj_struct) + + return objects + + +def voc_ap(rec, prec, use_07_metric=False): + """ ap = voc_ap(rec, prec, [use_07_metric]) + Compute VOC AP given precision and recall. + If use_07_metric is true, uses the + VOC 07 11 point method (default:False). + """ + if use_07_metric: + # 11 point metric + ap = 0. + for t in np.arange(0., 1.1, 0.1): + if np.sum(rec >= t) == 0: + p = 0 + else: + p = np.max(prec[rec >= t]) + ap = ap + p / 11. + else: + # correct AP calculation + # first append sentinel values at the end + mrec = np.concatenate(([0.], rec, [1.])) + mpre = np.concatenate(([0.], prec, [0.])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + + +def voc_eval(detpath, + annopath, + imagesetfile, + classname, + cachedir, + ovthresh=0.5, + use_07_metric=False): + """rec, prec, ap = voc_eval(detpath, + annopath, + imagesetfile, + classname, + [ovthresh], + [use_07_metric]) + + Top level function that does the PASCAL VOC evaluation. + + detpath: Path to detections + detpath.format(classname) should produce the detection results file. + annopath: Path to annotations + annopath.format(imagename) should be the xml annotations file. + imagesetfile: Text file containing the list of images, one image per line. + classname: Category name (duh) + cachedir: Directory for caching the annotations + [ovthresh]: Overlap threshold (default = 0.5) + [use_07_metric]: Whether to use VOC07's 11 point AP computation + (default False) + """ + # assumes detections are in detpath.format(classname) + # assumes annotations are in annopath.format(imagename) + # assumes imagesetfile is a text file with each line an image name + # cachedir caches the annotations in a pickle file + + # first load gt + ''' + if not os.path.isdir(cachedir): + os.mkdir(cachedir) + cachefile = os.path.join(cachedir, 'annots.pkl') + ''' + # read list of images + with open(imagesetfile, 'r') as f: + lines = f.readlines() + imagenames = [x.strip() for x in lines] + ''' + if not os.path.isfile(cachefile): + # load annots + recs = {} + for i, imagename in enumerate(imagenames): + recs[imagename] = parse_rec(annopath.format(imagename)) + if i % 100 == 0: + print('Reading annotation for {:d}/{:d}'.format( + i + 1, len(imagenames))) + # save + print('Saving cached annotations to {:s}'.format(cachefile)) + with open(cachefile, 'w') as f: + pickle.dump(recs, f) + else: + # load + with open(cachefile, 'rb') as f: + try: + recs = pickle.load(f) + except: + recs = pickle.load(f, encoding='bytes') + + # extract gt objects for this class + class_recs = {} + npos = 0 + for imagename in imagenames: + R = [obj for obj in recs[imagename] if obj['name'] == classname] + bbox = np.array([x['bbox'] for x in R]) + difficult = np.array([x['difficult'] for x in R]).astype(np.bool) + det = [False] * len(R) + npos = npos + sum(~difficult) + class_recs[imagename] = {'bbox': bbox, + 'difficult': difficult, + 'det': det} + ''' + # read dets + detfile = detpath.format(classname) + print("detfile",detfile,classname) + + with open(detfile, 'r') as f: + lines = f.readlines() + + #[Hand detection] [imagename x y w h score left/right driver/passenger number_hands_on_wheel]; + if classname[:4] == "left": + hand = "left" + else: + hand = "right" + + if classname[-6:] == "driver": + person = "driver" + else: + person = "passenger" + + content = "" + file_path = "./result/"+classname+".txt" + f = open(file_path,"w+") + + newlines = [transform(line,hand,person) for line in lines] + content = "".join(newlines) + f.write(content) + f.close() + + #[/hand detection] + + + + #print("voc_eval_detfile_lines_length",lines) + ''' + splitlines = [x.strip().split(' ') for x in lines] + image_ids = [x[0] for x in splitlines] + confidence = np.array([float(x[1]) for x in splitlines]) + + print("confidence",confidence) + BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) + + nd = len(image_ids) + print("voc_eval_nd",nd) + + tp = np.zeros(nd) + fp = np.zeros(nd) + + if BB.shape[0] > 0: + # sort by confidence + sorted_ind = np.argsort(-confidence) + sorted_scores = np.sort(-confidence) + BB = BB[sorted_ind, :] + image_ids = [image_ids[x] for x in sorted_ind] + #print("BB",BB) + + # go down dets and mark TPs and FPs + + #print("nd",nd) + print("image_ids",image_ids) + + for d in range(nd): + R = class_recs[image_ids[d]] + bb = BB[d, :].astype(float) + + print("d",d,"bb",bb) + + ovmax = -np.inf + BBGT = R['bbox'].astype(float) + print("BBGT",BBGT[:,]) + + if BBGT.size > 0: + # compute overlaps + # intersection + ixmin = np.maximum(BBGT[:, 0], bb[0]) + iymin = np.maximum(BBGT[:, 1], bb[1]) + ixmax = np.minimum(BBGT[:, 2], bb[2]) + iymax = np.minimum(BBGT[:, 3], bb[3]) + iw = np.maximum(ixmax - ixmin + 1., 0.) + ih = np.maximum(iymax - iymin + 1., 0.) + inters = iw * ih + + # union + uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + + (BBGT[:, 2] - BBGT[:, 0] + 1.) * + (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) + + overlaps = inters / uni + print("overlaps",overlaps) + + ovmax = np.max(overlaps) + jmax = np.argmax(overlaps) + print("ovmax",ovmax) + + if ovmax > ovthresh: + if not R['difficult'][jmax]: + if not R['det'][jmax]: + tp[d] = 1. + R['det'][jmax] = 1 + else: + fp[d] = 1. + else: + fp[d] = 1. + print("fp",fp) + print("tp",tp) + # compute precision recall + fp = np.cumsum(fp) + tp = np.cumsum(tp) + rec = tp / float(npos) + # avoid divide by zero in case the first detection matches a difficult + # ground truth + prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) + ap = voc_ap(rec, prec, use_07_metric) + ''' + rec = 0 + prec = 0 + ap = 0 + return rec, prec, ap + +