diff --git a/LISA_annotation_to_VOC.py b/LISA_annotation_to_VOC.py
index ff8374c..be8d323 100644
--- a/LISA_annotation_to_VOC.py
+++ b/LISA_annotation_to_VOC.py
@@ -80,6 +80,11 @@ def generate_xml(name,img_size):
         title_text = doc.createTextNode(str(img_size[2]))
         title.appendChild(title_text)
         size.appendChild(title)
+        
+        title = doc.createElement('segmented')
+        title_text = doc.createTextNode('0')
+        title.appendChild(title_text)
+        annotation.appendChild(title)
 
         # A loop for several objects to be detected
         #The bounding boxes are described using the top left point, a width, and a height [x y w h] in the 2D image plane.=>[xmin,ymin,xmax,ymax]
@@ -87,7 +92,7 @@ def generate_xml(name,img_size):
             data=lines[i].strip().split(" ")
             name=data[0]
             x,y,w,h=int(data[1]),int(data[2]),int(data[3]),int(data[4])
-            xmin,ymin,xmax,ymax=x,y-h,x+w,y
+            xmin,ymin,xmax,ymax=x,y,x+w,y+h
             
         
             object = doc.createElement('object')
@@ -96,7 +101,24 @@ def generate_xml(name,img_size):
             title_text = doc.createTextNode(name)
             title.appendChild(title_text)
             object.appendChild(title)
-
+            
+                        
+            title = doc.createElement('pose')
+            title_text = doc.createTextNode('Unspecified')
+            title.appendChild(title_text)
+            object.appendChild(title)
+            
+            title = doc.createElement('truncated')
+            title_text = doc.createTextNode('0')
+            title.appendChild(title_text)
+            object.appendChild(title)
+            
+            title = doc.createElement('difficult')
+            title_text = doc.createTextNode('0')
+            title.appendChild(title_text)
+            object.appendChild(title)
+            
+            
             bndbox = doc.createElement('bndbox')
             object.appendChild(bndbox)
             title = doc.createElement('xmin')
@@ -132,6 +154,7 @@ def generate_xml(name,img_size):
         generate_xml(name,img_size)
     
     
+    
 
 
 
diff --git a/LISA_posGt_to_VOC_main.py b/LISA_posGt_to_VOC_main.py
new file mode 100644
index 0000000..e368b7f
--- /dev/null
+++ b/LISA_posGt_to_VOC_main.py
@@ -0,0 +1,106 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Jun 24 19:26:54 2017
+
+
+"""
+# --------------------------------------------------------
+#Transform posGt to VOC2007/imagesets/main
+#Used for create train.txt,val.txt,trainval.txt under main folder in VOC2007
+#Written by Shaoshen Wang
+# --------------------------------------------------------
+#Usage:
+#Put this script under train folder
+#Create a new folder named "Main" in train folder
+#Run this script
+
+
+import os
+
+annotation_path = "./posGt/" 
+result_path = "./ImageSets/Main/"
+
+ratio_trainval = 0.5  #(trainval/total) 
+ratio_train = 0.5    #(train/trainval)
+
+def create_train_val_trainval():
+    files = os.listdir(annotation_path)
+    total_cases = len(files)
+    t2 = int(ratio_trainval*total_cases)
+    t1 = int(ratio_train*t2)
+    train_cases=files[:t1]
+    val_cases=files[t1:t2]
+    test_cases=files[t2:]
+    
+    train_txt = ""
+    val_txt = ""
+    trainval_txt = ""
+    test_txt=""
+    
+    for file in train_cases:
+        train_txt += file[:-4] + "\n"  #Delete ".txt"
+    for file in val_cases:
+        val_txt += file[:-4] + "\n"
+    trainval_txt = train_txt+val_txt
+    for file in test_cases:
+        test_txt += file[:-4] + "\n"
+        
+    f = open(result_path+"train.txt","w")
+    f.write(train_txt)
+    f.close()
+    f = open(result_path+"val.txt","w")
+    f.write(val_txt)
+    f.close()
+    f = open(result_path+"trainval.txt","w")
+    f.write(trainval_txt)
+    f.close()
+    f = open(result_path+"test.txt","w")
+    f.write(test_txt)
+    f.close()
+
+def create_train_for_classes():               #Not being used so far
+    files = os.listdir(annotation_path)
+    total_cases = len(files)
+    total_train = 3
+    total_test = 0
+    record = [[],[],[],[]]
+    names = ["leftHand_driver","rightHand_driver","leftHand_passenger","rightHand_passenger"]
+    
+    
+    train_cases = files[:total_train]
+    for case in train_cases:
+        file = open(annotation_path+case)
+        lines = file.readlines()
+        lines = lines[1:]                  #ignore first line
+        indicator = [-1,-1,-1,-1]
+        
+        for line in lines:
+            line = line.strip().split(" ")
+            name = line[0]
+            if name == "leftHand_driver":
+                indicator[0] = 1
+            elif name == "rightHand_driver":
+                indicator[1] = 1
+            elif name == "leftHand_passenger":
+                indicator[2] = 1
+            elif name == "rightHand_passenger":
+                indicator[3] = 1
+            else:
+                pass
+        for i in range(4):
+            record[i].append((case,indicator[i]))
+
+    for i in range(4):
+        file_path=result_path+names[i]+"_train"+".txt"
+        content=""
+        for k in record[i]:
+            content+=k[0]+" "+str(k[1])+"\n"
+        f=open(file_path,"w")
+        f.write(content)
+        f.close()
+    #print(record)   
+        
+if __name__ == '__main__':
+    create_train_val_trainval()
+    
+    
diff --git a/Modification Points b/Modification Points
new file mode 100644
index 0000000..f7e86fd
--- /dev/null
+++ b/Modification Points	
@@ -0,0 +1,49 @@
+
+Generate annotations
+Generate 4 txt file train.txt val.txt trainval.txt test.txt under Main
+
+Error: overlaps = entry['max_overlaps']:
+Delete data/cache folder，因为里面保存了上一次数据集的roidb。因为错误显示加载了以前的文件。
+
+Config.py:
+暂时去掉使用flip扩增数据集的方法
+
+Pascal_voc.py:
+1）修改大小写obj.find('name').text.lower() delete lower()
+
+2）Delete -1 in 
+x1 = float(bbox.find('xmin').text)-1…
+y2 = float(bbox.find('ymax').text)-1
+
+因为原坐标位置起始是(1,1)，现在是(0,0)
+
+3）修改分类class 为4+1类
+4）修改jpg为png，因为新数据集图像格式改变了
+
+Vgg16:
+1）修改网络
+2）修改load pretrain model时需要加载的参数
+
+Error: Train loss 出现NAN：
+重新制作数据集，问题消失，怀疑之前数据集有损坏。
+
+Error:  rpn_cls_score与 label不匹配，reshape无法完成：
+Label长度代表了anchor数量
+通过查找anchor产生过程发现产生anchor的数量是根据input(224*224) resize得到的，resize的ratio被写死了，需要修改。
+修改network.py self._feat_stride, self._feat_compress 
+从16改为4.
+Change this ratio to 4 = input width/conv5 width = 224/56 = 4 in modified case
+
+
+Testing:
+Vgg16.py:
+修改concate 维度为-1，即连接channel的维度
+
+lib/datasets/voc_eval.py:
+
+注释掉部分evaluation的代码，把结果改成正确格式output到txt里面.
+
+mAP低可能由于train不充分
+需要调整thresh
+testing得到很多bbox的坐标heconfidence，取
+得到所有testing的结果之后，把预测的box 通过 pascal_voc 的_write_voc_results_file写入了result 文件
diff --git a/README.md b/README.md
index 419d890..19c9280 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,51 @@
-# HandDetection
\ No newline at end of file
+# HandDetection
+This is a Modify faster-rcnn hand detection project, developed during my research assistant in Centre of Artificial Intelligence (CAI) in UTS. </br>
+This project achieves Top 10 performance in VIVA hand detection competition.
+
+![](pic/arch.png)
+
+
+
+
+Setup via [https://github.com/endernewton/tf-faster-rcnn](https://github.com/endernewton/tf-faster-rcnn)
+
+Modified the code via [Robust Hand Detection in Vehicles](http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7899695) for hand detection.
+
+This project is collaboration with my collegue Yunqiu Xu (https://github.com/YunqiuXu).
+
+# Preprocessing
+~/tf-faster-rcnn-endernewton/data/LISA_HD_Static/detectiondata$ python LISA_posGt_to_VOC_Annotations.py </br>
+~/tf-faster-rcnn-endernewton/data/LISA_HD_Static/detectiondata$ python LISA_posGt_to_VOC_Main.py </br>
+
+# Train
+~/tf-faster-rcnn-endernewton$ ./experiments/scripts/train_faster_rcnn.sh 0 pascal_voc vgg16 </br>
+
+# Test
+Modifiy the iter times in test_faster_rcnn.sh </br>
+~/tf-faster-rcnn-endernewton$ ./experiments/scripts/test_faster_rcnn.sh 0 pascal_voc vgg16 </br>
+
+# How to do prediction on your own dataset
+
+cd tf-faster-rcnn-endernewton/data/LISA_HD_Static/detectiondata/ImageSets/Main </br>
+mv test.txt test_for_train.txt </br>
+mv test5500.txt test.txt </br>
+
+cd tf-faster-rcnn-endernewton/data/LISA_HD_Static/detectiondata </br>
+mv JPEGImages JPEGImages_train </br>
+mv JPEGImages_test JPEGImages </br>
+
+Open tf-faster-rcnn-endernewton/experiments/scripts/test_faster_rcnn.sh </br>
+Set line 21 "ITERS = the iters of the model you trained" Say if you trained a model with 10000 iters, set this line "ITERS = 10000" </br>
+
+cd tf-faster-rcnn-endernewton </br>
+./experiments/scripts/test_faster_rcnn.sh 0 pascal_voc vgg16 </br>
+
+# How to stop the training
+
+tmux attach </br>
+ctrl+c
+
+
+
+
+
diff --git a/checkpoint_params.py b/checkpoint_params.py
new file mode 100644
index 0000000..f94e076
--- /dev/null
+++ b/checkpoint_params.py
@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jun 25 17:00:50 2017
+
+@author: Shaoshen Wang
+"""
+#Used for show the variables in a checkpoint file
+#Usage: Put this code under tf-faster-rcnn-master
+
+import os
+import tensorflow as tf
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.tools.inspect_checkpoint import print_tensors_in_checkpoint_file
+
+def get_variables_in_checkpoint_file(file_name):
+    try:
+        reader = pywrap_tensorflow.NewCheckpointReader(file_name)      
+        var_to_shape_map = reader.get_variable_to_shape_map()
+        return var_to_shape_map
+    except Exception as e:
+        print(str(e))
+
+
+model_dir=".\data\imagenet_weights"
+checkpoint_path = os.path.join(model_dir, "vgg16.ckpt")
+
+#print(type(file_name))
+
+var_to_shape_map=get_variables_in_checkpoint_file(checkpoint_path)
+
+for var in var_to_shape_map:
+    print(var,var_to_shape_map[var])
+
+
+# List ALL tensors example output: v0/Adam (DT_FLOAT) [3,3,1,80]
+#print_tensors_in_checkpoint_file(file_name=checkpoint_path, tensor_name='',all_tensors='')
diff --git a/network.py b/network.py
new file mode 100644
index 0000000..8552146
--- /dev/null
+++ b/network.py
@@ -0,0 +1,403 @@
+# --------------------------------------------------------
+# Tensorflow Faster R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Xinlei Chen
+# Modified by Shaoshen Wang
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import tensorflow.contrib.slim as slim
+from tensorflow.contrib.slim import losses
+from tensorflow.contrib.slim import arg_scope
+
+import numpy as np
+
+from layer_utils.snippets import generate_anchors_pre
+from layer_utils.proposal_layer import proposal_layer
+from layer_utils.proposal_top_layer import proposal_top_layer
+from layer_utils.anchor_target_layer import anchor_target_layer
+from layer_utils.proposal_target_layer import proposal_target_layer
+
+from model.config import cfg
+
+class Network(object):
+  def __init__(self, batch_size=1):
+    #Change this ratio to 4 = input width/conv5 width = 224/56 = 4 in modified case
+    #Old：self._feat_stride = [16, ]
+    self._feat_stride = [4, ]
+    #Change this ratio to 4 ,same as above
+    #Old：self._feat_compress = [1. / 16., ]
+    self._feat_compress = [1. / 4., ]
+
+    self._batch_size = batch_size
+    self._predictions = {}
+    self._losses = {}
+    self._anchor_targets = {}
+    self._proposal_targets = {}
+    self._layers = {}
+    self._act_summaries = []
+    self._score_summaries = {}
+    self._train_summaries = []
+    self._event_summaries = {}
+
+  def _add_image_summary(self, image, boxes):
+    # add back mean
+    image += cfg.PIXEL_MEANS
+    # bgr to rgb (opencv uses bgr)
+    channels = tf.unstack (image, axis=-1)
+    image    = tf.stack ([channels[2], channels[1], channels[0]], axis=-1)
+    # dims for normalization
+    width  = tf.to_float(tf.shape(image)[2])
+    height = tf.to_float(tf.shape(image)[1])
+    # from [x1, y1, x2, y2, cls] to normalized [y1, x1, y1, x1]
+    cols = tf.unstack(boxes, axis=1)
+    boxes = tf.stack([cols[1] / height,
+                      cols[0] / width,
+                      cols[3] / height,
+                      cols[2] / width], axis=1)
+    # add batch dimension (assume batch_size==1)
+    assert image.get_shape()[0] == 1
+    boxes = tf.expand_dims(boxes, dim=0)
+    image = tf.image.draw_bounding_boxes(image, boxes)
+    
+    return tf.summary.image('ground_truth', image)
+
+  def _add_act_summary(self, tensor):
+    tf.summary.histogram('ACT/' + tensor.op.name + '/activations', tensor)
+    tf.summary.scalar('ACT/' + tensor.op.name + '/zero_fraction',
+                      tf.nn.zero_fraction(tensor))
+
+  def _add_score_summary(self, key, tensor):
+    tf.summary.histogram('SCORE/' + tensor.op.name + '/' + key + '/scores', tensor)
+
+  def _add_train_summary(self, var):
+    tf.summary.histogram('TRAIN/' + var.op.name, var)
+
+  def _reshape_layer(self, bottom, num_dim, name):
+    input_shape = tf.shape(bottom)
+    with tf.variable_scope(name) as scope:
+      # change the channel to the caffe format
+      to_caffe = tf.transpose(bottom, [0, 3, 1, 2])
+      # then force it to have channel 2
+      reshaped = tf.reshape(to_caffe,
+                            tf.concat(axis=0, values=[[self._batch_size], [num_dim, -1], [input_shape[2]]]))
+      # then swap the channel back
+      to_tf = tf.transpose(reshaped, [0, 2, 3, 1])
+      return to_tf
+
+  def _softmax_layer(self, bottom, name):
+    if name == 'rpn_cls_prob_reshape':
+      input_shape = tf.shape(bottom)
+      bottom_reshaped = tf.reshape(bottom, [-1, input_shape[-1]])
+      reshaped_score = tf.nn.softmax(bottom_reshaped, name=name)
+      return tf.reshape(reshaped_score, input_shape)
+    return tf.nn.softmax(bottom, name=name)
+
+  def _proposal_top_layer(self, rpn_cls_prob, rpn_bbox_pred, name):
+    with tf.variable_scope(name) as scope:
+      rois, rpn_scores = tf.py_func(proposal_top_layer,
+                                    [rpn_cls_prob, rpn_bbox_pred, self._im_info,
+                                     self._feat_stride, self._anchors, self._num_anchors],
+                                    [tf.float32, tf.float32])
+      rois.set_shape([cfg.TEST.RPN_TOP_N, 5])
+      rpn_scores.set_shape([cfg.TEST.RPN_TOP_N, 1])
+
+    return rois, rpn_scores
+
+  def _proposal_layer(self, rpn_cls_prob, rpn_bbox_pred, name):
+    with tf.variable_scope(name) as scope:
+      rois, rpn_scores = tf.py_func(proposal_layer,
+                                    [rpn_cls_prob, rpn_bbox_pred, self._im_info, self._mode,
+                                     self._feat_stride, self._anchors, self._num_anchors],
+                                    [tf.float32, tf.float32])
+      rois.set_shape([None, 5])
+      rpn_scores.set_shape([None, 1])
+
+    return rois, rpn_scores
+
+  # Only use it if you have roi_pooling op written in tf.image
+  def _roi_pool_layer(self, bootom, rois, name):
+    with tf.variable_scope(name) as scope:
+      return tf.image.roi_pooling(bootom, rois,
+                                  pooled_height=cfg.POOLING_SIZE,
+                                  pooled_width=cfg.POOLING_SIZE,
+                                  spatial_scale=1. / 16.)[0]
+
+  def _crop_pool_layer(self, bottom, rois, name):
+    with tf.variable_scope(name) as scope:
+      batch_ids = tf.squeeze(tf.slice(rois, [0, 0], [-1, 1], name="batch_id"), [1])
+      # Get the normalized coordinates of bboxes
+      bottom_shape = tf.shape(bottom)
+      height = (tf.to_float(bottom_shape[1]) - 1.) * np.float32(self._feat_stride[0])
+      width = (tf.to_float(bottom_shape[2]) - 1.) * np.float32(self._feat_stride[0])
+      x1 = tf.slice(rois, [0, 1], [-1, 1], name="x1") / width
+      y1 = tf.slice(rois, [0, 2], [-1, 1], name="y1") / height
+      x2 = tf.slice(rois, [0, 3], [-1, 1], name="x2") / width
+      y2 = tf.slice(rois, [0, 4], [-1, 1], name="y2") / height
+      # Won't be backpropagated to rois anyway, but to save time
+      bboxes = tf.stop_gradient(tf.concat([y1, x1, y2, x2], axis=1))
+      pre_pool_size = cfg.POOLING_SIZE * 2
+      crops = tf.image.crop_and_resize(bottom, bboxes, tf.to_int32(batch_ids), [pre_pool_size, pre_pool_size], name="crops")
+
+    return slim.max_pool2d(crops, [2, 2], padding='SAME')
+
+  def _dropout_layer(self, bottom, name, ratio=0.5):
+    return tf.nn.dropout(bottom, ratio, name=name)
+
+  def _anchor_target_layer(self, rpn_cls_score, name):
+    with tf.variable_scope(name) as scope:
+      rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = tf.py_func(
+        anchor_target_layer,
+        [rpn_cls_score, self._gt_boxes, self._im_info, self._feat_stride, self._anchors, self._num_anchors],
+        [tf.float32, tf.float32, tf.float32, tf.float32])
+
+      rpn_labels.set_shape([1, 1, None, None])
+      rpn_bbox_targets.set_shape([1, None, None, self._num_anchors * 4])
+      rpn_bbox_inside_weights.set_shape([1, None, None, self._num_anchors * 4])
+      rpn_bbox_outside_weights.set_shape([1, None, None, self._num_anchors * 4])
+
+      rpn_labels = tf.to_int32(rpn_labels, name="to_int32")
+      self._anchor_targets['rpn_labels'] = rpn_labels
+      self._anchor_targets['rpn_bbox_targets'] = rpn_bbox_targets
+      self._anchor_targets['rpn_bbox_inside_weights'] = rpn_bbox_inside_weights
+      self._anchor_targets['rpn_bbox_outside_weights'] = rpn_bbox_outside_weights
+
+      self._score_summaries.update(self._anchor_targets)
+
+    return rpn_labels
+
+  def _proposal_target_layer(self, rois, roi_scores, name):
+    with tf.variable_scope(name) as scope:
+      rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights = tf.py_func(
+        proposal_target_layer,
+        [rois, roi_scores, self._gt_boxes, self._num_classes],
+        [tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32])
+
+      rois.set_shape([cfg.TRAIN.BATCH_SIZE, 5])
+      roi_scores.set_shape([cfg.TRAIN.BATCH_SIZE])
+      labels.set_shape([cfg.TRAIN.BATCH_SIZE, 1])
+      bbox_targets.set_shape([cfg.TRAIN.BATCH_SIZE, self._num_classes * 4])
+      bbox_inside_weights.set_shape([cfg.TRAIN.BATCH_SIZE, self._num_classes * 4])
+      bbox_outside_weights.set_shape([cfg.TRAIN.BATCH_SIZE, self._num_classes * 4])
+
+      self._proposal_targets['rois'] = rois
+      self._proposal_targets['labels'] = tf.to_int32(labels, name="to_int32")
+      self._proposal_targets['bbox_targets'] = bbox_targets
+      self._proposal_targets['bbox_inside_weights'] = bbox_inside_weights
+      self._proposal_targets['bbox_outside_weights'] = bbox_outside_weights
+
+      self._score_summaries.update(self._proposal_targets)
+
+      return rois, roi_scores
+
+  def _anchor_component(self):
+    with tf.variable_scope('ANCHOR_' + self._tag) as scope:
+      # just to get the shape right
+
+      height = tf.to_int32(tf.ceil(self._im_info[0, 0] / np.float32(self._feat_stride[0])))
+      width = tf.to_int32(tf.ceil(self._im_info[0, 1] / np.float32(self._feat_stride[0])))
+
+      anchors, anchor_length = tf.py_func(generate_anchors_pre,
+                                          [height, width,
+                                           self._feat_stride, self._anchor_scales, self._anchor_ratios],
+                                          [tf.float32, tf.int32], name="generate_anchors")
+      anchors.set_shape([None, 4])
+      
+      anchor_length.set_shape([])
+      self._anchors = anchors
+      self._anchor_length = anchor_length
+
+  def build_network(self, sess, is_training=True):
+    raise NotImplementedError
+
+  def _smooth_l1_loss(self, bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights, sigma=1.0, dim=[1]):
+    sigma_2 = sigma ** 2
+    box_diff = bbox_pred - bbox_targets
+    in_box_diff = bbox_inside_weights * box_diff
+    abs_in_box_diff = tf.abs(in_box_diff)
+    smoothL1_sign = tf.stop_gradient(tf.to_float(tf.less(abs_in_box_diff, 1. / sigma_2)))
+    in_loss_box = tf.pow(in_box_diff, 2) * (sigma_2 / 2.) * smoothL1_sign \
+                  + (abs_in_box_diff - (0.5 / sigma_2)) * (1. - smoothL1_sign)
+    out_loss_box = bbox_outside_weights * in_loss_box
+    loss_box = tf.reduce_mean(tf.reduce_sum(
+      out_loss_box,
+      axis=dim
+    ))
+    return loss_box
+
+  def _add_losses(self, sigma_rpn=3.0):
+    with tf.variable_scope('loss_' + self._tag) as scope:
+      # RPN, class loss
+      rpn_cls_score = tf.reshape(self._predictions['rpn_cls_score_reshape'], [-1, 2])
+      rpn_label = tf.reshape(self._anchor_targets['rpn_labels'], [-1])
+      rpn_select = tf.where(tf.not_equal(rpn_label, -1))
+      rpn_cls_score = tf.reshape(tf.gather(rpn_cls_score, rpn_select), [-1, 2])
+      rpn_label = tf.reshape(tf.gather(rpn_label, rpn_select), [-1])
+      rpn_cross_entropy = tf.reduce_mean(
+        tf.nn.sparse_softmax_cross_entropy_with_logits(logits=rpn_cls_score, labels=rpn_label))
+
+      # RPN, bbox loss
+      rpn_bbox_pred = self._predictions['rpn_bbox_pred']
+      rpn_bbox_targets = self._anchor_targets['rpn_bbox_targets']
+      rpn_bbox_inside_weights = self._anchor_targets['rpn_bbox_inside_weights']
+      rpn_bbox_outside_weights = self._anchor_targets['rpn_bbox_outside_weights']
+
+      rpn_loss_box = self._smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights,
+                                          rpn_bbox_outside_weights, sigma=sigma_rpn, dim=[1, 2, 3])
+
+      # RCNN, class loss
+      cls_score = self._predictions["cls_score"]
+      label = tf.reshape(self._proposal_targets["labels"], [-1])
+
+      cross_entropy = tf.reduce_mean(
+        tf.nn.sparse_softmax_cross_entropy_with_logits(
+          logits=tf.reshape(cls_score, [-1, self._num_classes]), labels=label))
+
+      # RCNN, bbox loss
+      bbox_pred = self._predictions['bbox_pred']
+      bbox_targets = self._proposal_targets['bbox_targets']
+      bbox_inside_weights = self._proposal_targets['bbox_inside_weights']
+      bbox_outside_weights = self._proposal_targets['bbox_outside_weights']
+
+      loss_box = self._smooth_l1_loss(bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights)
+
+      self._losses['cross_entropy'] = cross_entropy
+      self._losses['loss_box'] = loss_box
+      self._losses['rpn_cross_entropy'] = rpn_cross_entropy
+      self._losses['rpn_loss_box'] = rpn_loss_box
+
+      loss = cross_entropy + loss_box + rpn_cross_entropy + rpn_loss_box
+      self._losses['total_loss'] = loss
+
+      self._event_summaries.update(self._losses)
+
+    return loss
+
+  def create_architecture(self, sess, mode, num_classes, tag=None,
+                          anchor_scales=(8, 16, 32), anchor_ratios=(0.5, 1, 2)):
+    self._image = tf.placeholder(tf.float32, shape=[self._batch_size, None, None, 3])
+    self._im_info = tf.placeholder(tf.float32, shape=[self._batch_size, 3])
+    self._gt_boxes = tf.placeholder(tf.float32, shape=[None, 5])
+    self._tag = tag
+
+    self._num_classes = num_classes
+    self._mode = mode
+    self._anchor_scales = anchor_scales
+    self._num_scales = len(anchor_scales)
+
+    self._anchor_ratios = anchor_ratios
+    self._num_ratios = len(anchor_ratios)
+
+    self._num_anchors = self._num_scales * self._num_ratios
+
+    training = mode == 'TRAIN'
+    testing = mode == 'TEST'
+
+    assert tag != None
+
+    # handle most of the regularizers here
+    weights_regularizer = tf.contrib.layers.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY)
+    if cfg.TRAIN.BIAS_DECAY:
+      biases_regularizer = weights_regularizer
+    else:
+      biases_regularizer = tf.no_regularizer
+
+    # list as many types of layers as possible, even if they are not used now
+    with arg_scope([slim.conv2d, slim.conv2d_in_plane, \
+                    slim.conv2d_transpose, slim.separable_conv2d, slim.fully_connected], 
+                    weights_regularizer=weights_regularizer,
+                    biases_regularizer=biases_regularizer, 
+                    biases_initializer=tf.constant_initializer(0.0)): 
+      rois, cls_prob, bbox_pred = self.build_network(sess, training)
+
+    layers_to_output = {'rois': rois}
+    layers_to_output.update(self._predictions)
+
+    for var in tf.trainable_variables():
+      self._train_summaries.append(var)
+
+    if mode == 'TEST':
+      stds = np.tile(np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (self._num_classes))
+      means = np.tile(np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (self._num_classes))
+      self._predictions["bbox_pred"] *= stds
+      self._predictions["bbox_pred"] += means
+    else:
+      self._add_losses()
+      layers_to_output.update(self._losses)
+
+    val_summaries = []
+    with tf.device("/cpu:0"):
+      val_summaries.append(self._add_image_summary(self._image, self._gt_boxes))
+      for key, var in self._event_summaries.items():
+        val_summaries.append(tf.summary.scalar(key, var))
+      for key, var in self._score_summaries.items():
+        self._add_score_summary(key, var)
+      for var in self._act_summaries:
+        self._add_act_summary(var)
+      for var in self._train_summaries:
+        self._add_train_summary(var)
+
+    self._summary_op = tf.summary.merge_all()
+    if not testing:
+      self._summary_op_val = tf.summary.merge(val_summaries)
+
+    return layers_to_output
+
+  # Extract the head feature maps, for example for vgg16 it is conv5_3
+  # only useful during testing mode
+  def extract_head(self, sess, image):
+    feed_dict = {self._image: image}
+    feat = sess.run(self._layers["head"], feed_dict=feed_dict)
+    return feat
+
+  # only useful during testing mode
+  def test_image(self, sess, image, im_info):
+    feed_dict = {self._image: image,
+                 self._im_info: im_info}
+    cls_score, cls_prob, bbox_pred, rois = sess.run([self._predictions["cls_score"],
+                                                     self._predictions['cls_prob'],
+                                                     self._predictions['bbox_pred'],
+                                                     self._predictions['rois']],
+                                                    feed_dict=feed_dict)
+    return cls_score, cls_prob, bbox_pred, rois
+
+  def get_summary(self, sess, blobs):
+    feed_dict = {self._image: blobs['data'], self._im_info: blobs['im_info'],
+                 self._gt_boxes: blobs['gt_boxes']}
+    summary = sess.run(self._summary_op_val, feed_dict=feed_dict)
+
+    return summary
+
+  def train_step(self, sess, blobs, train_op):
+    feed_dict = {self._image: blobs['data'], self._im_info: blobs['im_info'],
+                 self._gt_boxes: blobs['gt_boxes']}
+    rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, loss, _ = sess.run([self._losses["rpn_cross_entropy"],
+                                                                        self._losses['rpn_loss_box'],
+                                                                        self._losses['cross_entropy'],
+                                                                        self._losses['loss_box'],
+                                                                        self._losses['total_loss'],
+                                                                        train_op],
+                                                                       feed_dict=feed_dict)
+    return rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, loss
+
+  def train_step_with_summary(self, sess, blobs, train_op):
+    feed_dict = {self._image: blobs['data'], self._im_info: blobs['im_info'],
+                 self._gt_boxes: blobs['gt_boxes']}
+    rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, loss, summary, _ = sess.run([self._losses["rpn_cross_entropy"],
+                                                                                 self._losses['rpn_loss_box'],
+                                                                                 self._losses['cross_entropy'],
+                                                                                 self._losses['loss_box'],
+                                                                                 self._losses['total_loss'],
+                                                                                 self._summary_op,
+                                                                                 train_op],
+                                                                                feed_dict=feed_dict)
+    return rpn_loss_cls, rpn_loss_box, loss_cls, loss_box, loss, summary
+
+  def train_step_no_return(self, sess, blobs, train_op):
+    feed_dict = {self._image: blobs['data'], self._im_info: blobs['im_info'],
+                 self._gt_boxes: blobs['gt_boxes']}
+    sess.run([train_op], feed_dict=feed_dict)
+
diff --git a/pascal_voc.py b/pascal_voc.py
new file mode 100644
index 0000000..be6c81f
--- /dev/null
+++ b/pascal_voc.py
@@ -0,0 +1,317 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick and Xinlei Chen
+# Modified by Shaoshen Wang
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from datasets.imdb import imdb
+import datasets.ds_utils as ds_utils
+import xml.etree.ElementTree as ET
+import numpy as np
+import scipy.sparse
+import scipy.io as sio
+import utils.cython_bbox
+import pickle
+import subprocess
+import uuid
+from .voc_eval import voc_eval
+from model.config import cfg
+
+
+class pascal_voc(imdb):
+  def __init__(self, image_set, year, devkit_path=None):
+    imdb.__init__(self, 'voc_' + year + '_' + image_set)
+    self._year = year
+    self._image_set = image_set
+    self._devkit_path = self._get_default_path() if devkit_path is None \
+      else devkit_path
+    #[Hand Detection]
+    self._hand_path = cfg.DATA_DIR #./data/
+    self._data_path = os.path.join(self._hand_path, 'LISA_HD_Static','detectiondata')#Same as VOC2007/
+
+    #self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year)
+
+    #Substitute original _classes with 1+4 classes 
+    #["leftHand_driver","rightHand_driver","leftHand_passenger","rightHand_passenger"]+background
+
+    self._classes = ('__background__',  # always index 0
+                     'leftHand_driver', 'rightHand_driver', 'leftHand_passenger', 'rightHand_passenger'
+                     )
+    self._class_to_ind = dict(list(zip(self.classes, list(range(self.num_classes)))))
+    self._image_ext = '.png'
+    self._image_index = self._load_image_set_index()
+    # Default to roidb handler
+    self._roidb_handler = self.gt_roidb
+    self._salt = str(uuid.uuid4())
+    self._comp_id = 'comp4'
+
+    # PASCAL specific config options
+    self.config = {'cleanup': True,
+                   'use_salt': True,
+                   'use_diff': False,
+                   'matlab_eval': False,
+                   'rpn_file': None}
+
+    assert os.path.exists(self._devkit_path), \
+      'VOCdevkit path does not exist: {}'.format(self._devkit_path)
+    assert os.path.exists(self._data_path), \
+      'Path does not exist: {}'.format(self._data_path)
+
+  def image_path_at(self, i):
+    """
+    Return the absolute path to image i in the image sequence.
+    """
+    return self.image_path_from_index(self._image_index[i])
+
+  def image_path_from_index(self, index):
+    """
+    Construct an image path from the image's "index" identifier.
+    """
+    image_path = os.path.join(self._data_path, 'JPEGImages',
+                              index + self._image_ext)
+    assert os.path.exists(image_path), \
+      'Path does not exist: {}'.format(image_path)
+    return image_path
+
+  def _load_image_set_index(self):
+    """
+    Load the indexes listed in this dataset's image set file.
+    """
+    # Example path to image set file:
+    # self._devkit_path + /VOCdevkit2007/VOC2007/ImageSets/Main/val.txt
+    image_set_file = os.path.join(self._data_path, 'ImageSets', 'Main',
+                                  self._image_set + '.txt')
+    assert os.path.exists(image_set_file), \
+      'Path does not exist: {}'.format(image_set_file)
+    with open(image_set_file) as f:
+      image_index = [x.strip() for x in f.readlines()]
+    return image_index
+
+  def _get_default_path(self):
+    """
+    Return the default path where PASCAL VOC is expected to be installed.
+    """
+    return os.path.join(cfg.DATA_DIR, 'VOCdevkit' + self._year)
+
+  def gt_roidb(self):
+    """
+    Return the database of ground-truth regions of interest.
+
+    This function loads/saves from/to a cache file to speed up future calls.
+    """
+    cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl')
+    if os.path.exists(cache_file):
+      with open(cache_file, 'rb') as fid:
+        try:
+          roidb = pickle.load(fid)
+        except:
+          roidb = pickle.load(fid, encoding='bytes')
+      print('{} gt roidb loaded from {}'.format(self.name, cache_file))
+      return roidb
+
+    gt_roidb = [self._load_pascal_annotation(index)
+                for index in self.image_index]
+    with open(cache_file, 'wb') as fid:
+      pickle.dump(gt_roidb, fid, pickle.HIGHEST_PROTOCOL)
+    print('wrote gt roidb to {}'.format(cache_file))
+
+    return gt_roidb
+
+  def rpn_roidb(self):
+    if int(self._year) == 2007 or self._image_set != 'test':
+      gt_roidb = self.gt_roidb()
+      rpn_roidb = self._load_rpn_roidb(gt_roidb)
+      roidb = imdb.merge_roidbs(gt_roidb, rpn_roidb)
+    else:
+      roidb = self._load_rpn_roidb(None)
+
+    return roidb
+
+  def _load_rpn_roidb(self, gt_roidb):
+    filename = self.config['rpn_file']
+    print('loading {}'.format(filename))
+    assert os.path.exists(filename), \
+      'rpn data not found at: {}'.format(filename)
+    with open(filename, 'rb') as f:
+      box_list = pickle.load(f)
+    return self.create_roidb_from_box_list(box_list, gt_roidb)
+
+  def _load_pascal_annotation(self, index):
+    """
+    Load image and bounding boxes info from XML file in the PASCAL VOC
+    format.
+    """
+    filename = os.path.join(self._data_path, 'Annotations', index + '.xml')
+    tree = ET.parse(filename)
+    objs = tree.findall('object')
+    if not self.config['use_diff']:
+      # Exclude the samples labeled as difficult
+      non_diff_objs = [
+        obj for obj in objs if int(obj.find('difficult').text) == 0]
+      # if len(non_diff_objs) != len(objs):
+      #     print 'Removed {} difficult objects'.format(
+      #         len(objs) - len(non_diff_objs))
+      objs = non_diff_objs
+    num_objs = len(objs)
+
+    boxes = np.zeros((num_objs, 4), dtype=np.uint16)
+    gt_classes = np.zeros((num_objs), dtype=np.int32)
+    overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32)
+    # "Seg" area for pascal is just the box area
+    seg_areas = np.zeros((num_objs), dtype=np.float32)
+
+    # Load object bounding boxes into a data frame.
+    for ix, obj in enumerate(objs):
+      bbox = obj.find('bndbox')
+      # Make pixel indexes 0-based
+      #Delete -1
+      x1 = float(bbox.find('xmin').text) 
+      y1 = float(bbox.find('ymin').text) 
+      x2 = float(bbox.find('xmax').text) 
+      y2 = float(bbox.find('ymax').text) 
+      #[Hand Detection]
+      #cls = self._class_to_ind[obj.find('name').text.lower().strip()]
+      cls = self._class_to_ind[obj.find('name').text.strip()] #Delete lower(), cause the annotation class has upper case.
+
+      boxes[ix, :] = [x1, y1, x2, y2]
+      gt_classes[ix] = cls
+      overlaps[ix, cls] = 1.0
+      seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1)
+
+    overlaps = scipy.sparse.csr_matrix(overlaps)
+
+    return {'boxes': boxes,
+            'gt_classes': gt_classes,
+            'gt_overlaps': overlaps,
+            'flipped': False,
+            'seg_areas': seg_areas}
+
+  def _get_comp_id(self):
+    comp_id = (self._comp_id + '_' + self._salt if self.config['use_salt']
+               else self._comp_id)
+    return comp_id
+
+  def _get_voc_results_file_template(self):
+    # VOCdevkit/results/VOC2007/Main/<comp_id>_det_test_aeroplane.txt
+    filename = self._get_comp_id() + '_det_' + self._image_set + '_{:s}.txt'
+    path = os.path.join(
+      self._devkit_path,
+      'results',
+      'VOC' + self._year,
+      'Main',
+      filename)
+    return path
+
+  def _write_voc_results_file(self, all_boxes):
+    for cls_ind, cls in enumerate(self.classes):
+      if cls == '__background__':
+        continue
+      print('Writing {} VOC results file'.format(cls))
+      filename = self._get_voc_results_file_template().format(cls)
+      with open(filename, 'wt') as f:
+        for im_ind, index in enumerate(self.image_index):
+          dets = all_boxes[cls_ind][im_ind]
+          if dets == []:
+            continue
+          # the VOCdevkit expects 1-based indices
+          for k in range(dets.shape[0]):
+            f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
+                    format(index, dets[k, -1],
+                           dets[k, 0] + 1, dets[k, 1] + 1,
+                           dets[k, 2] + 1, dets[k, 3] + 1))
+
+  def _do_python_eval(self, output_dir='output'):
+    annopath = os.path.join(
+      self._devkit_path,
+      'VOC' + self._year,
+      'Annotations',
+      '{:s}.xml')
+    imagesetfile = os.path.join(
+      self._devkit_path,
+      'VOC' + self._year,
+      'ImageSets',
+      'Main',
+      self._image_set + '.txt')
+    cachedir = os.path.join(self._devkit_path, 'annotations_cache')
+    aps = []
+    # The PASCAL VOC metric changed in 2010
+    use_07_metric = True if int(self._year) < 2010 else False
+    print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
+    if not os.path.isdir(output_dir):
+      os.mkdir(output_dir)
+    for i, cls in enumerate(self._classes):
+      if cls == '__background__':
+        continue
+      filename = self._get_voc_results_file_template().format(cls)
+      rec, prec, ap = voc_eval(
+        filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5,
+        use_07_metric=use_07_metric)
+      aps += [ap]
+      print(('AP for {} = {:.4f}'.format(cls, ap)))
+      with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f:
+        pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
+    print(('Mean AP = {:.4f}'.format(np.mean(aps))))
+    print('~~~~~~~~')
+    print('Results:')
+    for ap in aps:
+      print(('{:.3f}'.format(ap)))
+    print(('{:.3f}'.format(np.mean(aps))))
+    print('~~~~~~~~')
+    print('')
+    print('--------------------------------------------------------------')
+    print('Results computed with the **unofficial** Python eval code.')
+    print('Results should be very close to the official MATLAB eval code.')
+    print('Recompute with `./tools/reval.py --matlab ...` for your paper.')
+    print('-- Thanks, The Management')
+    print('--------------------------------------------------------------')
+
+  def _do_matlab_eval(self, output_dir='output'):
+    print('-----------------------------------------------------')
+    print('Computing results with the official MATLAB eval code.')
+    print('-----------------------------------------------------')
+    path = os.path.join(cfg.ROOT_DIR, 'lib', 'datasets',
+                        'VOCdevkit-matlab-wrapper')
+    cmd = 'cd {} && '.format(path)
+    cmd += '{:s} -nodisplay -nodesktop '.format(cfg.MATLAB)
+    cmd += '-r "dbstop if error; '
+    cmd += 'voc_eval(\'{:s}\',\'{:s}\',\'{:s}\',\'{:s}\'); quit;"' \
+      .format(self._devkit_path, self._get_comp_id(),
+              self._image_set, output_dir)
+    print(('Running:\n{}'.format(cmd)))
+    status = subprocess.call(cmd, shell=True)
+
+  def evaluate_detections(self, all_boxes, output_dir):
+    self._write_voc_results_file(all_boxes)
+    self._do_python_eval(output_dir)
+    if self.config['matlab_eval']:
+      self._do_matlab_eval(output_dir)
+    if self.config['cleanup']:
+      for cls in self._classes:
+        if cls == '__background__':
+          continue
+        filename = self._get_voc_results_file_template().format(cls)
+        os.remove(filename)
+
+  def competition_mode(self, on):
+    if on:
+      self.config['use_salt'] = False
+      self.config['cleanup'] = False
+    else:
+      self.config['use_salt'] = True
+      self.config['cleanup'] = True
+
+
+if __name__ == '__main__':
+  from datasets.pascal_voc import pascal_voc
+
+  d = pascal_voc('trainval', '2007')
+  res = d.roidb
+  from IPython import embed;
+
+  embed()
diff --git a/pic/arch.png b/pic/arch.png
new file mode 100644
index 0000000..68c096b
Binary files /dev/null and b/pic/arch.png differ
diff --git a/vgg16.py b/vgg16.py
new file mode 100644
index 0000000..e3e7eb3
--- /dev/null
+++ b/vgg16.py
@@ -0,0 +1,367 @@
+# _______________________________________________________
+# Tensorflow Faster R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Xinlei Chen
+# Modified by Yunqiu Xu
+# --------------------------------------------------------
+
+
+# A revision of VGG16 (tensorflow backend)
+# Input : 224 * 224 * 3
+# (after) conv1 : 224 * 224 * 64
+# maxpool : 112 * 112 * 64
+# conv2 : 112 * 112 * 128
+# maxpool : 56 * 56 * 128
+# conv3 : 56 * 56 * 256
+# maxpool : 28 * 28 * 256
+# conv4 : 28 * 28 * 512
+# maxpool : 14 * 14 * 512
+# conv5 : 14 * 14 * 512
+# maxpool : 7 * 7 * 512
+# fc6 : 4096
+# fc7 : 4096
+# ---------------------------------------------------------
+
+
+# Why Faster RCNN is faster : RPN
+#   RCNN: 
+#      get proposal --> get features(CNN) --> SVM --> bbox regression
+#   Fast RCNN: 
+#      send proposal and features to ROI pooling --> combine bbox and SVM together
+#   Faster RCNN:
+#      get features first --> get proposals from RPN --> send proposal and features to ROI pooling
+# ---------------------------------------------------------
+
+
+# ------------- To do 1: Multiple Scale Faster-RCNN ------
+# Combine both global and local features --> enhance hand detecting in an image
+# Collect features not only conv5, but also conv3 and conv4, then incorporate them
+# Implementation: 
+#   1. For conv3, conv4, conv5, each conv is only followed with ReLU, remove Max-pooling layer.
+#   2. Take their output as the input of 3 corresponding ROI pooling layers and normalization layers
+#   3. Concat and shrink normalization layers as input of fc layers
+#   4. roi pooling in fc layers: make prediction of class and position
+# --------------------------------------------------------
+# ------------- To do 2: Weight Normalization ------------
+# Features in shallower layers: larger-scaled values
+# Features in deeper layers: smaller-scaled values
+# To combine the features of 3 conv layers, we need to normalize them
+# Implementation:
+#   1. Put each feature into normalization layer(see the equations)
+#   2. Each pixel xi is normalized, then multiply scaling factor ri
+#   3. Use backpropagation to get ri in training step, we need to build loop here
+#   4. After normalization, the features will be concated
+# --------------------------------------------------------
+# ------------- To do 3 Add New Layer --------------------
+# 1. Each RPN needs a normalization layer
+# 2. Add two more ROI pooling layers in detector part
+# 3. Each ROI pooling layer needs a normalization layer
+# 4. After each concatenation(2 positions in total), we need a 1*1 conv layer
+# --------------------------------------------------------
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import tensorflow.contrib.slim as slim
+from tensorflow.contrib.slim import losses
+from tensorflow.contrib.slim import arg_scope
+import numpy as np
+
+from nets.network import Network
+from model.config import cfg
+
+class vgg16(Network):
+  def __init__(self, batch_size=1):
+    Network.__init__(self, batch_size=batch_size)
+    self._arch = 'vgg16'
+
+  # [Hand Detection] Batch normalization
+  # http://stackoverflow.com/a/34634291/2267819
+  # Note that this is different from the paper(they use another method)
+  def batch_norm_layer(self, to_be_normalized, is_training):
+    if is_training:
+      train_phase = tf.constant(1)
+    else:
+      train_phase = tf.constant(-1)
+    beta = tf.Variable(tf.constant(0.0, shape=[to_be_normalized.shape[-1]]), name='beta', trainable=True)
+    gamma = tf.Variable(tf.constant(1.0, shape=[to_be_normalized.shape[-1]]), name='gamma', trainable=True)
+    axises = np.arange(len(to_be_normalized.shape) - 1)
+    batch_mean, batch_var = tf.nn.moments(to_be_normalized, axises, name='moments')
+    ema = tf.train.ExponentialMovingAverage(decay=0.5)
+
+    def mean_var_with_update():
+        ema_apply_op = ema.apply([batch_mean, batch_var])
+        with tf.control_dependencies([ema_apply_op]):
+            return tf.identity(batch_mean), tf.identity(batch_var)
+
+    mean, var = tf.cond(train_phase > 0, mean_var_with_update, lambda: (ema.average(batch_mean), ema.average(batch_var))) # if is training --> update
+    normed = tf.nn.batch_normalization(to_be_normalized, mean, var, beta, gamma, 1e-3)
+    return normed
+
+
+  def build_network(self, sess, is_training=True):
+    with tf.variable_scope('vgg_16', 'vgg_16'):
+      # select initializers
+      if cfg.TRAIN.TRUNCATED:
+        initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.01)
+        initializer_bbox = tf.truncated_normal_initializer(mean=0.0, stddev=0.001)
+      else:
+        initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01)
+        initializer_bbox = tf.random_normal_initializer(mean=0.0, stddev=0.001)
+
+      # [VGG16] conv1
+      # input shape : 224 * 224 * 3
+      # output shape : 112 * 112 * 64
+      net = slim.repeat(self._image, 2, slim.conv2d, 64, [3, 3],
+                        trainable=False, scope='conv1')
+      net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool1')
+
+      # [VGG16] conv2
+      # input shape : 112 * 112 * 64
+      # output shape : 56 * 56 * 128
+      net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3],
+                        trainable=False, scope='conv2')
+      net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool2')
+
+
+      # [Hand Detection] REMOVE net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool3') 
+      # [Hand Detection] conv3
+      # input shape : 56 * 56 * 128
+      # output shape : 56 * 56 * 256
+      net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3],
+                        trainable=is_training, scope='conv3')
+      to_be_normalized_1 = net 
+      # [Hand Detection] conv4
+      # input shape : 56 * 56 * 256
+      # output shape : 56 * 56 * 256
+      net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3],
+                        trainable=is_training, scope='conv4')
+      to_be_normalized_2 = net 
+      # [Hand Detection] conv5
+      # input shape : 56 * 56 * 256
+      # output shape : 56 * 56 * 256
+      net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3],
+                        trainable=is_training, scope='conv5')
+      to_be_normalized_3 = net 
+
+# ------------- Take a break -----------------------------
+# Now as we get to_be_normalized_1 / to_be_normalized_2 / to_be_normalized_3, each is 56 * 56 * 256
+# For RPN , we need to: 
+# 1. normalize each to_be_normalized layer
+# 2. concat 3 normalized layers
+# 3. change the dimension using 1 * 1 conv
+# 3. Then the modified net can be used in RPN
+# 
+# For ROI pooling, we need to:
+# 1. put each conv output into its ROI pooling (so there should be 3 ROI pooling layers)
+# 2. normalize each layer
+# 3. concat them
+# 4. change the dimension using 1 * 1 conv
+# --------------------------------------------------------
+
+      # ------------- Normalization for RPN --------------------
+      # old version 
+      # normed_1_rpn = tf.nn.l2_normalize(to_be_normalized_1, dim = [0, 1])
+      # normed_2_rpn = tf.nn.l2_normalize(to_be_normalized_2, dim = [0, 1])
+      # normed_3_rpn = tf.nn.l2_normalize(to_be_normalized_3, dim = [0, 1])
+      normed_1_rpn = self.batch_norm_layer(to_be_normalized_1, is_training)
+      normed_2_rpn = self.batch_norm_layer(to_be_normalized_2, is_training)
+      normed_3_rpn = self.batch_norm_layer(to_be_normalized_3, is_training)
+      
+      # ------------- Concatation for RPN (56 * 56 * 768) ------
+      # old version
+      # concated_rpn = tf.concat([normed_1_rpn, normed_2_rpn, normed_3_rpn], 2)
+      #batch *length*width*channel
+      #concate in the channel
+      concated_rpn = tf.concat([normed_1_rpn, normed_2_rpn, normed_3_rpn], -1)
+     
+      # ------------- 1 * 1 conv -------------------------------
+      scaled_rpn = slim.conv2d(concated_rpn, 512, [1, 1], trainable=is_training, weights_initializer=initializer, scope="scaled_rpn/1x1")
+      # Then we can get 56 * 56 * 512
+      
+      
+      # [Faster RCNN] summary and anchor
+      self._act_summaries.append(scaled_rpn)
+      self._layers['head'] = scaled_rpn
+      self._anchor_component()
+
+      # ------------- RPN Begin --------------------------------
+      
+      rpn = slim.conv2d(scaled_rpn, 512, [3, 3], trainable=is_training, weights_initializer=initializer, scope="rpn_conv/3x3")
+      self.show_variables("rpn",rpn.get_shape())
+
+      print("rpn",rpn.get_shape())
+      self._act_summaries.append(rpn)
+      rpn_cls_score = slim.conv2d(rpn, self._num_anchors * 2, [1, 1], trainable=is_training,
+                                  weights_initializer=initializer,
+                                  padding='VALID', activation_fn=None, scope='rpn_cls_score')
+      # [Hand Detection] change it so that the score has 2 as its channel size
+      rpn_cls_score_reshape = self._reshape_layer(rpn_cls_score, 2, 'rpn_cls_score_reshape')
+      rpn_cls_prob_reshape = self._softmax_layer(rpn_cls_score_reshape, "rpn_cls_prob_reshape")
+      rpn_cls_prob = self._reshape_layer(rpn_cls_prob_reshape, self._num_anchors * 2, "rpn_cls_prob")
+      rpn_bbox_pred = slim.conv2d(rpn, self._num_anchors * 4, [1, 1], trainable=is_training,
+                                  weights_initializer=initializer,
+                                  padding='VALID', activation_fn=None, scope='rpn_bbox_pred')
+      print("rpn_cls_score",rpn_cls_score.get_shape())
+      if is_training:
+        print("Compute rois,roi_scores")
+        print("training:rpn_cls_score",rpn_cls_score.get_shape())
+        rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
+    
+        print("Compute rpn_labels")
+        self.show_variables("rpn_cls_score",rpn_cls_score.get_shape())
+
+        rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor")
+        # Try to have a determinestic order for the computing graph, for reproducibility
+        with tf.control_dependencies([rpn_labels]):
+          rois, _ = self._proposal_target_layer(rois, roi_scores, "rpn_rois")
+      else:
+        if cfg.TEST.MODE == 'nms':
+          rois, _ = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
+        elif cfg.TEST.MODE == 'top':
+          rois, _ = self._proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
+        else:
+          raise NotImplementedError
+      # ------------- RPN End ----------------------------------
+
+      print("vgg16_rois",str(rois.shape))
+
+      # ------------- ROI Pooling Begin ------------------------
+      if cfg.POOLING_MODE == 'crop':
+        # get roi layers
+        roi1 = self._crop_pool_layer(to_be_normalized_1, rois, "roi1") # 28 * 28 * 256
+        #print("vgg16_roi1",str(roi1.shape))
+        roi2 = self._crop_pool_layer(to_be_normalized_2, rois, "roi2") # 28 * 28 * 256
+        roi3 = self._crop_pool_layer(to_be_normalized_3, rois, "roi3") # 28 * 28 * 256
+        # normalization
+        normed_1_roi = self.batch_norm_layer(roi1, is_training)
+        normed_2_roi = self.batch_norm_layer(roi2, is_training)
+        normed_3_roi = self.batch_norm_layer(roi3, is_training)
+        # concat
+        concated_roi = tf.concat([normed_1_roi, normed_2_roi, normed_3_roi], -1) # 28 * 28 * 768
+       
+        #concated_roi = tf.slice(concated_roi,[0,0,0,0],[channel1,-1,-1,-1])#train 256 testing 300
+        #print("concated_roi",concated_roi.get_shape())
+        
+      # scale
+        #with tf.variable_scope("rois") as scope:
+        #  out = rois.shape[0]
+
+        pool5 = slim.conv2d(concated_roi,512, [1, 1], trainable=is_training, weights_initializer=initializer, scope="pool5/1x1") # 28 * 28 * 512
+
+        #print("pool5",pool5.get_shape())
+        #pool5 = tf.reshape(pool5,[-1,])
+        #pool5 = tf.slice(pool5,[0,0,0,0],[self._anchor_length,-1,-1,-1])
+
+      else:
+        raise NotImplementedError
+      # old version
+      # if cfg.POOLING_MODE == 'crop':
+      #  roi_pool_1 = self._crop_pool_layer(to_be_normalized_1, rois, "roi_pool_1")
+      #  roi_pool_2 = self._crop_pool_layer(to_be_normalized_2, rois, "roi_pool_2")
+      #  roi_pool_3 = self._crop_pool_layer(to_be_normalized_3, rois, "roi_pool_3")
+
+      #  roi_pool_1_normalized = tf.nn.l2_normalize(roi_pool_1, dim = [0, 1])
+      #  roi_pool_2_normalized = tf.nn.l2_normalize(roi_pool_2, dim = [0, 1])
+      #  roi_pool_3_normalized = tf.nn.l2_normalize(roi_pool_3, dim = [0, 1])
+      #  pool5 = tf.concat([roi_pool_1_normalized, roi_pool_1_normalized, roi_pool_1_normalized], 2)
+      # ------------- ROI Pooling End --------------------------
+
+
+      # [VGG16] flatten
+      pool5_flat = slim.flatten(pool5, scope='flatten')
+      # [VGG16] dense 4096 + dropout
+      fc6 = slim.fully_connected(pool5_flat, 4096, scope='fc6')
+      if is_training:
+        fc6 = slim.dropout(fc6, keep_prob=0.5, is_training=True, scope='dropout6')
+      # [VGG16] dense 4096 + dropout
+      fc7 = slim.fully_connected(fc6, 4096, scope='fc7')
+      if is_training:
+        fc7 = slim.dropout(fc7, keep_prob=0.5, is_training=True, scope='dropout7')
+      
+      # [Faster RCNN] get cls_score(class) and bbox_predict(position)
+      cls_score = slim.fully_connected(fc7, self._num_classes, 
+                                       weights_initializer=initializer,
+                                       trainable=is_training,
+                                       activation_fn=None, scope='cls_score')
+      cls_prob = self._softmax_layer(cls_score, "cls_prob")
+      bbox_pred = slim.fully_connected(fc7, self._num_classes * 4, 
+                                       weights_initializer=initializer_bbox,
+                                       trainable=is_training,
+                                       activation_fn=None, scope='bbox_pred')
+      #print("cls_score",cls_score.get_shape())
+      #if not is_training and len(rois.shape)==2:
+      #  bbox_pred = tf.slice(bbox_pred,[0,0,0,0],[rois.shape[0],-1,-1,-1])
+
+      print("vgg16_bbox_pred",str(bbox_pred.shape))
+      print("vgg16_rois",str(rois.shape))  
+   
+      self._predictions["rpn_cls_score"] = rpn_cls_score
+      self._predictions["rpn_cls_score_reshape"] = rpn_cls_score_reshape
+      self._predictions["rpn_cls_prob"] = rpn_cls_prob
+      self._predictions["rpn_bbox_pred"] = rpn_bbox_pred
+      self._predictions["cls_score"] = cls_score
+      self._predictions["cls_prob"] = cls_prob
+      self._predictions["bbox_pred"] = bbox_pred
+      self._predictions["rois"] = rois # not original rois
+
+      self._score_summaries.update(self._predictions)
+
+      return rois, cls_prob, bbox_pred
+
+
+  def get_variables_to_restore(self, variables, var_keep_dic):
+    variables_to_restore = []
+    #[Hand Detection]
+    var_modified=['vgg_16/conv3/conv3_1/weights:0','vgg_16/conv3/conv3_2/weights:0','vgg_16/conv3/conv3_3/weights:0',
+                      'vgg_16/conv4/conv4_1/weights:0','vgg_16/conv4/conv4_2/weights:0','vgg_16/conv4/conv4_3/weights:0',
+                      'vgg_16/conv5/conv5_1/weights:0','vgg_16/conv5/conv5_2/weights:0','vgg_16/conv5/conv5_3/weights:0',
+                      'vgg_16/conv3/conv3_1/biases:0','vgg_16/conv3/conv3_2/biases:0','vgg_16/conv3/conv3_3/biases:0',
+                      'vgg_16/conv4/conv4_1/biases:0','vgg_16/conv4/conv4_2/biases:0','vgg_16/conv4/conv4_3/biases:0',
+                      'vgg_16/conv5/conv5_1/biases:0','vgg_16/conv5/conv5_2/biases:0','vgg_16/conv5/conv5_3/biases:0']
+    #/[Hand Detection]
+
+    for v in variables:
+      # exclude the conv weights that are fc weights in vgg16
+      if v.name == 'vgg_16/fc6/weights:0' or v.name == 'vgg_16/fc7/weights:0':
+        self._variables_to_fix[v.name] = v
+        continue
+      # exclude the first conv layer to swap RGB to BGR
+      if v.name == 'vgg_16/conv1/conv1_1/weights:0':
+        self._variables_to_fix[v.name] = v
+        continue
+
+      # [Hand Detection]
+      if v.name in var_modified:
+          continue
+      # /[Hand Detection]
+
+      if v.name.split(':')[0] in var_keep_dic:
+        print('Varibles restored: %s' % v.name)
+        variables_to_restore.append(v)
+
+    return variables_to_restore
+
+  def fix_variables(self, sess, pretrained_model):
+    print('Fix VGG16 layers..')
+    with tf.variable_scope('Fix_VGG16') as scope:
+      with tf.device("/cpu:0"):
+        # fix the vgg16 issue from conv weights to fc weights
+        # fix RGB to BGR
+        #fc6_conv = tf.get_variable("fc6_conv", [7, 7, 512, 4096], trainable=False)
+        #fc7_conv = tf.get_variable("fc7_conv", [1, 1, 4096, 4096], trainable=False)
+        conv1_rgb = tf.get_variable("conv1_rgb", [3, 3, 3, 64], trainable=False)
+        restorer_fc = tf.train.Saver({ "vgg_16/conv1/conv1_1/weights": conv1_rgb})
+        restorer_fc.restore(sess, pretrained_model)
+
+        #sess.run(tf.assign(self._variables_to_fix['vgg_16/fc6/weights:0'], tf.reshape(fc6_conv, 
+        #                    self._variables_to_fix['vgg_16/fc6/weights:0'].get_shape())))
+        #sess.run(tf.assign(self._variables_to_fix['vgg_16/fc7/weights:0'], tf.reshape(fc7_conv, 
+        #                    self._variables_to_fix['vgg_16/fc7/weights:0'].get_shape())))
+        sess.run(tf.assign(self._variables_to_fix['vgg_16/conv1/conv1_1/weights:0'],tf.reverse(conv1_rgb, [2])))
+  
+  def show_variables(self,var_name,var):
+    print(var_name,var)
diff --git a/vgg16_modified1.py b/vgg16_modified1.py
deleted file mode 100644
index c9d5a06..0000000
--- a/vgg16_modified1.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# --------------------------------------------------------
-# Tensorflow Faster R-CNN
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Xinlei Chen
-# --------------------------------------------------------
-
-# --------------------------------------------------------
-# [Modified by Yunqiu Xu]
-# Ref:
-#   https://leonardoaraujosantos.gitbooks.io/artificial-inteligence/content/object_localization_and_detection.html
-#   http://blog.csdn.net/shenxiaolu1984/article/details/51152614
-#   http://closure11.com/rcnn-fast-rcnn-faster-rcnn%E7%9A%84%E4%B8%80%E4%BA%9B%E4%BA%8B/
-#   http://blog.csdn.net/lanran2/article/details/60143861
-
-# Why Faster RCNN is faster : RPN
-#   RCNN: 
-#      get proposal --> get features(CNN) --> SVM --> bbox regression
-#   Fast RCNN: 
-#      send proposal and features to ROI pooling --> combine bbox and SVM together
-#   Faster RCNN:
-#      get features first --> get proposals from RPN --> send proposal and features to ROI pooling
-# ---------------------------------------------------------
-
-# ------------- To do 1: Multiple Scale Faster-RCNN ------
-# Combine both global and local features --> enhance hand detecting in an image
-# Collect features not only conv5, but also conv3 and conv4, then incorporate them
-# Implementation: 
-#   1. For conv3, conv4, conv5, each conv is only followed with ReLU, remove Max-pooling layer.
-#   2. Take their output as the input of 3 corresponding ROI pooling layers and normalization layers
-#   3. Concat and shrink normalization layers as input of fc layers
-#   4. roi pooling in fc layers: make prediction of class and position
-# --------------------------------------------------------
-# ------------- To do 2: Weight Normalization ------------
-# Features in shallower layers: larger-scaled values
-# Features in deeper layers: smaller-scaled values
-# To combine the features of 3 conv layers, we need to normalize them
-# Implementation:
-#   1. Put each feature into normalization layer(see the equations)
-#   2. Each pixel xi is normalized, then multiply scaling factor ri
-#   3. Use backpropagation to get ri in training step, we need to build loop here
-#   4. After normalization, the features will be concated
-# --------------------------------------------------------
-# ------------- To do 3 Add New Layer --------------------
-# 1. Each RPN needs a normalization layer
-# 2. Add two more ROI pooling layers in detector part
-# 3. Each ROI pooling layer needs a normalization layer
-# 4. After each concatenation(2 positions in total), we need a 1*1 conv layer
-# --------------------------------------------------------
-
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-import tensorflow.contrib.slim as slim
-from tensorflow.contrib.slim import losses
-from tensorflow.contrib.slim import arg_scope
-import numpy as np
-
-from nets.network import Network
-from model.config import cfg
-
-class vgg16(Network):
-  def __init__(self, batch_size=1):
-    Network.__init__(self, batch_size=batch_size)
-    self._arch = 'vgg16'
-
-  def build_network(self, sess, is_training=True):
-    with tf.variable_scope('vgg_16', 'vgg_16'):
-      # select initializers
-      if cfg.TRAIN.TRUNCATED:
-        initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.01)
-        initializer_bbox = tf.truncated_normal_initializer(mean=0.0, stddev=0.001)
-      else:
-        initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01)
-        initializer_bbox = tf.random_normal_initializer(mean=0.0, stddev=0.001)
-
-      # [VGG16] conv1
-      # input shape : 224 * 224 * 3
-      # conv 64 * 3 * 3
-      # conv 64 * 3 * 3
-      # maxpool 2 * 2
-      net = slim.repeat(self._image, 2, slim.conv2d, 64, [3, 3],
-                        trainable=False, scope='conv1')
-      net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool1')
-
-      # [VGG16] conv2
-      # conv 128 * 3 * 3
-      # conv 128 * 3 * 3
-      # maxpool 2 * 2
-      net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3],
-                        trainable=False, scope='conv2')
-      net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool2')
-
-
-      # [VGG16] conv3
-      # conv 256 * 3 * 3
-      # conv 256 * 3 * 3
-      # conv 256 * 3 * 3
-      # maxpool 2 * 2
-      net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3],
-                        trainable=is_training, scope='conv3')
-      to_be_normalized_1 = net
-      # [Hand Detection] REMOVE net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool3') 
-
-      # [VGG16] conv4
-      # conv 512 * 3 * 3
-      # conv 512 * 3 * 3
-      # conv 512 * 3 * 3
-      # maxpool 2 * 2
-      net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3],
-                        trainable=is_training, scope='conv4')
-      to_be_normalized_2 = net
-      # [Hand Detection] REMOVE net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool4') 
-
-      # [VGG16] conv5
-      # conv 512 * 3 * 3
-      # conv 512 * 3 * 3
-      # conv 512 * 3 * 3
-      # maxpool 2 * 2
-      net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3],
-                        trainable=is_training, scope='conv5')
-      to_be_normalized_3 = net
-      
-      # [Hand detection]
-      # Use the result of conv3, conv4 and conv5
-      # normalize and concat them, then use 1*1 conv, then use RPN
-      # use the result of 3 layers and RPN in training loop
-
-      # // self._act_summaries.append(net)
-      # // self._layers['head'] = net
-      # // self._anchor_component() # Yunqiu Xu: generate anchors?
-
-# ------------- RPN Begin --------------------------------
-## %%%%%%%%%% RPN Begin %%%%%%%%%% ##
-      # [Faster RCNN] RPN: put features into RPN layer --> get proposals
-      # input features(or anchors?), output rois(proposals)
-      # [Hand Detection] Normalize , concat, then use 1*1 conv, finally the data will be treated as the input here
-      rpn = slim.conv2d(net, 512, [3, 3], trainable=is_training, weights_initializer=initializer, scope="rpn_conv/3x3")
-      self._act_summaries.append(rpn)
-      rpn_cls_score = slim.conv2d(rpn, self._num_anchors * 2, [1, 1], trainable=is_training,
-                                  weights_initializer=initializer,
-                                  padding='VALID', activation_fn=None, scope='rpn_cls_score')
-      # [Hand Detection] change it so that the score has 2 as its channel size
-      rpn_cls_score_reshape = self._reshape_layer(rpn_cls_score, 2, 'rpn_cls_score_reshape')
-      rpn_cls_prob_reshape = self._softmax_layer(rpn_cls_score_reshape, "rpn_cls_prob_reshape")
-      rpn_cls_prob = self._reshape_layer(rpn_cls_prob_reshape, self._num_anchors * 2, "rpn_cls_prob")
-      rpn_bbox_pred = slim.conv2d(rpn, self._num_anchors * 4, [1, 1], trainable=is_training,
-                                  weights_initializer=initializer,
-                                  padding='VALID', activation_fn=None, scope='rpn_bbox_pred')
-      if is_training:
-        rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
-        rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor")
-        # Try to have a determinestic order for the computing graph, for reproducibility
-        with tf.control_dependencies([rpn_labels]):
-          rois, _ = self._proposal_target_layer(rois, roi_scores, "rpn_rois")
-      else:
-        if cfg.TEST.MODE == 'nms':
-          rois, _ = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
-        elif cfg.TEST.MODE == 'top':
-          rois, _ = self._proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
-        else:
-          raise NotImplementedError
-# ------------- RPN End ----------------------------------
-
-# ------------- ROI Pooling Begin ------------------------
-      # [Faster RCNN] build roi pooling layer(here is same with RCNN)
-      # [Hand Detection] add another 2 roi pooling layer
-      # Input: proposals(rois) from RPN and features from CNN 
-      if cfg.POOLING_MODE == 'crop':
-        pool5 = self._crop_pool_layer(net, rois, "pool5")
-      else:
-        raise NotImplementedError
-# ------------- ROI Pooling End --------------------------
-
-      # [Hand Detection] Then we use 3 normalize layers
-      # [Hand Detection] Then we concat them
-      # [Hand Detection] Then we use 1*1 conv to return the channel size
-
-
-      # [VGG16] flatten
-      pool5_flat = slim.flatten(pool5, scope='flatten')
-      # [VGG16] dense 4096 + dropout
-      fc6 = slim.fully_connected(pool5_flat, 4096, scope='fc6')
-      if is_training:
-        fc6 = slim.dropout(fc6, keep_prob=0.5, is_training=True, scope='dropout6')
-      # [VGG16] dense 4096 + dropout
-      fc7 = slim.fully_connected(fc6, 4096, scope='fc7')
-      if is_training:
-        fc7 = slim.dropout(fc7, keep_prob=0.5, is_training=True, scope='dropout7')
-      
-      # [Faster RCNN] get cls_score(class) and bbox_predict(position)
-      cls_score = slim.fully_connected(fc7, self._num_classes, 
-                                       weights_initializer=initializer,
-                                       trainable=is_training,
-                                       activation_fn=None, scope='cls_score')
-      cls_prob = self._softmax_layer(cls_score, "cls_prob")
-      bbox_pred = slim.fully_connected(fc7, self._num_classes * 4, 
-                                       weights_initializer=initializer_bbox,
-                                       trainable=is_training,
-                                       activation_fn=None, scope='bbox_pred')
-
-      self._predictions["rpn_cls_score"] = rpn_cls_score
-      self._predictions["rpn_cls_score_reshape"] = rpn_cls_score_reshape
-      self._predictions["rpn_cls_prob"] = rpn_cls_prob
-      self._predictions["rpn_bbox_pred"] = rpn_bbox_pred
-      self._predictions["cls_score"] = cls_score
-      self._predictions["cls_prob"] = cls_prob
-      self._predictions["bbox_pred"] = bbox_pred
-      self._predictions["rois"] = rois
-
-      self._score_summaries.update(self._predictions)
-
-      return rois, cls_prob, bbox_pred
diff --git a/voc_eval.py b/voc_eval.py
new file mode 100644
index 0000000..bd51d65
--- /dev/null
+++ b/voc_eval.py
@@ -0,0 +1,282 @@
+# --------------------------------------------------------
+# Fast/er R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Bharath Hariharan
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import xml.etree.ElementTree as ET
+import os
+import pickle
+import numpy as np
+
+
+#['2L_0000080_X_7_4_3 0.294 34.6 202.3 97.7 264.7\n',...] =>
+#[imagename x y w h score left/right driver/passenger number_hands_on_wheel];
+
+def transform(line,hand,person):
+  line=line.strip().split()
+  imagename = line[0]
+  x = int(float(line[2]))
+  y = int(float(line[3]))
+  w = int(float(line[4])) - x
+  h = int(float(line[5])) - y
+  score = line[1]
+  number_hands_on_wheel = "-1" #Not implemented
+  newlist = [imagename,str(x),str(y),str(w),str(h),score,hand,person,number_hands_on_wheel]
+  newline = "["+" ".join(newlist)+"];\n"
+    
+  return newline
+
+
+
+def parse_rec(filename):
+  """ Parse a PASCAL VOC xml file """
+  tree = ET.parse(filename)
+  objects = []
+  for obj in tree.findall('object'):
+    obj_struct = {}
+    obj_struct['name'] = obj.find('name').text
+    obj_struct['pose'] = obj.find('pose').text
+    obj_struct['truncated'] = int(obj.find('truncated').text)
+    obj_struct['difficult'] = int(obj.find('difficult').text)
+    bbox = obj.find('bndbox')
+    obj_struct['bbox'] = [int(bbox.find('xmin').text),
+                          int(bbox.find('ymin').text),
+                          int(bbox.find('xmax').text),
+                          int(bbox.find('ymax').text)]
+    objects.append(obj_struct)
+
+  return objects
+
+
+def voc_ap(rec, prec, use_07_metric=False):
+  """ ap = voc_ap(rec, prec, [use_07_metric])
+  Compute VOC AP given precision and recall.
+  If use_07_metric is true, uses the
+  VOC 07 11 point method (default:False).
+  """
+  if use_07_metric:
+    # 11 point metric
+    ap = 0.
+    for t in np.arange(0., 1.1, 0.1):
+      if np.sum(rec >= t) == 0:
+        p = 0
+      else:
+        p = np.max(prec[rec >= t])
+      ap = ap + p / 11.
+  else:
+    # correct AP calculation
+    # first append sentinel values at the end
+    mrec = np.concatenate(([0.], rec, [1.]))
+    mpre = np.concatenate(([0.], prec, [0.]))
+
+    # compute the precision envelope
+    for i in range(mpre.size - 1, 0, -1):
+      mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+    # to calculate area under PR curve, look for points
+    # where X axis (recall) changes value
+    i = np.where(mrec[1:] != mrec[:-1])[0]
+
+    # and sum (\Delta recall) * prec
+    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+  return ap
+
+
+def voc_eval(detpath,
+             annopath,
+             imagesetfile,
+             classname,
+             cachedir,
+             ovthresh=0.5,
+             use_07_metric=False):
+  """rec, prec, ap = voc_eval(detpath,
+                              annopath,
+                              imagesetfile,
+                              classname,
+                              [ovthresh],
+                              [use_07_metric])
+
+  Top level function that does the PASCAL VOC evaluation.
+
+  detpath: Path to detections
+      detpath.format(classname) should produce the detection results file.
+  annopath: Path to annotations
+      annopath.format(imagename) should be the xml annotations file.
+  imagesetfile: Text file containing the list of images, one image per line.
+  classname: Category name (duh)
+  cachedir: Directory for caching the annotations
+  [ovthresh]: Overlap threshold (default = 0.5)
+  [use_07_metric]: Whether to use VOC07's 11 point AP computation
+      (default False)
+  """
+  # assumes detections are in detpath.format(classname)
+  # assumes annotations are in annopath.format(imagename)
+  # assumes imagesetfile is a text file with each line an image name
+  # cachedir caches the annotations in a pickle file
+
+  # first load gt
+  '''
+  if not os.path.isdir(cachedir):
+    os.mkdir(cachedir)
+  cachefile = os.path.join(cachedir, 'annots.pkl')
+  '''
+  # read list of images
+  with open(imagesetfile, 'r') as f:
+    lines = f.readlines()
+  imagenames = [x.strip() for x in lines]
+  '''
+  if not os.path.isfile(cachefile):
+    # load annots
+    recs = {}
+    for i, imagename in enumerate(imagenames):
+      recs[imagename] = parse_rec(annopath.format(imagename))
+      if i % 100 == 0:
+        print('Reading annotation for {:d}/{:d}'.format(
+          i + 1, len(imagenames)))
+    # save
+    print('Saving cached annotations to {:s}'.format(cachefile))
+    with open(cachefile, 'w') as f:
+      pickle.dump(recs, f)
+  else:
+    # load
+    with open(cachefile, 'rb') as f:
+      try:
+        recs = pickle.load(f)
+      except:
+        recs = pickle.load(f, encoding='bytes')
+
+  # extract gt objects for this class
+  class_recs = {}
+  npos = 0
+  for imagename in imagenames:
+    R = [obj for obj in recs[imagename] if obj['name'] == classname]
+    bbox = np.array([x['bbox'] for x in R])
+    difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
+    det = [False] * len(R)
+    npos = npos + sum(~difficult)
+    class_recs[imagename] = {'bbox': bbox,
+                             'difficult': difficult,
+                             'det': det}
+  '''
+  # read dets
+  detfile = detpath.format(classname)
+  print("detfile",detfile,classname) 
+  
+  with open(detfile, 'r') as f:
+    lines = f.readlines()
+  
+  #[Hand detection] [imagename x y w h score left/right driver/passenger number_hands_on_wheel];
+  if classname[:4] == "left":
+    hand = "left"
+  else:
+    hand = "right"
+
+  if classname[-6:] == "driver":
+    person = "driver"
+  else:
+    person = "passenger"
+
+  content = ""
+  file_path = "./result/"+classname+".txt"
+  f = open(file_path,"w+")
+
+  newlines = [transform(line,hand,person) for line in lines]
+  content = "".join(newlines)
+  f.write(content)
+  f.close()
+    
+  #[/hand detection]
+
+
+
+  #print("voc_eval_detfile_lines_length",lines)
+  '''
+  splitlines = [x.strip().split(' ') for x in lines]
+  image_ids = [x[0] for x in splitlines]
+  confidence = np.array([float(x[1]) for x in splitlines])
+
+  print("confidence",confidence)
+  BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
+
+  nd = len(image_ids)
+  print("voc_eval_nd",nd)
+
+  tp = np.zeros(nd)
+  fp = np.zeros(nd)
+
+  if BB.shape[0] > 0:
+    # sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    sorted_scores = np.sort(-confidence)
+    BB = BB[sorted_ind, :]
+    image_ids = [image_ids[x] for x in sorted_ind]
+    #print("BB",BB)
+
+    # go down dets and mark TPs and FPs
+
+    #print("nd",nd)
+    print("image_ids",image_ids)
+
+    for d in range(nd):
+      R = class_recs[image_ids[d]]
+      bb = BB[d, :].astype(float)
+
+      print("d",d,"bb",bb)
+
+      ovmax = -np.inf
+      BBGT = R['bbox'].astype(float)
+      print("BBGT",BBGT[:,])
+
+      if BBGT.size > 0:
+        # compute overlaps
+        # intersection
+        ixmin = np.maximum(BBGT[:, 0], bb[0])
+        iymin = np.maximum(BBGT[:, 1], bb[1])
+        ixmax = np.minimum(BBGT[:, 2], bb[2])
+        iymax = np.minimum(BBGT[:, 3], bb[3])
+        iw = np.maximum(ixmax - ixmin + 1., 0.)
+        ih = np.maximum(iymax - iymin + 1., 0.)
+        inters = iw * ih
+
+        # union
+        uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
+               (BBGT[:, 2] - BBGT[:, 0] + 1.) *
+               (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
+
+        overlaps = inters / uni
+        print("overlaps",overlaps)
+
+        ovmax = np.max(overlaps)
+        jmax = np.argmax(overlaps)
+        print("ovmax",ovmax)
+
+      if ovmax > ovthresh:
+        if not R['difficult'][jmax]:
+          if not R['det'][jmax]:
+            tp[d] = 1.
+            R['det'][jmax] = 1
+          else:
+            fp[d] = 1.
+      else:
+        fp[d] = 1.
+      print("fp",fp)
+      print("tp",tp)
+  # compute precision recall
+  fp = np.cumsum(fp)
+  tp = np.cumsum(tp)
+  rec = tp / float(npos)
+  # avoid divide by zero in case the first detection matches a difficult
+  # ground truth
+  prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+  ap = voc_ap(rec, prec, use_07_metric)
+  '''
+  rec = 0
+  prec = 0
+  ap = 0
+  return rec, prec, ap
+
+