andrewhou1
diff --git a/‎eval.py
+11-6 b/‎eval.py
+11-6
diff --git a/‎model.py
+22-32 b/‎model.py
+22-32
diff --git a/‎preprocessing.py
+98-9 b/‎preprocessing.py
+98-9
diff --git a/‎test/00001_test.png
2.18 KB b/‎test/00001_test.png
2.18 KB
diff --git a/‎test/00001_test_single_layer_2_epochs_1000ppi.png
79.8 KB b/‎test/00001_test_single_layer_2_epochs_1000ppi.png
79.8 KB
diff --git a/‎test/images/img_00001.png
1.76 MB b/‎test/images/img_00001.png
1.76 MB
diff --git a/‎test/labels/lab_00001.png
78.8 KB b/‎test/labels/lab_00001.png
78.8 KB
@@ -26,10 +26,14 @@ def test_model(sess, model, images, labels, patch_size, output_dir=None, categor
         image = image_to_np_array(image_f, float_cols=True)
         labels = labels_to_np_array(label_f)
         h, w, _ = image.shape
+        image = image[:h//2, :w//2, :]
+        h, w, _ = image.shape
+        labels = labels[:h, :w]
         predicted_labels = np.zeros([h, w], dtype=np.uint8)
         pixels_correct = 0
         error_for_image = 0
         i = 0
+
         for y in range(patch_size, h - patch_size):
             # # for debug, only do first 10K
             # if i > 1e4:
@@ -39,12 +43,12 @@ def test_model(sess, model, images, labels, patch_size, output_dir=None, categor
                 i += 1
                 input_image = get_patch(image, (y, x), patch_size)
                 input_image = np.append(input_image,
-                                        np.zeros(shape=[patch_size, patch_size, 1], dtype=np.float32),
+                                        np.zeros(shape=[patch_size, patch_size, model.num_classes], dtype=np.float32),
                                         axis=2)
                 input_label = labels[y, x]
-                feed_dict = {model.inpt: [input_image], model.output: [[input_label]]}
+                feed_dict = {model.inpt: [input_image], model.output: input_label}
 
-                error, logits = sess.run([model.error, model.logits], feed_dict=feed_dict)
+                error, logits = sess.run([model.errors[1], model.logits[1]], feed_dict=feed_dict)
                 error_for_image += error
                 output_label = np.argmax(logits)
                 if output_label == input_label:
@@ -54,8 +58,8 @@ def test_model(sess, model, images, labels, patch_size, output_dir=None, categor
                 if i % 1000 == 0:
                     print "%d/%d pixels done..." % (i, (h - 2 * patch_size) * (w - 2 * patch_size))
 
-        print "Tested on image %s: Accuracy is %.2f%%, error per pixel is %f." % (
-            image_f, (100.0 * pixels_correct) / i, error_for_image / i)
+        # print "Tested on image %s: Accuracy is %.2f%%, error per pixel is %f." % (
+        #     image_f, (100.0 * pixels_correct) / i, error_for_image / i)
         if output_dir is not None:
             if category_colors is None:
                 raise ValueError("Color index not provided, can't output images.")
@@ -74,6 +78,7 @@ def main():
     parser.add_argument('--labels', type=str, nargs='+', help='Filename of test labels')
     parser.add_argument('--output_dir', type=str, default=None,
                         help='Directory to store model output. By default no output is generated.')
+    parser.add_argument('--patch_size', type=int, default=67, help='Size of input patches')
     args = parser.parse_args()
 
     # load class labels
@@ -85,7 +90,7 @@ def main():
     sess = tf.Session()
     restore_model(sess, args.model)
 
-    test_model(sess, model, args.images, args.labels, patch_size=23, output_dir=args.output_dir,
+    test_model(sess, model, args.images, args.labels, patch_size=args.patch_size, output_dir=args.output_dir,
                category_colors=category_colors)
 
 
 
@@ -16,14 +16,12 @@ def __init__(self, hidden_size_1, hidden_size_2, batch_size, num_classes, learni
         self.num_layers = num_layers
 
         # Set up placeholders for input and output
-        print "params:", batch_size, hidden_size_1, hidden_size_2, self.num_classes
-        self.inpt = tf.placeholder(dtype=tf.float32, shape=[batch_size, None, None, 3+self.num_classes])
-        print "**** input", self.inpt.get_shape()
-        self.output = tf.placeholder(tf.int32, [1, 1])
+        self.inpt = tf.placeholder(dtype=tf.float32, shape=[batch_size, None, None, 3 + self.num_classes])
+        self.output = tf.placeholder(tf.int32, [batch_size, None, None])
 
         # Set up variable weights for model. These are shared across recurrent layers
 
-        W_conv1 = tf.Variable(tf.truncated_normal([8, 8, 3+self.num_classes, self.hidden_size_1], stddev=0.1))
+        self.W_conv1 = tf.Variable(tf.truncated_normal([8, 8, 3 + self.num_classes, self.hidden_size_1], stddev=0.1))
         b_conv1 = tf.Variable(tf.constant(0.1, shape=[self.hidden_size_1]))
 
         W_conv2 = tf.Variable(tf.truncated_normal([8, 8, self.hidden_size_1, self.hidden_size_2], stddev=0.1))
@@ -35,45 +33,37 @@ def __init__(self, hidden_size_1, hidden_size_2, batch_size, num_classes, learni
         self.logits = []
         self.errors = []
         current_input = self.inpt
+        current_output = self.output
         for i in range(self.num_layers):
-            h_conv1 = tf.nn.conv2d(current_input, W_conv1, strides=[1, 1, 1, 1], padding='SAME') + b_conv1
-            h_pool1 = tf.nn.max_pool(h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
+            # scale output down by a stride of 2, to match convolution output
+            current_output = tf.strided_slice(current_output, [0, 0, 0], [0, 0, 0], strides=[1, 2, 2], end_mask=7)
 
+            # convolution steps
+            h_conv1 = tf.nn.conv2d(current_input, self.W_conv1, strides=[1, 1, 1, 1], padding='SAME') + b_conv1
+            h_pool1 = tf.nn.max_pool(h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
             tanh = tf.tanh(h_pool1)
-            print "**** tanh", tanh.get_shape()
-
             h_conv2 = tf.nn.conv2d(tanh, W_conv2, strides=[1, 1, 1, 1], padding='SAME') + b_conv2
-            print "&&&& h_conv2", h_conv2.get_shape()
-
             h_conv3 = tf.nn.conv2d(h_conv2, W_conv3, strides=[1, 1, 1, 1], padding='SAME') + b_conv3
-            print "&&&& h_conv3", h_conv3.get_shape()
-
-            # # figure out the frickin logits reshaping
-            # # h_conv3 shape is [batch_size x width x height x num_categories]
-            # conv3_shape = tf.shape(h_conv3)
-            # conv3_height = conv3_shape[1]
-            # conv3_width = conv3_shape[2]
-            #
-            # # TODO don't hardcode this slice
-            # center_pixel = tf.slice(h_conv3, begin=[0, conv3_height / 2, conv3_width / 2, 0],
-            #                         size=[1, 1, 1, self.num_classes])
-
             current_logits = h_conv3
-            logits_shape = tf.shape(current_logits)
-            center_logit = tf.slice(current_logits, begin=[0, logits_shape[1] / 2, logits_shape[2] / 2, 0],
-                                    size=[-1, 1, 1, -1])
-            center_logit = tf.reshape(center_logit, shape=[1, 1, num_classes])
-            current_error = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(center_logit, self.output))
+
+            # tensorflow 11 doesn't have multidimensional softmax, we need to get predictions manually :-(
+            # (predictions are what's passed to the next iteration/layer of the CNN
+            exp_logits = tf.exp(current_logits)
+            predictions = exp_logits / tf.reduce_sum(exp_logits, reduction_indices=[3], keep_dims=True)
+
+            cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(current_logits, current_output)
+            error_for_all_pixel = tf.reduce_mean(cross_entropy, reduction_indices=[0])
+            error_for_image = tf.reduce_mean(error_for_all_pixel)
             self.logits.append(current_logits)
-            self.errors.append(current_error)
+            self.errors.append(error_for_image)
 
             # extracts RGB channels from input image. Only keeps every other pixel, since convolution scales down the
             #  output. The shape of this should have the same height and width and the logits.
             rgb = tf.strided_slice(current_input, [0, 0, 0, 0], [0, 0, 0, 3], strides=[1, 2, 2, 1], end_mask=7)
-            current_input = tf.concat(concat_dim=3, values=[rgb, current_logits])
-            print "Current Input Shape: ", current_input.get_shape()
+            current_input = tf.concat(concat_dim=3, values=[rgb, predictions])
 
-        self.train_step = tf.train.AdamOptimizer(learning_rate).minimize(tf.add_n(self.errors))
+        self.loss = tf.add_n(self.errors)
+        self.train_step = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
 
 
 def save_model(sess, path, saver=None):
 
@@ -6,6 +6,9 @@
 
 import numpy as np
 from PIL import Image
+import os
+
+from os.path import isfile
 
 
 def read_object_classes(classes_map_filename):
@@ -19,6 +22,7 @@ def read_object_classes(classes_map_filename):
         3. an array of ID -> category name
         2. a dictionary of category name -> ID
     """
+    # TODO handle different potential formats better
     format_description = "Each line should contain 5 elements: (float R, float G, float B, int ID, str Name)."
     ids = set()
     ids_to_cols = {}
@@ -30,9 +34,17 @@ def read_object_classes(classes_map_filename):
                 vals = line.split()
                 if len(vals) == 0:
                     continue
-                rgb = tuple([int(255 * float(s)) for s in vals[:3]])
-                category_num = int(vals[3])
-                category_name = vals[4]
+                elif len(vals) == 2:
+                    has_cols = False
+                    category_num = int(vals[0])
+                    category_name = vals[1]
+                elif len(vals) == 5:
+                    has_cols = True
+                    rgb = tuple([int(255 * float(s)) for s in vals[:3]])
+                    category_num = int(vals[3])
+                    category_name = vals[4]
+                else:
+                    raise ValueError("Category map must have either 2 or 5 columns")
 
                 # check for duplicate categories
                 if category_num in ids:
@@ -45,7 +57,8 @@ def read_object_classes(classes_map_filename):
                 ids.add(category_num)
                 ids_to_names[category_num] = category_name
                 names_to_ids[category_name] = category_num
-                ids_to_cols[category_num] = rgb
+                if has_cols:
+                    ids_to_cols[category_num] = rgb
 
             except (ValueError, IndexError) as e:
                 sys.stderr.write("%s %s\n" % (format_description, e))
@@ -56,7 +69,8 @@ def read_object_classes(classes_map_filename):
     category_names = [None] * (max_id + 1)
     for cat_id in ids:
         category_names[cat_id] = ids_to_names[cat_id]
-        category_colors[cat_id] = ids_to_cols[cat_id]
+        if has_cols:
+            category_colors[cat_id] = ids_to_cols[cat_id]
 
     return category_colors, category_names, names_to_ids
 
@@ -92,6 +106,13 @@ def labels_to_np_array(lab_filename):
     return data
 
 
+def text_labels_to_np_array(lab_filename):
+    label_file = open(lab_filename, 'r')
+    # TODO right now were just ignoring negative ("unknown") labels. Need a nicer way to do this in long term
+    labels = [map(lambda n: max(0, int(n)), l.split()) for l in label_file.readlines()]
+    return np.array(labels, dtype=np.int8)
+
+
 def save_labels_array(labels, output_filename, colors):
     """
     Saves a numpy array of labels to an paletted image.
@@ -120,9 +141,77 @@ def get_patch(array, center, patch_size):
     """
     rounded_width = patch_size // 2
     return array[center[0] - rounded_width: center[0] + rounded_width + 1,
-                 center[1] - rounded_width: center[1] + rounded_width + 1]
-
-if __name__ == '__main__':
+           center[1] - rounded_width: center[1] + rounded_width + 1]
+
+
+def from_games_dataset(data_dir, train_fraction=None, num_train=None):
+    labels_dir = os.path.join(data_dir, 'labels')
+    images_dir = os.path.join(data_dir, 'images')
+
+    # TODO get only image files
+    labels = [os.path.join(labels_dir, f) for f in os.listdir(labels_dir) if
+              isfile(os.path.join(labels_dir, f)) and not f.startswith('.')]
+    labels = sorted(labels)
+    images = [os.path.join(images_dir, f) for f in os.listdir(images_dir) if
+              isfile(os.path.join(images_dir, f)) and not f.startswith('.')]
+    images = sorted(images)
+    train_files = zip(labels, images)
+
+    # if specified, only choose subset of training data
+    if train_fraction is not None and num_train is None:
+        num_train = int(len(train_files) * train_fraction)
+    if num_train is not None:
+        train_files = train_files[:num_train]
+
+    for label_f, image_f in train_files:
+        print "Current image:", os.path.basename(image_f)
+        if os.path.basename(label_f) != os.path.basename(image_f):
+            print "UNEQUAL IMAGE NAMES!"
+        image = image_to_np_array(image_f)
+        labels = labels_to_np_array(label_f)
+        yield image, labels
+
+
+# TODO negative label nums could mess up paletted output
+def stanford_bgrounds_dataset(data_dir, train_fraction=None, num_train=None):
+    labels_dir = os.path.join(data_dir, 'labels')
+    images_dir = os.path.join(data_dir, 'images')
+
+    # TODO get only image files
+    labels = [os.path.join(labels_dir, f) for f in os.listdir(labels_dir) if
+              isfile(os.path.join(labels_dir, f)) and not f.startswith('.') and f.endswith('.regions.txt')]
+    labels = sorted(labels)
+    images = [os.path.join(images_dir, f) for f in os.listdir(images_dir) if
+              isfile(os.path.join(images_dir, f)) and not f.startswith('.')]
+    images = sorted(images)
+    train_files = zip(labels, images)
+
+    # if specified, only choose subset of training data
+    if train_fraction is not None and num_train is None:
+        num_train = int(len(train_files) * train_fraction)
+    if num_train is not None:
+        train_files = train_files[:num_train]
+
+    for label_f, image_f in train_files:
+        if os.path.basename(label_f).split('.')[0] != os.path.basename(image_f).split('.')[0]:
+            print "UNEQUAL IMAGE NAMES!", label_f, image_f
+        image = image_to_np_array(image_f)
+        labels = text_labels_to_np_array(label_f)
+        yield image, labels
+
+
+# list of datasets for which we have iterators
+FROM_GAMES = 'from-games'
+SIFT_FLOW = 'sift-flow'
+STANFORD_BGROUND = 'stanford-bground'
+DATASETS = {FROM_GAMES: from_games_dataset, SIFT_FLOW: None, STANFORD_BGROUND: stanford_bgrounds_dataset}
+
+
+def main():
     colors_map, infile, outfile = sys.argv[1:]
     labels = labels_to_np_array(infile)
-    save_labels_array()
+    save_labels_array(labels, output_filename=outfile, colors=colors_map)
+
+
+if __name__ == '__main__':
+    main()