Revert "sd package: all categorical columns are multi-label columns" (#357)

brandondutra · web-flow · commit 2137c84c9528 · 2017-04-21T10:35:52.000-07:00
* Revert "Storage object deletion waits for consistency. (#354)" This reverts commit 3c7e44d. * Revert "sd package: all categorical columns are multi-label columns (#351)" This reverts commit f1522f7.
diff --git a/solutionbox/structured_data/mltoolbox/_structured_data/preprocess/cloud_preprocess.py b/solutionbox/structured_data/mltoolbox/_structured_data/preprocess/cloud_preprocess.py
@@ -160,9 +160,9 @@ def run_categorical_analysis(table, schema_list, args):
   """Find vocab values for the categorical columns and writes a csv file.
 
   The vocab files are in the from
-  label1,count
-  label2,count
-  label3,count
+  label1
+  label2
+  label3
   ...
 
   Args:
@@ -188,20 +188,18 @@ def run_categorical_analysis(table, schema_list, args):
       else:
         table_name = 'table_name'
 
-      # In non-legacy sql, the SPLIT function returns an array. To flatten the
-      # array in BQ, you have to cross join UNNEST the table with itself
       sql = """
-            WITH SPLIT_TABLE AS (
-              SELECT SPLIT({name}, ' ') as split_col FROM {table}
-            )
-            SELECT word, COUNT(word) as word_count
-            FROM SPLIT_TABLE
-            CROSS JOIN UNNEST(SPLIT_TABLE.split_col) as word
-            WHERE LENGTH(word) > 0
-            GROUP BY word
-            ORDER BY word_count DESC
+            SELECT
+              {name}
+            FROM
+              {table}
+            WHERE
+              {name} IS NOT NULL
+            GROUP BY
+              {name}
+            ORDER BY
+              {name}
       """.format(name=name, table=table_name)
-
       out_file = os.path.join(args.output_dir,
                               CATEGORICAL_ANALYSIS_FILE % name)
 
diff --git a/solutionbox/structured_data/mltoolbox/_structured_data/preprocess/local_preprocess.py b/solutionbox/structured_data/mltoolbox/_structured_data/preprocess/local_preprocess.py
@@ -20,7 +20,6 @@
 
 import argparse
 import collections
-import csv
 import json
 import os
 import six
@@ -93,27 +92,20 @@ def _init_numerical_results():
             'count': 0,
             'sum': 0.0}
   numerical_results = collections.defaultdict(_init_numerical_results)
-  categorical_results = collections.defaultdict(
-      lambda: collections.defaultdict(int))
+  categorical_results = collections.defaultdict(set)
 
   # for each file, update the numerical stats from that file, and update the set
   # of unique labels.
   for input_file in input_files:
     with file_io.FileIO(input_file, 'r') as f:
-      for line in csv.reader(f):
-        parsed_line = dict(zip(header, line))
+      for line in f:
+        parsed_line = dict(zip(header, line.strip().split(',')))
 
         for col_schema in schema_list:
           col_name = col_schema['name']
           col_type = col_schema['type']
           if col_type.lower() == 'string':
-            split_strings = parsed_line[col_name].split(' ')
-
-            for one_label in split_strings:
-              # Filter out empty strings
-              if one_label:
-                # add the label to the dict and increase its count.
-                categorical_results[col_name][one_label] += 1
+            categorical_results[col_name].update([parsed_line[col_name]])
           else:
             # numerical column.
 
@@ -145,16 +137,8 @@ def _init_numerical_results():
       json.dumps(numerical_results, indent=2, separators=(',', ': ')))
 
   # Write the vocab files. Each label is on its own line.
-  for name, label_count in six.iteritems(categorical_results):
-    # Labels is now the string:
-    # label1,count
-    # label2,count
-    # ...
-    # where label1 is the most frequent label, and label2 is the 2nd most, etc.
-    labels = '\n'.join(["%s,%d" % (label, count)
-                        for label, count in sorted(six.iteritems(label_count),
-                                                   key=lambda x: x[1],
-                                                   reverse=True)])
+  for name, unique_labels in six.iteritems(categorical_results):
+    labels = '\n'.join(list(unique_labels))
     file_io.write_string_to_file(
         os.path.join(args.output_dir, CATEGORICAL_ANALYSIS_FILE % name),
         labels)
diff --git a/solutionbox/structured_data/mltoolbox/_structured_data/trainer/util.py b/solutionbox/structured_data/mltoolbox/_structured_data/trainer/util.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 # ==============================================================================
 
-import csv
 import json
 import multiprocessing
 import os
@@ -538,11 +537,7 @@ def preprocess_input(features, target, train_config, preprocess_output_dir,
       if map_vocab:
         labels = train_config['vocab_stats'][name]['labels']
         table = tf.contrib.lookup.string_to_index_table_from_tensor(labels)
-
-        # TODO(brandondutra) yaqs/5113574563512320
-        sparse = tf.squeeze(features[name])
-        ss = tf.string_split(sparse, delimiter=' ')
-        features[name] = table.lookup(ss)
+        features[name] = table.lookup(features[name])
 
   return features, target
 
@@ -675,8 +670,9 @@ def get_vocabulary(preprocess_output_dir, name):
     raise ValueError('File %s not found in %s' %
                      (CATEGORICAL_ANALYSIS % name, preprocess_output_dir))
 
-  with file_io.FileIO(vocab_file, 'r') as f:
-    label_values = [x[0] for x in csv.reader(f) if x[0]]
+  labels = python_portable_string(
+      file_io.read_file_to_string(vocab_file)).split('\n')
+  label_values = [x for x in labels if x]  # remove empty lines
 
   return label_values
 
diff --git a/solutionbox/structured_data/test_mltoolbox/e2e_functions.py b/solutionbox/structured_data/test_mltoolbox/e2e_functions.py
@@ -40,17 +40,15 @@ def make_csv_data(filename, num_rows, problem_type, keep_target=True):
 
       str1 = random.choice(['red', 'blue', 'green', 'pink', 'yellow', 'brown', 'black'])
       str2 = random.choice(['abc', 'def', 'ghi', 'jkl', 'mno', 'pqr'])
-      str3_list = [
-          random.choice(['car', 'truck', 'van', 'bike', 'train', 'drone'])
-          for x in range(random.randint(1, 3))]
+      str3 = random.choice(['car', 'truck', 'van', 'bike', 'train', 'drone'])
 
       map1 = {'red': 2, 'blue': 6, 'green': 4, 'pink': -5, 'yellow': -6, 'brown': -1, 'black': 7}
       map2 = {'abc': 10, 'def': 1, 'ghi': 1, 'jkl': 1, 'mno': 1, 'pqr': 1}
       map3 = {'car': 5, 'truck': 10, 'van': 15, 'bike': 20, 'train': 25, 'drone': 30}
 
       # Build some model.
       t = 0.5 + 0.5 * num1 - 2.5 * num2 + num3
-      t += map1[str1] + map2[str2] + sum([map3[x] for x in str3_list])
+      t += map1[str1] + map2[str2] + map3[str3]
 
       if problem_type == 'classification':
         if t < 0:
@@ -69,7 +67,7 @@ def make_csv_data(filename, num_rows, problem_type, keep_target=True):
             num3=num3,
             str1=str1,
             str2=str2,
-            str3=' '.join(str3_list))
+            str3=str3)
       else:
           csv_line = "{id},{num1},{num2},{num3},{str1},{str2},{str3}\n".format(
             id=i,
@@ -78,7 +76,7 @@ def make_csv_data(filename, num_rows, problem_type, keep_target=True):
             num3=num3,
             str1=str1,
             str2=str2,
-            str3=' '.join(str3_list))
+            str3=str3)
       f1.write(csv_line)
 
 
diff --git a/solutionbox/structured_data/test_mltoolbox/test_sd_trainer.py b/solutionbox/structured_data/test_mltoolbox/test_sd_trainer.py
@@ -200,7 +200,7 @@ def testRegressionDnn(self):
                        transforms=transforms,
                        extra_args=extra_args)
 
-    self._check_training_screen_output(loss=2)
+    self._check_training_screen_output(loss=20)
     self._check_train_files()
 
   def testRegressionLinear(self):
@@ -219,7 +219,7 @@ def testRegressionLinear(self):
                        model_type='linear',
                        transforms=transforms)
 
-    self._check_training_screen_output(loss=100)
+    self._check_training_screen_output(loss=20)
     self._check_train_files()
 
   def testClassificationDnn(self):
@@ -240,7 +240,7 @@ def testClassificationDnn(self):
                        transforms=transforms,
                        extra_args=extra_args)
 
-    self._check_training_screen_output(accuracy=0.70, loss=0.1)
+    self._check_training_screen_output(accuracy=0.70, loss=0.10)
     self._check_train_files()
 
   def testClassificationLinear(self):
@@ -259,7 +259,7 @@ def testClassificationLinear(self):
                        model_type='linear',
                        transforms=transforms)
 
-    self._check_training_screen_output(accuracy=0.70, loss=0.1)
+    self._check_training_screen_output(accuracy=0.70, loss=0.2)
     self._check_train_files()