Skip to content
This repository was archived by the owner on Sep 3, 2022. It is now read-only.

Commit 2137c84

Browse files
authored
Revert "sd package: all categorical columns are multi-label columns" (#357)
* Revert "Storage object deletion waits for consistency. (#354)" This reverts commit 3c7e44d. * Revert "sd package: all categorical columns are multi-label columns (#351)" This reverts commit f1522f7.
1 parent 57b0cc6 commit 2137c84

File tree

5 files changed

+31
-55
lines changed

5 files changed

+31
-55
lines changed

solutionbox/structured_data/mltoolbox/_structured_data/preprocess/cloud_preprocess.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -160,9 +160,9 @@ def run_categorical_analysis(table, schema_list, args):
160160
"""Find vocab values for the categorical columns and writes a csv file.
161161
162162
The vocab files are in the from
163-
label1,count
164-
label2,count
165-
label3,count
163+
label1
164+
label2
165+
label3
166166
...
167167
168168
Args:
@@ -188,20 +188,18 @@ def run_categorical_analysis(table, schema_list, args):
188188
else:
189189
table_name = 'table_name'
190190

191-
# In non-legacy sql, the SPLIT function returns an array. To flatten the
192-
# array in BQ, you have to cross join UNNEST the table with itself
193191
sql = """
194-
WITH SPLIT_TABLE AS (
195-
SELECT SPLIT({name}, ' ') as split_col FROM {table}
196-
)
197-
SELECT word, COUNT(word) as word_count
198-
FROM SPLIT_TABLE
199-
CROSS JOIN UNNEST(SPLIT_TABLE.split_col) as word
200-
WHERE LENGTH(word) > 0
201-
GROUP BY word
202-
ORDER BY word_count DESC
192+
SELECT
193+
{name}
194+
FROM
195+
{table}
196+
WHERE
197+
{name} IS NOT NULL
198+
GROUP BY
199+
{name}
200+
ORDER BY
201+
{name}
203202
""".format(name=name, table=table_name)
204-
205203
out_file = os.path.join(args.output_dir,
206204
CATEGORICAL_ANALYSIS_FILE % name)
207205

solutionbox/structured_data/mltoolbox/_structured_data/preprocess/local_preprocess.py

Lines changed: 6 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020

2121
import argparse
2222
import collections
23-
import csv
2423
import json
2524
import os
2625
import six
@@ -93,27 +92,20 @@ def _init_numerical_results():
9392
'count': 0,
9493
'sum': 0.0}
9594
numerical_results = collections.defaultdict(_init_numerical_results)
96-
categorical_results = collections.defaultdict(
97-
lambda: collections.defaultdict(int))
95+
categorical_results = collections.defaultdict(set)
9896

9997
# for each file, update the numerical stats from that file, and update the set
10098
# of unique labels.
10199
for input_file in input_files:
102100
with file_io.FileIO(input_file, 'r') as f:
103-
for line in csv.reader(f):
104-
parsed_line = dict(zip(header, line))
101+
for line in f:
102+
parsed_line = dict(zip(header, line.strip().split(',')))
105103

106104
for col_schema in schema_list:
107105
col_name = col_schema['name']
108106
col_type = col_schema['type']
109107
if col_type.lower() == 'string':
110-
split_strings = parsed_line[col_name].split(' ')
111-
112-
for one_label in split_strings:
113-
# Filter out empty strings
114-
if one_label:
115-
# add the label to the dict and increase its count.
116-
categorical_results[col_name][one_label] += 1
108+
categorical_results[col_name].update([parsed_line[col_name]])
117109
else:
118110
# numerical column.
119111

@@ -145,16 +137,8 @@ def _init_numerical_results():
145137
json.dumps(numerical_results, indent=2, separators=(',', ': ')))
146138

147139
# Write the vocab files. Each label is on its own line.
148-
for name, label_count in six.iteritems(categorical_results):
149-
# Labels is now the string:
150-
# label1,count
151-
# label2,count
152-
# ...
153-
# where label1 is the most frequent label, and label2 is the 2nd most, etc.
154-
labels = '\n'.join(["%s,%d" % (label, count)
155-
for label, count in sorted(six.iteritems(label_count),
156-
key=lambda x: x[1],
157-
reverse=True)])
140+
for name, unique_labels in six.iteritems(categorical_results):
141+
labels = '\n'.join(list(unique_labels))
158142
file_io.write_string_to_file(
159143
os.path.join(args.output_dir, CATEGORICAL_ANALYSIS_FILE % name),
160144
labels)

solutionbox/structured_data/mltoolbox/_structured_data/trainer/util.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
# limitations under the License.
1414
# ==============================================================================
1515

16-
import csv
1716
import json
1817
import multiprocessing
1918
import os
@@ -538,11 +537,7 @@ def preprocess_input(features, target, train_config, preprocess_output_dir,
538537
if map_vocab:
539538
labels = train_config['vocab_stats'][name]['labels']
540539
table = tf.contrib.lookup.string_to_index_table_from_tensor(labels)
541-
542-
# TODO(brandondutra) yaqs/5113574563512320
543-
sparse = tf.squeeze(features[name])
544-
ss = tf.string_split(sparse, delimiter=' ')
545-
features[name] = table.lookup(ss)
540+
features[name] = table.lookup(features[name])
546541

547542
return features, target
548543

@@ -675,8 +670,9 @@ def get_vocabulary(preprocess_output_dir, name):
675670
raise ValueError('File %s not found in %s' %
676671
(CATEGORICAL_ANALYSIS % name, preprocess_output_dir))
677672

678-
with file_io.FileIO(vocab_file, 'r') as f:
679-
label_values = [x[0] for x in csv.reader(f) if x[0]]
673+
labels = python_portable_string(
674+
file_io.read_file_to_string(vocab_file)).split('\n')
675+
label_values = [x for x in labels if x] # remove empty lines
680676

681677
return label_values
682678

solutionbox/structured_data/test_mltoolbox/e2e_functions.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,17 +40,15 @@ def make_csv_data(filename, num_rows, problem_type, keep_target=True):
4040

4141
str1 = random.choice(['red', 'blue', 'green', 'pink', 'yellow', 'brown', 'black'])
4242
str2 = random.choice(['abc', 'def', 'ghi', 'jkl', 'mno', 'pqr'])
43-
str3_list = [
44-
random.choice(['car', 'truck', 'van', 'bike', 'train', 'drone'])
45-
for x in range(random.randint(1, 3))]
43+
str3 = random.choice(['car', 'truck', 'van', 'bike', 'train', 'drone'])
4644

4745
map1 = {'red': 2, 'blue': 6, 'green': 4, 'pink': -5, 'yellow': -6, 'brown': -1, 'black': 7}
4846
map2 = {'abc': 10, 'def': 1, 'ghi': 1, 'jkl': 1, 'mno': 1, 'pqr': 1}
4947
map3 = {'car': 5, 'truck': 10, 'van': 15, 'bike': 20, 'train': 25, 'drone': 30}
5048

5149
# Build some model.
5250
t = 0.5 + 0.5 * num1 - 2.5 * num2 + num3
53-
t += map1[str1] + map2[str2] + sum([map3[x] for x in str3_list])
51+
t += map1[str1] + map2[str2] + map3[str3]
5452

5553
if problem_type == 'classification':
5654
if t < 0:
@@ -69,7 +67,7 @@ def make_csv_data(filename, num_rows, problem_type, keep_target=True):
6967
num3=num3,
7068
str1=str1,
7169
str2=str2,
72-
str3=' '.join(str3_list))
70+
str3=str3)
7371
else:
7472
csv_line = "{id},{num1},{num2},{num3},{str1},{str2},{str3}\n".format(
7573
id=i,
@@ -78,7 +76,7 @@ def make_csv_data(filename, num_rows, problem_type, keep_target=True):
7876
num3=num3,
7977
str1=str1,
8078
str2=str2,
81-
str3=' '.join(str3_list))
79+
str3=str3)
8280
f1.write(csv_line)
8381

8482

solutionbox/structured_data/test_mltoolbox/test_sd_trainer.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ def testRegressionDnn(self):
200200
transforms=transforms,
201201
extra_args=extra_args)
202202

203-
self._check_training_screen_output(loss=2)
203+
self._check_training_screen_output(loss=20)
204204
self._check_train_files()
205205

206206
def testRegressionLinear(self):
@@ -219,7 +219,7 @@ def testRegressionLinear(self):
219219
model_type='linear',
220220
transforms=transforms)
221221

222-
self._check_training_screen_output(loss=100)
222+
self._check_training_screen_output(loss=20)
223223
self._check_train_files()
224224

225225
def testClassificationDnn(self):
@@ -240,7 +240,7 @@ def testClassificationDnn(self):
240240
transforms=transforms,
241241
extra_args=extra_args)
242242

243-
self._check_training_screen_output(accuracy=0.70, loss=0.1)
243+
self._check_training_screen_output(accuracy=0.70, loss=0.10)
244244
self._check_train_files()
245245

246246
def testClassificationLinear(self):
@@ -259,7 +259,7 @@ def testClassificationLinear(self):
259259
model_type='linear',
260260
transforms=transforms)
261261

262-
self._check_training_screen_output(accuracy=0.70, loss=0.1)
262+
self._check_training_screen_output(accuracy=0.70, loss=0.2)
263263
self._check_train_files()
264264

265265

0 commit comments

Comments
 (0)