Skip to content

Commit acd27f5

Browse files
antgonzaElDeveloper
authored andcommitted
fix #2211 (#2240)
* fix #2211 * following @qiita-dev suggestions * fixing errors and improving message * removing Ó * 🐾 * æ
1 parent a369cad commit acd27f5

File tree

6 files changed

+1827
-1819
lines changed

6 files changed

+1827
-1819
lines changed

qiita_db/metadata_template/test/test_util.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ def test_load_template_to_dataframe_lowercase(self):
149149

150150
def test_load_template_to_dataframe_non_utf8(self):
151151
bad = EXP_SAMPLE_TEMPLATE.replace('Test Sample 2', 'Test Sample\x962')
152-
with self.assertRaises(qdb.exceptions.QiitaDBError):
152+
with self.assertRaises(ValueError):
153153
qdb.metadata_template.util.load_template_to_dataframe(
154154
StringIO(bad))
155155

@@ -387,20 +387,20 @@ def test_get_pgsql_reserved_words(self):
387387

388388
EXP_SAMPLE_TEMPLATE_SPACES_EMPTY_ROW = (
389389
"sample_name\tcollection_timestamp\tdescription\thas_extracted_data\t"
390-
"has_physical_specimen\thost_subject_id\tint_column\tlatitude\tlongitude\t"
391-
"physical_location\trequired_sample_info_status\tsample_type\t"
392-
"str_column\n"
393-
"2.Sample1 \t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\t"
390+
"has_physical_specimen\thost_subject_id\tint_column\tlatitude\t"
391+
"longitude\t physical_location\trequired_sample_info_status"
392+
"\tsample_type\tstr_column\n"
393+
" 2.Sample1 \t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\t"
394394
"NotIdentified\t1\t42.42\t41.41\tlocation1\treceived\ttype1\t"
395395
"Value for sample 1\n"
396-
"2.Sample2 \t2014-05-29 12:24:51\t"
396+
" 2.Sample2 \t2014-05-29 12:24:51\t"
397397
"Test Sample 2\tTrue\tTrue\tNotIdentified\t2\t4.2\t1.1\tlocation1\t"
398398
"received\ttype1\tValue for sample 2\n"
399399
"2.Sample3\t2014-05-29 12:24:51\tTest Sample 3\tTrue\t"
400400
"True\tNotIdentified\t3\t4.8\t4.41\tlocation1\treceived\ttype1\t"
401401
"Value for sample 3\n"
402402
"\t\t\t\t\t\t\t\t\t\t\t\t\n"
403-
"\t\t\t\t\t\t\t\t\t\t\t\t\n")
403+
"\t\t\t\t\t\t\t\t\t\t \t\t\n")
404404

405405
EXP_ST_SPACES_EMPTY_COLUMN = (
406406
"sample_name\tcollection_timestamp\tdescription\thas_extracted_data\t"

qiita_db/metadata_template/util.py

Lines changed: 35 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
# -----------------------------------------------------------------------------
88

99
from __future__ import division
10-
from collections import defaultdict
1110
from future.utils import PY3, viewitems
1211
from six import StringIO
12+
from string import printable
1313

1414
import pandas as pd
1515
import numpy as np
@@ -103,7 +103,27 @@ def load_template_to_dataframe(fn, index='sample_name'):
103103
# Load in file lines
104104
holdfile = None
105105
with open_file(fn, mode='U') as f:
106+
errors = {}
106107
holdfile = f.readlines()
108+
# here we are checking for non printable chars AKA non UTF-8 chars
109+
for row, line in enumerate(holdfile):
110+
for col, block in enumerate(line.split('\t')):
111+
tblock = ''.join([c for c in block if c in printable])
112+
if len(block) != len(tblock):
113+
tblock = ''.join([c if c in printable else '🐾'
114+
for c in block])
115+
if tblock not in errors:
116+
errors[tblock] = []
117+
errors[tblock].append('(%d, %d)' % (row, col))
118+
if bool(errors):
119+
raise ValueError(
120+
"There are invalid (non UTF-8) characters in your information "
121+
"file. The offending fields and their location (row, column) "
122+
"are listed below, invalid characters are represented using "
123+
"🐾: %s" % '; '.join(
124+
['"%s" = %s' % (k, ', '.join(v))
125+
for k, v in viewitems(errors)]))
126+
107127
if not holdfile:
108128
raise ValueError('Empty file passed!')
109129

@@ -137,7 +157,7 @@ def load_template_to_dataframe(fn, index='sample_name'):
137157
# .strip will remove odd chars, newlines, tabs and multiple
138158
# spaces but we need to read a new line at the end of the
139159
# line(+'\n')
140-
newcols = [d.strip(" \r\x0b\x0c\n") for d in cols]
160+
newcols = [d.strip(" \r\n") for d in cols]
141161

142162
holdfile[pos] = '\t'.join(newcols) + '\n'
143163

@@ -149,34 +169,19 @@ def load_template_to_dataframe(fn, index='sample_name'):
149169
# comment:
150170
# using the tab character as "comment" we remove rows that are
151171
# constituted only by delimiters i. e. empty rows.
152-
try:
153-
template = pd.read_csv(
154-
StringIO(''.join(holdfile)),
155-
sep='\t',
156-
dtype=str,
157-
encoding='utf-8',
158-
infer_datetime_format=False,
159-
keep_default_na=False,
160-
index_col=False,
161-
comment='\t',
162-
converters={index: lambda x: str(x).strip()})
163-
# remove newlines and tabs from fields
164-
template.replace(to_replace='[\t\n\r\x0b\x0c]+', value='',
165-
regex=True, inplace=True)
166-
except UnicodeDecodeError:
167-
# Find row number and col number for utf-8 encoding errors
168-
headers = holdfile[0].strip().split('\t')
169-
errors = defaultdict(list)
170-
for row, line in enumerate(holdfile, 1):
171-
for col, cell in enumerate(line.split('\t')):
172-
try:
173-
cell.encode('utf-8')
174-
except UnicodeError:
175-
errors[headers[col]].append(row)
176-
lines = ['%s: row(s) %s' % (header, ', '.join(map(str, rows)))
177-
for header, rows in viewitems(errors)]
178-
raise qdb.exceptions.QiitaDBError(
179-
'Non UTF-8 characters found in columns:\n' + '\n'.join(lines))
172+
template = pd.read_csv(
173+
StringIO(''.join(holdfile)),
174+
sep='\t',
175+
dtype=str,
176+
encoding='utf-8',
177+
infer_datetime_format=False,
178+
keep_default_na=False,
179+
index_col=False,
180+
comment='\t',
181+
converters={index: lambda x: str(x).strip()})
182+
# remove newlines and tabs from fields
183+
template.replace(to_replace='[\t\n\r\x0b\x0c]+', value='',
184+
regex=True, inplace=True)
180185

181186
initial_columns = set(template.columns)
182187

qiita_ware/test/test_dispatchable.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,11 @@ def test_create_sample_template_nonutf8(self):
8484
'sample_info_utf8_error.txt')
8585
obs = create_sample_template(fp, Study(1), False)
8686
exp = {'status': 'danger',
87-
'message': u"Non UTF-8 characters found in columns:"
88-
u"\n\ufffdcollection_timestamp: row(s) 1"}
87+
'message': 'There are invalid (non UTF-8) characters in your '
88+
'information file. The offending fields and their '
89+
'location (row, column) are listed below, invalid '
90+
'characters are represented using 🐾: '
91+
'"🐾collection_timestamp" = (0, 13)'}
8992
self.assertEqual(obs, exp)
9093

9194
def test_update_sample_template(self):

0 commit comments

Comments
 (0)