7
7
# -----------------------------------------------------------------------------
8
8
9
9
from __future__ import division
10
- from collections import defaultdict
11
10
from future .utils import PY3 , viewitems
12
11
from six import StringIO
12
+ from string import printable
13
13
14
14
import pandas as pd
15
15
import numpy as np
@@ -103,7 +103,27 @@ def load_template_to_dataframe(fn, index='sample_name'):
103
103
# Load in file lines
104
104
holdfile = None
105
105
with open_file (fn , mode = 'U' ) as f :
106
+ errors = {}
106
107
holdfile = f .readlines ()
108
+ # here we are checking for non printable chars AKA non UTF-8 chars
109
+ for row , line in enumerate (holdfile ):
110
+ for col , block in enumerate (line .split ('\t ' )):
111
+ tblock = '' .join ([c for c in block if c in printable ])
112
+ if len (block ) != len (tblock ):
113
+ tblock = '' .join ([c if c in printable else '🐾'
114
+ for c in block ])
115
+ if tblock not in errors :
116
+ errors [tblock ] = []
117
+ errors [tblock ].append ('(%d, %d)' % (row , col ))
118
+ if bool (errors ):
119
+ raise ValueError (
120
+ "There are invalid (non UTF-8) characters in your information "
121
+ "file. The offending fields and their location (row, column) "
122
+ "are listed below, invalid characters are represented using "
123
+ "🐾: %s" % '; ' .join (
124
+ ['"%s" = %s' % (k , ', ' .join (v ))
125
+ for k , v in viewitems (errors )]))
126
+
107
127
if not holdfile :
108
128
raise ValueError ('Empty file passed!' )
109
129
@@ -137,7 +157,7 @@ def load_template_to_dataframe(fn, index='sample_name'):
137
157
# .strip will remove odd chars, newlines, tabs and multiple
138
158
# spaces but we need to read a new line at the end of the
139
159
# line(+'\n')
140
- newcols = [d .strip (" \r \x0b \x0c \ n " ) for d in cols ]
160
+ newcols = [d .strip (" \r \n " ) for d in cols ]
141
161
142
162
holdfile [pos ] = '\t ' .join (newcols ) + '\n '
143
163
@@ -149,34 +169,19 @@ def load_template_to_dataframe(fn, index='sample_name'):
149
169
# comment:
150
170
# using the tab character as "comment" we remove rows that are
151
171
# constituted only by delimiters i. e. empty rows.
152
- try :
153
- template = pd .read_csv (
154
- StringIO ('' .join (holdfile )),
155
- sep = '\t ' ,
156
- dtype = str ,
157
- encoding = 'utf-8' ,
158
- infer_datetime_format = False ,
159
- keep_default_na = False ,
160
- index_col = False ,
161
- comment = '\t ' ,
162
- converters = {index : lambda x : str (x ).strip ()})
163
- # remove newlines and tabs from fields
164
- template .replace (to_replace = '[\t \n \r \x0b \x0c ]+' , value = '' ,
165
- regex = True , inplace = True )
166
- except UnicodeDecodeError :
167
- # Find row number and col number for utf-8 encoding errors
168
- headers = holdfile [0 ].strip ().split ('\t ' )
169
- errors = defaultdict (list )
170
- for row , line in enumerate (holdfile , 1 ):
171
- for col , cell in enumerate (line .split ('\t ' )):
172
- try :
173
- cell .encode ('utf-8' )
174
- except UnicodeError :
175
- errors [headers [col ]].append (row )
176
- lines = ['%s: row(s) %s' % (header , ', ' .join (map (str , rows )))
177
- for header , rows in viewitems (errors )]
178
- raise qdb .exceptions .QiitaDBError (
179
- 'Non UTF-8 characters found in columns:\n ' + '\n ' .join (lines ))
172
+ template = pd .read_csv (
173
+ StringIO ('' .join (holdfile )),
174
+ sep = '\t ' ,
175
+ dtype = str ,
176
+ encoding = 'utf-8' ,
177
+ infer_datetime_format = False ,
178
+ keep_default_na = False ,
179
+ index_col = False ,
180
+ comment = '\t ' ,
181
+ converters = {index : lambda x : str (x ).strip ()})
182
+ # remove newlines and tabs from fields
183
+ template .replace (to_replace = '[\t \n \r \x0b \x0c ]+' , value = '' ,
184
+ regex = True , inplace = True )
180
185
181
186
initial_columns = set (template .columns )
182
187
0 commit comments