@@ -78,6 +78,49 @@ def detect_encoding(file_path):
78
78
return detector .result # e.g. {'encoding': 'EUC-JP', 'confidence': 0.99}
79
79
80
80
81
+ def _fields_match (fields , existing_fields , logger ):
82
+ ''' Check whether all columns have the same names and types as previously,
83
+ independent of ordering.
84
+ '''
85
+ # drop the generated '_id' field
86
+ for index in range (len (existing_fields )):
87
+ if existing_fields [index ]['id' ] == '_id' :
88
+ existing_fields .pop (index )
89
+ break
90
+
91
+ # fail fast if number of fields doesn't match
92
+ field_count = len (fields )
93
+ if field_count != len (existing_fields ):
94
+ logger .info ("Fields do not match; there are now %s fields but previously %s" , field_count , len (existing_fields ))
95
+ return False
96
+
97
+ # ensure each field is present in both collections with the same type
98
+ for index in range (field_count ):
99
+ field_id = fields [index ]['id' ]
100
+ for existing_index in range (field_count ):
101
+ existing_field_id = existing_fields [existing_index ]['id' ]
102
+ if field_id == existing_field_id :
103
+ if fields [index ]['type' ] == existing_fields [existing_index ]['type' ]:
104
+ break
105
+ else :
106
+ logger .info ("Fields do not match; new type for %s field is %s but existing type is %s" ,
107
+ field_id , fields [index ]["type" ], existing_fields [existing_index ]['type' ])
108
+ return False
109
+ else :
110
+ logger .info ("Fields do not match; no existing entry found for %s" , field_id )
111
+ return False
112
+ return True
113
+
114
+
115
+ def _clear_datastore_resource (resource_id ):
116
+ ''' Delete all records from the datastore table, without dropping the table itself.
117
+ '''
118
+ engine = get_write_engine ()
119
+ with engine .begin () as conn :
120
+ conn .execute ("SET LOCAL lock_timeout = '5s'" )
121
+ conn .execute ('TRUNCATE TABLE "{}"' .format (resource_id ))
122
+
123
+
81
124
def load_csv (csv_filepath , resource_id , mimetype = 'text/csv' , logger = None ):
82
125
'''Loads a CSV into DataStore. Does not create the indexes.'''
83
126
@@ -140,34 +183,43 @@ def load_csv(csv_filepath, resource_id, mimetype='text/csv', logger=None):
140
183
existing = datastore_resource_exists (resource_id )
141
184
existing_info = {}
142
185
if existing :
186
+ existing_fields = existing .get ('fields' , [])
143
187
existing_info = dict ((f ['id' ], f ['info' ])
144
- for f in existing . get ( 'fields' , [])
188
+ for f in existing_fields
145
189
if 'info' in f )
146
190
147
- '''
148
- Delete existing datastore table before proceeding. Otherwise
149
- the COPY will append to the existing table. And if
150
- the fields have significantly changed, it may also fail.
151
- '''
152
- logger .info ('Deleting "{res_id}" from DataStore.' .format (
153
- res_id = resource_id ))
154
- delete_datastore_resource (resource_id )
155
-
156
- # Columns types are either set (overridden) in the Data Dictionary page
157
- # or default to text type (which is robust)
158
- fields = [
159
- {'id' : header_name ,
160
- 'type' : existing_info .get (header_name , {})
161
- .get ('type_override' ) or 'text' ,
162
- }
163
- for header_name in headers ]
191
+ # Column types are either set (overridden) in the Data Dictionary page
192
+ # or default to text type (which is robust)
193
+ fields = [
194
+ {'id' : header_name ,
195
+ 'type' : existing_info .get (header_name , {})
196
+ .get ('type_override' ) or 'text' ,
197
+ }
198
+ for header_name in headers ]
164
199
165
- # Maintain data dictionaries from matching column names
166
- if existing_info :
200
+ # Maintain data dictionaries from matching column names
167
201
for f in fields :
168
202
if f ['id' ] in existing_info :
169
203
f ['info' ] = existing_info [f ['id' ]]
170
204
205
+ '''
206
+ Delete or truncate existing datastore table before proceeding,
207
+ depending on whether any fields have changed.
208
+ Otherwise the COPY will append to the existing table.
209
+ And if the fields have significantly changed, it may also fail.
210
+ '''
211
+ if _fields_match (fields , existing_fields , logger ):
212
+ logger .info ('Clearing records for "%s" from DataStore.' , resource_id )
213
+ _clear_datastore_resource (resource_id )
214
+ else :
215
+ logger .info ('Deleting "%s" from DataStore.' , resource_id )
216
+ delete_datastore_resource (resource_id )
217
+ else :
218
+ fields = [
219
+ {'id' : header_name ,
220
+ 'type' : 'text' }
221
+ for header_name in headers ]
222
+
171
223
logger .info ('Fields: %s' , fields )
172
224
173
225
# Create table
@@ -281,6 +333,18 @@ def create_column_indexes(fields, resource_id, logger):
281
333
logger .info ('...column indexes created.' )
282
334
283
335
336
+ def _save_type_overrides (headers_dicts ):
337
+ # copy 'type' to 'type_override' if it's not the default type (text)
338
+ # and there isn't already an override in place
339
+ for h in headers_dicts :
340
+ if h ['type' ] != 'text' :
341
+ if 'info' in h :
342
+ if 'type_override' not in h ['info' ]:
343
+ h ['info' ]['type_override' ] = h ['type' ]
344
+ else :
345
+ h ['info' ] = {'type_override' : h ['type' ]}
346
+
347
+
284
348
def load_table (table_filepath , resource_id , mimetype = 'text/csv' , logger = None ):
285
349
'''Loads an Excel file (or other tabular data recognized by tabulator)
286
350
into Datastore and creates indexes.
@@ -311,9 +375,10 @@ def load_table(table_filepath, resource_id, mimetype='text/csv', logger=None):
311
375
existing = datastore_resource_exists (resource_id )
312
376
existing_info = None
313
377
if existing :
378
+ existing_fields = existing .get ('fields' , [])
314
379
existing_info = dict (
315
380
(f ['id' ], f ['info' ])
316
- for f in existing . get ( 'fields' , []) if 'info' in f )
381
+ for f in existing_fields if 'info' in f )
317
382
318
383
# Some headers might have been converted from strings to floats and such.
319
384
headers = encode_headers (headers )
@@ -349,16 +414,6 @@ def row_iterator():
349
414
yield data_row
350
415
result = row_iterator ()
351
416
352
- '''
353
- Delete existing datstore resource before proceeding. Otherwise
354
- 'datastore_create' will append to the existing datastore. And if
355
- the fields have significantly changed, it may also fail.
356
- '''
357
- if existing :
358
- logger .info ('Deleting "{res_id}" from datastore.' .format (
359
- res_id = resource_id ))
360
- delete_datastore_resource (resource_id )
361
-
362
417
headers_dicts = [dict (id = field [0 ], type = TYPE_MAPPING [str (field [1 ])])
363
418
for field in zip (headers , types )]
364
419
@@ -372,8 +427,24 @@ def row_iterator():
372
427
if type_override in list (_TYPE_MAPPING .values ()):
373
428
h ['type' ] = type_override
374
429
375
- logger .info ('Determined headers and types: {headers}' .format (
376
- headers = headers_dicts ))
430
+ # preserve any types that we have sniffed unless told otherwise
431
+ _save_type_overrides (headers_dicts )
432
+
433
+ logger .info ('Determined headers and types: %s' , headers_dicts )
434
+
435
+ '''
436
+ Delete or truncate existing datastore table before proceeding,
437
+ depending on whether any fields have changed.
438
+ Otherwise 'datastore_create' will append to the existing datastore.
439
+ And if the fields have significantly changed, it may also fail.
440
+ '''
441
+ if existing :
442
+ if _fields_match (headers_dicts , existing_fields , logger ):
443
+ logger .info ('Clearing records for "%s" from DataStore.' , resource_id )
444
+ _clear_datastore_resource (resource_id )
445
+ else :
446
+ logger .info ('Deleting "%s" from datastore.' , resource_id )
447
+ delete_datastore_resource (resource_id )
377
448
378
449
logger .info ('Copying to database...' )
379
450
count = 0
@@ -382,7 +453,7 @@ def row_iterator():
382
453
non_empty_types = ['timestamp' , 'numeric' ]
383
454
for i , records in enumerate (chunky (result , 250 )):
384
455
count += len (records )
385
- logger .info ('Saving chunk {number}' . format ( number = i ) )
456
+ logger .info ('Saving chunk %s' , i )
386
457
for row in records :
387
458
for column_index , column_name in enumerate (row ):
388
459
if headers_dicts [column_index ]['type' ] in non_empty_types and row [column_name ] == '' :
@@ -391,8 +462,7 @@ def row_iterator():
391
462
logger .info ('...copying done' )
392
463
393
464
if count :
394
- logger .info ('Successfully pushed {n} entries to "{res_id}".' .format (
395
- n = count , res_id = resource_id ))
465
+ logger .info ('Successfully pushed %s entries to "%s".' , count , resource_id )
396
466
else :
397
467
# no datastore table is created
398
468
raise LoaderError ('No entries found - nothing to load' )
0 commit comments