48
48
import warnings
49
49
50
50
import webapp .home .metapype_client
51
- from webapp .home .home_utils import log_error , log_info
51
+ from webapp .home .home_utils import log_error , log_info , log_available_memory
52
52
import webapp .home .utils .load_and_save
53
53
from webapp .utils import path_exists , path_isdir , path_join
54
54
55
55
import webapp .auth .user_data as user_data
56
56
from webapp .config import Config
57
57
from webapp .home .fetch_data import convert_file_size
58
58
59
+ import webapp .views .data_tables .load_data as load_data
60
+
59
61
from metapype .eml import names
60
62
from webapp .exceptions import ezEMLXMLError
61
63
from metapype .model import metapype_io
@@ -93,9 +95,11 @@ def load_eml_file(eml_file_url:str):
93
95
return eml_node , nsmap_changed
94
96
95
97
96
- def load_df (eml_node , csv_url , data_table_name ):
98
+ def load_df (eml_node , csv_url , data_table_name , max_rows = None ):
97
99
"""
98
- Retrieve a data table CSV file from a URL and return a Pandas data frame for it.
100
+ Retrieve a data table CSV file from a URL and return:
101
+ a Pandas data frame for it, and
102
+ a flag indicating whether the data frame was truncated.
99
103
"""
100
104
101
105
data_table_node = find_data_table_node (eml_node , data_table_name )
@@ -128,11 +132,17 @@ def load_df(eml_node, csv_url, data_table_name):
128
132
try :
129
133
if delimiter == '\\ t' :
130
134
delimiter = '\t '
131
- return pd .read_csv (unquote_plus (csv_url ), encoding = 'utf-8-sig' , sep = delimiter , quotechar = quote_char ,
132
- keep_default_na = False , skiprows = range (1 , num_header_lines ),
135
+
136
+ num_rows = load_data .get_num_rows (unquote_plus (csv_url ), delimiter = delimiter , quote_char = quote_char )
137
+ if max_rows is None :
138
+ max_rows = num_rows
139
+ truncated = num_rows > max_rows
140
+ df = pd .read_csv (unquote_plus (csv_url ), encoding = 'utf-8-sig' , sep = delimiter , quotechar = quote_char ,
141
+ keep_default_na = False , skiprows = range (1 , num_header_lines ), nrows = max_rows ,
133
142
skipfooter = num_footer_lines , low_memory = False , infer_datetime_format = True ,
134
143
dtype = str ) # Set dtype to str to prevent pandas from converting empty strings to NaN,
135
144
# whole numbers to floats, etc.
145
+ return df , truncated
136
146
137
147
except Exception as err :
138
148
log_info (f'Error loading CSV file: { err } ' )
@@ -340,13 +350,22 @@ def get_number_type(attribute_node):
340
350
return number_type
341
351
342
352
343
- def match_with_regex (col_values , regex , empty_is_ok = True ):
353
+ def match_with_regex (col_values , regex , mvc , empty_is_ok = True ):
344
354
"""
345
355
Return a boolean Series indicating whether each value in a column matches a given regex.
346
356
"""
347
- if empty_is_ok :
348
- regex = f'^({ regex } )?$'
349
357
warnings .filterwarnings ("ignore" , 'This pattern is interpreted as a regular expression, and has match groups.' )
358
+ # If regex starts with a ^, remove it temporarily
359
+ if regex .startswith ('^' ):
360
+ regex = regex [1 :]
361
+ # If regex ends with a $, remove it temporarily
362
+ if regex .endswith ('$' ):
363
+ regex = regex [:- 1 ]
364
+ if mvc :
365
+ regex = f"({ regex } )" + '|' + f"{ '|' .join (mvc )} "
366
+ if empty_is_ok :
367
+ regex = '$|' + regex
368
+ regex = f"^{ regex } $"
350
369
matches = col_values .str .contains (regex )
351
370
return matches
352
371
@@ -416,23 +435,23 @@ def check_numerical_column(df, data_table_node, column_name, max_errs_per_column
416
435
regex = '^[0-9]+$'
417
436
else :
418
437
regex = '^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$'
419
- # Allow empty string
420
- regex = '^$|' + regex
438
+ mvc = get_missing_value_codes (data_table_node , column_name )
421
439
try :
422
- matches = match_with_regex (col_values , regex )
440
+ matches = match_with_regex (col_values , regex , mvc )
423
441
except KeyError :
424
442
# This indicates the column name was not found in the data table.
425
443
return [create_error_json (get_data_table_name (data_table_node ), column_name , None ,
426
444
'Column not found in data table' , column_name , 'Not found' )]
427
- mvc = get_missing_value_codes (data_table_node , column_name )
428
- if len (mvc ) > 0 :
429
- mvc_regex = '^' + '|' .join (mvc ) + '$'
430
- warnings .filterwarnings ("ignore" , 'This pattern is interpreted as a regular expression, and has match groups.' )
431
- mvc_matches = col_values .str .contains (mvc_regex )
432
- # Errors are rows with both matches == False and mvc_matches == False
433
- result = ~ (matches | mvc_matches )
434
- else :
435
- result = ~ matches
445
+ # mvc = get_missing_value_codes(data_table_node, column_name)
446
+ # if len(mvc) > 0:
447
+ # mvc_regex = '^' + '|'.join(mvc) + '$'
448
+ # warnings.filterwarnings("ignore", 'This pattern is interpreted as a regular expression, and has match groups.')
449
+ # mvc_matches = col_values.str.contains(mvc_regex)
450
+ # # Errors are rows with both matches == False and mvc_matches == False
451
+ # result = ~(matches | mvc_matches)
452
+ # else:
453
+ # result = ~matches
454
+ result = ~ matches
436
455
error_indices = result [result ].index .values
437
456
438
457
data_table_name = get_data_table_name (data_table_node )
@@ -477,22 +496,22 @@ def check_categorical_column(df, data_table_node, column_name, max_errs_per_colu
477
496
478
497
codes = list (map (re .escape , get_categorical_codes (attribute_node )))
479
498
codes_regex = '^' + '|' .join (codes ) + '$'
480
- # Allow empty string
481
- codes_regex = '^$|' + codes_regex
499
+ mvc = get_missing_value_codes (data_table_node , column_name )
482
500
try :
483
- matches = match_with_regex (col_values , codes_regex )
501
+ matches = match_with_regex (col_values , codes_regex , mvc )
484
502
except KeyError :
485
503
return [] # This indicates the column is missing, but that type of error is reported via
486
504
# check_columns_existence_against_metadata()
487
- mvc = get_missing_value_codes (data_table_node , column_name )
488
- if len (mvc ) > 0 :
489
- mvc_regex = '^' + '|' .join (mvc ) + '$'
490
- warnings .filterwarnings ("ignore" , 'This pattern is interpreted as a regular expression, and has match groups.' )
491
- mvc_matches = col_values .str .contains (mvc_regex )
492
- # Errors are rows with both matches == False and mvc_matches == False
493
- result = ~ (matches | mvc_matches )
494
- else :
495
- result = ~ matches
505
+ # mvc = get_missing_value_codes(data_table_node, column_name)
506
+ # if len(mvc) > 0:
507
+ # mvc_regex = '^' + '|'.join(mvc) + '$'
508
+ # warnings.filterwarnings("ignore", 'This pattern is interpreted as a regular expression, and has match groups.')
509
+ # mvc_matches = col_values.str.contains(mvc_regex)
510
+ # # Errors are rows with both matches == False and mvc_matches == False
511
+ # result = ~(matches | mvc_matches)
512
+ # else:
513
+ # result = ~matches
514
+ result = ~ matches
496
515
error_indices = result [result ].index .values
497
516
data_table_name = get_data_table_name (data_table_node )
498
517
expected = 'A defined code'
@@ -532,20 +551,24 @@ def get_regex_for_format(format):
532
551
'The specified DateTime Format String is not supported.' ,
533
552
'A <a href="../datetime_formats">supported</a> format' ,
534
553
date_time_format )]
554
+ mvc = get_missing_value_codes (data_table_node , column_name )
535
555
try :
536
- matches = match_with_regex (col_values , regex )
556
+ matches = match_with_regex (col_values , regex , mvc )
557
+ # try:
558
+ # matches = match_with_regex(col_values, regex)
537
559
except KeyError :
538
560
return [create_error_json (get_data_table_name (data_table_node ), column_name , None ,
539
561
'Column not found in table' , (column_name ), 'Not found' )]
540
- mvc = get_missing_value_codes (data_table_node , column_name )
541
- if len (mvc ) > 0 :
542
- mvc_regex = '^' + '|' .join (mvc ) + '$'
543
- warnings .filterwarnings ("ignore" , 'This pattern is interpreted as a regular expression, and has match groups.' )
544
- mvc_matches = col_values .str .contains (mvc_regex )
545
- # Errors are rows with both matches == False and mvc_matches == False
546
- result = ~ (matches | mvc_matches )
547
- else :
548
- result = ~ matches
562
+ # mvc = get_missing_value_codes(data_table_node, column_name)
563
+ # if len(mvc) > 0:
564
+ # mvc_regex = '^' + '|'.join(mvc) + '$'
565
+ # warnings.filterwarnings("ignore", 'This pattern is interpreted as a regular expression, and has match groups.')
566
+ # mvc_matches = col_values.str.contains(mvc_regex)
567
+ # # Errors are rows with both matches == False and mvc_matches == False
568
+ # result = ~(matches | mvc_matches)
569
+ # else:
570
+ # result = ~matches
571
+ result = ~ matches
549
572
error_indices = result [result ].index .values
550
573
data_table_name = get_data_table_name (data_table_node )
551
574
expected = get_date_time_format_specification (data_table_node , column_name )
@@ -578,7 +601,18 @@ def check_data_table(eml_file_url:str=None,
578
601
its contents based on the metadata specification for the column.
579
602
"""
580
603
eml_node , _ = load_eml_file (eml_file_url )
581
- df = load_df (eml_node , csv_file_url , data_table_name )
604
+ df , truncated = load_df (eml_node , csv_file_url , data_table_name , max_rows = None )
605
+
606
+ import sys
607
+ foo = sys .getsizeof (df )
608
+
609
+ if truncated :
610
+ flash (f'The number of rows in { os .path .basename (unquote_plus (csv_file_url ))} is greater than 5 million. ezEML checks '
611
+ f'only the first 5 million rows. Often this suffices to indicate the kinds of errors that are present. The full '
612
+ f'file will be checked when you submit the data package to the EDI repository.' , 'warning' )
613
+
614
+ log_info ('After loading the data table' )
615
+ log_available_memory ()
582
616
583
617
data_table_node = find_data_table_node (eml_node , data_table_name )
584
618
errors , data_table_column_names , metadata_column_names = check_columns_existence_against_metadata (data_table_node , df )
@@ -603,6 +637,8 @@ def check_data_table(eml_file_url:str=None,
603
637
# reported above by check_columns_existence_against_metadata().
604
638
continue
605
639
variable_type = get_variable_type (attribute_node )
640
+ from datetime import date , datetime
641
+ start = datetime .now ()
606
642
if variable_type == 'CATEGORICAL' :
607
643
columns_checked .append (column_name )
608
644
errors .extend (check_categorical_column (df , data_table_node , column_name , max_errs_per_column ))
@@ -612,8 +648,17 @@ def check_data_table(eml_file_url:str=None,
612
648
elif variable_type == 'NUMERICAL' :
613
649
columns_checked .append (column_name )
614
650
errors .extend (check_numerical_column (df , data_table_node , column_name , max_errs_per_column ))
651
+ end = datetime .now ()
652
+ elapsed = (end - start ).total_seconds ()
653
+ log_info (f'After checking column: { column_name } ... elapsed time: { elapsed :.1f} seconds' )
654
+ log_available_memory ()
655
+
656
+ results = create_result_json (eml_file_url , csv_file_url , columns_checked , errors , max_errs_per_column )
615
657
616
- return create_result_json (eml_file_url , csv_file_url , columns_checked , errors , max_errs_per_column )
658
+ log_info (f'After creating result JSON' )
659
+ log_available_memory ()
660
+
661
+ return results
617
662
618
663
619
664
def load_date_time_format_files (strings_filename = DATE_TIME_FORMAT_STRINGS_FILENAME ,
@@ -694,11 +739,9 @@ def check_for_empty_rows(df, data_table_name, num_header_lines):
694
739
Check for empty rows in the data table.
695
740
"""
696
741
errors = []
697
- is_empty = lambda x : x == ''
698
- empty_rows = df .applymap (is_empty ).all (axis = 1 )
699
- empty_row_indices = []
700
- if empty_rows .any ():
701
- empty_row_indices = empty_rows [empty_rows ].index .values
742
+ # Check for empty rows
743
+ empty_rows = df .eq ('' ).all (axis = 1 )
744
+ empty_row_indices = empty_rows [empty_rows ].index
702
745
for index in empty_row_indices :
703
746
# Make the index 1-based and take into account the number of header rows. I.e., make it match what they'd see in Excel.
704
747
errors .append (create_error_json (data_table_name , None ,
0 commit comments