Skip to content

Commit fe0b375

Browse files
committed
Merge branch 'development'
2 parents 2ecaf35 + 29a1be0 commit fe0b375

22 files changed

+712
-360
lines changed

webapp/config.py.template

+4-1
Original file line numberDiff line numberDiff line change
@@ -98,4 +98,7 @@ class Config(object):
9898
COLLABORATION_BETA_TESTERS_ONLY = False
9999
COLLABORATION_BETA_TESTERS = []
100100

101-
LOG_FILE_HANDLING_DETAILS = False
101+
LOG_FILE_HANDLING_DETAILS = False
102+
LOG_MEMORY_USAGE = False
103+
LOG_REQUESTS = False
104+
LOG_RESPONSES = False

webapp/home/check_data_table_contents.py

+92-49
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,16 @@
4848
import warnings
4949

5050
import webapp.home.metapype_client
51-
from webapp.home.home_utils import log_error, log_info
51+
from webapp.home.home_utils import log_error, log_info, log_available_memory
5252
import webapp.home.utils.load_and_save
5353
from webapp.utils import path_exists, path_isdir, path_join
5454

5555
import webapp.auth.user_data as user_data
5656
from webapp.config import Config
5757
from webapp.home.fetch_data import convert_file_size
5858

59+
import webapp.views.data_tables.load_data as load_data
60+
5961
from metapype.eml import names
6062
from webapp.exceptions import ezEMLXMLError
6163
from metapype.model import metapype_io
@@ -93,9 +95,11 @@ def load_eml_file(eml_file_url:str):
9395
return eml_node, nsmap_changed
9496

9597

96-
def load_df(eml_node, csv_url, data_table_name):
98+
def load_df(eml_node, csv_url, data_table_name, max_rows=None):
9799
"""
98-
Retrieve a data table CSV file from a URL and return a Pandas data frame for it.
100+
Retrieve a data table CSV file from a URL and return:
101+
a Pandas data frame for it, and
102+
a flag indicating whether the data frame was truncated.
99103
"""
100104

101105
data_table_node = find_data_table_node(eml_node, data_table_name)
@@ -128,11 +132,17 @@ def load_df(eml_node, csv_url, data_table_name):
128132
try:
129133
if delimiter == '\\t':
130134
delimiter = '\t'
131-
return pd.read_csv(unquote_plus(csv_url), encoding='utf-8-sig', sep=delimiter, quotechar=quote_char,
132-
keep_default_na=False, skiprows=range(1, num_header_lines),
135+
136+
num_rows = load_data.get_num_rows(unquote_plus(csv_url), delimiter=delimiter, quote_char=quote_char)
137+
if max_rows is None:
138+
max_rows = num_rows
139+
truncated = num_rows > max_rows
140+
df = pd.read_csv(unquote_plus(csv_url), encoding='utf-8-sig', sep=delimiter, quotechar=quote_char,
141+
keep_default_na=False, skiprows=range(1, num_header_lines), nrows=max_rows,
133142
skipfooter=num_footer_lines, low_memory=False, infer_datetime_format=True,
134143
dtype=str) # Set dtype to str to prevent pandas from converting empty strings to NaN,
135144
# whole numbers to floats, etc.
145+
return df, truncated
136146

137147
except Exception as err:
138148
log_info(f'Error loading CSV file: {err}')
@@ -340,13 +350,22 @@ def get_number_type(attribute_node):
340350
return number_type
341351

342352

343-
def match_with_regex(col_values, regex, empty_is_ok=True):
353+
def match_with_regex(col_values, regex, mvc, empty_is_ok=True):
344354
"""
345355
Return a boolean Series indicating whether each value in a column matches a given regex.
346356
"""
347-
if empty_is_ok:
348-
regex = f'^({regex})?$'
349357
warnings.filterwarnings("ignore", 'This pattern is interpreted as a regular expression, and has match groups.')
358+
# If regex starts with a ^, remove it temporarily
359+
if regex.startswith('^'):
360+
regex = regex[1:]
361+
# If regex ends with a $, remove it temporarily
362+
if regex.endswith('$'):
363+
regex = regex[:-1]
364+
if mvc:
365+
regex = f"({regex})" + '|' + f"{'|'.join(mvc)}"
366+
if empty_is_ok:
367+
regex = '$|' + regex
368+
regex = f"^{regex}$"
350369
matches = col_values.str.contains(regex)
351370
return matches
352371

@@ -416,23 +435,23 @@ def check_numerical_column(df, data_table_node, column_name, max_errs_per_column
416435
regex = '^[0-9]+$'
417436
else:
418437
regex = '^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$'
419-
# Allow empty string
420-
regex = '^$|' + regex
438+
mvc = get_missing_value_codes(data_table_node, column_name)
421439
try:
422-
matches = match_with_regex(col_values, regex)
440+
matches = match_with_regex(col_values, regex, mvc)
423441
except KeyError:
424442
# This indicates the column name was not found in the data table.
425443
return [create_error_json(get_data_table_name(data_table_node), column_name, None,
426444
'Column not found in data table', column_name, 'Not found')]
427-
mvc = get_missing_value_codes(data_table_node, column_name)
428-
if len(mvc) > 0:
429-
mvc_regex = '^' + '|'.join(mvc) + '$'
430-
warnings.filterwarnings("ignore", 'This pattern is interpreted as a regular expression, and has match groups.')
431-
mvc_matches = col_values.str.contains(mvc_regex)
432-
# Errors are rows with both matches == False and mvc_matches == False
433-
result = ~(matches | mvc_matches)
434-
else:
435-
result = ~matches
445+
# mvc = get_missing_value_codes(data_table_node, column_name)
446+
# if len(mvc) > 0:
447+
# mvc_regex = '^' + '|'.join(mvc) + '$'
448+
# warnings.filterwarnings("ignore", 'This pattern is interpreted as a regular expression, and has match groups.')
449+
# mvc_matches = col_values.str.contains(mvc_regex)
450+
# # Errors are rows with both matches == False and mvc_matches == False
451+
# result = ~(matches | mvc_matches)
452+
# else:
453+
# result = ~matches
454+
result = ~matches
436455
error_indices = result[result].index.values
437456

438457
data_table_name = get_data_table_name(data_table_node)
@@ -477,22 +496,22 @@ def check_categorical_column(df, data_table_node, column_name, max_errs_per_colu
477496

478497
codes = list(map(re.escape, get_categorical_codes(attribute_node)))
479498
codes_regex = '^' + '|'.join(codes) + '$'
480-
# Allow empty string
481-
codes_regex = '^$|' + codes_regex
499+
mvc = get_missing_value_codes(data_table_node, column_name)
482500
try:
483-
matches = match_with_regex(col_values, codes_regex)
501+
matches = match_with_regex(col_values, codes_regex, mvc)
484502
except KeyError:
485503
return [] # This indicates the column is missing, but that type of error is reported via
486504
# check_columns_existence_against_metadata()
487-
mvc = get_missing_value_codes(data_table_node, column_name)
488-
if len(mvc) > 0:
489-
mvc_regex = '^' + '|'.join(mvc) + '$'
490-
warnings.filterwarnings("ignore", 'This pattern is interpreted as a regular expression, and has match groups.')
491-
mvc_matches = col_values.str.contains(mvc_regex)
492-
# Errors are rows with both matches == False and mvc_matches == False
493-
result = ~(matches | mvc_matches)
494-
else:
495-
result = ~matches
505+
# mvc = get_missing_value_codes(data_table_node, column_name)
506+
# if len(mvc) > 0:
507+
# mvc_regex = '^' + '|'.join(mvc) + '$'
508+
# warnings.filterwarnings("ignore", 'This pattern is interpreted as a regular expression, and has match groups.')
509+
# mvc_matches = col_values.str.contains(mvc_regex)
510+
# # Errors are rows with both matches == False and mvc_matches == False
511+
# result = ~(matches | mvc_matches)
512+
# else:
513+
# result = ~matches
514+
result = ~matches
496515
error_indices = result[result].index.values
497516
data_table_name = get_data_table_name(data_table_node)
498517
expected = 'A defined code'
@@ -532,20 +551,24 @@ def get_regex_for_format(format):
532551
'The specified DateTime Format String is not supported.',
533552
'A <a href="../datetime_formats">supported</a> format',
534553
date_time_format)]
554+
mvc = get_missing_value_codes(data_table_node, column_name)
535555
try:
536-
matches = match_with_regex(col_values, regex)
556+
matches = match_with_regex(col_values, regex, mvc)
557+
# try:
558+
# matches = match_with_regex(col_values, regex)
537559
except KeyError:
538560
return [create_error_json(get_data_table_name(data_table_node), column_name, None,
539561
'Column not found in table', (column_name), 'Not found')]
540-
mvc = get_missing_value_codes(data_table_node, column_name)
541-
if len(mvc) > 0:
542-
mvc_regex = '^' + '|'.join(mvc) + '$'
543-
warnings.filterwarnings("ignore", 'This pattern is interpreted as a regular expression, and has match groups.')
544-
mvc_matches = col_values.str.contains(mvc_regex)
545-
# Errors are rows with both matches == False and mvc_matches == False
546-
result = ~(matches | mvc_matches)
547-
else:
548-
result = ~matches
562+
# mvc = get_missing_value_codes(data_table_node, column_name)
563+
# if len(mvc) > 0:
564+
# mvc_regex = '^' + '|'.join(mvc) + '$'
565+
# warnings.filterwarnings("ignore", 'This pattern is interpreted as a regular expression, and has match groups.')
566+
# mvc_matches = col_values.str.contains(mvc_regex)
567+
# # Errors are rows with both matches == False and mvc_matches == False
568+
# result = ~(matches | mvc_matches)
569+
# else:
570+
# result = ~matches
571+
result = ~matches
549572
error_indices = result[result].index.values
550573
data_table_name = get_data_table_name(data_table_node)
551574
expected = get_date_time_format_specification(data_table_node, column_name)
@@ -578,7 +601,18 @@ def check_data_table(eml_file_url:str=None,
578601
its contents based on the metadata specification for the column.
579602
"""
580603
eml_node, _ = load_eml_file(eml_file_url)
581-
df = load_df(eml_node, csv_file_url, data_table_name)
604+
df, truncated = load_df(eml_node, csv_file_url, data_table_name, max_rows=None)
605+
606+
import sys
607+
foo = sys.getsizeof(df)
608+
609+
if truncated:
610+
flash(f'The number of rows in {os.path.basename(unquote_plus(csv_file_url))} is greater than 5 million. ezEML checks '
611+
f'only the first 5 million rows. Often this suffices to indicate the kinds of errors that are present. The full '
612+
f'file will be checked when you submit the data package to the EDI repository.', 'warning')
613+
614+
log_info('After loading the data table')
615+
log_available_memory()
582616

583617
data_table_node = find_data_table_node(eml_node, data_table_name)
584618
errors, data_table_column_names, metadata_column_names = check_columns_existence_against_metadata(data_table_node, df)
@@ -603,6 +637,8 @@ def check_data_table(eml_file_url:str=None,
603637
# reported above by check_columns_existence_against_metadata().
604638
continue
605639
variable_type = get_variable_type(attribute_node)
640+
from datetime import date, datetime
641+
start = datetime.now()
606642
if variable_type == 'CATEGORICAL':
607643
columns_checked.append(column_name)
608644
errors.extend(check_categorical_column(df, data_table_node, column_name, max_errs_per_column))
@@ -612,8 +648,17 @@ def check_data_table(eml_file_url:str=None,
612648
elif variable_type == 'NUMERICAL':
613649
columns_checked.append(column_name)
614650
errors.extend(check_numerical_column(df, data_table_node, column_name, max_errs_per_column))
651+
end = datetime.now()
652+
elapsed = (end - start).total_seconds()
653+
log_info(f'After checking column: {column_name}... elapsed time: {elapsed:.1f} seconds')
654+
log_available_memory()
655+
656+
results = create_result_json(eml_file_url, csv_file_url, columns_checked, errors, max_errs_per_column)
615657

616-
return create_result_json(eml_file_url, csv_file_url, columns_checked, errors, max_errs_per_column)
658+
log_info(f'After creating result JSON')
659+
log_available_memory()
660+
661+
return results
617662

618663

619664
def load_date_time_format_files(strings_filename=DATE_TIME_FORMAT_STRINGS_FILENAME,
@@ -694,11 +739,9 @@ def check_for_empty_rows(df, data_table_name, num_header_lines):
694739
Check for empty rows in the data table.
695740
"""
696741
errors = []
697-
is_empty = lambda x: x == ''
698-
empty_rows = df.applymap(is_empty).all(axis=1)
699-
empty_row_indices = []
700-
if empty_rows.any():
701-
empty_row_indices = empty_rows[empty_rows].index.values
742+
# Check for empty rows
743+
empty_rows = df.eq('').all(axis=1)
744+
empty_row_indices = empty_rows[empty_rows].index
702745
for index in empty_row_indices:
703746
# Make the index 1-based and take into account the number of header rows. I.e., make it match what they'd see in Excel.
704747
errors.append(create_error_json(data_table_name, None,

webapp/home/check_metadata.py

+3
Original file line numberDiff line numberDiff line change
@@ -896,6 +896,9 @@ def check_method_step(method_step_node, doc_name, node_id):
896896
if find_err_code(evaluation_warnings, EvaluationWarning.METHOD_STEP_DESCRIPTION_MISSING, names.DESCRIPTION):
897897
add_to_evaluation('methods_02', link)
898898

899+
if find_min_unmet(validation_errs, names.METHODSTEP, names.DESCRIPTION):
900+
add_to_evaluation('methods_02', link)
901+
899902
link = url_for(PAGE_METHOD_STEP_SELECT, filename=doc_name)
900903
dataset_node = eml_node.find_child(names.DATASET)
901904
if evaluation_warnings is None:

webapp/home/forms.py

+6
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,12 @@ class ImportEMLItemsForm(FlaskForm):
161161
target = RadioField('Target', choices=[], validators=[])
162162

163163

164+
class ImportPartiesFromTemplateForm(FlaskForm):
165+
to_import = MultiCheckboxField('Import', choices=[], validators=[])
166+
to_import_sorted = MultiCheckboxField('Import', choices=[], validators=[])
167+
target = RadioField('Target', choices=[], validators=[])
168+
169+
164170
class SelectUserForm(FlaskForm):
165171
user = RadioField('User', choices=[], validators=[])
166172

0 commit comments

Comments
 (0)