Skip to content

Commit

Permalink
Limit Check Data Tables to 5 million rows. Fix Manage Data Usage with…
Browse files Browse the repository at this point in the history
… no option selected. Add handling of hidden buttons.(#149,#150,#151)
  • Loading branch information
jon-ide committed Dec 12, 2023
1 parent a6a6d75 commit 0335a32
Show file tree
Hide file tree
Showing 5 changed files with 237 additions and 125 deletions.
31 changes: 21 additions & 10 deletions webapp/home/check_data_table_contents.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@
from webapp.config import Config
from webapp.home.fetch_data import convert_file_size

import webapp.views.data_tables.load_data as load_data

from metapype.eml import names
from webapp.exceptions import ezEMLXMLError
from metapype.model import metapype_io
Expand Down Expand Up @@ -93,9 +95,11 @@ def load_eml_file(eml_file_url:str):
return eml_node, nsmap_changed


def load_df(eml_node, csv_url, data_table_name):
def load_df(eml_node, csv_url, data_table_name, max_rows=10**6):
"""
Retrieve a data table CSV file from a URL and return a Pandas data frame for it.
Retrieve a data table CSV file from a URL and return:
a Pandas data frame for it, and
a flag indicating whether the data frame was truncated.
"""

data_table_node = find_data_table_node(eml_node, data_table_name)
Expand Down Expand Up @@ -128,11 +132,15 @@ def load_df(eml_node, csv_url, data_table_name):
try:
if delimiter == '\\t':
delimiter = '\t'
return pd.read_csv(unquote_plus(csv_url), encoding='utf-8-sig', sep=delimiter, quotechar=quote_char,
keep_default_na=False, skiprows=range(1, num_header_lines),

num_rows = load_data.get_num_rows(unquote_plus(csv_url), delimiter=delimiter, quote_char=quote_char)
truncated = num_rows > max_rows
df = pd.read_csv(unquote_plus(csv_url), encoding='utf-8-sig', sep=delimiter, quotechar=quote_char,
keep_default_na=False, skiprows=range(1, num_header_lines), nrows=max_rows,
skipfooter=num_footer_lines, low_memory=False, infer_datetime_format=True,
dtype=str) # Set dtype to str to prevent pandas from converting empty strings to NaN,
# whole numbers to floats, etc.
return df, truncated

except Exception as err:
log_info(f'Error loading CSV file: {err}')
Expand Down Expand Up @@ -578,7 +586,12 @@ def check_data_table(eml_file_url:str=None,
its contents based on the metadata specification for the column.
"""
eml_node, _ = load_eml_file(eml_file_url)
df = load_df(eml_node, csv_file_url, data_table_name)
df, truncated = load_df(eml_node, csv_file_url, data_table_name, max_rows=5*10**6)

if truncated:
flash(f'The number of rows in {os.path.basename(unquote_plus(csv_file_url))} is greater than 5 million. ezEML checks '
f'only the first 5 million rows. Often this suffices to indicate the kinds of errors that are present. The full '
f'file will be checked when you submit the data package to the EDI repository.', 'warning')

data_table_node = find_data_table_node(eml_node, data_table_name)
errors, data_table_column_names, metadata_column_names = check_columns_existence_against_metadata(data_table_node, df)
Expand Down Expand Up @@ -694,11 +707,9 @@ def check_for_empty_rows(df, data_table_name, num_header_lines):
Check for empty rows in the data table.
"""
errors = []
is_empty = lambda x: x == ''
empty_rows = df.applymap(is_empty).all(axis=1)
empty_row_indices = []
if empty_rows.any():
empty_row_indices = empty_rows[empty_rows].index.values
# Check for empty rows
empty_rows = df.eq('').all(axis=1)
empty_row_indices = empty_rows[empty_rows].index
for index in empty_row_indices:
# Make the index 1-based and take into account the number of header rows. I.e., make it match what they'd see in Excel.
errors.append(create_error_json(data_table_name, None,
Expand Down
5 changes: 4 additions & 1 deletion webapp/home/utils/hidden_buttons.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,17 @@ def is_hidden_button():
return any(button in request.form for button in HIDDEN_TARGETS)


def handle_hidden_buttons(target_page):
def handle_hidden_buttons(target_page=None):
"""
See if a "hidden" button has been clicked. If so, return the page that the button indicates.
If none of the hidden buttons was clicked, leave target_page as we found it. I.e., target_page is the page
that the user was trying to get to when they clicked on the menu option, so we'll go there in the absence of
a hidden button. If a hidden button was clicked, we'll go to the page that the hidden button indicates, after
saving the data that the user entered in the page that they were in when they clicked on the menu option.
If handle_hidden_buttons() is called after checking is_hidden_button(), then we know that a hidden button has
been clicked, so it's fine to pass in None for target_page.
"""
for button in HIDDEN_TARGETS:
if button in request.form:
Expand Down
Loading

0 comments on commit 0335a32

Please sign in to comment.