Skip to content

Commit

Permalink
Troubleshoot out of memory errors.
Browse files Browse the repository at this point in the history
  • Loading branch information
jon-ide committed Dec 12, 2023
1 parent 0335a32 commit f4814cb
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 5 deletions.
27 changes: 23 additions & 4 deletions webapp/home/check_data_table_contents.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
import warnings

import webapp.home.metapype_client
from webapp.home.home_utils import log_error, log_info
from webapp.home.home_utils import log_error, log_info, log_available_memory
import webapp.home.utils.load_and_save
from webapp.utils import path_exists, path_isdir, path_join

Expand Down Expand Up @@ -95,7 +95,7 @@ def load_eml_file(eml_file_url:str):
return eml_node, nsmap_changed


def load_df(eml_node, csv_url, data_table_name, max_rows=10**6):
def load_df(eml_node, csv_url, data_table_name, max_rows=None):
"""
Retrieve a data table CSV file from a URL and return:
a Pandas data frame for it, and
Expand Down Expand Up @@ -134,6 +134,8 @@ def load_df(eml_node, csv_url, data_table_name, max_rows=10**6):
delimiter = '\t'

num_rows = load_data.get_num_rows(unquote_plus(csv_url), delimiter=delimiter, quote_char=quote_char)
if max_rows is None:
max_rows = num_rows
truncated = num_rows > max_rows
df = pd.read_csv(unquote_plus(csv_url), encoding='utf-8-sig', sep=delimiter, quotechar=quote_char,
keep_default_na=False, skiprows=range(1, num_header_lines), nrows=max_rows,
Expand Down Expand Up @@ -586,13 +588,19 @@ def check_data_table(eml_file_url:str=None,
its contents based on the metadata specification for the column.
"""
eml_node, _ = load_eml_file(eml_file_url)
df, truncated = load_df(eml_node, csv_file_url, data_table_name, max_rows=5*10**6)
df, truncated = load_df(eml_node, csv_file_url, data_table_name, max_rows=None)

import sys
foo = sys.getsizeof(df)

if truncated:
flash(f'The number of rows in {os.path.basename(unquote_plus(csv_file_url))} is greater than 5 million. ezEML checks '
f'only the first 5 million rows. Often this suffices to indicate the kinds of errors that are present. The full '
f'file will be checked when you submit the data package to the EDI repository.', 'warning')

log_info('After loading the data table')
log_available_memory()

data_table_node = find_data_table_node(eml_node, data_table_name)
errors, data_table_column_names, metadata_column_names = check_columns_existence_against_metadata(data_table_node, df)

Expand All @@ -616,6 +624,8 @@ def check_data_table(eml_file_url:str=None,
# reported above by check_columns_existence_against_metadata().
continue
variable_type = get_variable_type(attribute_node)
from datetime import date, datetime
start = datetime.now()
if variable_type == 'CATEGORICAL':
columns_checked.append(column_name)
errors.extend(check_categorical_column(df, data_table_node, column_name, max_errs_per_column))
Expand All @@ -625,8 +635,17 @@ def check_data_table(eml_file_url:str=None,
elif variable_type == 'NUMERICAL':
columns_checked.append(column_name)
errors.extend(check_numerical_column(df, data_table_node, column_name, max_errs_per_column))
end = datetime.now()
elapsed = (end - start).total_seconds()
log_info(f'After checking column: {column_name}... elapsed time: {elapsed:.1f} seconds')
log_available_memory()

results = create_result_json(eml_file_url, csv_file_url, columns_checked, errors, max_errs_per_column)

log_info(f'After creating result JSON')
log_available_memory()

return create_result_json(eml_file_url, csv_file_url, columns_checked, errors, max_errs_per_column)
return results


def load_date_time_format_files(strings_filename=DATE_TIME_FORMAT_STRINGS_FILENAME,
Expand Down
9 changes: 9 additions & 0 deletions webapp/home/home_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,12 @@ def get_check_metadata_status(eml_node:Node=None, filename:str=None):
return status


def log_available_memory():
"""
Log the available system memory.
"""
import psutil
available_memory = psutil.virtual_memory().available / 1024 / 1024
process_usage = psutil.Process().memory_info().rss / 1024 / 1024
log_info(f"Memory usage: available system memory:{available_memory:.1f} MB process usage:{process_usage:.1f} MB")

4 changes: 3 additions & 1 deletion webapp/views/data_tables/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@

from webapp.home.utils.node_utils import new_child_node, add_child, remove_child
import webapp.home.views as views
from webapp.home.home_utils import log_error, log_info
from webapp.home.home_utils import log_error, log_info, log_available_memory

from webapp.pages import PAGE_REUPLOAD_WITH_COL_NAMES_CHANGED, PAGE_DATA_TABLE_SELECT, PAGE_DATA_TABLE

Expand Down Expand Up @@ -361,6 +361,8 @@ def check_column_name_uniqueness(csv_file_path, delimiter):
def get_num_rows(csv_filepath, delimiter: str = ',', quote_char: str = '"'):
"""Return the number of rows in a CSV file. For efficiency, we use only the first column."""
df = pd.read_csv(csv_filepath, encoding='utf8', usecols=[0], sep=delimiter, quotechar=quote_char)
log_info(f"Number of rows in {csv_filepath}: {df.shape[0]}")
log_available_memory()
return df.shape[0]


Expand Down

0 comments on commit f4814cb

Please sign in to comment.