Skip to content

Commit

Permalink
Rework CSV sampling. Fixes #404.
Browse files Browse the repository at this point in the history
  • Loading branch information
stijn-uva committed Jan 10, 2024
1 parent df2462f commit 9963cd8
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 17 deletions.
38 changes: 28 additions & 10 deletions datasources/upload/import_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ def process(self):
if filtering:
for field, value in item.items():
if field is None:
raise CsvDialectException("Field is None") # This would normally be caught when writerow is called
# This would normally be caught when writerow is called
raise CsvDialectException("Field is None")
if field.startswith("author"):
if filtering == "anonymise":
item[field] = "REDACTED"
Expand Down Expand Up @@ -173,7 +174,7 @@ def process(self):

if skipped:
self.dataset.update_status(
"CSV file imported, but %i items were skipped because their date could not be parsed." % skipped,
f"CSV file imported, but {skipped:,} items were skipped because their date could not be parsed.",
is_final=True)

temp_file.unlink()
Expand Down Expand Up @@ -205,29 +206,46 @@ def validate_query(query, request, user):
raise QueryParametersException("No file was offered for upload.")

if query.get("format") not in import_formats.tools:
raise QueryParametersException("Cannot import CSV from tool %s" % str(query.get("format")))
raise QueryParametersException(f"Cannot import CSV from tool {query.get('format')}")

# content_length seems unreliable, so figure out the length by reading
# the file...
upload_size = 0
while True:
bit = file.read(1024)
if len(bit) == 0:
break
upload_size += len(bit)

file.seek(0)
encoding = sniff_encoding(file)
tool_format = import_formats.tools.get(query.get("format"))

try:
# try reading the file as csv here
# never read more than 128 kB (to keep it quick)
sample_size = min(upload_size, 128 * 1024) # 128 kB is sent from the frontend at most
wrapped_file = io.TextIOWrapper(file, encoding=encoding)
sample = wrapped_file.read(1024 * 1024)
wrapped_file.seek(0)
sample = wrapped_file.read(sample_size)

if not csv.Sniffer().has_header(sample) and not query.get("frontend-confirm"):
# this may be intended, or the check may be bad, so allow user to continue
raise QueryNeedsExplicitConfirmationException(
"The uploaded file does not seem to have a header row. Continue anyway?")
dialect = csv.Sniffer().sniff(sample, delimiters=(",", ";", "\t"))

# override the guesses for specific formats if defiend so in
wrapped_file.seek(0)
dialect = csv.Sniffer().sniff(sample, delimiters=",;\t")

# override the guesses for specific formats if defined so in
# import_formats.py
for prop in tool_format.get("csv_dialect", {}):
setattr(dialect, prop, tool_format["csv_dialect"][prop])
except UnicodeDecodeError:

except UnicodeDecodeError as e:
raise QueryParametersException("The uploaded file does not seem to be a CSV file encoded with UTF-8. "
"Save the file in the proper format and try again.")
except csv.Error:
raise QueryParametersException("Uploaded file is not a well-formed CSV or TAB file.")
raise QueryParametersException("Uploaded file is not a well-formed, UTF 8-encoded CSV or TAB file.")

# With validated csvs, save as is but make sure the raw file is sorted
reader = csv.DictReader(wrapped_file, dialect=dialect)
Expand All @@ -237,7 +255,7 @@ def validate_query(query, request, user):
try:
fields = reader.fieldnames
except UnicodeDecodeError:
raise QueryParametersException("Uploaded file is not a well-formed CSV or TAB file.")
raise QueryParametersException("The uploaded file is not a well-formed, UTF 8-encoded CSV or TAB file.")

incomplete_mapping = list(tool_format["columns"])
for field in tool_format["columns"]:
Expand Down
15 changes: 8 additions & 7 deletions webtool/static/js/fourcat.js
Original file line number Diff line number Diff line change
Expand Up @@ -353,12 +353,13 @@ const query = {
let snippet_size = 128 * 1024; // 128K ought to be enough for everybody
for (let pair of formdata.entries()) {
if (pair[1] instanceof File) {
let content = await FileReaderPromise(pair[1]);
if (content.byteLength > snippet_size) {
content = content.slice(0, snippet_size);
let snippet = new File([content], pair[1].name);
formdata.set(pair[0], snippet)
}
const sample_size = Math.min(pair[1].size, snippet_size);
const blob = pair[1].slice(0, sample_size); // do not load whole file into memory

// make sure we're submitting utf-8 - read and then re-encode to be sure
const blobAsText = await FileReaderPromise(blob);
const snippet = new File([new TextEncoder().encode(blobAsText)], pair[1].name);
formdata.set(pair[0], snippet);
}
}
}
Expand Down Expand Up @@ -1820,7 +1821,7 @@ function FileReaderPromise(file) {
fr.onload = () => {
resolve(fr.result);
}
fr.readAsArrayBuffer(file);
fr.readAsText(file);
});
}

Expand Down

0 comments on commit 9963cd8

Please sign in to comment.