Rework CSV sampling. Fixes #404.

digitalmethodsinitiative · Jan 10, 2024 · 9963cd8 · 9963cd8
1 parent df2462f
commit 9963cd8
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 17 deletions.
diff --git a/datasources/upload/import_csv.py b/datasources/upload/import_csv.py
@@ -130,7 +130,8 @@ def process(self):
                             if filtering:
                                 for field, value in item.items():
                                     if field is None:
-                                        raise CsvDialectException("Field is None") # This would normally be caught when writerow is called
+                                        # This would normally be caught when writerow is called
+                                        raise CsvDialectException("Field is None")
                                     if field.startswith("author"):
                                         if filtering == "anonymise":
                                             item[field] = "REDACTED"
@@ -173,7 +174,7 @@ def process(self):
 
         if skipped:
             self.dataset.update_status(
-                "CSV file imported, but %i items were skipped because their date could not be parsed." % skipped,
+                f"CSV file imported, but {skipped:,} items were skipped because their date could not be parsed.",
                 is_final=True)
 
         temp_file.unlink()
@@ -205,29 +206,46 @@ def validate_query(query, request, user):
             raise QueryParametersException("No file was offered for upload.")
 
         if query.get("format") not in import_formats.tools:
-            raise QueryParametersException("Cannot import CSV from tool %s" % str(query.get("format")))
+            raise QueryParametersException(f"Cannot import CSV from tool {query.get('format')}")
 
+        # content_length seems unreliable, so figure out the length by reading
+        # the file...
+        upload_size = 0
+        while True:
+            bit = file.read(1024)
+            if len(bit) == 0:
+                break
+            upload_size += len(bit)
+
+        file.seek(0)
         encoding = sniff_encoding(file)
         tool_format = import_formats.tools.get(query.get("format"))
 
         try:
+            # try reading the file as csv here
+            # never read more than 128 kB (to keep it quick)
+            sample_size = min(upload_size, 128 * 1024)  # 128 kB is sent from the frontend at most
             wrapped_file = io.TextIOWrapper(file, encoding=encoding)
-            sample = wrapped_file.read(1024 * 1024)
-            wrapped_file.seek(0)
+            sample = wrapped_file.read(sample_size)
+
             if not csv.Sniffer().has_header(sample) and not query.get("frontend-confirm"):
+                # this may be intended, or the check may be bad, so allow user to continue
                 raise QueryNeedsExplicitConfirmationException(
                     "The uploaded file does not seem to have a header row. Continue anyway?")
-            dialect = csv.Sniffer().sniff(sample, delimiters=(",", ";", "\t"))
 
-            # override the guesses for specific formats if defiend so in
+            wrapped_file.seek(0)
+            dialect = csv.Sniffer().sniff(sample, delimiters=",;\t")
+
+            # override the guesses for specific formats if defined so in
             # import_formats.py
             for prop in tool_format.get("csv_dialect", {}):
                 setattr(dialect, prop, tool_format["csv_dialect"][prop])
-        except UnicodeDecodeError:
+
+        except UnicodeDecodeError as e:
             raise QueryParametersException("The uploaded file does not seem to be a CSV file encoded with UTF-8. "
                                            "Save the file in the proper format and try again.")
         except csv.Error:
-            raise QueryParametersException("Uploaded file is not a well-formed CSV or TAB file.")
+            raise QueryParametersException("Uploaded file is not a well-formed, UTF 8-encoded CSV or TAB file.")
 
         # With validated csvs, save as is but make sure the raw file is sorted
         reader = csv.DictReader(wrapped_file, dialect=dialect)
@@ -237,7 +255,7 @@ def validate_query(query, request, user):
         try:
             fields = reader.fieldnames
         except UnicodeDecodeError:
-            raise QueryParametersException("Uploaded file is not a well-formed CSV or TAB file.")
+            raise QueryParametersException("The uploaded file is not a well-formed, UTF 8-encoded CSV or TAB file.")
 
         incomplete_mapping = list(tool_format["columns"])
         for field in tool_format["columns"]:

diff --git a/webtool/static/js/fourcat.js b/webtool/static/js/fourcat.js
@@ -353,12 +353,13 @@ const query = {
             let snippet_size = 128 * 1024; // 128K ought to be enough for everybody
             for (let pair of formdata.entries()) {
                 if (pair[1] instanceof File) {
-                    let content = await FileReaderPromise(pair[1]);
-                    if (content.byteLength > snippet_size) {
-                        content = content.slice(0, snippet_size);
-                        let snippet = new File([content], pair[1].name);
-                        formdata.set(pair[0], snippet)
-                    }
+                    const sample_size = Math.min(pair[1].size, snippet_size);
+                    const blob = pair[1].slice(0, sample_size); // do not load whole file into memory
+
+                    // make sure we're submitting utf-8 - read and then re-encode to be sure
+                    const blobAsText = await FileReaderPromise(blob);
+                    const snippet = new File([new TextEncoder().encode(blobAsText)], pair[1].name);
+                    formdata.set(pair[0], snippet);
                 }
             }
         }
@@ -1820,7 +1821,7 @@ function FileReaderPromise(file) {
         fr.onload = () => {
             resolve(fr.result);
         }
-        fr.readAsArrayBuffer(file);
+        fr.readAsText(file);
     });
 }