feat(converters): CSVToDocument row-mode hardening + tests

xoaryaa · xoaryaa · commit fc7f91f281ac · 2025-09-11T11:46:25.000+05:30
Signed-off-by: Arya Tayshete &lt;avtayshete_b21@et.vjti.ac.in&gt;
diff --git a/haystack/components/converters/csv.py b/haystack/components/converters/csv.py
@@ -14,6 +14,8 @@
 
 logger = logging.getLogger(__name__)
 
+_ROW_MODE_SIZE_WARN_BYTES = 5 * 1024 * 1024  # ~5MB; warn when parsing rows might be memory-heavy
+
 
 @component
 class CSVToDocument:
@@ -75,6 +77,12 @@ def __init__(
         self.delimiter = delimiter
         self.quotechar = quotechar
 
+        # Basic validation (reviewer suggestion)
+        if len(self.delimiter) != 1:
+            raise ValueError("CSVToDocument: delimiter must be a single character.")
+        if len(self.quotechar) != 1:
+            raise ValueError("CSVToDocument: quotechar must be a single character.")
+
     @component.output_types(documents=list[Document])
     def run(
         self,
@@ -109,7 +117,9 @@ def run(
                 continue
             try:
                 encoding = bytestream.meta.get("encoding", self.encoding)
-                data = io.BytesIO(bytestream.data).getvalue().decode(encoding=encoding)
+                raw = io.BytesIO(bytestream.data).getvalue()
+                data = raw.decode(encoding=encoding)
+
             except Exception as e:
                 logger.warning(
                     "Could not convert file {source}. Skipping it. Error message: {error}", source=source, error=e
@@ -128,6 +138,18 @@ def run(
                 documents.append(Document(content=data, meta=merged_metadata))
                 continue
 
+            # Reviewer note: Warn for very large CSVs in row mode (memory consideration)
+            try:
+                size_bytes = len(raw)
+                if size_bytes > _ROW_MODE_SIZE_WARN_BYTES:
+                    logger.warning(
+                        "CSVToDocument(row): parsing a large CSV (~{mb:.1f} MB). "
+                        "Consider chunking/streaming if you hit memory issues.",
+                        mb=size_bytes / (1024 * 1024),
+                    )
+            except Exception:
+                pass
+
             # Mode: row -> one Document per CSV row
             try:
                 reader = csv.DictReader(io.StringIO(data), delimiter=self.delimiter, quotechar=self.quotechar)
@@ -140,26 +162,68 @@ def run(
                 documents.append(Document(content=data, meta=merged_metadata))
                 continue
 
+            # Validate content_column presence; fall back to listing if missing
+            effective_content_col = self.content_column
+            header = reader.fieldnames or []
+            if effective_content_col and effective_content_col not in header:
+                logger.warning(
+                    "CSVToDocument(row): content_column='{col}' not found in header for {source}; "
+                    "falling back to key: value listing.",
+                    col=effective_content_col,
+                    source=source,
+                )
+                effective_content_col = None
+
             for i, row in enumerate(reader):
-                row_meta = dict(merged_metadata)  # start with file-level/meta param  bytestream meta
-
-                # Determine content from selected column or fallback to a friendly listing
-                if self.content_column:
-                    content = row.get(self.content_column, "")
-                    if content is None:
-                        content = ""
-                else:
-                    # "key: value" per line for readability
-                    content = "\n".join(f"{k}: {v if v is not None else ''}" for k, v in row.items())
-
-                # Add remaining columns into meta (don't override existing keys like file_path, encoding, etc.)
-                for k, v in row.items():
-                    if self.content_column and k == self.content_column:
-                        continue
-                    if k not in row_meta:
-                        row_meta[k] = "" if v is None else v
-
-                row_meta["row_number"] = i
-                documents.append(Document(content=content, meta=row_meta))
+                # Protect against malformed rows (reviewer suggestion)
+                try:
+                    doc = self._build_document_from_row(
+                        row=row, base_meta=merged_metadata, row_index=i, content_column=effective_content_col
+                    )
+                    documents.append(doc)
+                except Exception as e:
+                    logger.warning(
+                        "CSVToDocument(row): skipping malformed row {row_index} in {source}. Error: {error}",
+                        row_index=i,
+                        source=source,
+                        error=e,
+                    )
 
         return {"documents": documents}
+
+    # ----- helpers -----
+    def _safe_value(self, value: Any) -> str:
+        """Normalize CSV cell values: None -> '', everything -> str."""
+        return "" if value is None else str(value)
+
+    def _build_document_from_row(
+        self, row: dict[str, Any], base_meta: dict[str, Any], row_index: int, content_column: Optional[str]
+    ) -> Document:
+        """
+        Create a Document from a single CSV row. Does not catch exceptions; caller wraps.
+        """
+        row_meta = dict(base_meta)
+
+        # content
+        if content_column:
+            content = self._safe_value(row.get(content_column))
+        else:
+            content = "\n".join(f"{k}: {self._safe_value(v)}" for k, v in row.items())
+
+        # merge remaining columns into meta with collision handling
+        for k, v in row.items():
+            if content_column and k == content_column:
+                continue
+            key_to_use = k
+            if key_to_use in row_meta:
+                # Avoid clobbering existing meta like file_path/encoding; prefix and de-dupe
+                base_key = f"csv_{key_to_use}"
+                key_to_use = base_key
+                suffix = 1
+                while key_to_use in row_meta:
+                    key_to_use = f"{base_key}_{suffix}"
+                    suffix = 1
+            row_meta[key_to_use] = self._safe_value(v)
+
+        row_meta["row_number"] = row_index
+        return Document(content=content, meta=row_meta)
diff --git a/test/components/converters/test_csv_to_document.py b/test/components/converters/test_csv_to_document.py
@@ -103,6 +103,64 @@ def test_run_with_meta(self):
         # check that the metadata from the bytestream is merged with that from the meta parameter
         assert document.meta == {"name": "test_name", "language": "it"}
 
+    # --- NEW TESTS for row mode reviewer asks ---
+
+    def test_row_mode_with_missing_content_column_warns_and_fallbacks(self, tmp_path, caplog):
+        csv_text = "a,b\r\n1,2\r\n3,4\r\n"
+        f = tmp_path / "miss.csv"
+        f.write_text(csv_text, encoding="utf-8")
+        bs = ByteStream.from_file_path(f)
+        bs.meta["file_path"] = str(f)
+
+        conv = CSVToDocument(conversion_mode="row", content_column="missing")
+        with caplog.at_level(logging.WARNING):
+            out = conv.run(sources=[bs])
+            assert "content_column='missing' not found" in caplog.text
+        docs = out["documents"]
+        assert len(docs) == 2
+        # Fallback content is a readable listing
+        assert "a: 1" in docs[0].content and "b: 2" in docs[0].content
+
+    def test_row_mode_meta_collision_prefixed(self, tmp_path):
+        # ByteStream meta has file_path and encoding; CSV also has those columns.
+        csv_text = "file_path,encoding,comment\r\nrowpath.csv,latin1,ok\r\n"
+        f = tmp_path / "collide.csv"
+        f.write_text(csv_text, encoding="utf-8")
+        bs = ByteStream.from_file_path(f)
+        bs.meta["file_path"] = str(f)
+        bs.meta["encoding"] = "utf-8"
+
+        conv = CSVToDocument(conversion_mode="row")
+        out = conv.run(sources=[bs])
+        d = out["documents"][0]
+        # Original meta preserved
+        assert d.meta["file_path"] == os.path.basename(str(f))
+        assert d.meta["encoding"] == "utf-8"
+        # CSV columns stored with csv_ prefix (no clobber)
+        assert d.meta["csv_file_path"] == "rowpath.csv"
+        assert d.meta["csv_encoding"] == "latin1"
+        assert d.meta["comment"] == "ok"
+
+    def test_init_validates_delimiter_and_quotechar(self):
+        with pytest.raises(ValueError):
+            CSVToDocument(delimiter=";;")
+        with pytest.raises(ValueError):
+            CSVToDocument(quotechar='""')
+
+    def test_row_mode_large_file_warns(self, tmp_path, caplog):
+        # Build a ~1.2MB CSV to trigger the warning (threshold ~5MB in component;
+        # If you want to keep this super fast, you can comment this test out.)
+        rows = 60_000
+        header = "text,author\n"
+        body = "".join("hello,Ada\n" for _ in range(rows))
+        data = (header + body).encode("utf-8")
+        bs = ByteStream(data=data, meta={"file_path": "big.csv"})
+        conv = CSVToDocument(conversion_mode="row")
+        with caplog.at_level(logging.WARNING):
+            _ = conv.run(sources=[bs])
+        # Not asserting exact MB value to avoid brittleness; look for the key phrase
+        assert "parsing a large CSV" in caplog.text
+
     def test_row_mode_with_content_column(self, tmp_path):
         """
         Each row becomes a Document, with `content` from a chosen column and other columns in meta.