44
55import logging
66import os
7+ from pathlib import Path
78
89import pytest
910
@@ -17,10 +18,10 @@ def csv_converter():
1718
1819
1920class TestCSVToDocument :
20- def test_init (self , csv_converter ):
21+ def test_init (self , csv_converter : CSVToDocument ):
2122 assert isinstance (csv_converter , CSVToDocument )
2223
23- def test_run (self , test_files_path ):
24+ def test_run (self , test_files_path : Path ):
2425 """
2526 Test if the component runs correctly.
2627 """
@@ -38,7 +39,7 @@ def test_run(self, test_files_path):
3839 assert docs [1 ].meta ["file_path" ] == os .path .basename (files [1 ])
3940 assert docs [2 ].meta ["file_path" ] == os .path .basename (files [2 ])
4041
41- def test_run_with_store_full_path_false (self , test_files_path ):
42+ def test_run_with_store_full_path_false (self , test_files_path : Path ):
4243 """
4344 Test if the component runs correctly with store_full_path=False
4445 """
@@ -57,7 +58,7 @@ def test_run_with_store_full_path_false(self, test_files_path):
5758 assert docs [1 ].meta ["file_path" ] == "sample_2.csv"
5859 assert docs [2 ].meta ["file_path" ] == "sample_3.csv"
5960
60- def test_run_error_handling (self , test_files_path , caplog ):
61+ def test_run_error_handling (self , test_files_path : Path , caplog : pytest . LogCaptureFixture ):
6162 """
6263 Test if the component correctly handles errors.
6364 """
@@ -74,7 +75,7 @@ def test_run_error_handling(self, test_files_path, caplog):
7475 assert len (docs ) == 2
7576 assert docs [0 ].meta ["file_path" ] == os .path .basename (paths [0 ])
7677
77- def test_encoding_override (self , test_files_path , caplog ):
78+ def test_encoding_override (self , test_files_path : Path , caplog : pytest . LogCaptureFixture ):
7879 """
7980 Test if the encoding metadata field is used properly
8081 """
@@ -103,7 +104,7 @@ def test_run_with_meta(self):
103104 # check that the metadata from the bytestream is merged with that from the meta parameter
104105 assert document .meta == {"name" : "test_name" , "language" : "it" }
105106
106- # --- NEW TESTS for row mode reviewer asks ---
107+ # --- NEW TESTS for row mode ---
107108
108109 def test_row_mode_with_missing_content_column_warns_and_fallbacks (self , tmp_path , caplog ):
109110 csv_text = "a,b\r \n 1,2\r \n 3,4\r \n "
@@ -121,7 +122,7 @@ def test_row_mode_with_missing_content_column_warns_and_fallbacks(self, tmp_path
121122 # Fallback content is a readable listing
122123 assert "a: 1" in docs [0 ].content and "b: 2" in docs [0 ].content
123124
124- def test_row_mode_meta_collision_prefixed (self , tmp_path ):
125+ def test_row_mode_meta_collision_prefixed (self , tmp_path : Path ):
125126 # ByteStream meta has file_path and encoding; CSV also has those columns.
126127 csv_text = "file_path,encoding,comment\r \n rowpath.csv,latin1,ok\r \n "
127128 f = tmp_path / "collide.csv"
@@ -147,21 +148,20 @@ def test_init_validates_delimiter_and_quotechar(self):
147148 with pytest .raises (ValueError ):
148149 CSVToDocument (quotechar = '""' )
149150
150- def test_row_mode_large_file_warns (self , tmp_path , caplog ):
151- # Build a ~1.2MB CSV to trigger the warning (threshold ~5MB in component;
152- # If you want to keep this super fast, you can comment this test out.)
153- rows = 60_000
154- header = "text,author\n "
155- body = "" .join ("hello,Ada\n " for _ in range (rows ))
156- data = (header + body ).encode ("utf-8" )
157- bs = ByteStream (data = data , meta = {"file_path" : "big.csv" })
151+ def test_row_mode_large_file_warns (self , caplog : pytest .LogCaptureFixture , monkeypatch : pytest .MonkeyPatch ):
152+ # Make the threshold tiny so the warning always triggers, regardless of platform.
153+ import haystack .components .converters .csv as csv_mod
154+
155+ monkeypatch .setattr (csv_mod , "_ROW_MODE_SIZE_WARN_BYTES" , 1 , raising = False )
156+
157+ bs = ByteStream (data = b"text,author\n hi,Ada\n " , meta = {"file_path" : "big.csv" })
158158 conv = CSVToDocument (conversion_mode = "row" )
159- with caplog .at_level (logging .WARNING ):
159+ # Capture the converter module's logger explicitly for reliability across CI runners.
160+ with caplog .at_level (logging .WARNING , logger = "haystack.components.converters.csv" ):
160161 _ = conv .run (sources = [bs ])
161- # Not asserting exact MB value to avoid brittleness; look for the key phrase
162162 assert "parsing a large CSV" in caplog .text
163163
164- def test_row_mode_with_content_column (self , tmp_path ):
164+ def test_row_mode_with_content_column (self , tmp_path : Path ):
165165 """
166166 Each row becomes a Document, with `content` from a chosen column and other columns in meta.
167167 """
@@ -185,7 +185,7 @@ def test_row_mode_with_content_column(self, tmp_path):
185185 # still respects store_full_path default=False trimming when present
186186 assert os .path .basename (f ) == docs [0 ].meta ["file_path" ]
187187
188- def test_row_mode_without_content_column (self , tmp_path ):
188+ def test_row_mode_without_content_column (self , tmp_path : Path ):
189189 """
190190 Without `content_column`, the content is a human-readable 'key: value' listing of the row.
191191 """
@@ -202,7 +202,7 @@ def test_row_mode_without_content_column(self, tmp_path):
202202 assert docs [0 ].meta ["a" ] == "1" and docs [0 ].meta ["b" ] == "2"
203203 assert docs [0 ].meta ["row_number" ] == 0
204204
205- def test_row_mode_meta_merging (self , tmp_path ):
205+ def test_row_mode_meta_merging (self , tmp_path : Path ):
206206 """
207207 File-level meta and explicit `meta` arg are merged into each row's meta.
208208 """
0 commit comments