Skip to content

Commit 2818870

Browse files
committed
BUG: read_csv(on_bad_lines=callable)+index_col should warn; add test
- Always emit ParserWarning and drop extra fields when an on_bad_lines callable returns more elements than expected, regardless of index_col, in PythonParser._rows_to_cols. [GH#61837] - Ensure non-bad rows are appended in the outer else branch so good lines are preserved. - Add regression test pandas/tests/io/parser/test_python_parser_only.py::test_on_bad_lines_callable_warns_and_truncates_with_index_col covering index_col in [None, 0]. Closes #61837.
1 parent cc40732 commit 2818870

File tree

2 files changed

+52
-2
lines changed

2 files changed

+52
-2
lines changed

pandas/io/parsers/python_parser.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1189,21 +1189,30 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
11891189

11901190
for i, _content in iter_content:
11911191
actual_len = len(_content)
1192-
11931192
if actual_len > col_len:
11941193
if callable(self.on_bad_lines):
11951194
new_l = self.on_bad_lines(_content)
11961195
if new_l is not None:
1196+
# Truncate extra elements and warn.
1197+
if len(new_l) > col_len:
1198+
warnings.warn(
1199+
"Header/names length != data length. "
1200+
"Extra fields dropped.",
1201+
ParserWarning,
1202+
stacklevel=find_stack_level(),
1203+
)
1204+
new_l = new_l[:col_len]
11971205
content.append(new_l) # pyright: ignore[reportArgumentType]
11981206
elif self.on_bad_lines in (
11991207
self.BadLineHandleMethod.ERROR,
12001208
self.BadLineHandleMethod.WARN,
12011209
):
12021210
row_num = self.pos - (content_len - i + footers)
12031211
bad_lines.append((row_num, actual_len))
1204-
12051212
if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
12061213
break
1214+
else:
1215+
content.append(_content)
12071216
else:
12081217
content.append(_content)
12091218

pandas/tests/io/parser/test_python_parser_only.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -562,3 +562,44 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, exp
562562
expected = DataFrame(expected)
563563
expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"])
564564
tm.assert_frame_equal(result, expected)
565+
566+
567+
@pytest.mark.parametrize("index_col", [None, 0])
568+
def test_on_bad_lines_callable_warns_and_truncates_with_index_col(
569+
python_parser_only, index_col
570+
):
571+
"""
572+
GH#61837 regression: callable on_bad_lines returning extra fields must emit a
573+
ParserWarning and drop extras regardless of index_col. [2][3]
574+
"""
575+
parser = python_parser_only
576+
data = "id,field_1,field_2\n101,A,B\n102,C,D,E\n103,F,G\n"
577+
578+
def fixer(bad_line):
579+
# Over-return to trigger truncation + warning
580+
return list(bad_line) + ["EXTRA1", "EXTRA2"]
581+
582+
# Assert ParserWarning is emitted using module helper
583+
df = parser.read_csv_check_warnings(
584+
ParserWarning,
585+
"Length of header or names",
586+
StringIO(data),
587+
on_bad_lines=fixer,
588+
index_col=index_col,
589+
)
590+
591+
if index_col is None:
592+
expected = DataFrame(
593+
{
594+
"id": [101, 102, 103],
595+
"field_1": ["A", "C", "F"],
596+
"field_2": ["B", "D", "G"],
597+
}
598+
)
599+
else:
600+
expected = DataFrame(
601+
{"field_1": ["A", "C", "F"], "field_2": ["B", "D", "G"]},
602+
index=Index([101, 102, 103], name="id"),
603+
)
604+
605+
tm.assert_frame_equal(df, expected)

0 commit comments

Comments
 (0)