Skip to content

Commit f6887a2

Browse files
committed
DOC: whatsnew entry for on_bad_lines regression fix (GH#61837)
1 parent 6e3127b commit f6887a2

File tree

3 files changed

+20
-16
lines changed

3 files changed

+20
-16
lines changed

doc/source/whatsnew/v2.3.3.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ Bug fixes
2525
- Fix bug in :meth:`Series.str.replace` using named capture groups (e.g., ``\g<name>``) with the Arrow-backed dtype would raise an error (:issue:`57636`)
2626
- Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch``
2727
with a compiled regex and custom flags (:issue:`62240`)
28+
- Fix regression in ``on_bad_lines`` callable when returning too many fields: now emits
29+
``ParserWarning`` and truncates extra fields regardless of ``index_col`` (:issue:`61837`)
2830

2931
.. ---------------------------------------------------------------------------
3032
.. _whatsnew_233.contributors:

pandas/io/parsers/python_parser.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import numpy as np
2222

2323
from pandas._libs import lib
24+
from pandas._typing import Scalar
2425
from pandas.errors import (
2526
EmptyDataError,
2627
ParserError,
@@ -77,7 +78,6 @@
7778
ArrayLike,
7879
DtypeObj,
7980
ReadCsvBuffer,
80-
Scalar,
8181
T,
8282
)
8383

@@ -1194,18 +1194,24 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
11941194
new_l = self.on_bad_lines(_content)
11951195
if new_l is not None:
11961196
# Truncate extra elements and warn.
1197+
new_l = cast(list[Scalar], new_l)
11971198
if len(new_l) > col_len:
1198-
warnings.warn(
1199-
"Header/names length != data length. "
1200-
"Extra fields dropped.",
1201-
ParserWarning,
1202-
stacklevel=find_stack_level(),
1199+
msg = (
1200+
"Length returned by on_bad_lines callable "
1201+
"does not match expected "
1202+
f"number of columns ({col_len}). "
1203+
"Extra fields will be dropped."
1204+
)
1205+
self._alert_malformed(
1206+
msg, row_num=self.pos - (content_len - i + footers)
12031207
)
12041208
new_l = new_l[:col_len]
1205-
content.append(new_l) # pyright: ignore[reportArgumentType]
1209+
content.append(new_l)
1210+
12061211
elif self.on_bad_lines in (
12071212
self.BadLineHandleMethod.ERROR,
12081213
self.BadLineHandleMethod.WARN,
1214+
self.BadLineHandleMethod.SKIP,
12091215
):
12101216
row_num = self.pos - (content_len - i + footers)
12111217
bad_lines.append((row_num, actual_len))

pandas/tests/io/parser/test_python_parser_only.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -568,21 +568,17 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, exp
568568
def test_on_bad_lines_callable_warns_and_truncates_with_index_col(
569569
python_parser_only, index_col
570570
):
571-
"""
572-
GH#61837 regression: callable on_bad_lines returning extra fields must emit a
573-
ParserWarning and drop extras regardless of index_col. [2][3]
574-
"""
571+
# GH#61837
575572
parser = python_parser_only
576573
data = "id,field_1,field_2\n101,A,B\n102,C,D,E\n103,F,G\n"
577574

578575
def fixer(bad_line):
579-
# Over-return to trigger truncation + warning
580576
return list(bad_line) + ["EXTRA1", "EXTRA2"]
581577

582-
# Assert ParserWarning is emitted using module helper
583-
df = parser.read_csv_check_warnings(
578+
result = parser.read_csv_check_warnings(
584579
ParserWarning,
585-
"Length of header or names",
580+
"Length returned by on_bad_lines callable does "
581+
"not match expected number of columns",
586582
StringIO(data),
587583
on_bad_lines=fixer,
588584
index_col=index_col,
@@ -602,4 +598,4 @@ def fixer(bad_line):
602598
index=Index([101, 102, 103], name="id"),
603599
)
604600

605-
tm.assert_frame_equal(df, expected)
601+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)