Skip to content

Commit d0f9a6e

Browse files
committed
DOC: whatsnew entry for on_bad_lines regression fix (GH#61837)
1 parent 2818870 commit d0f9a6e

File tree

3 files changed

+23
-13
lines changed

3 files changed

+23
-13
lines changed

doc/source/whatsnew/v2.3.3.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ Bug fixes
2525
- Fix bug in :meth:`Series.str.replace` using named capture groups (e.g., ``\g<name>``) with the Arrow-backed dtype would raise an error (:issue:`57636`)
2626
- Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch``
2727
with a compiled regex and custom flags (:issue:`62240`)
28+
- Fix regression in ``on_bad_lines`` callable when returning too many fields: now emits
29+
``ParserWarning`` and truncates extra fields regardless of ``index_col`` (:issue:`61837`)
2830

2931
.. ---------------------------------------------------------------------------
3032
.. _whatsnew_233.contributors:

pandas/io/parsers/base_parser.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -621,12 +621,15 @@ def _check_data_length(
621621
empty_str_or_na = empty_str | isna(data[-1]) # type: ignore[operator]
622622
if len(columns) == len(data) - 1 and np.all(empty_str_or_na):
623623
return
624-
warnings.warn(
625-
"Length of header or names does not match length of data. This leads "
626-
"to a loss of data with index_col=False.",
627-
ParserWarning,
628-
stacklevel=find_stack_level(),
629-
)
624+
# Don't warn if on_bad_lines is set to handle bad lines
625+
if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
626+
warnings.warn(
627+
"Length of header or names does not match length of data. "
628+
"This leads "
629+
"to a loss of data with index_col=False.",
630+
ParserWarning,
631+
stacklevel=find_stack_level(),
632+
)
630633

631634
@final
632635
def _validate_usecols_names(self, usecols: SequenceT, names: Sequence) -> SequenceT:

pandas/io/parsers/python_parser.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import numpy as np
2222

2323
from pandas._libs import lib
24+
from pandas._typing import Scalar
2425
from pandas.errors import (
2526
EmptyDataError,
2627
ParserError,
@@ -77,7 +78,6 @@
7778
ArrayLike,
7879
DtypeObj,
7980
ReadCsvBuffer,
80-
Scalar,
8181
T,
8282
)
8383

@@ -1194,23 +1194,28 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
11941194
new_l = self.on_bad_lines(_content)
11951195
if new_l is not None:
11961196
# Truncate extra elements and warn.
1197+
new_l = cast(list[Scalar], new_l)
11971198
if len(new_l) > col_len:
11981199
warnings.warn(
1199-
"Header/names length != data length. "
1200-
"Extra fields dropped.",
1200+
"Length of header or names does not match length "
1201+
"of data. This leads "
1202+
"to a loss of data with index_col=False.",
12011203
ParserWarning,
12021204
stacklevel=find_stack_level(),
12031205
)
12041206
new_l = new_l[:col_len]
1205-
content.append(new_l) # pyright: ignore[reportArgumentType]
1207+
content.append(new_l)
1208+
elif self.on_bad_lines == self.BadLineHandleMethod.ERROR:
1209+
row_num = self.pos - (content_len - i + footers)
1210+
bad_lines.append((row_num, actual_len))
1211+
break
12061212
elif self.on_bad_lines in (
1207-
self.BadLineHandleMethod.ERROR,
12081213
self.BadLineHandleMethod.WARN,
1214+
self.BadLineHandleMethod.SKIP,
12091215
):
12101216
row_num = self.pos - (content_len - i + footers)
12111217
bad_lines.append((row_num, actual_len))
1212-
if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
1213-
break
1218+
# For WARN and SKIP, don't append the bad content
12141219
else:
12151220
content.append(_content)
12161221
else:

0 commit comments

Comments
 (0)