DOC: whatsnew entry for on_bad_lines regression fix (GH#61837)

skalwaghe-56 · skalwaghe-56 · commit 86d45aa6768a · 2025-09-24T15:32:47.000+05:30
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -1003,6 +1003,8 @@ MultiIndex
 I/O
 ^^^
 - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping` elements. (:issue:`57915`)
+- Fix bug in ``on_bad_lines`` callable when returning too many fields: now emits
+  ``ParserWarning`` and truncates extra fields regardless of ``index_col`` (:issue:`61837`)
 - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`)
 - Bug in :meth:`.io.common.is_fsspec_url` not recognizing chained fsspec URLs (:issue:`48978`)
 - Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`)
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -21,6 +21,7 @@
 import numpy as np
 
 from pandas._libs import lib
+from pandas._typing import Scalar
 from pandas.errors import (
     EmptyDataError,
     ParserError,
@@ -77,7 +78,6 @@
         ArrayLike,
         DtypeObj,
         ReadCsvBuffer,
-        Scalar,
         T,
     )
 
@@ -954,7 +954,9 @@ def _alert_malformed(self, msg: str, row_num: int) -> None:
         """
         if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
             raise ParserError(msg)
-        if self.on_bad_lines == self.BadLineHandleMethod.WARN:
+        if self.on_bad_lines == self.BadLineHandleMethod.WARN or callable(
+            self.on_bad_lines
+        ):
             warnings.warn(
                 f"Skipping line {row_num}: {msg}\n",
                 ParserWarning,
@@ -1193,34 +1195,31 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
                     if callable(self.on_bad_lines):
                         new_l = self.on_bad_lines(_content)
                         if new_l is not None:
-                            # Truncate extra elements and warn.
+                            new_l = cast(list[Scalar], new_l)
                             if len(new_l) > col_len:
-                                warnings.warn(
-                                    "Header/names length != data length. "
-                                    "Extra fields dropped.",
-                                    ParserWarning,
-                                    stacklevel=find_stack_level(),
-                                )
+                                row_num = self.pos - (content_len - i + footers)
+                                bad_lines.append((row_num, len(new_l), "callable"))
                                 new_l = new_l[:col_len]
-                            content.append(new_l)  # pyright: ignore[reportArgumentType]
+                            content.append(new_l)
+
                     elif self.on_bad_lines in (
                         self.BadLineHandleMethod.ERROR,
                         self.BadLineHandleMethod.WARN,
                     ):
                         row_num = self.pos - (content_len - i + footers)
-                        bad_lines.append((row_num, actual_len))
+                        bad_lines.append((row_num, actual_len, "normal"))
                         if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
                             break
-                    else:
-                        content.append(_content)
                 else:
                     content.append(_content)
 
-            for row_num, actual_len in bad_lines:
+            for row_num, actual_len, source in bad_lines:
                 msg = (
                     f"Expected {col_len} fields in line {row_num + 1}, saw {actual_len}"
                 )
-                if (
+                if source == "callable":
+                    msg += " from bad_lines callable"
+                elif (
                     self.delimiter
                     and len(self.delimiter) > 1
                     and self.quoting != csv.QUOTE_NONE
diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
@@ -432,7 +432,7 @@ def test_on_bad_lines_callable_not_expected_length(python_parser_only):
     bad_sio = StringIO(data)
 
     result = parser.read_csv_check_warnings(
-        ParserWarning, "Length of header or names", bad_sio, on_bad_lines=lambda x: x
+        ParserWarning, "from bad_lines callable", bad_sio, on_bad_lines=lambda x: x
     )
     expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
     tm.assert_frame_equal(result, expected)
@@ -568,21 +568,16 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, exp
 def test_on_bad_lines_callable_warns_and_truncates_with_index_col(
     python_parser_only, index_col
 ):
-    """
-    GH#61837 regression: callable on_bad_lines returning extra fields must emit a
-    ParserWarning and drop extras regardless of index_col. [2][3]
-    """
+    # GH#61837
     parser = python_parser_only
     data = "id,field_1,field_2\n101,A,B\n102,C,D,E\n103,F,G\n"
 
     def fixer(bad_line):
-        # Over-return to trigger truncation + warning
         return list(bad_line) + ["EXTRA1", "EXTRA2"]
 
-    # Assert ParserWarning is emitted using module helper
-    df = parser.read_csv_check_warnings(
+    result = parser.read_csv_check_warnings(
         ParserWarning,
-        "Length of header or names",
+        "from bad_lines callable",
         StringIO(data),
         on_bad_lines=fixer,
         index_col=index_col,
@@ -602,4 +597,4 @@ def fixer(bad_line):
             index=Index([101, 102, 103], name="id"),
         )
 
-    tm.assert_frame_equal(df, expected)
+    tm.assert_frame_equal(result, expected)