DOC: whatsnew entry for on_bad_lines regression fix (GH#61837)

skalwaghe-56 · skalwaghe-56 · commit f6887a225fb9 · 2025-09-17T22:30:08.000+05:30
diff --git a/doc/source/whatsnew/v2.3.3.rst b/doc/source/whatsnew/v2.3.3.rst
@@ -25,6 +25,8 @@ Bug fixes
 - Fix bug in :meth:`Series.str.replace` using named capture groups (e.g., ``\g<name>``) with the Arrow-backed dtype would raise an error (:issue:`57636`)
 - Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch``
   with a compiled regex and custom flags (:issue:`62240`)
+- Fix regression in ``on_bad_lines`` callable when returning too many fields: now emits
+  ``ParserWarning`` and truncates extra fields regardless of ``index_col`` (:issue:`61837`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_233.contributors:
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -21,6 +21,7 @@
 import numpy as np
 
 from pandas._libs import lib
+from pandas._typing import Scalar
 from pandas.errors import (
     EmptyDataError,
     ParserError,
@@ -77,7 +78,6 @@
         ArrayLike,
         DtypeObj,
         ReadCsvBuffer,
-        Scalar,
         T,
     )
 
@@ -1194,18 +1194,24 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
                         new_l = self.on_bad_lines(_content)
                         if new_l is not None:
                             # Truncate extra elements and warn.
+                            new_l = cast(list[Scalar], new_l)
                             if len(new_l) > col_len:
-                                warnings.warn(
-                                    "Header/names length != data length. "
-                                    "Extra fields dropped.",
-                                    ParserWarning,
-                                    stacklevel=find_stack_level(),
+                                msg = (
+                                    "Length returned by on_bad_lines callable "
+                                    "does not match expected "
+                                    f"number of columns ({col_len}). "
+                                    "Extra fields will be dropped."
+                                )
+                                self._alert_malformed(
+                                    msg, row_num=self.pos - (content_len - i + footers)
                                 )
                                 new_l = new_l[:col_len]
-                            content.append(new_l)  # pyright: ignore[reportArgumentType]
+                            content.append(new_l)
+
                     elif self.on_bad_lines in (
                         self.BadLineHandleMethod.ERROR,
                         self.BadLineHandleMethod.WARN,
+                        self.BadLineHandleMethod.SKIP,
                     ):
                         row_num = self.pos - (content_len - i + footers)
                         bad_lines.append((row_num, actual_len))
diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
@@ -568,21 +568,17 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, exp
 def test_on_bad_lines_callable_warns_and_truncates_with_index_col(
     python_parser_only, index_col
 ):
-    """
-    GH#61837 regression: callable on_bad_lines returning extra fields must emit a
-    ParserWarning and drop extras regardless of index_col. [2][3]
-    """
+    # GH#61837
     parser = python_parser_only
     data = "id,field_1,field_2\n101,A,B\n102,C,D,E\n103,F,G\n"
 
     def fixer(bad_line):
-        # Over-return to trigger truncation + warning
         return list(bad_line) + ["EXTRA1", "EXTRA2"]
 
-    # Assert ParserWarning is emitted using module helper
-    df = parser.read_csv_check_warnings(
+    result = parser.read_csv_check_warnings(
         ParserWarning,
-        "Length of header or names",
+        "Length returned by on_bad_lines callable does "
+        "not match expected number of columns",
         StringIO(data),
         on_bad_lines=fixer,
         index_col=index_col,
@@ -602,4 +598,4 @@ def fixer(bad_line):
             index=Index([101, 102, 103], name="id"),
         )
 
-    tm.assert_frame_equal(df, expected)
+    tm.assert_frame_equal(result, expected)