DOC: whatsnew entry for on_bad_lines regression fix (GH#61837)

skalwaghe-56 · skalwaghe-56 · commit d0f9a6e486ed · 2025-09-13T15:28:13.000+05:30
diff --git a/doc/source/whatsnew/v2.3.3.rst b/doc/source/whatsnew/v2.3.3.rst
@@ -25,6 +25,8 @@ Bug fixes
 - Fix bug in :meth:`Series.str.replace` using named capture groups (e.g., ``\g<name>``) with the Arrow-backed dtype would raise an error (:issue:`57636`)
 - Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch``
   with a compiled regex and custom flags (:issue:`62240`)
+- Fix regression in ``on_bad_lines`` callable when returning too many fields: now emits
+  ``ParserWarning`` and truncates extra fields regardless of ``index_col`` (:issue:`61837`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_233.contributors:
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -621,12 +621,15 @@ def _check_data_length(
             empty_str_or_na = empty_str | isna(data[-1])  # type: ignore[operator]
             if len(columns) == len(data) - 1 and np.all(empty_str_or_na):
                 return
-            warnings.warn(
-                "Length of header or names does not match length of data. This leads "
-                "to a loss of data with index_col=False.",
-                ParserWarning,
-                stacklevel=find_stack_level(),
-            )
+            # Don't warn if on_bad_lines is set to handle bad lines
+            if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
+                warnings.warn(
+                    "Length of header or names does not match length of data. "
+                    "This leads "
+                    "to a loss of data with index_col=False.",
+                    ParserWarning,
+                    stacklevel=find_stack_level(),
+                )
 
     @final
     def _validate_usecols_names(self, usecols: SequenceT, names: Sequence) -> SequenceT:
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -21,6 +21,7 @@
 import numpy as np
 
 from pandas._libs import lib
+from pandas._typing import Scalar
 from pandas.errors import (
     EmptyDataError,
     ParserError,
@@ -77,7 +78,6 @@
         ArrayLike,
         DtypeObj,
         ReadCsvBuffer,
-        Scalar,
         T,
     )
 
@@ -1194,23 +1194,28 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
                         new_l = self.on_bad_lines(_content)
                         if new_l is not None:
                             # Truncate extra elements and warn.
+                            new_l = cast(list[Scalar], new_l)
                             if len(new_l) > col_len:
                                 warnings.warn(
-                                    "Header/names length != data length. "
-                                    "Extra fields dropped.",
+                                    "Length of header or names does not match length "
+                                    "of data. This leads "
+                                    "to a loss of data with index_col=False.",
                                     ParserWarning,
                                     stacklevel=find_stack_level(),
                                 )
                                 new_l = new_l[:col_len]
-                            content.append(new_l)  # pyright: ignore[reportArgumentType]
+                            content.append(new_l)
+                    elif self.on_bad_lines == self.BadLineHandleMethod.ERROR:
+                        row_num = self.pos - (content_len - i + footers)
+                        bad_lines.append((row_num, actual_len))
+                        break
                     elif self.on_bad_lines in (
-                        self.BadLineHandleMethod.ERROR,
                         self.BadLineHandleMethod.WARN,
+                        self.BadLineHandleMethod.SKIP,
                     ):
                         row_num = self.pos - (content_len - i + footers)
                         bad_lines.append((row_num, actual_len))
-                        if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
-                            break
+                        # For WARN and SKIP, don't append the bad content
                     else:
                         content.append(_content)
                 else: