BUG: read_csv(on_bad_lines=callable)+index_col should warn; add test

skalwaghe-56 · skalwaghe-56 · commit 2818870bcee0 · 2025-09-13T15:28:13.000+05:30
- Always emit ParserWarning and drop extra fields when an on_bad_lines callable returns more elements than expected, regardless of index_col, in PythonParser._rows_to_cols. [GH#61837] - Ensure non-bad rows are appended in the outer else branch so good lines are preserved. - Add regression test pandas/tests/io/parser/test_python_parser_only.py::test_on_bad_lines_callable_warns_and_truncates_with_index_col covering index_col in [None, 0]. Closes #61837.
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -1189,21 +1189,30 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
 
             for i, _content in iter_content:
                 actual_len = len(_content)
-
                 if actual_len > col_len:
                     if callable(self.on_bad_lines):
                         new_l = self.on_bad_lines(_content)
                         if new_l is not None:
+                            # Truncate extra elements and warn.
+                            if len(new_l) > col_len:
+                                warnings.warn(
+                                    "Header/names length != data length. "
+                                    "Extra fields dropped.",
+                                    ParserWarning,
+                                    stacklevel=find_stack_level(),
+                                )
+                                new_l = new_l[:col_len]
                             content.append(new_l)  # pyright: ignore[reportArgumentType]
                     elif self.on_bad_lines in (
                         self.BadLineHandleMethod.ERROR,
                         self.BadLineHandleMethod.WARN,
                     ):
                         row_num = self.pos - (content_len - i + footers)
                         bad_lines.append((row_num, actual_len))
-
                         if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
                             break
+                    else:
+                        content.append(_content)
                 else:
                     content.append(_content)
 
diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
@@ -562,3 +562,44 @@ def test_no_thousand_convert_for_non_numeric_cols(python_parser_only, dtype, exp
     expected = DataFrame(expected)
     expected.insert(0, "a", ["0000,7995", "3,03,001,00514", "4923,600,041"])
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("index_col", [None, 0])
+def test_on_bad_lines_callable_warns_and_truncates_with_index_col(
+    python_parser_only, index_col
+):
+    """
+    GH#61837 regression: callable on_bad_lines returning extra fields must emit a
+    ParserWarning and drop extras regardless of index_col. [2][3]
+    """
+    parser = python_parser_only
+    data = "id,field_1,field_2\n101,A,B\n102,C,D,E\n103,F,G\n"
+
+    def fixer(bad_line):
+        # Over-return to trigger truncation + warning
+        return list(bad_line) + ["EXTRA1", "EXTRA2"]
+
+    # Assert ParserWarning is emitted using module helper
+    df = parser.read_csv_check_warnings(
+        ParserWarning,
+        "Length of header or names",
+        StringIO(data),
+        on_bad_lines=fixer,
+        index_col=index_col,
+    )
+
+    if index_col is None:
+        expected = DataFrame(
+            {
+                "id": [101, 102, 103],
+                "field_1": ["A", "C", "F"],
+                "field_2": ["B", "D", "G"],
+            }
+        )
+    else:
+        expected = DataFrame(
+            {"field_1": ["A", "C", "F"], "field_2": ["B", "D", "G"]},
+            index=Index([101, 102, 103], name="id"),
+        )
+
+    tm.assert_frame_equal(df, expected)