From b39ff487c94ca2c5256adc8998cde1a567fe6479 Mon Sep 17 00:00:00 2001 From: Daniel Caspi Date: Tue, 2 Sep 2025 14:54:30 -0500 Subject: [PATCH] BUG: Preserve leading zeros with dtype=str in pyarrow engine (#57666) --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/parsers/arrow_parser_wrapper.py | 26 ++++++ .../io/parser/dtypes/test_dtypes_basic.py | 86 +++++++++++++++++++ 3 files changed, 113 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ffa65032e6aae..ead60ad82f8cf 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -987,6 +987,7 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) - Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`) +- Bug in :meth:`read_csv` with dictionary-based dtype specifications not preserving leading zeros consistently across parser engines (:issue:`57666`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) - Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index ad39d0ebf4326..294cccea189ed 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -19,6 +19,8 @@ ) from pandas.core.dtypes.inference import is_integer +from pandas.core.arrays.arrow.array import to_pyarrow_type + from pandas.io._util import arrow_table_to_pandas from pandas.io.parsers.base_parser import ParserBase @@ -139,6 +141,30 @@ def handle_warning(invalid_row) -> str: f"f{n}" for n in self.convert_options["include_columns"] ] + if self.dtype is not None: + if isinstance(self.dtype, dict): + column_types = {} + for col, col_dtype in self.dtype.items(): + source_dtype = pandas_dtype(col_dtype) + + try: + target_dtype = to_pyarrow_type(source_dtype.type) + if target_dtype: + column_types[col] = target_dtype + + except TypeError: + # TODO: Unsupported dtypes silently ignored - may cause + # unexpected behavior when pyarrow applies default inference + # instead of user's dtype + pass + + if column_types: + self.convert_options["column_types"] = column_types + else: + # TODO: Global dtypes not supported - may cause inconsistent behavior + # between engines, especially for leading zero preservation + pass + self.read_options = { "autogenerate_column_names": self.header is None, "skip_rows": self.header diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index e4563afc631c5..b7ed04a1fbfc7 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -636,3 +636,89 @@ def test_index_col_with_dtype_no_rangeindex(all_parsers): ).index expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id") tm.assert_index_equal(result, expected) + + +def test_leading_zeros_preserved_with_dtype_str(all_parsers): + # GH#61618: ensure string dtype preservation across engines + parser = all_parsers + engine_name = getattr(parser, "engine", "unknown") + + # Skip pyarrow engine as it has its own xfail test + if engine_name == "pyarrow": + pytest.skip("pyarrow engine tested separately with xfail") + + data = """col1,col2,col3,col4 +AB,000388907,abc,0150 +CD,101044572,def,0150 +EF,000023607,ghi,0205 +GH,100102040,jkl,0205""" + + result = parser.read_csv( + StringIO(data), + dtype=str, + ) + + assert result.shape == (4, 4) + assert list(result.columns) == ["col1", "col2", "col3", "col4"] + assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0" + assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2" + assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0" + assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2" + + +@pytest.mark.xfail( + reason="pyarrow engine strips leading zeros with dtype=str (GH#57666)", strict=False +) +def test_leading_zeros_preserved_with_dtype_str_pyarrow(pyarrow_parser_only): + # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed + # This is a known issue that needs to be fixed in the pyarrow engine + parser = pyarrow_parser_only + + data = """col1,col2,col3,col4 +AB,000388907,abc,0150 +CD,101044572,def,0150 +EF,000023607,ghi,0205 +GH,100102040,jkl,0205""" + + result = parser.read_csv( + StringIO(data), + dtype=str, + ) + + assert result.shape == (4, 4) + assert list(result.columns) == ["col1", "col2", "col3", "col4"] + assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0" + assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2" + assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0" + assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2" + + +def test_leading_zeros_preserved_with_dtype_dict(all_parsers): + # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed + # GH#61618: further discussion on ensuring string dtype preservation across engines + + parser = all_parsers + + data = """col1,col2,col3,col4 +AB,000388907,199,0150 +CD,101044572,200,0150 +EF,000023607,201,0205 +GH,100102040,202,0205""" + + result = parser.read_csv( + StringIO(data), + dtype={"col2": str, "col3": int, "col4": str}, + ) + + assert result.shape == (4, 4) + assert list(result.columns) == ["col1", "col2", "col3", "col4"] + + assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0" + assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2" + assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0" + assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2" + + assert result.loc[0, "col3"] == 199 + assert result.loc[1, "col3"] == 200 + assert result.loc[2, "col3"] == 201 + assert result.loc[3, "col3"] == 202