BUG: Preserve leading zeros with dtype=str in pyarrow engine (#57666)

dxdc · dxdc · commit b39ff487c94c · 2025-09-25T09:45:37.000-05:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -987,6 +987,7 @@ I/O
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
 - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
 - Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
+- Bug in :meth:`read_csv` with dictionary-based dtype specifications not preserving leading zeros consistently across parser engines (:issue:`57666`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
 - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
 - Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -19,6 +19,8 @@
 )
 from pandas.core.dtypes.inference import is_integer
 
+from pandas.core.arrays.arrow.array import to_pyarrow_type
+
 from pandas.io._util import arrow_table_to_pandas
 from pandas.io.parsers.base_parser import ParserBase
 
@@ -139,6 +141,30 @@ def handle_warning(invalid_row) -> str:
                 f"f{n}" for n in self.convert_options["include_columns"]
             ]
 
+        if self.dtype is not None:
+            if isinstance(self.dtype, dict):
+                column_types = {}
+                for col, col_dtype in self.dtype.items():
+                    source_dtype = pandas_dtype(col_dtype)
+
+                    try:
+                        target_dtype = to_pyarrow_type(source_dtype.type)
+                        if target_dtype:
+                            column_types[col] = target_dtype
+
+                    except TypeError:
+                        # TODO: Unsupported dtypes silently ignored - may cause
+                        # unexpected behavior when pyarrow applies default inference
+                        # instead of user's dtype
+                        pass
+
+                if column_types:
+                    self.convert_options["column_types"] = column_types
+            else:
+                # TODO: Global dtypes not supported - may cause inconsistent behavior
+                # between engines, especially for leading zero preservation
+                pass
+
         self.read_options = {
             "autogenerate_column_names": self.header is None,
             "skip_rows": self.header
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -636,3 +636,89 @@ def test_index_col_with_dtype_no_rangeindex(all_parsers):
     ).index
     expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id")
     tm.assert_index_equal(result, expected)
+
+
+def test_leading_zeros_preserved_with_dtype_str(all_parsers):
+    # GH#61618: ensure string dtype preservation across engines
+    parser = all_parsers
+    engine_name = getattr(parser, "engine", "unknown")
+
+    # Skip pyarrow engine as it has its own xfail test
+    if engine_name == "pyarrow":
+        pytest.skip("pyarrow engine tested separately with xfail")
+
+    data = """col1,col2,col3,col4
+AB,000388907,abc,0150
+CD,101044572,def,0150
+EF,000023607,ghi,0205
+GH,100102040,jkl,0205"""
+
+    result = parser.read_csv(
+        StringIO(data),
+        dtype=str,
+    )
+
+    assert result.shape == (4, 4)
+    assert list(result.columns) == ["col1", "col2", "col3", "col4"]
+    assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
+    assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
+    assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
+    assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"
+
+
+@pytest.mark.xfail(
+    reason="pyarrow engine strips leading zeros with dtype=str (GH#57666)", strict=False
+)
+def test_leading_zeros_preserved_with_dtype_str_pyarrow(pyarrow_parser_only):
+    # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
+    # This is a known issue that needs to be fixed in the pyarrow engine
+    parser = pyarrow_parser_only
+
+    data = """col1,col2,col3,col4
+AB,000388907,abc,0150
+CD,101044572,def,0150
+EF,000023607,ghi,0205
+GH,100102040,jkl,0205"""
+
+    result = parser.read_csv(
+        StringIO(data),
+        dtype=str,
+    )
+
+    assert result.shape == (4, 4)
+    assert list(result.columns) == ["col1", "col2", "col3", "col4"]
+    assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
+    assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
+    assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
+    assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"
+
+
+def test_leading_zeros_preserved_with_dtype_dict(all_parsers):
+    # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
+    # GH#61618: further discussion on ensuring string dtype preservation across engines
+
+    parser = all_parsers
+
+    data = """col1,col2,col3,col4
+AB,000388907,199,0150
+CD,101044572,200,0150
+EF,000023607,201,0205
+GH,100102040,202,0205"""
+
+    result = parser.read_csv(
+        StringIO(data),
+        dtype={"col2": str, "col3": int, "col4": str},
+    )
+
+    assert result.shape == (4, 4)
+    assert list(result.columns) == ["col1", "col2", "col3", "col4"]
+
+    assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
+    assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
+    assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
+    assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"
+
+    assert result.loc[0, "col3"] == 199
+    assert result.loc[1, "col3"] == 200
+    assert result.loc[2, "col3"] == 201
+    assert result.loc[3, "col3"] == 202