Skip to content

Commit b39ff48

Browse files
committed
BUG: Preserve leading zeros with dtype=str in pyarrow engine (#57666)
1 parent aabbbc5 commit b39ff48

File tree

3 files changed

+113
-0
lines changed

3 files changed

+113
-0
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -987,6 +987,7 @@ I/O
987987
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
988988
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
989989
- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
990+
- Bug in :meth:`read_csv` with dictionary-based dtype specifications not preserving leading zeros consistently across parser engines (:issue:`57666`)
990991
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
991992
- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
992993
- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)

pandas/io/parsers/arrow_parser_wrapper.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
)
2020
from pandas.core.dtypes.inference import is_integer
2121

22+
from pandas.core.arrays.arrow.array import to_pyarrow_type
23+
2224
from pandas.io._util import arrow_table_to_pandas
2325
from pandas.io.parsers.base_parser import ParserBase
2426

@@ -139,6 +141,30 @@ def handle_warning(invalid_row) -> str:
139141
f"f{n}" for n in self.convert_options["include_columns"]
140142
]
141143

144+
if self.dtype is not None:
145+
if isinstance(self.dtype, dict):
146+
column_types = {}
147+
for col, col_dtype in self.dtype.items():
148+
source_dtype = pandas_dtype(col_dtype)
149+
150+
try:
151+
target_dtype = to_pyarrow_type(source_dtype.type)
152+
if target_dtype:
153+
column_types[col] = target_dtype
154+
155+
except TypeError:
156+
# TODO: Unsupported dtypes silently ignored - may cause
157+
# unexpected behavior when pyarrow applies default inference
158+
# instead of user's dtype
159+
pass
160+
161+
if column_types:
162+
self.convert_options["column_types"] = column_types
163+
else:
164+
# TODO: Global dtypes not supported - may cause inconsistent behavior
165+
# between engines, especially for leading zero preservation
166+
pass
167+
142168
self.read_options = {
143169
"autogenerate_column_names": self.header is None,
144170
"skip_rows": self.header

pandas/tests/io/parser/dtypes/test_dtypes_basic.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -636,3 +636,89 @@ def test_index_col_with_dtype_no_rangeindex(all_parsers):
636636
).index
637637
expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id")
638638
tm.assert_index_equal(result, expected)
639+
640+
641+
def test_leading_zeros_preserved_with_dtype_str(all_parsers):
642+
# GH#61618: ensure string dtype preservation across engines
643+
parser = all_parsers
644+
engine_name = getattr(parser, "engine", "unknown")
645+
646+
# Skip pyarrow engine as it has its own xfail test
647+
if engine_name == "pyarrow":
648+
pytest.skip("pyarrow engine tested separately with xfail")
649+
650+
data = """col1,col2,col3,col4
651+
AB,000388907,abc,0150
652+
CD,101044572,def,0150
653+
EF,000023607,ghi,0205
654+
GH,100102040,jkl,0205"""
655+
656+
result = parser.read_csv(
657+
StringIO(data),
658+
dtype=str,
659+
)
660+
661+
assert result.shape == (4, 4)
662+
assert list(result.columns) == ["col1", "col2", "col3", "col4"]
663+
assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
664+
assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
665+
assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
666+
assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"
667+
668+
669+
@pytest.mark.xfail(
670+
reason="pyarrow engine strips leading zeros with dtype=str (GH#57666)", strict=False
671+
)
672+
def test_leading_zeros_preserved_with_dtype_str_pyarrow(pyarrow_parser_only):
673+
# GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
674+
# This is a known issue that needs to be fixed in the pyarrow engine
675+
parser = pyarrow_parser_only
676+
677+
data = """col1,col2,col3,col4
678+
AB,000388907,abc,0150
679+
CD,101044572,def,0150
680+
EF,000023607,ghi,0205
681+
GH,100102040,jkl,0205"""
682+
683+
result = parser.read_csv(
684+
StringIO(data),
685+
dtype=str,
686+
)
687+
688+
assert result.shape == (4, 4)
689+
assert list(result.columns) == ["col1", "col2", "col3", "col4"]
690+
assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
691+
assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
692+
assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
693+
assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"
694+
695+
696+
def test_leading_zeros_preserved_with_dtype_dict(all_parsers):
697+
# GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
698+
# GH#61618: further discussion on ensuring string dtype preservation across engines
699+
700+
parser = all_parsers
701+
702+
data = """col1,col2,col3,col4
703+
AB,000388907,199,0150
704+
CD,101044572,200,0150
705+
EF,000023607,201,0205
706+
GH,100102040,202,0205"""
707+
708+
result = parser.read_csv(
709+
StringIO(data),
710+
dtype={"col2": str, "col3": int, "col4": str},
711+
)
712+
713+
assert result.shape == (4, 4)
714+
assert list(result.columns) == ["col1", "col2", "col3", "col4"]
715+
716+
assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
717+
assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
718+
assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
719+
assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"
720+
721+
assert result.loc[0, "col3"] == 199
722+
assert result.loc[1, "col3"] == 200
723+
assert result.loc[2, "col3"] == 201
724+
assert result.loc[3, "col3"] == 202

0 commit comments

Comments
 (0)