Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1075,6 +1075,7 @@ I/O
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
- Bug in :meth:`read_csv` with dictionary-based dtype specifications not preserving leading zeros consistently across parser engines (:issue:`57666`)
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)
Expand Down
26 changes: 26 additions & 0 deletions pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
)
from pandas.core.dtypes.inference import is_integer

from pandas.core.arrays.arrow.array import to_pyarrow_type

from pandas.io._util import arrow_table_to_pandas
from pandas.io.parsers.base_parser import ParserBase

Expand Down Expand Up @@ -139,6 +141,30 @@ def handle_warning(invalid_row) -> str:
f"f{n}" for n in self.convert_options["include_columns"]
]

if self.dtype is not None:
if isinstance(self.dtype, dict):
column_types = {}
for col, col_dtype in self.dtype.items():
source_dtype = pandas_dtype(col_dtype)

try:
target_dtype = to_pyarrow_type(source_dtype.type)
if target_dtype:
column_types[col] = target_dtype

except TypeError:
# TODO: Unsupported dtypes silently ignored - may cause
# unexpected behavior when pyarrow applies default inference
# instead of user's dtype
pass

if column_types:
self.convert_options["column_types"] = column_types
else:
# TODO: Global dtypes not supported - may cause inconsistent behavior
# between engines, especially for leading zero preservation
pass

self.read_options = {
"autogenerate_column_names": self.header is None,
"skip_rows": self.header
Expand Down
86 changes: 86 additions & 0 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,3 +636,89 @@ def test_index_col_with_dtype_no_rangeindex(all_parsers):
).index
expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id")
tm.assert_index_equal(result, expected)


def test_leading_zeros_preserved_with_dtype_str(all_parsers):
# GH#61618: ensure string dtype preservation across engines
parser = all_parsers
engine_name = getattr(parser, "engine", "unknown")

# Skip pyarrow engine as it has its own xfail test
if engine_name == "pyarrow":
pytest.skip("pyarrow engine tested separately with xfail")

data = """col1,col2,col3,col4
AB,000388907,abc,0150
CD,101044572,def,0150
EF,000023607,ghi,0205
GH,100102040,jkl,0205"""

result = parser.read_csv(
StringIO(data),
dtype=str,
)

assert result.shape == (4, 4)
assert list(result.columns) == ["col1", "col2", "col3", "col4"]
assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"


@pytest.mark.xfail(
reason="pyarrow engine strips leading zeros with dtype=str (GH#57666)", strict=False
)
def test_leading_zeros_preserved_with_dtype_str_pyarrow(pyarrow_parser_only):
# GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
# This is a known issue that needs to be fixed in the pyarrow engine
parser = pyarrow_parser_only

data = """col1,col2,col3,col4
AB,000388907,abc,0150
CD,101044572,def,0150
EF,000023607,ghi,0205
GH,100102040,jkl,0205"""

result = parser.read_csv(
StringIO(data),
dtype=str,
)

assert result.shape == (4, 4)
assert list(result.columns) == ["col1", "col2", "col3", "col4"]
assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"


def test_leading_zeros_preserved_with_dtype_dict(all_parsers):
# GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
# GH#61618: further discussion on ensuring string dtype preservation across engines

parser = all_parsers

data = """col1,col2,col3,col4
AB,000388907,199,0150
CD,101044572,200,0150
EF,000023607,201,0205
GH,100102040,202,0205"""

result = parser.read_csv(
StringIO(data),
dtype={"col2": str, "col3": int, "col4": str},
)

assert result.shape == (4, 4)
assert list(result.columns) == ["col1", "col2", "col3", "col4"]

assert result.loc[0, "col2"] == "000388907", "lost zeros in col2 row 0"
assert result.loc[2, "col2"] == "000023607", "lost zeros in col2 row 2"
assert result.loc[0, "col4"] == "0150", "lost zeros in col4 row 0"
assert result.loc[2, "col4"] == "0205", "lost zeros in col4 row 2"

assert result.loc[0, "col3"] == 199
assert result.loc[1, "col3"] == 200
assert result.loc[2, "col3"] == 201
assert result.loc[3, "col3"] == 202
Loading