@@ -636,3 +636,89 @@ def test_index_col_with_dtype_no_rangeindex(all_parsers):
636
636
).index
637
637
expected = pd .Index ([0 , 1 ], dtype = np .uint32 , name = "bin_id" )
638
638
tm .assert_index_equal (result , expected )
639
+
640
+
641
+ def test_leading_zeros_preserved_with_dtype_str (all_parsers ):
642
+ # GH#61618: ensure string dtype preservation across engines
643
+ parser = all_parsers
644
+ engine_name = getattr (parser , "engine" , "unknown" )
645
+
646
+ # Skip pyarrow engine as it has its own xfail test
647
+ if engine_name == "pyarrow" :
648
+ pytest .skip ("pyarrow engine tested separately with xfail" )
649
+
650
+ data = """col1,col2,col3,col4
651
+ AB,000388907,abc,0150
652
+ CD,101044572,def,0150
653
+ EF,000023607,ghi,0205
654
+ GH,100102040,jkl,0205"""
655
+
656
+ result = parser .read_csv (
657
+ StringIO (data ),
658
+ dtype = str ,
659
+ )
660
+
661
+ assert result .shape == (4 , 4 )
662
+ assert list (result .columns ) == ["col1" , "col2" , "col3" , "col4" ]
663
+ assert result .loc [0 , "col2" ] == "000388907" , "lost zeros in col2 row 0"
664
+ assert result .loc [2 , "col2" ] == "000023607" , "lost zeros in col2 row 2"
665
+ assert result .loc [0 , "col4" ] == "0150" , "lost zeros in col4 row 0"
666
+ assert result .loc [2 , "col4" ] == "0205" , "lost zeros in col4 row 2"
667
+
668
+
669
+ @pytest .mark .xfail (
670
+ reason = "pyarrow engine strips leading zeros with dtype=str (GH#57666)" , strict = False
671
+ )
672
+ def test_leading_zeros_preserved_with_dtype_str_pyarrow (pyarrow_parser_only ):
673
+ # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
674
+ # This is a known issue that needs to be fixed in the pyarrow engine
675
+ parser = pyarrow_parser_only
676
+
677
+ data = """col1,col2,col3,col4
678
+ AB,000388907,abc,0150
679
+ CD,101044572,def,0150
680
+ EF,000023607,ghi,0205
681
+ GH,100102040,jkl,0205"""
682
+
683
+ result = parser .read_csv (
684
+ StringIO (data ),
685
+ dtype = str ,
686
+ )
687
+
688
+ assert result .shape == (4 , 4 )
689
+ assert list (result .columns ) == ["col1" , "col2" , "col3" , "col4" ]
690
+ assert result .loc [0 , "col2" ] == "000388907" , "lost zeros in col2 row 0"
691
+ assert result .loc [2 , "col2" ] == "000023607" , "lost zeros in col2 row 2"
692
+ assert result .loc [0 , "col4" ] == "0150" , "lost zeros in col4 row 0"
693
+ assert result .loc [2 , "col4" ] == "0205" , "lost zeros in col4 row 2"
694
+
695
+
696
+ def test_leading_zeros_preserved_with_dtype_dict (all_parsers ):
697
+ # GH#57666: pyarrow engine strips leading zeros when dtype=str is passed
698
+ # GH#61618: further discussion on ensuring string dtype preservation across engines
699
+
700
+ parser = all_parsers
701
+
702
+ data = """col1,col2,col3,col4
703
+ AB,000388907,199,0150
704
+ CD,101044572,200,0150
705
+ EF,000023607,201,0205
706
+ GH,100102040,202,0205"""
707
+
708
+ result = parser .read_csv (
709
+ StringIO (data ),
710
+ dtype = {"col2" : str , "col3" : int , "col4" : str },
711
+ )
712
+
713
+ assert result .shape == (4 , 4 )
714
+ assert list (result .columns ) == ["col1" , "col2" , "col3" , "col4" ]
715
+
716
+ assert result .loc [0 , "col2" ] == "000388907" , "lost zeros in col2 row 0"
717
+ assert result .loc [2 , "col2" ] == "000023607" , "lost zeros in col2 row 2"
718
+ assert result .loc [0 , "col4" ] == "0150" , "lost zeros in col4 row 0"
719
+ assert result .loc [2 , "col4" ] == "0205" , "lost zeros in col4 row 2"
720
+
721
+ assert result .loc [0 , "col3" ] == 199
722
+ assert result .loc [1 , "col3" ] == 200
723
+ assert result .loc [2 , "col3" ] == 201
724
+ assert result .loc [3 , "col3" ] == 202
0 commit comments