2929# improve pandas compatibility, based on geopandas _compat.py
3030PANDAS_GE_12 = Version (pd .__version__ ) >= Version ("1.2.0" )
3131
32+ # When we drop support for pyarrow < 5 all code related to this can be removed.
33+ LEGACY_PYARROW = Version (pa .__version__ ) < Version ("5.0.0" )
34+
3235
3336def _load_parquet_pandas_metadata (
3437 path ,
@@ -42,21 +45,34 @@ def _load_parquet_pandas_metadata(
4245 raise ValueError ("Path not found: " + path )
4346
4447 if filesystem .isdir (path ):
48+ if LEGACY_PYARROW :
49+ basic_kwargs = dict (filesystem = filesystem , validate_schema = False )
50+ else :
51+ basic_kwargs = dict (filesystem = filesystem , use_legacy_dataset = False )
52+
4553 pqds = pq .ParquetDataset (
4654 path ,
47- filesystem = filesystem ,
48- #validate_schema=False,
49- use_legacy_dataset = False ,
55+ ** basic_kwargs ,
5056 ** engine_kwargs ,
5157 )
52- filename = pathlib .Path (pqds .files [0 ]).parent .joinpath ("_common_metadata" )
53- try :
54- common_metadata = pq .read_metadata (filename )
55- except FileNotFoundError :
56- # Common metadata doesn't exist, so get metadata for first piece instead
57- filename = pathlib .Path (pqds .files [0 ])
58- common_metadata = pq .read_metadata (filename )
59- metadata = common_metadata .metadata
58+
59+ if LEGACY_PYARROW :
60+ common_metadata = pqds .common_metadata
61+ if common_metadata is None :
62+ # Get metadata for first piece
63+ piece = pqds .pieces [0 ]
64+ metadata = piece .get_metadata ().metadata
65+ else :
66+ metadata = pqds .common_metadata .metadata
67+ else :
68+ filename = pathlib .Path (pqds .files [0 ]).parent .joinpath ("_common_metadata" )
69+ try :
70+ common_metadata = pq .read_metadata (filename )
71+ except FileNotFoundError :
72+ # Common metadata doesn't exist, so get metadata for first piece instead
73+ filename = pathlib .Path (pqds .files [0 ])
74+ common_metadata = pq .read_metadata (filename )
75+ metadata = common_metadata .metadata
6076 else :
6177 with filesystem .open (path ) as f :
6278 pf = pq .ParquetFile (f )
@@ -111,17 +127,28 @@ def read_parquet(
111127 engine_kwargs = engine_kwargs or {}
112128 filesystem = validate_coerce_filesystem (path , filesystem , storage_options )
113129
130+ if LEGACY_PYARROW :
131+ basic_kwargs = dict (filesystem = filesystem , validate_schema = False )
132+ else :
133+ basic_kwargs = dict (filesystem = filesystem , use_legacy_dataset = False )
134+
114135 # Load using pyarrow to handle parquet files and directories across filesystems
115136 dataset = pq .ParquetDataset (
116137 path ,
117- filesystem = filesystem ,
118- #validate_schema=False,
119- use_legacy_dataset = False ,
138+ ** basic_kwargs ,
120139 ** engine_kwargs ,
121140 ** kwargs ,
122141 )
123142
124- metadata = dataset .schema .pandas_metadata
143+ if LEGACY_PYARROW :
144+ metadata = _load_parquet_pandas_metadata (
145+ path ,
146+ filesystem = filesystem ,
147+ storage_options = storage_options ,
148+ engine_kwargs = engine_kwargs ,
149+ )
150+ else :
151+ metadata = dataset .schema .pandas_metadata
125152
126153 # If columns specified, prepend index columns to it
127154 if columns is not None :
@@ -290,12 +317,15 @@ def _perform_read_parquet_dask(
290317 filesystem ,
291318 storage_options ,
292319 )
320+ if LEGACY_PYARROW :
321+ basic_kwargs = dict (filesystem = filesystem , validate_schema = False )
322+ else :
323+ basic_kwargs = dict (filesystem = filesystem , use_legacy_dataset = False )
324+
293325 datasets = [
294326 pa .parquet .ParquetDataset (
295327 path ,
296- filesystem = filesystem ,
297- #validate_schema=False,
298- use_legacy_dataset = False ,
328+ ** basic_kwargs ,
299329 ** engine_kwargs ,
300330 ) for path in paths
301331 ]
@@ -304,7 +334,10 @@ def _perform_read_parquet_dask(
304334 pieces = []
305335 for dataset in datasets :
306336 # Perform natural sort on pieces so that "part.10" comes after "part.2"
307- dataset_pieces = sorted (dataset .fragments , key = lambda piece : natural_sort_key (piece .path ))
337+ fragments = getattr (dataset , "fragments" , None )
338+ if fragments is None :
339+ fragments = dataset .pieces
340+ dataset_pieces = sorted (fragments , key = lambda piece : natural_sort_key (piece .path ))
308341 pieces .extend (dataset_pieces )
309342
310343 delayed_partitions = [
@@ -356,12 +389,18 @@ def _perform_read_parquet_dask(
356389 else :
357390 cols_no_index = None
358391
392+ if LEGACY_PYARROW :
393+ files = paths
394+ else :
395+ files = getattr (datasets [0 ], "files" , paths )
396+
359397 meta = dd_read_parquet (
360- datasets [ 0 ]. files [0 ],
398+ files [0 ],
361399 columns = cols_no_index ,
362400 filesystem = filesystem ,
363401 engine = 'pyarrow' ,
364402 categories = categories ,
403+ ignore_metadata_file = True ,
365404 storage_options = storage_options ,
366405 ** engine_kwargs ,
367406 )._meta
@@ -441,11 +480,15 @@ def _perform_read_parquet_dask(
441480
442481def _load_partition_bounds (pqds ):
443482 partition_bounds = None
444- filename = pathlib .Path (pqds .files [0 ]).parent .joinpath ("_common_metadata" )
445- try :
446- common_metadata = pq .read_metadata (filename )
447- except FileNotFoundError :
448- common_metadata = None
483+
484+ if LEGACY_PYARROW :
485+ common_metadata = pqds .common_metadata
486+ else :
487+ filename = pathlib .Path (pqds .files [0 ]).parent .joinpath ("_common_metadata" )
488+ try :
489+ common_metadata = pq .read_metadata (filename )
490+ except FileNotFoundError :
491+ common_metadata = None
449492
450493 if common_metadata is not None and b'spatialpandas' in common_metadata .metadata :
451494 spatial_metadata = json .loads (
0 commit comments