29
29
# improve pandas compatibility, based on geopandas _compat.py
30
30
PANDAS_GE_12 = Version (pd .__version__ ) >= Version ("1.2.0" )
31
31
32
+ # When we drop support for pyarrow < 5 all code related to this can be removed.
33
+ LEGACY_PYARROW = Version (pa .__version__ ) < Version ("5.0.0" )
34
+
32
35
33
36
def _load_parquet_pandas_metadata (
34
37
path ,
@@ -42,21 +45,34 @@ def _load_parquet_pandas_metadata(
42
45
raise ValueError ("Path not found: " + path )
43
46
44
47
if filesystem .isdir (path ):
48
+ if LEGACY_PYARROW :
49
+ basic_kwargs = dict (filesystem = filesystem , validate_schema = False )
50
+ else :
51
+ basic_kwargs = dict (filesystem = filesystem , use_legacy_dataset = False )
52
+
45
53
pqds = pq .ParquetDataset (
46
54
path ,
47
- filesystem = filesystem ,
48
- #validate_schema=False,
49
- use_legacy_dataset = False ,
55
+ ** basic_kwargs ,
50
56
** engine_kwargs ,
51
57
)
52
- filename = pathlib .Path (pqds .files [0 ]).parent .joinpath ("_common_metadata" )
53
- try :
54
- common_metadata = pq .read_metadata (filename )
55
- except FileNotFoundError :
56
- # Common metadata doesn't exist, so get metadata for first piece instead
57
- filename = pathlib .Path (pqds .files [0 ])
58
- common_metadata = pq .read_metadata (filename )
59
- metadata = common_metadata .metadata
58
+
59
+ if LEGACY_PYARROW :
60
+ common_metadata = pqds .common_metadata
61
+ if common_metadata is None :
62
+ # Get metadata for first piece
63
+ piece = pqds .pieces [0 ]
64
+ metadata = piece .get_metadata ().metadata
65
+ else :
66
+ metadata = pqds .common_metadata .metadata
67
+ else :
68
+ filename = pathlib .Path (pqds .files [0 ]).parent .joinpath ("_common_metadata" )
69
+ try :
70
+ common_metadata = pq .read_metadata (filename )
71
+ except FileNotFoundError :
72
+ # Common metadata doesn't exist, so get metadata for first piece instead
73
+ filename = pathlib .Path (pqds .files [0 ])
74
+ common_metadata = pq .read_metadata (filename )
75
+ metadata = common_metadata .metadata
60
76
else :
61
77
with filesystem .open (path ) as f :
62
78
pf = pq .ParquetFile (f )
@@ -111,17 +127,28 @@ def read_parquet(
111
127
engine_kwargs = engine_kwargs or {}
112
128
filesystem = validate_coerce_filesystem (path , filesystem , storage_options )
113
129
130
+ if LEGACY_PYARROW :
131
+ basic_kwargs = dict (filesystem = filesystem , validate_schema = False )
132
+ else :
133
+ basic_kwargs = dict (filesystem = filesystem , use_legacy_dataset = False )
134
+
114
135
# Load using pyarrow to handle parquet files and directories across filesystems
115
136
dataset = pq .ParquetDataset (
116
137
path ,
117
- filesystem = filesystem ,
118
- #validate_schema=False,
119
- use_legacy_dataset = False ,
138
+ ** basic_kwargs ,
120
139
** engine_kwargs ,
121
140
** kwargs ,
122
141
)
123
142
124
- metadata = dataset .schema .pandas_metadata
143
+ if LEGACY_PYARROW :
144
+ metadata = _load_parquet_pandas_metadata (
145
+ path ,
146
+ filesystem = filesystem ,
147
+ storage_options = storage_options ,
148
+ engine_kwargs = engine_kwargs ,
149
+ )
150
+ else :
151
+ metadata = dataset .schema .pandas_metadata
125
152
126
153
# If columns specified, prepend index columns to it
127
154
if columns is not None :
@@ -290,12 +317,15 @@ def _perform_read_parquet_dask(
290
317
filesystem ,
291
318
storage_options ,
292
319
)
320
+ if LEGACY_PYARROW :
321
+ basic_kwargs = dict (filesystem = filesystem , validate_schema = False )
322
+ else :
323
+ basic_kwargs = dict (filesystem = filesystem , use_legacy_dataset = False )
324
+
293
325
datasets = [
294
326
pa .parquet .ParquetDataset (
295
327
path ,
296
- filesystem = filesystem ,
297
- #validate_schema=False,
298
- use_legacy_dataset = False ,
328
+ ** basic_kwargs ,
299
329
** engine_kwargs ,
300
330
) for path in paths
301
331
]
@@ -304,7 +334,10 @@ def _perform_read_parquet_dask(
304
334
pieces = []
305
335
for dataset in datasets :
306
336
# Perform natural sort on pieces so that "part.10" comes after "part.2"
307
- dataset_pieces = sorted (dataset .fragments , key = lambda piece : natural_sort_key (piece .path ))
337
+ fragments = getattr (dataset , "fragments" , None )
338
+ if fragments is None :
339
+ fragments = dataset .pieces
340
+ dataset_pieces = sorted (fragments , key = lambda piece : natural_sort_key (piece .path ))
308
341
pieces .extend (dataset_pieces )
309
342
310
343
delayed_partitions = [
@@ -356,12 +389,18 @@ def _perform_read_parquet_dask(
356
389
else :
357
390
cols_no_index = None
358
391
392
+ if LEGACY_PYARROW :
393
+ files = paths
394
+ else :
395
+ files = getattr (datasets [0 ], "files" , paths )
396
+
359
397
meta = dd_read_parquet (
360
- datasets [ 0 ]. files [0 ],
398
+ files [0 ],
361
399
columns = cols_no_index ,
362
400
filesystem = filesystem ,
363
401
engine = 'pyarrow' ,
364
402
categories = categories ,
403
+ ignore_metadata_file = True ,
365
404
storage_options = storage_options ,
366
405
** engine_kwargs ,
367
406
)._meta
@@ -441,11 +480,15 @@ def _perform_read_parquet_dask(
441
480
442
481
def _load_partition_bounds (pqds ):
443
482
partition_bounds = None
444
- filename = pathlib .Path (pqds .files [0 ]).parent .joinpath ("_common_metadata" )
445
- try :
446
- common_metadata = pq .read_metadata (filename )
447
- except FileNotFoundError :
448
- common_metadata = None
483
+
484
+ if LEGACY_PYARROW :
485
+ common_metadata = pqds .common_metadata
486
+ else :
487
+ filename = pathlib .Path (pqds .files [0 ]).parent .joinpath ("_common_metadata" )
488
+ try :
489
+ common_metadata = pq .read_metadata (filename )
490
+ except FileNotFoundError :
491
+ common_metadata = None
449
492
450
493
if common_metadata is not None and b'spatialpandas' in common_metadata .metadata :
451
494
spatial_metadata = json .loads (
0 commit comments