Skip to content

Commit 25fd687

Browse files
authored
Support pyarrow back to version 1.0.0 (#111)
1 parent d102aa0 commit 25fd687

File tree

1 file changed

+68
-25
lines changed

1 file changed

+68
-25
lines changed

spatialpandas/io/parquet.py

Lines changed: 68 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@
2929
# improve pandas compatibility, based on geopandas _compat.py
3030
PANDAS_GE_12 = Version(pd.__version__) >= Version("1.2.0")
3131

32+
# When we drop support for pyarrow < 5 all code related to this can be removed.
33+
LEGACY_PYARROW = Version(pa.__version__) < Version("5.0.0")
34+
3235

3336
def _load_parquet_pandas_metadata(
3437
path,
@@ -42,21 +45,34 @@ def _load_parquet_pandas_metadata(
4245
raise ValueError("Path not found: " + path)
4346

4447
if filesystem.isdir(path):
48+
if LEGACY_PYARROW:
49+
basic_kwargs = dict(filesystem=filesystem, validate_schema=False)
50+
else:
51+
basic_kwargs = dict(filesystem=filesystem, use_legacy_dataset=False)
52+
4553
pqds = pq.ParquetDataset(
4654
path,
47-
filesystem=filesystem,
48-
#validate_schema=False,
49-
use_legacy_dataset=False,
55+
**basic_kwargs,
5056
**engine_kwargs,
5157
)
52-
filename = pathlib.Path(pqds.files[0]).parent.joinpath("_common_metadata")
53-
try:
54-
common_metadata = pq.read_metadata(filename)
55-
except FileNotFoundError:
56-
# Common metadata doesn't exist, so get metadata for first piece instead
57-
filename = pathlib.Path(pqds.files[0])
58-
common_metadata = pq.read_metadata(filename)
59-
metadata = common_metadata.metadata
58+
59+
if LEGACY_PYARROW:
60+
common_metadata = pqds.common_metadata
61+
if common_metadata is None:
62+
# Get metadata for first piece
63+
piece = pqds.pieces[0]
64+
metadata = piece.get_metadata().metadata
65+
else:
66+
metadata = pqds.common_metadata.metadata
67+
else:
68+
filename = pathlib.Path(pqds.files[0]).parent.joinpath("_common_metadata")
69+
try:
70+
common_metadata = pq.read_metadata(filename)
71+
except FileNotFoundError:
72+
# Common metadata doesn't exist, so get metadata for first piece instead
73+
filename = pathlib.Path(pqds.files[0])
74+
common_metadata = pq.read_metadata(filename)
75+
metadata = common_metadata.metadata
6076
else:
6177
with filesystem.open(path) as f:
6278
pf = pq.ParquetFile(f)
@@ -111,17 +127,28 @@ def read_parquet(
111127
engine_kwargs = engine_kwargs or {}
112128
filesystem = validate_coerce_filesystem(path, filesystem, storage_options)
113129

130+
if LEGACY_PYARROW:
131+
basic_kwargs = dict(filesystem=filesystem, validate_schema=False)
132+
else:
133+
basic_kwargs = dict(filesystem=filesystem, use_legacy_dataset=False)
134+
114135
# Load using pyarrow to handle parquet files and directories across filesystems
115136
dataset = pq.ParquetDataset(
116137
path,
117-
filesystem=filesystem,
118-
#validate_schema=False,
119-
use_legacy_dataset=False,
138+
**basic_kwargs,
120139
**engine_kwargs,
121140
**kwargs,
122141
)
123142

124-
metadata = dataset.schema.pandas_metadata
143+
if LEGACY_PYARROW:
144+
metadata = _load_parquet_pandas_metadata(
145+
path,
146+
filesystem=filesystem,
147+
storage_options=storage_options,
148+
engine_kwargs=engine_kwargs,
149+
)
150+
else:
151+
metadata = dataset.schema.pandas_metadata
125152

126153
# If columns specified, prepend index columns to it
127154
if columns is not None:
@@ -290,12 +317,15 @@ def _perform_read_parquet_dask(
290317
filesystem,
291318
storage_options,
292319
)
320+
if LEGACY_PYARROW:
321+
basic_kwargs = dict(filesystem=filesystem, validate_schema=False)
322+
else:
323+
basic_kwargs = dict(filesystem=filesystem, use_legacy_dataset=False)
324+
293325
datasets = [
294326
pa.parquet.ParquetDataset(
295327
path,
296-
filesystem=filesystem,
297-
#validate_schema=False,
298-
use_legacy_dataset=False,
328+
**basic_kwargs,
299329
**engine_kwargs,
300330
) for path in paths
301331
]
@@ -304,7 +334,10 @@ def _perform_read_parquet_dask(
304334
pieces = []
305335
for dataset in datasets:
306336
# Perform natural sort on pieces so that "part.10" comes after "part.2"
307-
dataset_pieces = sorted(dataset.fragments, key=lambda piece: natural_sort_key(piece.path))
337+
fragments = getattr(dataset, "fragments", None)
338+
if fragments is None:
339+
fragments = dataset.pieces
340+
dataset_pieces = sorted(fragments, key=lambda piece: natural_sort_key(piece.path))
308341
pieces.extend(dataset_pieces)
309342

310343
delayed_partitions = [
@@ -356,12 +389,18 @@ def _perform_read_parquet_dask(
356389
else:
357390
cols_no_index = None
358391

392+
if LEGACY_PYARROW:
393+
files = paths
394+
else:
395+
files = getattr(datasets[0], "files", paths)
396+
359397
meta = dd_read_parquet(
360-
datasets[0].files[0],
398+
files[0],
361399
columns=cols_no_index,
362400
filesystem=filesystem,
363401
engine='pyarrow',
364402
categories=categories,
403+
ignore_metadata_file=True,
365404
storage_options=storage_options,
366405
**engine_kwargs,
367406
)._meta
@@ -441,11 +480,15 @@ def _perform_read_parquet_dask(
441480

442481
def _load_partition_bounds(pqds):
443482
partition_bounds = None
444-
filename = pathlib.Path(pqds.files[0]).parent.joinpath("_common_metadata")
445-
try:
446-
common_metadata = pq.read_metadata(filename)
447-
except FileNotFoundError:
448-
common_metadata = None
483+
484+
if LEGACY_PYARROW:
485+
common_metadata = pqds.common_metadata
486+
else:
487+
filename = pathlib.Path(pqds.files[0]).parent.joinpath("_common_metadata")
488+
try:
489+
common_metadata = pq.read_metadata(filename)
490+
except FileNotFoundError:
491+
common_metadata = None
449492

450493
if common_metadata is not None and b'spatialpandas' in common_metadata.metadata:
451494
spatial_metadata = json.loads(

0 commit comments

Comments
 (0)