Skip to content

ENH: match geopandas sjoin and clip API #149

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jan 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion dask_geopandas/clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@ def clip(gdf, mask, keep_geom_type=False):
}
divisions = [None] * (len(dsk) + 1)
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[gdf])
result = GeoDataFrame(graph, name, gdf._meta, tuple(divisions))
if isinstance(gdf, GeoDataFrame):
result = GeoDataFrame(graph, name, gdf._meta, tuple(divisions))
elif isinstance(gdf, GeoSeries):
result = GeoSeries(graph, name, gdf._meta, tuple(divisions))
result.spatial_partitions = new_spatial_partitions

return result
39 changes: 39 additions & 0 deletions dask_geopandas/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
from .morton_distance import _morton_distance
from .geohash import _geohash

import dask_geopandas


def _set_crs(df, crs, allow_override):
"""Return a new object with crs set to ``crs``"""
Expand Down Expand Up @@ -449,6 +451,10 @@ def geohash(self, string=True, p=12):

return geohashes

@derived_from(geopandas.GeoDataFrame)
def clip(self, mask, keep_geom_type=False):
return dask_geopandas.clip(self, mask=mask, keep_geom_type=keep_geom_type)


class GeoSeries(_Frame, dd.core.Series):
"""Parallel GeoPandas GeoSeries
Expand Down Expand Up @@ -579,6 +585,39 @@ def union(block):
)
return aggregated.set_crs(self.crs)

def sjoin(self, df, how="inner", predicate="intersects"):
"""
Spatial join of two GeoDataFrames.

Parameters
----------
df : geopandas or dask_geopandas GeoDataFrame
If a geopandas.GeoDataFrame is passed, it is considered as a
dask_geopandas.GeoDataFrame with 1 partition (without spatial
partitioning information).
how : string, default 'inner'
The type of join. Currently only 'inner' is supported.
predicate : string, default 'intersects'
Binary predicate how to match corresponding rows of the left and right
GeoDataFrame. Possible values: 'contains', 'contains_properly',
'covered_by', 'covers', 'crosses', 'intersects', 'overlaps',
'touches', 'within'.

Returns
-------
dask_geopandas.GeoDataFrame

Notes
-----
If both the left and right GeoDataFrame have spatial partitioning
information available (the ``spatial_partitions`` attribute is set),
the output partitions are determined based on intersection of the
spatial partitions. In all other cases, the output partitions are
all combinations (cartesian/cross product) of all input partition
of the left and right GeoDataFrame.
"""
return dask_geopandas.sjoin(self, df, how=how, predicate=predicate)


from_geopandas = dd.from_pandas

Expand Down
26 changes: 21 additions & 5 deletions dask_geopandas/sjoin.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import warnings

import numpy as np
import geopandas

Expand All @@ -7,7 +9,7 @@
from .core import from_geopandas, GeoDataFrame


def sjoin(left, right, how="inner", op="intersects"):
def sjoin(left, right, how="inner", predicate="intersects", **kwargs):
"""
Spatial join of two GeoDataFrames.

Expand All @@ -19,7 +21,7 @@ def sjoin(left, right, how="inner", op="intersects"):
partitioning information).
how : string, default 'inner'
The type of join. Currently only 'inner' is supported.
op : string, default 'intersects'
predicate : string, default 'intersects'
Binary predicate how to match corresponding rows of the left and right
GeoDataFrame. Possible values: 'contains', 'contains_properly',
'covered_by', 'covers', 'crosses', 'intersects', 'overlaps',
Expand All @@ -38,6 +40,14 @@ def sjoin(left, right, how="inner", op="intersects"):
all combinations (cartesian/cross product) of all input partition
of the left and right GeoDataFrame.
"""
if "op" in kwargs:
predicate = kwargs.pop("op")
deprecation_message = (
"The `op` parameter is deprecated and will be removed"
" in a future release. Please use the `predicate` parameter"
" instead."
)
warnings.warn(deprecation_message, FutureWarning, stacklevel=2)
if how != "inner":
raise NotImplementedError("Only how='inner' is supported right now")

Expand All @@ -46,8 +56,8 @@ def sjoin(left, right, how="inner", op="intersects"):
if isinstance(right, geopandas.GeoDataFrame):
right = from_geopandas(right, npartitions=1)

name = "sjoin-" + tokenize(left, right, how, op)
meta = geopandas.sjoin(left._meta, right._meta, how=how, op=op)
name = "sjoin-" + tokenize(left, right, how, predicate)
meta = geopandas.sjoin(left._meta, right._meta, how=how, predicate=predicate)

if left.spatial_partitions is not None and right.spatial_partitions is not None:
# Spatial partitions are known -> use them to trim down the list of
Expand All @@ -73,7 +83,13 @@ def sjoin(left, right, how="inner", op="intersects"):
dsk = {}
new_spatial_partitions = []
for i, (l, r) in enumerate(zip(parts_left, parts_right)):
dsk[(name, i)] = (geopandas.sjoin, (left._name, l), (right._name, r), how, op)
dsk[(name, i)] = (
geopandas.sjoin,
(left._name, l),
(right._name, r),
how,
predicate,
)
# TODO preserve spatial partitions of the output if only left has spatial
# partitions
if using_spatial_partitions:
Expand Down
2 changes: 1 addition & 1 deletion doc/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
geopandas
geopandas>=0.10
numpydoc
sphinx-book-theme
myst-nb
Expand Down
16 changes: 16 additions & 0 deletions doc/source/docs/reference/geodataframe.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,22 @@ Aggregating and exploding

GeoDataFrame.explode

Spatial joins
-------------

.. autosummary::
:toctree: api/

GeoDataFrame.sjoin

Overlay operations
------------------

.. autosummary::
:toctree: api/

GeoDataFrame.clip

Indexing
--------

Expand Down
8 changes: 8 additions & 0 deletions doc/source/docs/reference/geoseries.rst
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,14 @@ Missing values
GeoSeries.fillna
GeoSeries.isna

Overlay operations
------------------

.. autosummary::
:toctree: api/

GeoSeries.clip

Indexing
--------

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import versioneer

install_requires = [
"geopandas",
"geopandas>=0.10",
"dask>=2.18.0,!=2021.05.1",
"distributed>=2.18.0,!=2021.05.1",
"numba",
Expand Down
12 changes: 11 additions & 1 deletion tests/test_clip.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import geopandas
from geopandas.testing import assert_geodataframe_equal
from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal
import pytest
import dask_geopandas
from .test_core import geodf_points # noqa: F401
Expand Down Expand Up @@ -31,3 +31,13 @@ def test_clip_dask_mask(geodf_points): # noqa: F811
NotImplementedError, match=r"Mask cannot be a Dask GeoDataFrame or GeoSeries."
):
dask_geopandas.clip(dask_obj, mask)


def test_clip_geoseries(geodf_points): # noqa: F811
dask_obj = dask_geopandas.from_geopandas(geodf_points, npartitions=2)
dask_obj.calculate_spatial_partitions()
mask = geodf_points.iloc[:1]
mask["geometry"] = mask["geometry"].buffer(2)
expected = geopandas.clip(geodf_points.geometry, mask)
result = dask_geopandas.clip(dask_obj.geometry, mask).compute()
assert_geoseries_equal(expected, result)
30 changes: 30 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,36 @@ def test_copy_none_spatial_partitions(geoseries_points):
assert ddf_copy.spatial_partitions is None


def test_sjoin():
# test only the method, functionality tested in test_sjoin.py
df_points = geopandas.read_file(geopandas.datasets.get_path("naturalearth_cities"))
ddf_points = dask_geopandas.from_geopandas(df_points, npartitions=4)

df_polygons = geopandas.read_file(
geopandas.datasets.get_path("naturalearth_lowres")
)
expected = df_points.sjoin(df_polygons, predicate="within", how="inner")
expected = expected.sort_index()

result = ddf_points.sjoin(df_polygons, predicate="within", how="inner")
assert_geodataframe_equal(expected, result.compute().sort_index())


def test_clip(geodf_points):
# test only the method, functionality tested in test_clip.py
dask_obj = dask_geopandas.from_geopandas(geodf_points, npartitions=2)
dask_obj.calculate_spatial_partitions()
mask = geodf_points.iloc[:1]
mask["geometry"] = mask["geometry"].buffer(2)
expected = geodf_points.clip(mask)
result = dask_obj.clip(mask).compute()
assert_geodataframe_equal(expected, result)

expected = geodf_points.geometry.clip(mask)
result = dask_obj.geometry.clip(mask).compute()
assert_geoseries_equal(expected, result)


class TestDissolve:
def setup_method(self):
self.world = geopandas.read_file(
Expand Down
24 changes: 19 additions & 5 deletions tests/test_sjoin.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pytest

import geopandas
from geopandas.testing import assert_geodataframe_equal

Expand All @@ -13,24 +15,36 @@ def test_sjoin_dask_geopandas():
)
ddf_polygons = dask_geopandas.from_geopandas(df_polygons, npartitions=4)

expected = geopandas.sjoin(df_points, df_polygons, op="within", how="inner")
expected = geopandas.sjoin(df_points, df_polygons, predicate="within", how="inner")
expected = expected.sort_index()

# dask / geopandas
result = dask_geopandas.sjoin(ddf_points, df_polygons, op="within", how="inner")
result = dask_geopandas.sjoin(
ddf_points, df_polygons, predicate="within", how="inner"
)
assert_geodataframe_equal(expected, result.compute().sort_index())

# geopandas / dask
result = dask_geopandas.sjoin(df_points, ddf_polygons, op="within", how="inner")
result = dask_geopandas.sjoin(
df_points, ddf_polygons, predicate="within", how="inner"
)
assert_geodataframe_equal(expected, result.compute().sort_index())

# dask / dask
result = dask_geopandas.sjoin(ddf_points, ddf_polygons, op="within", how="inner")
result = dask_geopandas.sjoin(
ddf_points, ddf_polygons, predicate="within", how="inner"
)
assert_geodataframe_equal(expected, result.compute().sort_index())

# with spatial_partitions
ddf_points.calculate_spatial_partitions()
ddf_polygons.calculate_spatial_partitions()
result = dask_geopandas.sjoin(ddf_points, ddf_polygons, op="within", how="inner")
result = dask_geopandas.sjoin(
ddf_points, ddf_polygons, predicate="within", how="inner"
)
assert result.spatial_partitions is not None
assert_geodataframe_equal(expected, result.compute().sort_index())

# check warning
with pytest.warns(FutureWarning, match="The `op` parameter is deprecated"):
dask_geopandas.sjoin(df_points, ddf_polygons, op="within", how="inner")