diff --git a/dask_geopandas/clip.py b/dask_geopandas/clip.py index a30d01db..084d40ae 100644 --- a/dask_geopandas/clip.py +++ b/dask_geopandas/clip.py @@ -38,7 +38,10 @@ def clip(gdf, mask, keep_geom_type=False): } divisions = [None] * (len(dsk) + 1) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[gdf]) - result = GeoDataFrame(graph, name, gdf._meta, tuple(divisions)) + if isinstance(gdf, GeoDataFrame): + result = GeoDataFrame(graph, name, gdf._meta, tuple(divisions)) + elif isinstance(gdf, GeoSeries): + result = GeoSeries(graph, name, gdf._meta, tuple(divisions)) result.spatial_partitions = new_spatial_partitions return result diff --git a/dask_geopandas/core.py b/dask_geopandas/core.py index 1a910217..3ce3051f 100644 --- a/dask_geopandas/core.py +++ b/dask_geopandas/core.py @@ -17,6 +17,8 @@ from .morton_distance import _morton_distance from .geohash import _geohash +import dask_geopandas + def _set_crs(df, crs, allow_override): """Return a new object with crs set to ``crs``""" @@ -449,6 +451,10 @@ def geohash(self, string=True, p=12): return geohashes + @derived_from(geopandas.GeoDataFrame) + def clip(self, mask, keep_geom_type=False): + return dask_geopandas.clip(self, mask=mask, keep_geom_type=keep_geom_type) + class GeoSeries(_Frame, dd.core.Series): """Parallel GeoPandas GeoSeries @@ -579,6 +585,39 @@ def union(block): ) return aggregated.set_crs(self.crs) + def sjoin(self, df, how="inner", predicate="intersects"): + """ + Spatial join of two GeoDataFrames. + + Parameters + ---------- + df : geopandas or dask_geopandas GeoDataFrame + If a geopandas.GeoDataFrame is passed, it is considered as a + dask_geopandas.GeoDataFrame with 1 partition (without spatial + partitioning information). + how : string, default 'inner' + The type of join. Currently only 'inner' is supported. + predicate : string, default 'intersects' + Binary predicate how to match corresponding rows of the left and right + GeoDataFrame. Possible values: 'contains', 'contains_properly', + 'covered_by', 'covers', 'crosses', 'intersects', 'overlaps', + 'touches', 'within'. + + Returns + ------- + dask_geopandas.GeoDataFrame + + Notes + ----- + If both the left and right GeoDataFrame have spatial partitioning + information available (the ``spatial_partitions`` attribute is set), + the output partitions are determined based on intersection of the + spatial partitions. In all other cases, the output partitions are + all combinations (cartesian/cross product) of all input partition + of the left and right GeoDataFrame. + """ + return dask_geopandas.sjoin(self, df, how=how, predicate=predicate) + from_geopandas = dd.from_pandas diff --git a/dask_geopandas/sjoin.py b/dask_geopandas/sjoin.py index 8515dbc3..cdb4d191 100644 --- a/dask_geopandas/sjoin.py +++ b/dask_geopandas/sjoin.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np import geopandas @@ -7,7 +9,7 @@ from .core import from_geopandas, GeoDataFrame -def sjoin(left, right, how="inner", op="intersects"): +def sjoin(left, right, how="inner", predicate="intersects", **kwargs): """ Spatial join of two GeoDataFrames. @@ -19,7 +21,7 @@ def sjoin(left, right, how="inner", op="intersects"): partitioning information). how : string, default 'inner' The type of join. Currently only 'inner' is supported. - op : string, default 'intersects' + predicate : string, default 'intersects' Binary predicate how to match corresponding rows of the left and right GeoDataFrame. Possible values: 'contains', 'contains_properly', 'covered_by', 'covers', 'crosses', 'intersects', 'overlaps', @@ -38,6 +40,14 @@ def sjoin(left, right, how="inner", op="intersects"): all combinations (cartesian/cross product) of all input partition of the left and right GeoDataFrame. """ + if "op" in kwargs: + predicate = kwargs.pop("op") + deprecation_message = ( + "The `op` parameter is deprecated and will be removed" + " in a future release. Please use the `predicate` parameter" + " instead." + ) + warnings.warn(deprecation_message, FutureWarning, stacklevel=2) if how != "inner": raise NotImplementedError("Only how='inner' is supported right now") @@ -46,8 +56,8 @@ def sjoin(left, right, how="inner", op="intersects"): if isinstance(right, geopandas.GeoDataFrame): right = from_geopandas(right, npartitions=1) - name = "sjoin-" + tokenize(left, right, how, op) - meta = geopandas.sjoin(left._meta, right._meta, how=how, op=op) + name = "sjoin-" + tokenize(left, right, how, predicate) + meta = geopandas.sjoin(left._meta, right._meta, how=how, predicate=predicate) if left.spatial_partitions is not None and right.spatial_partitions is not None: # Spatial partitions are known -> use them to trim down the list of @@ -73,7 +83,13 @@ def sjoin(left, right, how="inner", op="intersects"): dsk = {} new_spatial_partitions = [] for i, (l, r) in enumerate(zip(parts_left, parts_right)): - dsk[(name, i)] = (geopandas.sjoin, (left._name, l), (right._name, r), how, op) + dsk[(name, i)] = ( + geopandas.sjoin, + (left._name, l), + (right._name, r), + how, + predicate, + ) # TODO preserve spatial partitions of the output if only left has spatial # partitions if using_spatial_partitions: diff --git a/doc/requirements.txt b/doc/requirements.txt index 5c5e08e3..dc3dc00c 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -1,4 +1,4 @@ -geopandas +geopandas>=0.10 numpydoc sphinx-book-theme myst-nb diff --git a/doc/source/docs/reference/geodataframe.rst b/doc/source/docs/reference/geodataframe.rst index 96ef988f..e3e251a4 100644 --- a/doc/source/docs/reference/geodataframe.rst +++ b/doc/source/docs/reference/geodataframe.rst @@ -47,6 +47,22 @@ Aggregating and exploding GeoDataFrame.explode +Spatial joins +------------- + +.. autosummary:: + :toctree: api/ + + GeoDataFrame.sjoin + +Overlay operations +------------------ + +.. autosummary:: + :toctree: api/ + + GeoDataFrame.clip + Indexing -------- diff --git a/doc/source/docs/reference/geoseries.rst b/doc/source/docs/reference/geoseries.rst index e62a1fe1..2234d496 100644 --- a/doc/source/docs/reference/geoseries.rst +++ b/doc/source/docs/reference/geoseries.rst @@ -124,6 +124,14 @@ Missing values GeoSeries.fillna GeoSeries.isna +Overlay operations +------------------ + +.. autosummary:: + :toctree: api/ + + GeoSeries.clip + Indexing -------- diff --git a/setup.py b/setup.py index 59f3be4a..a0236c52 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ import versioneer install_requires = [ - "geopandas", + "geopandas>=0.10", "dask>=2.18.0,!=2021.05.1", "distributed>=2.18.0,!=2021.05.1", "numba", diff --git a/tests/test_clip.py b/tests/test_clip.py index 1b3e3211..946a032e 100644 --- a/tests/test_clip.py +++ b/tests/test_clip.py @@ -1,5 +1,5 @@ import geopandas -from geopandas.testing import assert_geodataframe_equal +from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal import pytest import dask_geopandas from .test_core import geodf_points # noqa: F401 @@ -31,3 +31,13 @@ def test_clip_dask_mask(geodf_points): # noqa: F811 NotImplementedError, match=r"Mask cannot be a Dask GeoDataFrame or GeoSeries." ): dask_geopandas.clip(dask_obj, mask) + + +def test_clip_geoseries(geodf_points): # noqa: F811 + dask_obj = dask_geopandas.from_geopandas(geodf_points, npartitions=2) + dask_obj.calculate_spatial_partitions() + mask = geodf_points.iloc[:1] + mask["geometry"] = mask["geometry"].buffer(2) + expected = geopandas.clip(geodf_points.geometry, mask) + result = dask_geopandas.clip(dask_obj.geometry, mask).compute() + assert_geoseries_equal(expected, result) diff --git a/tests/test_core.py b/tests/test_core.py index ae54f461..fa66f9ca 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -527,6 +527,36 @@ def test_copy_none_spatial_partitions(geoseries_points): assert ddf_copy.spatial_partitions is None +def test_sjoin(): + # test only the method, functionality tested in test_sjoin.py + df_points = geopandas.read_file(geopandas.datasets.get_path("naturalearth_cities")) + ddf_points = dask_geopandas.from_geopandas(df_points, npartitions=4) + + df_polygons = geopandas.read_file( + geopandas.datasets.get_path("naturalearth_lowres") + ) + expected = df_points.sjoin(df_polygons, predicate="within", how="inner") + expected = expected.sort_index() + + result = ddf_points.sjoin(df_polygons, predicate="within", how="inner") + assert_geodataframe_equal(expected, result.compute().sort_index()) + + +def test_clip(geodf_points): + # test only the method, functionality tested in test_clip.py + dask_obj = dask_geopandas.from_geopandas(geodf_points, npartitions=2) + dask_obj.calculate_spatial_partitions() + mask = geodf_points.iloc[:1] + mask["geometry"] = mask["geometry"].buffer(2) + expected = geodf_points.clip(mask) + result = dask_obj.clip(mask).compute() + assert_geodataframe_equal(expected, result) + + expected = geodf_points.geometry.clip(mask) + result = dask_obj.geometry.clip(mask).compute() + assert_geoseries_equal(expected, result) + + class TestDissolve: def setup_method(self): self.world = geopandas.read_file( diff --git a/tests/test_sjoin.py b/tests/test_sjoin.py index a8054d80..127f523a 100644 --- a/tests/test_sjoin.py +++ b/tests/test_sjoin.py @@ -1,3 +1,5 @@ +import pytest + import geopandas from geopandas.testing import assert_geodataframe_equal @@ -13,24 +15,36 @@ def test_sjoin_dask_geopandas(): ) ddf_polygons = dask_geopandas.from_geopandas(df_polygons, npartitions=4) - expected = geopandas.sjoin(df_points, df_polygons, op="within", how="inner") + expected = geopandas.sjoin(df_points, df_polygons, predicate="within", how="inner") expected = expected.sort_index() # dask / geopandas - result = dask_geopandas.sjoin(ddf_points, df_polygons, op="within", how="inner") + result = dask_geopandas.sjoin( + ddf_points, df_polygons, predicate="within", how="inner" + ) assert_geodataframe_equal(expected, result.compute().sort_index()) # geopandas / dask - result = dask_geopandas.sjoin(df_points, ddf_polygons, op="within", how="inner") + result = dask_geopandas.sjoin( + df_points, ddf_polygons, predicate="within", how="inner" + ) assert_geodataframe_equal(expected, result.compute().sort_index()) # dask / dask - result = dask_geopandas.sjoin(ddf_points, ddf_polygons, op="within", how="inner") + result = dask_geopandas.sjoin( + ddf_points, ddf_polygons, predicate="within", how="inner" + ) assert_geodataframe_equal(expected, result.compute().sort_index()) # with spatial_partitions ddf_points.calculate_spatial_partitions() ddf_polygons.calculate_spatial_partitions() - result = dask_geopandas.sjoin(ddf_points, ddf_polygons, op="within", how="inner") + result = dask_geopandas.sjoin( + ddf_points, ddf_polygons, predicate="within", how="inner" + ) assert result.spatial_partitions is not None assert_geodataframe_equal(expected, result.compute().sort_index()) + + # check warning + with pytest.warns(FutureWarning, match="The `op` parameter is deprecated"): + dask_geopandas.sjoin(df_points, ddf_polygons, op="within", how="inner")