From be1e0f9bdaad5b5dbbab86f5cc1da6e095d92dc9 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 6 Oct 2024 11:35:38 +0900 Subject: [PATCH 1/5] Remove unvetted DataTree methods As [discussed](https://docs.google.com/presentation/d/1zBjEsihBhK_U972jxHwaAZBbzS1-hd3aDLnO9uu2Ob4/edit#slide=id.g3087b787633_13_0) in the last DataTree meeting, this PR deletes the many Dataset methods that were copied onto DataTree without unit tests, along with a few that are not implemented properly yet, e.g., 1. Arithmetic methods were removed, because `DataTree + Dataset` should probably raise an error. 2. Indexing and aggregation methods were removed, because these should allow for dimensions that are missing only on some nodes. 3. The untested `map_over_subtree_inplace` and `render` methods were removed. 3. A few other methods (e.g., `merge` and `plot`) that were only implemented by raising `NotImplementedError`` are entirely removed instead. --- xarray/core/datatree.py | 58 ----------------------------------- xarray/tests/test_datatree.py | 35 ++++++++++++++------- 2 files changed, 24 insertions(+), 69 deletions(-) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index 52d44bec96f..bc04b855a4c 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -24,12 +24,6 @@ check_isomorphic, map_over_subtree, ) -from xarray.core.datatree_ops import ( - DataTreeArithmeticMixin, - MappedDatasetMethodsMixin, - MappedDataWithCoords, -) -from xarray.core.datatree_render import RenderDataTree from xarray.core.formatting import datatree_repr, dims_and_coords_repr from xarray.core.formatting_html import ( datatree_repr as datatree_repr_html, @@ -404,9 +398,6 @@ def map( # type: ignore[override] class DataTree( NamedNode["DataTree"], - MappedDatasetMethodsMixin, - MappedDataWithCoords, - DataTreeArithmeticMixin, TreeAttrAccessMixin, Mapping[str, "DataArray | DataTree"], ): @@ -1413,34 +1404,6 @@ def map_over_subtree( # TODO fix this typing error return map_over_subtree(func)(self, *args, **kwargs) - def map_over_subtree_inplace( - self, - func: Callable, - *args: Iterable[Any], - **kwargs: Any, - ) -> None: - """ - Apply a function to every dataset in this subtree, updating data in place. - - Parameters - ---------- - func : callable - Function to apply to datasets with signature: - `func(node.dataset, *args, **kwargs) -> Dataset`. - - Function will not be applied to any nodes without datasets, - *args : tuple, optional - Positional arguments passed on to `func`. - **kwargs : Any - Keyword arguments passed on to `func`. - """ - - # TODO if func fails on some node then the previous nodes will still have been updated... - - for node in self.subtree: - if node.has_data: - node.dataset = func(node.dataset, *args, **kwargs) - def pipe( self, func: Callable | tuple[Callable, str], *args: Any, **kwargs: Any ) -> Any: @@ -1501,26 +1464,8 @@ def pipe( args = (self,) + args return func(*args, **kwargs) - def render(self): - """Print tree structure, including any data stored at each node.""" - for pre, fill, node in RenderDataTree(self): - print(f"{pre}DataTree('{self.name}')") - for ds_line in repr(node.dataset)[1:]: - print(f"{fill}{ds_line}") - - def merge(self, datatree: DataTree) -> DataTree: - """Merge all the leaves of a second DataTree into this one.""" - raise NotImplementedError - - def merge_child_nodes(self, *paths, new_path: T_Path) -> DataTree: - """Merge a set of child nodes into a single new node.""" - raise NotImplementedError - # TODO some kind of .collapse() or .flatten() method to merge a subtree - def to_dataarray(self) -> DataArray: - return self.dataset.to_dataarray() - @property def groups(self): """Return all netCDF4 groups in the tree, given as a tuple of path-like strings.""" @@ -1655,6 +1600,3 @@ def to_zarr( compute=compute, **kwargs, ) - - def plot(self): - raise NotImplementedError diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py index 30934f83c63..bd0b6c34e7d 100644 --- a/xarray/tests/test_datatree.py +++ b/xarray/tests/test_datatree.py @@ -668,10 +668,11 @@ def test_modify(self): actual.coords["x"] = ("x", [-1]) assert_identical(actual, dt) # should not be modified - actual = dt.copy() - del actual.coords["b"] - expected = dt.reset_coords("b", drop=True) - assert_identical(expected, actual) + # TODO: re-enable after implementing reset_coords() + # actual = dt.copy() + # del actual.coords["b"] + # expected = dt.reset_coords("b", drop=True) + # assert_identical(expected, actual) with pytest.raises(KeyError): del dt.coords["not_found"] @@ -679,14 +680,15 @@ def test_modify(self): with pytest.raises(KeyError): del dt.coords["foo"] - actual = dt.copy(deep=True) - actual.coords.update({"c": 11}) - expected = dt.assign_coords({"c": 11}) - assert_identical(expected, actual) + # TODO: re-enable after implementing assign_coords() + # actual = dt.copy(deep=True) + # actual.coords.update({"c": 11}) + # expected = dt.assign_coords({"c": 11}) + # assert_identical(expected, actual) - # regression test for GH3746 - del actual.coords["x"] - assert "x" not in actual.xindexes + # # regression test for GH3746 + # del actual.coords["x"] + # assert "x" not in actual.xindexes # test that constructors can also handle the `DataTreeCoordinates` object ds2 = Dataset(coords=dt.coords) @@ -968,6 +970,7 @@ def test_ipython_key_completions(self, create_test_datatree): var_keys = list(dt.variables.keys()) assert all(var_key in key_completions for var_key in var_keys) + @pytest.mark.xfail(reason="sel not implemented yet") def test_operation_with_attrs_but_no_data(self): # tests bug from xarray-datatree GH262 xs = xr.Dataset({"testvar": xr.DataArray(np.ones((2, 3)))}) @@ -1557,6 +1560,7 @@ def test_filter(self): class TestDSMethodInheritance: + @pytest.mark.xfail(reason="isel not implemented yet") def test_dataset_method(self): ds = xr.Dataset({"a": ("x", [1, 2, 3])}) dt = DataTree.from_dict( @@ -1576,6 +1580,7 @@ def test_dataset_method(self): result = dt.isel(x=1) assert_equal(result, expected) + @pytest.mark.xfail(reason="reduce methods not implemented yet") def test_reduce_method(self): ds = xr.Dataset({"a": ("x", [False, True, False])}) dt = DataTree.from_dict({"/": ds, "/results": ds}) @@ -1585,6 +1590,7 @@ def test_reduce_method(self): result = dt.any() assert_equal(result, expected) + @pytest.mark.xfail(reason="reduce methods not implemented yet") def test_nan_reduce_method(self): ds = xr.Dataset({"a": ("x", [1, 2, 3])}) dt = DataTree.from_dict({"/": ds, "/results": ds}) @@ -1594,6 +1600,7 @@ def test_nan_reduce_method(self): result = dt.mean() assert_equal(result, expected) + @pytest.mark.xfail(reason="cum methods not implemented yet") def test_cum_method(self): ds = xr.Dataset({"a": ("x", [1, 2, 3])}) dt = DataTree.from_dict({"/": ds, "/results": ds}) @@ -1610,6 +1617,7 @@ def test_cum_method(self): class TestOps: + @pytest.mark.xfail(reason="arithmetic not implemented yet") def test_binary_op_on_int(self): ds1 = xr.Dataset({"a": [5], "b": [3]}) ds2 = xr.Dataset({"x": [0.1, 0.2], "y": [10, 20]}) @@ -1621,6 +1629,7 @@ def test_binary_op_on_int(self): result: DataTree = dt * 5 # type: ignore[assignment,operator] assert_equal(result, expected) + @pytest.mark.xfail(reason="arithmetic not implemented yet") def test_binary_op_on_dataset(self): ds1 = xr.Dataset({"a": [5], "b": [3]}) ds2 = xr.Dataset({"x": [0.1, 0.2], "y": [10, 20]}) @@ -1643,6 +1652,7 @@ def test_binary_op_on_dataset(self): result = dt * other_ds assert_equal(result, expected) + @pytest.mark.xfail(reason="arithmetic not implemented yet") def test_binary_op_on_datatree(self): ds1 = xr.Dataset({"a": [5], "b": [3]}) ds2 = xr.Dataset({"x": [0.1, 0.2], "y": [10, 20]}) @@ -1655,6 +1665,7 @@ def test_binary_op_on_datatree(self): result = dt * dt # type: ignore[operator] assert_equal(result, expected) + @pytest.mark.xfail(reason="arithmetic not implemented yet") def test_arithmetic_inherited_coords(self): tree = DataTree(xr.Dataset(coords={"x": [1, 2, 3]})) tree["/foo"] = DataTree(xr.Dataset({"bar": ("x", [4, 5, 6])})) @@ -1669,6 +1680,8 @@ def test_arithmetic_inherited_coords(self): class TestUFuncs: + + @pytest.mark.xfail(reason="__array_ufunc__ not implemented yet") def test_tree(self, create_test_datatree): dt = create_test_datatree() expected = create_test_datatree(modify=lambda ds: np.sin(ds)) From 2ddf24115db5462a78ba86b570a73c7c474ced93 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 6 Oct 2024 11:56:41 +0900 Subject: [PATCH 2/5] groups docstring --- xarray/core/datatree.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index bc04b855a4c..57b7b087d95 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -1468,7 +1468,7 @@ def pipe( @property def groups(self): - """Return all netCDF4 groups in the tree, given as a tuple of path-like strings.""" + """Return all groups in the tree, given as a tuple of path-like strings.""" return tuple(node.path for node in self.subtree) def to_netcdf( From b3be8781a31a1304b8fb9a0f404c202df6e47e1d Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 6 Oct 2024 21:47:45 +0900 Subject: [PATCH 3/5] comment out removed DataTree methods --- doc/api.rst | 269 ++++++++++++++++++++++++++-------------------------- 1 file changed, 135 insertions(+), 134 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 87f116514cc..c1e3c09c77b 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -725,17 +725,18 @@ Manipulate the contents of all nodes in a ``DataTree`` simultaneously. :toctree: generated/ DataTree.copy - DataTree.assign_coords - DataTree.merge - DataTree.rename - DataTree.rename_vars - DataTree.rename_dims - DataTree.swap_dims - DataTree.expand_dims - DataTree.drop_vars - DataTree.drop_dims - DataTree.set_coords - DataTree.reset_coords + + .. DataTree.assign_coords + .. DataTree.merge + .. DataTree.rename + .. DataTree.rename_vars + .. DataTree.rename_dims + .. DataTree.swap_dims + .. DataTree.expand_dims + .. DataTree.drop_vars + .. DataTree.drop_dims + .. DataTree.set_coords + .. DataTree.reset_coords DataTree Node Contents ---------------------- @@ -760,129 +761,129 @@ Compare one ``DataTree`` object to another. DataTree.equals DataTree.identical -Indexing --------- - -Index into all nodes in the subtree simultaneously. - -.. autosummary:: - :toctree: generated/ - - DataTree.isel - DataTree.sel - DataTree.drop_sel - DataTree.drop_isel - DataTree.head - DataTree.tail - DataTree.thin - DataTree.squeeze - DataTree.interp - DataTree.interp_like - DataTree.reindex - DataTree.reindex_like - DataTree.set_index - DataTree.reset_index - DataTree.reorder_levels - DataTree.query - -.. - - Missing: - ``DataTree.loc`` - - -Missing Value Handling ----------------------- - -.. autosummary:: - :toctree: generated/ - - DataTree.isnull - DataTree.notnull - DataTree.combine_first - DataTree.dropna - DataTree.fillna - DataTree.ffill - DataTree.bfill - DataTree.interpolate_na - DataTree.where - DataTree.isin - -Computation ------------ - -Apply a computation to the data in all nodes in the subtree simultaneously. - -.. autosummary:: - :toctree: generated/ - - DataTree.map - DataTree.reduce - DataTree.diff - DataTree.quantile - DataTree.differentiate - DataTree.integrate - DataTree.map_blocks - DataTree.polyfit - DataTree.curvefit - -Aggregation ------------ - -Aggregate data in all nodes in the subtree simultaneously. - -.. autosummary:: - :toctree: generated/ - - DataTree.all - DataTree.any - DataTree.argmax - DataTree.argmin - DataTree.idxmax - DataTree.idxmin - DataTree.max - DataTree.min - DataTree.mean - DataTree.median - DataTree.prod - DataTree.sum - DataTree.std - DataTree.var - DataTree.cumsum - DataTree.cumprod - -ndarray methods ---------------- - -Methods copied from :py:class:`numpy.ndarray` objects, here applying to the data in all nodes in the subtree. - -.. autosummary:: - :toctree: generated/ - - DataTree.argsort - DataTree.astype - DataTree.clip - DataTree.conj - DataTree.conjugate - DataTree.round - DataTree.rank - -Reshaping and reorganising --------------------------- - -Reshape or reorganise the data in all nodes in the subtree. - -.. autosummary:: - :toctree: generated/ - - DataTree.transpose - DataTree.stack - DataTree.unstack - DataTree.shift - DataTree.roll - DataTree.pad - DataTree.sortby - DataTree.broadcast_like +.. Indexing +.. -------- + +.. Index into all nodes in the subtree simultaneously. + +.. .. autosummary:: +.. :toctree: generated/ + +.. DataTree.isel +.. DataTree.sel +.. DataTree.drop_sel +.. DataTree.drop_isel +.. DataTree.head +.. DataTree.tail +.. DataTree.thin +.. DataTree.squeeze +.. DataTree.interp +.. DataTree.interp_like +.. DataTree.reindex +.. DataTree.reindex_like +.. DataTree.set_index +.. DataTree.reset_index +.. DataTree.reorder_levels +.. DataTree.query + +.. .. + +.. Missing: +.. ``DataTree.loc`` + + +.. Missing Value Handling +.. ---------------------- + +.. .. autosummary:: +.. :toctree: generated/ + +.. DataTree.isnull +.. DataTree.notnull +.. DataTree.combine_first +.. DataTree.dropna +.. DataTree.fillna +.. DataTree.ffill +.. DataTree.bfill +.. DataTree.interpolate_na +.. DataTree.where +.. DataTree.isin + +.. Computation +.. ----------- + +.. Apply a computation to the data in all nodes in the subtree simultaneously. + +.. .. autosummary:: +.. :toctree: generated/ + +.. DataTree.map +.. DataTree.reduce +.. DataTree.diff +.. DataTree.quantile +.. DataTree.differentiate +.. DataTree.integrate +.. DataTree.map_blocks +.. DataTree.polyfit +.. DataTree.curvefit + +.. Aggregation +.. ----------- + +.. Aggregate data in all nodes in the subtree simultaneously. + +.. .. autosummary:: +.. :toctree: generated/ + +.. DataTree.all +.. DataTree.any +.. DataTree.argmax +.. DataTree.argmin +.. DataTree.idxmax +.. DataTree.idxmin +.. DataTree.max +.. DataTree.min +.. DataTree.mean +.. DataTree.median +.. DataTree.prod +.. DataTree.sum +.. DataTree.std +.. DataTree.var +.. DataTree.cumsum +.. DataTree.cumprod + +.. ndarray methods +.. --------------- + +.. Methods copied from :py:class:`numpy.ndarray` objects, here applying to the data in all nodes in the subtree. + +.. .. autosummary:: +.. :toctree: generated/ + +.. DataTree.argsort +.. DataTree.astype +.. DataTree.clip +.. DataTree.conj +.. DataTree.conjugate +.. DataTree.round +.. DataTree.rank + +.. Reshaping and reorganising +.. -------------------------- + +.. Reshape or reorganise the data in all nodes in the subtree. + +.. .. autosummary:: +.. :toctree: generated/ + +.. DataTree.transpose +.. DataTree.stack +.. DataTree.unstack +.. DataTree.shift +.. DataTree.roll +.. DataTree.pad +.. DataTree.sortby +.. DataTree.broadcast_like IO / Conversion =============== From 4259f1e09584cd49ae1d313ef87ab3f699ce3620 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 6 Oct 2024 21:59:05 +0900 Subject: [PATCH 4/5] update quick overview on DataTree --- doc/getting-started-guide/quick-overview.rst | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/doc/getting-started-guide/quick-overview.rst b/doc/getting-started-guide/quick-overview.rst index 5efe3acc609..050de3dcb1c 100644 --- a/doc/getting-started-guide/quick-overview.rst +++ b/doc/getting-started-guide/quick-overview.rst @@ -314,19 +314,23 @@ And you can get a copy of just the node local values of :py:class:`~xarray.Datas ds_node_local = dt["simulation/coarse"].to_dataset(inherited=False) ds_node_local -Operations map over subtrees, so we can take a mean over the ``x`` dimension of both the ``fine`` and ``coarse`` groups just by: +We intend to eventually implement most :py:class:`~xarray.Dataset` methods +(indexing, aggregation, arithmetic, etc) on :py:class:`~xarray.DataTree` +objects, but many methods have not been implemented yet. -.. ipython:: python +.. Operations map over subtrees, so we can take a mean over the ``x`` dimension of both the ``fine`` and ``coarse`` groups just by: + +.. .. ipython:: python - avg = dt["simulation"].mean(dim="x") - avg +.. avg = dt["simulation"].mean(dim="x") +.. avg -Here the ``"x"`` dimension used is always the one local to that subgroup. +.. Here the ``"x"`` dimension used is always the one local to that subgroup. -You can do almost everything you can do with :py:class:`~xarray.Dataset` objects with :py:class:`~xarray.DataTree` objects -(including indexing and arithmetic), as operations will be mapped over every subgroup in the tree. -This allows you to work with multiple groups of non-alignable variables at once. +.. You can do almost everything you can do with :py:class:`~xarray.Dataset` objects with :py:class:`~xarray.DataTree` objects +.. (including indexing and arithmetic), as operations will be mapped over every subgroup in the tree. +.. This allows you to work with multiple groups of non-alignable variables at once. .. note:: From a523d59f052fbd0fbd0ec980a835325101de8670 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Mon, 7 Oct 2024 20:17:23 +0900 Subject: [PATCH 5/5] doc fixes suggested by Tom --- doc/api.rst | 28 ++++++++++---------- doc/getting-started-guide/quick-overview.rst | 12 +++++---- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index c1e3c09c77b..63fb59bc5e0 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -705,16 +705,16 @@ Pathlib-like Interface DataTree.parents DataTree.relative_to -Missing: +.. Missing: -.. +.. .. - ``DataTree.glob`` - ``DataTree.joinpath`` - ``DataTree.with_name`` - ``DataTree.walk`` - ``DataTree.rename`` - ``DataTree.replace`` +.. ``DataTree.glob`` +.. ``DataTree.joinpath`` +.. ``DataTree.with_name`` +.. ``DataTree.walk`` +.. ``DataTree.rename`` +.. ``DataTree.replace`` DataTree Contents ----------------- @@ -962,10 +962,10 @@ DataTree methods DataTree.to_netcdf DataTree.to_zarr -.. +.. .. - Missing: - ``open_mfdatatree`` +.. Missing: +.. ``open_mfdatatree`` Coordinates objects =================== @@ -1477,10 +1477,10 @@ Advanced API backends.list_engines backends.refresh_engines -.. +.. .. - Missing: - ``DataTree.set_close`` +.. Missing: +.. ``DataTree.set_close`` Default, pandas-backed indexes built-in Xarray: diff --git a/doc/getting-started-guide/quick-overview.rst b/doc/getting-started-guide/quick-overview.rst index 050de3dcb1c..fbe81b2e895 100644 --- a/doc/getting-started-guide/quick-overview.rst +++ b/doc/getting-started-guide/quick-overview.rst @@ -314,9 +314,11 @@ And you can get a copy of just the node local values of :py:class:`~xarray.Datas ds_node_local = dt["simulation/coarse"].to_dataset(inherited=False) ds_node_local -We intend to eventually implement most :py:class:`~xarray.Dataset` methods -(indexing, aggregation, arithmetic, etc) on :py:class:`~xarray.DataTree` -objects, but many methods have not been implemented yet. +.. note:: + + We intend to eventually implement most :py:class:`~xarray.Dataset` methods + (indexing, aggregation, arithmetic, etc) on :py:class:`~xarray.DataTree` + objects, but many methods have not been implemented yet. .. Operations map over subtrees, so we can take a mean over the ``x`` dimension of both the ``fine`` and ``coarse`` groups just by: @@ -332,9 +334,9 @@ objects, but many methods have not been implemented yet. .. (including indexing and arithmetic), as operations will be mapped over every subgroup in the tree. .. This allows you to work with multiple groups of non-alignable variables at once. -.. note:: +.. tip:: - If all of your variables are mutually alignable (i.e. they live on the same + If all of your variables are mutually alignable (i.e., they live on the same grid, such that every common dimension name maps to the same length), then you probably don't need :py:class:`xarray.DataTree`, and should consider just sticking with :py:class:`xarray.Dataset`.