Skip to content

Commit 3967351

Browse files
aladinorTomNicholaskmuehlbauer
authored
open_datatree performance improvement on NetCDF, H5, and Zarr files (#9014)
* open_datatree performance improvement on NetCDF files * fixing issue with forward slashes * fixing issue with pytest * open datatree in zarr format improvement * fixing incompatibility in returned object * passing group parameter to opendatatree method and reducing duplicated code * passing group parameter to opendatatree method - NetCDF * Update xarray/backends/netCDF4_.py renaming variables Co-authored-by: Tom Nicholas <[email protected]> * renaming variables * renaming variables * renaming group_store variable * removing _open_datatree_netcdf function not used anymore in open_datatree implementations * improving performance of open_datatree method * renaming 'i' variable within list comprehension in open_store method for zarr datatree * using the default generator instead of loading zarr groups in memory * fixing issue with group path to avoid using group[1:] notation. Adding group variable typing hints (str | Iterable[str] | callable) under the open_datatree for h5 files. Finally, separating positional from keyword args * fixing issue with group path to avoid using group[1:] notation and adding group variable typing hints (str | Iterable[str] | callable) under the open_datatree method for netCDF files * fixing issue with group path to avoid using group[1:] notation and adding group variable typing hints (str | Iterable[str] | callable) under the open_datatree method for zarr files * adding 'mode' parameter to open_datatree method * adding 'mode' parameter to H5NetCDFStore.open method * adding new entry related to open_datatree performance improvement * adding new entry related to open_datatree performance improvement * Getting rid of unnecessary parameters for 'open_datatree' method for netCDF4 and Hdf5 backends --------- Co-authored-by: Tom Nicholas <[email protected]> Co-authored-by: Kai Mühlbauer <[email protected]>
1 parent cb3663d commit 3967351

File tree

5 files changed

+287
-128
lines changed

5 files changed

+287
-128
lines changed

doc/whats-new.rst

+2
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ Performance
3030
By `Deepak Cherian <https://github.com/dcherian>`_.
3131
- Small optimizations to help reduce indexing speed of datasets (:pull:`9002`).
3232
By `Mark Harfouche <https://github.com/hmaarrfk>`_.
33+
- Performance improvement in `open_datatree` method for Zarr, netCDF4 and h5netcdf backends (:issue:`8994`, :pull:`9014`).
34+
By `Alfonso Ladino <https://github.com/aladinor>`_.
3335

3436

3537
Breaking changes

xarray/backends/common.py

-30
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,6 @@
1919
if TYPE_CHECKING:
2020
from io import BufferedIOBase
2121

22-
from h5netcdf.legacyapi import Dataset as ncDatasetLegacyH5
23-
from netCDF4 import Dataset as ncDataset
24-
2522
from xarray.core.dataset import Dataset
2623
from xarray.core.datatree import DataTree
2724
from xarray.core.types import NestedSequence
@@ -131,33 +128,6 @@ def _decode_variable_name(name):
131128
return name
132129

133130

134-
def _open_datatree_netcdf(
135-
ncDataset: ncDataset | ncDatasetLegacyH5,
136-
filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
137-
**kwargs,
138-
) -> DataTree:
139-
from xarray.backends.api import open_dataset
140-
from xarray.core.datatree import DataTree
141-
from xarray.core.treenode import NodePath
142-
143-
ds = open_dataset(filename_or_obj, **kwargs)
144-
tree_root = DataTree.from_dict({"/": ds})
145-
with ncDataset(filename_or_obj, mode="r") as ncds:
146-
for path in _iter_nc_groups(ncds):
147-
subgroup_ds = open_dataset(filename_or_obj, group=path, **kwargs)
148-
149-
# TODO refactor to use __setitem__ once creation of new nodes by assigning Dataset works again
150-
node_name = NodePath(path).name
151-
new_node: DataTree = DataTree(name=node_name, data=subgroup_ds)
152-
tree_root._set_item(
153-
path,
154-
new_node,
155-
allow_overwrite=False,
156-
new_nodes_along_path=True,
157-
)
158-
return tree_root
159-
160-
161131
def _iter_nc_groups(root, parent="/"):
162132
from xarray.core.treenode import NodePath
163133

xarray/backends/h5netcdf_.py

+50-4
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,14 @@
33
import functools
44
import io
55
import os
6-
from collections.abc import Iterable
6+
from collections.abc import Callable, Iterable
77
from typing import TYPE_CHECKING, Any
88

99
from xarray.backends.common import (
1010
BACKEND_ENTRYPOINTS,
1111
BackendEntrypoint,
1212
WritableCFDataStore,
1313
_normalize_path,
14-
_open_datatree_netcdf,
1514
find_root_and_group,
1615
)
1716
from xarray.backends.file_manager import CachingFileManager, DummyFileManager
@@ -431,11 +430,58 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti
431430
def open_datatree(
432431
self,
433432
filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
433+
*,
434+
mask_and_scale=True,
435+
decode_times=True,
436+
concat_characters=True,
437+
decode_coords=True,
438+
drop_variables: str | Iterable[str] | None = None,
439+
use_cftime=None,
440+
decode_timedelta=None,
441+
group: str | Iterable[str] | Callable | None = None,
434442
**kwargs,
435443
) -> DataTree:
436-
from h5netcdf.legacyapi import Dataset as ncDataset
444+
from xarray.backends.api import open_dataset
445+
from xarray.backends.common import _iter_nc_groups
446+
from xarray.core.datatree import DataTree
447+
from xarray.core.treenode import NodePath
448+
from xarray.core.utils import close_on_error
437449

438-
return _open_datatree_netcdf(ncDataset, filename_or_obj, **kwargs)
450+
filename_or_obj = _normalize_path(filename_or_obj)
451+
store = H5NetCDFStore.open(
452+
filename_or_obj,
453+
group=group,
454+
)
455+
if group:
456+
parent = NodePath("/") / NodePath(group)
457+
else:
458+
parent = NodePath("/")
459+
460+
manager = store._manager
461+
ds = open_dataset(store, **kwargs)
462+
tree_root = DataTree.from_dict({str(parent): ds})
463+
for path_group in _iter_nc_groups(store.ds, parent=parent):
464+
group_store = H5NetCDFStore(manager, group=path_group, **kwargs)
465+
store_entrypoint = StoreBackendEntrypoint()
466+
with close_on_error(group_store):
467+
ds = store_entrypoint.open_dataset(
468+
group_store,
469+
mask_and_scale=mask_and_scale,
470+
decode_times=decode_times,
471+
concat_characters=concat_characters,
472+
decode_coords=decode_coords,
473+
drop_variables=drop_variables,
474+
use_cftime=use_cftime,
475+
decode_timedelta=decode_timedelta,
476+
)
477+
new_node: DataTree = DataTree(name=NodePath(path_group).name, data=ds)
478+
tree_root._set_item(
479+
path_group,
480+
new_node,
481+
allow_overwrite=False,
482+
new_nodes_along_path=True,
483+
)
484+
return tree_root
439485

440486

441487
BACKEND_ENTRYPOINTS["h5netcdf"] = ("h5netcdf", H5netcdfBackendEntrypoint)

xarray/backends/netCDF4_.py

+49-4
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import functools
44
import operator
55
import os
6-
from collections.abc import Iterable
6+
from collections.abc import Callable, Iterable
77
from contextlib import suppress
88
from typing import TYPE_CHECKING, Any
99

@@ -16,7 +16,6 @@
1616
BackendEntrypoint,
1717
WritableCFDataStore,
1818
_normalize_path,
19-
_open_datatree_netcdf,
2019
find_root_and_group,
2120
robust_getitem,
2221
)
@@ -672,11 +671,57 @@ def open_dataset( # type: ignore[override] # allow LSP violation, not supporti
672671
def open_datatree(
673672
self,
674673
filename_or_obj: str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore,
674+
*,
675+
mask_and_scale=True,
676+
decode_times=True,
677+
concat_characters=True,
678+
decode_coords=True,
679+
drop_variables: str | Iterable[str] | None = None,
680+
use_cftime=None,
681+
decode_timedelta=None,
682+
group: str | Iterable[str] | Callable | None = None,
675683
**kwargs,
676684
) -> DataTree:
677-
from netCDF4 import Dataset as ncDataset
685+
from xarray.backends.api import open_dataset
686+
from xarray.backends.common import _iter_nc_groups
687+
from xarray.core.datatree import DataTree
688+
from xarray.core.treenode import NodePath
678689

679-
return _open_datatree_netcdf(ncDataset, filename_or_obj, **kwargs)
690+
filename_or_obj = _normalize_path(filename_or_obj)
691+
store = NetCDF4DataStore.open(
692+
filename_or_obj,
693+
group=group,
694+
)
695+
if group:
696+
parent = NodePath("/") / NodePath(group)
697+
else:
698+
parent = NodePath("/")
699+
700+
manager = store._manager
701+
ds = open_dataset(store, **kwargs)
702+
tree_root = DataTree.from_dict({str(parent): ds})
703+
for path_group in _iter_nc_groups(store.ds, parent=parent):
704+
group_store = NetCDF4DataStore(manager, group=path_group, **kwargs)
705+
store_entrypoint = StoreBackendEntrypoint()
706+
with close_on_error(group_store):
707+
ds = store_entrypoint.open_dataset(
708+
group_store,
709+
mask_and_scale=mask_and_scale,
710+
decode_times=decode_times,
711+
concat_characters=concat_characters,
712+
decode_coords=decode_coords,
713+
drop_variables=drop_variables,
714+
use_cftime=use_cftime,
715+
decode_timedelta=decode_timedelta,
716+
)
717+
new_node: DataTree = DataTree(name=NodePath(path_group).name, data=ds)
718+
tree_root._set_item(
719+
path_group,
720+
new_node,
721+
allow_overwrite=False,
722+
new_nodes_along_path=True,
723+
)
724+
return tree_root
680725

681726

682727
BACKEND_ENTRYPOINTS["netcdf4"] = ("netCDF4", NetCDF4BackendEntrypoint)

0 commit comments

Comments
 (0)