Skip to content

Commit c525715

Browse files
add read-in functionality for deeply nested variables (e.g. ATL08) (#281)
Co-authored-by: GitHub Action <[email protected]>
1 parent aa61e3f commit c525715

File tree

7 files changed

+261
-196
lines changed

7 files changed

+261
-196
lines changed

doc/source/example_notebooks/IS2_data_read-in.ipynb

+1-1
Original file line numberDiff line numberDiff line change
@@ -426,7 +426,7 @@
426426
"\n",
427427
"***ATTENTION: icepyx loads your data by creating an Xarray DataSet for each input granule and then merging them. In some cases, the automatic merge fails and needs to be handled manually. In these cases, icepyx will return a warning with the error message from the failed Xarray merge and a list of per-granule DataSets***\n",
428428
"\n",
429-
"This can happen if you unintentionally provide the same granule multiple times with different filenames."
429+
"This can happen if you unintentionally provide the same granule multiple times with different filenames or in segmented products where the rgt+cycle automatically generated `gran_idx` values match. In this latter case, you can simply provide unique `gran_idx` values for each DataSet in `ds` and run `import xarray as xr` and `ds_merged = xr.merge(ds)` to create one merged DataSet."
430430
]
431431
},
432432
{

doc/source/user_guide/documentation/classes_dev_uml.svg

+172-171
Loading

doc/source/user_guide/documentation/classes_user_uml.svg

+3-3
Loading

icepyx/core/is2ref.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ def _default_varlists(product):
259259

260260
else:
261261
print(
262-
"THE REQUESTED PRODUCT DOES NOT YET HAVE A DEFAULT LIST SET UP. ONLY DELTA_TIME, LATITUTDE, AND LONGITUDE WILL BE RETURNED"
262+
"THE REQUESTED PRODUCT DOES NOT YET HAVE A DEFAULT LIST SET UP. ONLY DELTA_TIME, LATITUDE, AND LONGITUDE WILL BE RETURNED"
263263
)
264264
return common_list
265265

icepyx/core/query.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1017,7 +1017,7 @@ def download_granules(
10171017
by default when subset=True, but additional subsetting options are available.
10181018
Spatial subsetting returns all data that are within the area of interest (but not complete
10191019
granules. This eliminates false-positive granules returned by the metadata-level search)
1020-
restart: boolean, default false
1020+
restart : boolean, default false
10211021
If previous download was terminated unexpectedly. Run again with restart set to True to continue.
10221022
**kwargs : key-value pairs
10231023
Additional parameters to be passed to the subsetter.

icepyx/core/read.py

+71-17
Original file line numberDiff line numberDiff line change
@@ -311,9 +311,9 @@ def _check_source_for_pattern(source, filename_pattern):
311311
return False, None
312312

313313
@staticmethod
314-
def _add_var_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):
314+
def _add_vars_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):
315315
"""
316-
Add the new variable group to the dataset template.
316+
Add the new variables in the group to the dataset template.
317317
318318
Parameters
319319
----------
@@ -336,11 +336,9 @@ def _add_var_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):
336336
Xarray Dataset with variables from the ds variable group added.
337337
"""
338338

339-
wanted_vars = list(wanted_dict.keys())
340-
341339
if grp_path in ["orbit_info", "ancillary_data"]:
342340
grp_spec_vars = [
343-
wanted_vars[i]
341+
wanted_groups_tiered[-1][i]
344342
for i, x in enumerate(wanted_groups_tiered[0])
345343
if x == grp_path
346344
]
@@ -389,9 +387,10 @@ def _add_var_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):
389387
# add a test for the new function (called here)!
390388

391389
grp_spec_vars = [
392-
k for k, v in wanted_dict.items() if any(grp_path in x for x in v)
390+
k
391+
for k, v in wanted_dict.items()
392+
if any(f"{grp_path}/{k}" in x for x in v)
393393
]
394-
# print(grp_spec_vars)
395394

396395
ds = (
397396
ds.reset_coords(drop=False)
@@ -400,17 +399,57 @@ def _add_var_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):
400399
.assign(gt=(("gran_idx", "spot"), [[gt_str]]))
401400
)
402401

403-
# print(ds)
404402
grp_spec_vars.append("gt")
405403
is2ds = is2ds.merge(
406404
ds[grp_spec_vars], join="outer", combine_attrs="no_conflicts"
407405
)
408-
# print(is2ds)
409406

410407
# re-cast some dtypes to make array smaller
411408
is2ds["gt"] = is2ds.gt.astype(str)
412409
is2ds["spot"] = is2ds.spot.astype(np.uint8)
413410

411+
return is2ds, ds[grp_spec_vars]
412+
413+
@staticmethod
414+
def _combine_nested_vars(is2ds, ds, grp_path, wanted_dict):
415+
"""
416+
Add the deeply nested variables to a dataset with appropriate coordinate information.
417+
418+
Parameters
419+
----------
420+
is2ds : Xarray dataset
421+
Dataset to add deeply nested variables to.
422+
ds : Xarray dataset
423+
Dataset containing proper dimensions for the variables being added
424+
grp_path : str
425+
hdf5 group path read into ds
426+
wanted_dict : dict
427+
Dictionary with variable names as keys and a list of group + variable paths containing those variables as values.
428+
429+
Returns
430+
-------
431+
Xarray Dataset with variables from the ds variable group added.
432+
"""
433+
434+
grp_spec_vars = [
435+
k for k, v in wanted_dict.items() if any(f"{grp_path}/{k}" in x for x in v)
436+
]
437+
438+
# # Use this to handle issues specific to group paths that are more nested
439+
# tiers = len(wanted_groups_tiered)
440+
# if tiers > 3 and grp_path.count("/") == tiers - 2:
441+
# # Handle attribute conflicts that arose from data descriptions during merging
442+
# for var in grp_spec_vars:
443+
# ds[var].attrs = ds.attrs
444+
# for k in ds[var].attrs.keys():
445+
# ds.attrs.pop(k)
446+
# # warnings.warn(
447+
# # "Due to the number of layers of variable group paths, some attributes have been dropped from your DataSet during merging",
448+
# # UserWarning,
449+
# # )
450+
451+
is2ds = is2ds.assign(ds[grp_spec_vars])
452+
414453
return is2ds
415454

416455
def load(self):
@@ -485,7 +524,7 @@ def _build_dataset_template(self, file):
485524
)
486525
return is2ds
487526

488-
def _read_single_var(self, file, grp_path):
527+
def _read_single_grp(self, file, grp_path):
489528
"""
490529
For a given file and variable group path, construct an Intake catalog and use it to read in the data.
491530
@@ -519,12 +558,10 @@ def _read_single_var(self, file, grp_path):
519558
grp_paths=grp_path,
520559
extra_engine_kwargs={"phony_dims": "access"},
521560
)
522-
523561
ds = grpcat[self._source_type].read()
524562

525563
return ds
526564

527-
# NOTE: for non-gridded datasets only
528565
def _build_single_file_dataset(self, file, groups_list):
529566
"""
530567
Create a single xarray dataset with all of the wanted variables/groups from the wanted var list for a single data file/url.
@@ -544,7 +581,7 @@ def _build_single_file_dataset(self, file, groups_list):
544581
Xarray Dataset
545582
"""
546583

547-
file_product = self._read_single_var(file, "/").attrs["identifier_product_type"]
584+
file_product = self._read_single_grp(file, "/").attrs["identifier_product_type"]
548585
assert (
549586
file_product == self._prod
550587
), "Your product specification does not match the product specification within your files."
@@ -577,13 +614,30 @@ def _build_single_file_dataset(self, file, groups_list):
577614
wanted_groups_set = set(wanted_groups)
578615
# orbit_info is used automatically as the first group path so the info is available for the rest of the groups
579616
wanted_groups_set.remove("orbit_info")
617+
# Note: the sorting is critical for datasets with highly nested groups
618+
wanted_groups_list = ["orbit_info"] + sorted(wanted_groups_set)
580619
# returns the wanted groups as a list of lists with group path string elements separated
581-
_, wanted_groups_tiered = Variables.parse_var_list(groups_list, tiered=True)
620+
_, wanted_groups_tiered = Variables.parse_var_list(
621+
groups_list, tiered=True, tiered_vars=True
622+
)
582623

583-
for grp_path in ["orbit_info"] + list(wanted_groups_set):
584-
ds = self._read_single_var(file, grp_path)
585-
is2ds = Read._add_var_to_ds(
624+
while wanted_groups_list:
625+
grp_path = wanted_groups_list[0]
626+
wanted_groups_list = wanted_groups_list[1:]
627+
ds = self._read_single_grp(file, grp_path)
628+
is2ds, ds = Read._add_vars_to_ds(
586629
is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict
587630
)
588631

632+
# if there are any deeper nested variables, get those so they have actual coordinates and add them
633+
if any(grp_path in grp_path2 for grp_path2 in wanted_groups_list):
634+
for grp_path2 in wanted_groups_list:
635+
if grp_path in grp_path2:
636+
sub_ds = self._read_single_grp(file, grp_path2)
637+
ds = Read._combine_nested_vars(
638+
ds, sub_ds, grp_path2, wanted_dict
639+
)
640+
wanted_groups_list.remove(grp_path2)
641+
is2ds = is2ds.merge(ds, join="outer", combine_attrs="no_conflicts")
642+
589643
return is2ds

icepyx/core/variables.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ def visitor_func(name, node):
135135
return self._avail
136136

137137
@staticmethod
138-
def parse_var_list(varlist, tiered=True):
138+
def parse_var_list(varlist, tiered=True, tiered_vars=False):
139139
"""
140140
Parse a list of path strings into tiered lists and names of variables
141141
@@ -149,6 +149,11 @@ def parse_var_list(varlist, tiered=True):
149149
(e.g. [['orbit_info', 'ancillary_data', 'gt1l'],['none','none','land_ice_segments']])
150150
or a single list of path strings (e.g. ['orbit_info','ancillary_data','gt1l/land_ice_segments'])
151151
152+
tiered_vars : boolean, default False
153+
Whether or not to append a list of the variable names to the nested list of component strings
154+
(e.g. [['orbit_info', 'ancillary_data', 'gt1l'],['none','none','land_ice_segments'],
155+
['sc_orient','atlas_sdp_gps_epoch','h_li']]))
156+
152157
Examples
153158
--------
154159
>>> reg_a = ipx.Query('ATL06',[-55, 68, -48, 71],['2019-02-20','2019-02-28'], version='1') # doctest: +SKIP
@@ -215,7 +220,10 @@ def parse_var_list(varlist, tiered=True):
215220
else:
216221
num = np.max([v.count("/") for v in varlist])
217222
# print('max needed: ' + str(num))
218-
paths = [[] for i in range(num)]
223+
if tiered_vars == True:
224+
paths = [[] for i in range(num + 1)]
225+
else:
226+
paths = [[] for i in range(num)]
219227

220228
# print(self._cust_options['variables'])
221229
for vn in varlist:
@@ -237,6 +245,8 @@ def parse_var_list(varlist, tiered=True):
237245
for i in range(j, num):
238246
paths[i].append("none")
239247
i = i + 1
248+
if tiered_vars == True:
249+
paths[num].append(vkey)
240250

241251
return vgrp, paths
242252

0 commit comments

Comments
 (0)