add read-in functionality for deeply nested variables (e.g. ATL08) (#281)

JessicaS11 · ricardobarroslourenco · web-flow · commit c5257158737c · 2022-03-17T14:48:23.000-04:00
Co-authored-by: GitHub Action &lt;ricardobarroslourenco@users.noreply.github.com&gt;
diff --git a/doc/source/example_notebooks/IS2_data_read-in.ipynb b/doc/source/example_notebooks/IS2_data_read-in.ipynb
@@ -426,7 +426,7 @@
     "\n",
     "***ATTENTION: icepyx loads your data by creating an Xarray DataSet for each input granule and then merging them. In some cases, the automatic merge fails and needs to be handled manually. In these cases, icepyx will return a warning with the error message from the failed Xarray merge and a list of per-granule DataSets***\n",
     "\n",
-    "This can happen if you unintentionally provide the same granule multiple times with different filenames."
+    "This can happen if you unintentionally provide the same granule multiple times with different filenames or in segmented products where the rgt+cycle automatically generated `gran_idx` values match. In this latter case, you can simply provide unique `gran_idx` values for each DataSet in `ds` and run `import xarray as xr` and `ds_merged = xr.merge(ds)` to create one merged DataSet."
    ]
   },
   {
diff --git a/doc/source/user_guide/documentation/classes_dev_uml.svg b/doc/source/user_guide/documentation/classes_dev_uml.svg
diff --git a/doc/source/user_guide/documentation/classes_user_uml.svg b/doc/source/user_guide/documentation/classes_user_uml.svg
@@ -197,11 +197,11 @@
 <polyline fill="none" stroke="black" points="1195,-159 1570,-159 "/>
 <text text-anchor="start" x="1203" y="-143.8" font-family="Times,serif" font-size="14.00">path : NoneType</text>
 <text text-anchor="start" x="1203" y="-128.8" font-family="Times,serif" font-size="14.00">product : NoneType</text>
-<text text-anchor="start" x="1203" y="-113.8" font-family="Times,serif" font-size="14.00">wanted : NoneType, dict</text>
+<text text-anchor="start" x="1203" y="-113.8" font-family="Times,serif" font-size="14.00">wanted : dict, NoneType</text>
 <polyline fill="none" stroke="black" points="1195,-106 1570,-106 "/>
 <text text-anchor="start" x="1203" y="-90.8" font-family="Times,serif" font-size="14.00">append(defaults, var_list, beam_list, keyword_list)</text>
 <text text-anchor="start" x="1203" y="-75.8" font-family="Times,serif" font-size="14.00">avail(options, internal)</text>
-<text text-anchor="start" x="1203" y="-60.8" font-family="Times,serif" font-size="14.00">parse_var_list(varlist, tiered)</text>
+<text text-anchor="start" x="1203" y="-60.8" font-family="Times,serif" font-size="14.00">parse_var_list(varlist, tiered, tiered_vars)</text>
 <text text-anchor="start" x="1203" y="-45.8" font-family="Times,serif" font-size="14.00">remove(all, var_list, beam_list, keyword_list)</text>
 </g>
 <!-- icepyx.core.variables.Variables&#45;&gt;icepyx.core.query.Query -->
@@ -241,7 +241,7 @@
 <text text-anchor="start" x="1875" y="-181.3" font-family="Times,serif" font-size="14.00">bbox : list</text>
 <text text-anchor="start" x="1875" y="-166.3" font-family="Times,serif" font-size="14.00">cycles : NoneType</text>
 <text text-anchor="start" x="1875" y="-151.3" font-family="Times,serif" font-size="14.00">date_range : NoneType</text>
-<text text-anchor="start" x="1875" y="-136.3" font-family="Times,serif" font-size="14.00">product : NoneType, str</text>
+<text text-anchor="start" x="1875" y="-136.3" font-family="Times,serif" font-size="14.00">product : str, NoneType</text>
 <text text-anchor="start" x="1875" y="-121.3" font-family="Times,serif" font-size="14.00">tracks : NoneType</text>
 <polyline fill="none" stroke="black" points="1867,-113.5 2118,-113.5 "/>
 <text text-anchor="start" x="1875" y="-98.3" font-family="Times,serif" font-size="14.00">generate_OA_parameters(): list</text>
diff --git a/icepyx/core/is2ref.py b/icepyx/core/is2ref.py
@@ -259,7 +259,7 @@ def _default_varlists(product):
 
     else:
         print(
-            "THE REQUESTED PRODUCT DOES NOT YET HAVE A DEFAULT LIST SET UP. ONLY DELTA_TIME, LATITUTDE, AND LONGITUDE WILL BE RETURNED"
+            "THE REQUESTED PRODUCT DOES NOT YET HAVE A DEFAULT LIST SET UP. ONLY DELTA_TIME, LATITUDE, AND LONGITUDE WILL BE RETURNED"
         )
         return common_list
 
diff --git a/icepyx/core/query.py b/icepyx/core/query.py
@@ -1017,7 +1017,7 @@ def download_granules(
             by default when subset=True, but additional subsetting options are available.
             Spatial subsetting returns all data that are within the area of interest (but not complete
             granules. This eliminates false-positive granules returned by the metadata-level search)
-        restart: boolean, default false
+        restart : boolean, default false
             If previous download was terminated unexpectedly. Run again with restart set to True to continue.
         **kwargs : key-value pairs
             Additional parameters to be passed to the subsetter.
diff --git a/icepyx/core/read.py b/icepyx/core/read.py
@@ -311,9 +311,9 @@ def _check_source_for_pattern(source, filename_pattern):
         return False, None
 
     @staticmethod
-    def _add_var_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):
+    def _add_vars_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):
         """
-        Add the new variable group to the dataset template.
+        Add the new variables in the group to the dataset template.
 
         Parameters
         ----------
@@ -336,11 +336,9 @@ def _add_var_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):
         Xarray Dataset with variables from the ds variable group added.
         """
 
-        wanted_vars = list(wanted_dict.keys())
-
         if grp_path in ["orbit_info", "ancillary_data"]:
             grp_spec_vars = [
-                wanted_vars[i]
+                wanted_groups_tiered[-1][i]
                 for i, x in enumerate(wanted_groups_tiered[0])
                 if x == grp_path
             ]
@@ -389,9 +387,10 @@ def _add_var_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):
             # add a test for the new function (called here)!
 
             grp_spec_vars = [
-                k for k, v in wanted_dict.items() if any(grp_path in x for x in v)
+                k
+                for k, v in wanted_dict.items()
+                if any(f"{grp_path}/{k}" in x for x in v)
             ]
-            # print(grp_spec_vars)
 
             ds = (
                 ds.reset_coords(drop=False)
@@ -400,17 +399,57 @@ def _add_var_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):
                 .assign(gt=(("gran_idx", "spot"), [[gt_str]]))
             )
 
-            # print(ds)
             grp_spec_vars.append("gt")
             is2ds = is2ds.merge(
                 ds[grp_spec_vars], join="outer", combine_attrs="no_conflicts"
             )
-            # print(is2ds)
 
             # re-cast some dtypes to make array smaller
             is2ds["gt"] = is2ds.gt.astype(str)
             is2ds["spot"] = is2ds.spot.astype(np.uint8)
 
+        return is2ds, ds[grp_spec_vars]
+
+    @staticmethod
+    def _combine_nested_vars(is2ds, ds, grp_path, wanted_dict):
+        """
+        Add the deeply nested variables to a dataset with appropriate coordinate information.
+
+        Parameters
+        ----------
+        is2ds : Xarray dataset
+            Dataset to add deeply nested variables to.
+        ds : Xarray dataset
+            Dataset containing proper dimensions for the variables being added
+        grp_path : str
+            hdf5 group path read into ds
+        wanted_dict : dict
+            Dictionary with variable names as keys and a list of group + variable paths containing those variables as values.
+
+        Returns
+        -------
+        Xarray Dataset with variables from the ds variable group added.
+        """
+
+        grp_spec_vars = [
+            k for k, v in wanted_dict.items() if any(f"{grp_path}/{k}" in x for x in v)
+        ]
+
+        # # Use this to handle issues specific to group paths that are more nested
+        # tiers = len(wanted_groups_tiered)
+        # if tiers > 3 and grp_path.count("/") == tiers - 2:
+        #     # Handle attribute conflicts that arose from data descriptions during merging
+        #     for var in grp_spec_vars:
+        #         ds[var].attrs = ds.attrs
+        #     for k in ds[var].attrs.keys():
+        #         ds.attrs.pop(k)
+        #     # warnings.warn(
+        #     #     "Due to the number of layers of variable group paths, some attributes have been dropped from your DataSet during merging",
+        #     #     UserWarning,
+        #     # )
+
+        is2ds = is2ds.assign(ds[grp_spec_vars])
+
         return is2ds
 
     def load(self):
@@ -485,7 +524,7 @@ def _build_dataset_template(self, file):
         )
         return is2ds
 
-    def _read_single_var(self, file, grp_path):
+    def _read_single_grp(self, file, grp_path):
         """
         For a given file and variable group path, construct an Intake catalog and use it to read in the data.
 
@@ -519,12 +558,10 @@ def _read_single_var(self, file, grp_path):
                 grp_paths=grp_path,
                 extra_engine_kwargs={"phony_dims": "access"},
             )
-
             ds = grpcat[self._source_type].read()
 
         return ds
 
-    # NOTE: for non-gridded datasets only
     def _build_single_file_dataset(self, file, groups_list):
         """
         Create a single xarray dataset with all of the wanted variables/groups from the wanted var list for a single data file/url.
@@ -544,7 +581,7 @@ def _build_single_file_dataset(self, file, groups_list):
         Xarray Dataset
         """
 
-        file_product = self._read_single_var(file, "/").attrs["identifier_product_type"]
+        file_product = self._read_single_grp(file, "/").attrs["identifier_product_type"]
         assert (
             file_product == self._prod
         ), "Your product specification does not match the product specification within your files."
@@ -577,13 +614,30 @@ def _build_single_file_dataset(self, file, groups_list):
             wanted_groups_set = set(wanted_groups)
             # orbit_info is used automatically as the first group path so the info is available for the rest of the groups
             wanted_groups_set.remove("orbit_info")
+            # Note: the sorting is critical for datasets with highly nested groups
+            wanted_groups_list = ["orbit_info"] + sorted(wanted_groups_set)
             # returns the wanted groups as a list of lists with group path string elements separated
-            _, wanted_groups_tiered = Variables.parse_var_list(groups_list, tiered=True)
+            _, wanted_groups_tiered = Variables.parse_var_list(
+                groups_list, tiered=True, tiered_vars=True
+            )
 
-            for grp_path in ["orbit_info"] + list(wanted_groups_set):
-                ds = self._read_single_var(file, grp_path)
-                is2ds = Read._add_var_to_ds(
+            while wanted_groups_list:
+                grp_path = wanted_groups_list[0]
+                wanted_groups_list = wanted_groups_list[1:]
+                ds = self._read_single_grp(file, grp_path)
+                is2ds, ds = Read._add_vars_to_ds(
                     is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict
                 )
 
+                # if there are any deeper nested variables, get those so they have actual coordinates and add them
+                if any(grp_path in grp_path2 for grp_path2 in wanted_groups_list):
+                    for grp_path2 in wanted_groups_list:
+                        if grp_path in grp_path2:
+                            sub_ds = self._read_single_grp(file, grp_path2)
+                            ds = Read._combine_nested_vars(
+                                ds, sub_ds, grp_path2, wanted_dict
+                            )
+                            wanted_groups_list.remove(grp_path2)
+                    is2ds = is2ds.merge(ds, join="outer", combine_attrs="no_conflicts")
+
         return is2ds
diff --git a/icepyx/core/variables.py b/icepyx/core/variables.py
@@ -135,7 +135,7 @@ def visitor_func(name, node):
             return self._avail
 
     @staticmethod
-    def parse_var_list(varlist, tiered=True):
+    def parse_var_list(varlist, tiered=True, tiered_vars=False):
         """
         Parse a list of path strings into tiered lists and names of variables
 
@@ -149,6 +149,11 @@ def parse_var_list(varlist, tiered=True):
             (e.g. [['orbit_info', 'ancillary_data', 'gt1l'],['none','none','land_ice_segments']])
             or a single list of path strings (e.g. ['orbit_info','ancillary_data','gt1l/land_ice_segments'])
 
+        tiered_vars : boolean, default False
+            Whether or not to append a list of the variable names to the nested list of component strings
+            (e.g. [['orbit_info', 'ancillary_data', 'gt1l'],['none','none','land_ice_segments'],
+                ['sc_orient','atlas_sdp_gps_epoch','h_li']]))
+
         Examples
         --------
         >>> reg_a = ipx.Query('ATL06',[-55, 68, -48, 71],['2019-02-20','2019-02-28'], version='1') # doctest: +SKIP
@@ -215,7 +220,10 @@ def parse_var_list(varlist, tiered=True):
         else:
             num = np.max([v.count("/") for v in varlist])
             #         print('max needed: ' + str(num))
-            paths = [[] for i in range(num)]
+            if tiered_vars == True:
+                paths = [[] for i in range(num + 1)]
+            else:
+                paths = [[] for i in range(num)]
 
         # print(self._cust_options['variables'])
         for vn in varlist:
@@ -237,6 +245,8 @@ def parse_var_list(varlist, tiered=True):
                     for i in range(j, num):
                         paths[i].append("none")
                         i = i + 1
+                    if tiered_vars == True:
+                        paths[num].append(vkey)
 
         return vgrp, paths
 

Original file line number	Diff line number	Diff line change
`@@ -426,7 +426,7 @@`
`426`	`426`	`"\n",`
`427`	`427`	`"*ATTENTION: icepyx loads your data by creating an Xarray DataSet for each input granule and then merging them. In some cases, the automatic merge fails and needs to be handled manually. In these cases, icepyx will return a warning with the error message from the failed Xarray merge and a list of per-granule DataSets*\n",`
`428`	`428`	`"\n",`
`429`		`- "This can happen if you unintentionally provide the same granule multiple times with different filenames."`
	`429`	+ "This can happen if you unintentionally provide the same granule multiple times with different filenames or in segmented products where the rgt+cycle automatically generated `gran_idx` values match. In this latter case, you can simply provide unique `gran_idx` values for each DataSet in `ds` and run `import xarray as xr` and `ds_merged = xr.merge(ds)` to create one merged DataSet."
`430`	`430`	`]`
`431`	`431`	`},`
`432`	`432`	`{`
Original file line number	Diff line number	Diff line change
`@@ -259,7 +259,7 @@ def _default_varlists(product):`
`259`	`259`
`260`	`260`	`else:`
`261`	`261`	`print(`
`262`		`- "THE REQUESTED PRODUCT DOES NOT YET HAVE A DEFAULT LIST SET UP. ONLY DELTA_TIME, LATITUTDE, AND LONGITUDE WILL BE RETURNED"`
	`262`	`+ "THE REQUESTED PRODUCT DOES NOT YET HAVE A DEFAULT LIST SET UP. ONLY DELTA_TIME, LATITUDE, AND LONGITUDE WILL BE RETURNED"`
`263`	`263`	`)`
`264`	`264`	`return common_list`
`265`	`265`