@@ -311,9 +311,9 @@ def _check_source_for_pattern(source, filename_pattern):
311
311
return False , None
312
312
313
313
@staticmethod
314
- def _add_var_to_ds (is2ds , ds , grp_path , wanted_groups_tiered , wanted_dict ):
314
+ def _add_vars_to_ds (is2ds , ds , grp_path , wanted_groups_tiered , wanted_dict ):
315
315
"""
316
- Add the new variable group to the dataset template.
316
+ Add the new variables in the group to the dataset template.
317
317
318
318
Parameters
319
319
----------
@@ -336,11 +336,9 @@ def _add_var_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):
336
336
Xarray Dataset with variables from the ds variable group added.
337
337
"""
338
338
339
- wanted_vars = list (wanted_dict .keys ())
340
-
341
339
if grp_path in ["orbit_info" , "ancillary_data" ]:
342
340
grp_spec_vars = [
343
- wanted_vars [i ]
341
+ wanted_groups_tiered [ - 1 ] [i ]
344
342
for i , x in enumerate (wanted_groups_tiered [0 ])
345
343
if x == grp_path
346
344
]
@@ -389,9 +387,10 @@ def _add_var_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):
389
387
# add a test for the new function (called here)!
390
388
391
389
grp_spec_vars = [
392
- k for k , v in wanted_dict .items () if any (grp_path in x for x in v )
390
+ k
391
+ for k , v in wanted_dict .items ()
392
+ if any (f"{ grp_path } /{ k } " in x for x in v )
393
393
]
394
- # print(grp_spec_vars)
395
394
396
395
ds = (
397
396
ds .reset_coords (drop = False )
@@ -400,17 +399,57 @@ def _add_var_to_ds(is2ds, ds, grp_path, wanted_groups_tiered, wanted_dict):
400
399
.assign (gt = (("gran_idx" , "spot" ), [[gt_str ]]))
401
400
)
402
401
403
- # print(ds)
404
402
grp_spec_vars .append ("gt" )
405
403
is2ds = is2ds .merge (
406
404
ds [grp_spec_vars ], join = "outer" , combine_attrs = "no_conflicts"
407
405
)
408
- # print(is2ds)
409
406
410
407
# re-cast some dtypes to make array smaller
411
408
is2ds ["gt" ] = is2ds .gt .astype (str )
412
409
is2ds ["spot" ] = is2ds .spot .astype (np .uint8 )
413
410
411
+ return is2ds , ds [grp_spec_vars ]
412
+
413
+ @staticmethod
414
+ def _combine_nested_vars (is2ds , ds , grp_path , wanted_dict ):
415
+ """
416
+ Add the deeply nested variables to a dataset with appropriate coordinate information.
417
+
418
+ Parameters
419
+ ----------
420
+ is2ds : Xarray dataset
421
+ Dataset to add deeply nested variables to.
422
+ ds : Xarray dataset
423
+ Dataset containing proper dimensions for the variables being added
424
+ grp_path : str
425
+ hdf5 group path read into ds
426
+ wanted_dict : dict
427
+ Dictionary with variable names as keys and a list of group + variable paths containing those variables as values.
428
+
429
+ Returns
430
+ -------
431
+ Xarray Dataset with variables from the ds variable group added.
432
+ """
433
+
434
+ grp_spec_vars = [
435
+ k for k , v in wanted_dict .items () if any (f"{ grp_path } /{ k } " in x for x in v )
436
+ ]
437
+
438
+ # # Use this to handle issues specific to group paths that are more nested
439
+ # tiers = len(wanted_groups_tiered)
440
+ # if tiers > 3 and grp_path.count("/") == tiers - 2:
441
+ # # Handle attribute conflicts that arose from data descriptions during merging
442
+ # for var in grp_spec_vars:
443
+ # ds[var].attrs = ds.attrs
444
+ # for k in ds[var].attrs.keys():
445
+ # ds.attrs.pop(k)
446
+ # # warnings.warn(
447
+ # # "Due to the number of layers of variable group paths, some attributes have been dropped from your DataSet during merging",
448
+ # # UserWarning,
449
+ # # )
450
+
451
+ is2ds = is2ds .assign (ds [grp_spec_vars ])
452
+
414
453
return is2ds
415
454
416
455
def load (self ):
@@ -485,7 +524,7 @@ def _build_dataset_template(self, file):
485
524
)
486
525
return is2ds
487
526
488
- def _read_single_var (self , file , grp_path ):
527
+ def _read_single_grp (self , file , grp_path ):
489
528
"""
490
529
For a given file and variable group path, construct an Intake catalog and use it to read in the data.
491
530
@@ -519,12 +558,10 @@ def _read_single_var(self, file, grp_path):
519
558
grp_paths = grp_path ,
520
559
extra_engine_kwargs = {"phony_dims" : "access" },
521
560
)
522
-
523
561
ds = grpcat [self ._source_type ].read ()
524
562
525
563
return ds
526
564
527
- # NOTE: for non-gridded datasets only
528
565
def _build_single_file_dataset (self , file , groups_list ):
529
566
"""
530
567
Create a single xarray dataset with all of the wanted variables/groups from the wanted var list for a single data file/url.
@@ -544,7 +581,7 @@ def _build_single_file_dataset(self, file, groups_list):
544
581
Xarray Dataset
545
582
"""
546
583
547
- file_product = self ._read_single_var (file , "/" ).attrs ["identifier_product_type" ]
584
+ file_product = self ._read_single_grp (file , "/" ).attrs ["identifier_product_type" ]
548
585
assert (
549
586
file_product == self ._prod
550
587
), "Your product specification does not match the product specification within your files."
@@ -577,13 +614,30 @@ def _build_single_file_dataset(self, file, groups_list):
577
614
wanted_groups_set = set (wanted_groups )
578
615
# orbit_info is used automatically as the first group path so the info is available for the rest of the groups
579
616
wanted_groups_set .remove ("orbit_info" )
617
+ # Note: the sorting is critical for datasets with highly nested groups
618
+ wanted_groups_list = ["orbit_info" ] + sorted (wanted_groups_set )
580
619
# returns the wanted groups as a list of lists with group path string elements separated
581
- _ , wanted_groups_tiered = Variables .parse_var_list (groups_list , tiered = True )
620
+ _ , wanted_groups_tiered = Variables .parse_var_list (
621
+ groups_list , tiered = True , tiered_vars = True
622
+ )
582
623
583
- for grp_path in ["orbit_info" ] + list (wanted_groups_set ):
584
- ds = self ._read_single_var (file , grp_path )
585
- is2ds = Read ._add_var_to_ds (
624
+ while wanted_groups_list :
625
+ grp_path = wanted_groups_list [0 ]
626
+ wanted_groups_list = wanted_groups_list [1 :]
627
+ ds = self ._read_single_grp (file , grp_path )
628
+ is2ds , ds = Read ._add_vars_to_ds (
586
629
is2ds , ds , grp_path , wanted_groups_tiered , wanted_dict
587
630
)
588
631
632
+ # if there are any deeper nested variables, get those so they have actual coordinates and add them
633
+ if any (grp_path in grp_path2 for grp_path2 in wanted_groups_list ):
634
+ for grp_path2 in wanted_groups_list :
635
+ if grp_path in grp_path2 :
636
+ sub_ds = self ._read_single_grp (file , grp_path2 )
637
+ ds = Read ._combine_nested_vars (
638
+ ds , sub_ds , grp_path2 , wanted_dict
639
+ )
640
+ wanted_groups_list .remove (grp_path2 )
641
+ is2ds = is2ds .merge (ds , join = "outer" , combine_attrs = "no_conflicts" )
642
+
589
643
return is2ds
0 commit comments