From 4605c780bb16909ca55c4136ea46c8f0c28f881d Mon Sep 17 00:00:00 2001 From: Stefano Piani Date: Thu, 5 Oct 2023 19:28:21 +0200 Subject: [PATCH] Improved DataExtractor class This commit solves a bug in the DataExtractor class and improves its performance in some corner cases. Before this commit, the DataExtractor class failed if a variable had a dimension 1 on an axis with index i if there was an axis with index j of dimension > 1 such that j < i. This should be fixed with this commit. Moreover, it cleans a little bit the code making some statements more readable. Finally, it also improves the performance of the classes when we read the first layer of a 3D variable (i.e., when we submit the argument dimvar=2). Before, it read all the variable and then it discarded all the layers different from 0. Now the routine reads only the required data. --- .gitignore | 1 + commons/dataextractor.py | 105 ++++++++++++++++++++++++--------------- 2 files changed, 65 insertions(+), 41 deletions(-) diff --git a/.gitignore b/.gitignore index a37b12ca..01e1c8c9 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ tutorial/ .*project commitpico* +.idea diff --git a/commons/dataextractor.py b/commons/dataextractor.py index 2331b5ca..895fe8bd 100644 --- a/commons/dataextractor.py +++ b/commons/dataextractor.py @@ -4,6 +4,8 @@ import os import numpy as np import netCDF4 +import warnings + #Mask object from commons.mask import Mask @@ -39,14 +41,18 @@ def __init__(self, mask, filename=None, varname=None, rawdata=None, rawdatafill= Either rawdata or filename plus varname must be defined. """ - if not ((filename is None) ^ (rawdata is None)): - raise ValueError("Either rawdata or filename plus varname must be defined") - elif (not (filename is None)) and (varname is None): - raise ValueError("filename and varname must be both defined") + if filename is None and varname is None: + raise ValueError( + "At least one among filename and varname must be both defined" + ) + if filename is not None and rawdata is not None: + raise ValueError( + "filename and rawdata can not be submitted at the same time" + ) self.fill_value = fill_value - if (filename is None): + if filename is None: #Use rawdata self.__shape = rawdata.shape self.dims = len(rawdata.shape) @@ -71,45 +77,62 @@ def __init__(self, mask, filename=None, varname=None, rawdata=None, rawdatafill= if one_dimensions_ind[0]==0: self.__values = np.array(dset.variables[v])[0,:] - if one_dimensions_ind.size==2 : - if one_dimensions_ind[0] ==0 : - if one_dimensions_ind[1] ==0 : - self.__values = np.array(dset.variables[v])[0,0,:] - - self.__shape = self.__values.shape - self.dims = len(self.__values.shape) - - for attr_name in ["missing_value","fillvalue","fillValue","FillValue"]: - if attr_name in dset.variables[v].ncattrs(): - fv = getattr(dset.variables[v], attr_name) - self.__dset_fillvalue = fv - dset.close() - self.__filename = fn - self.__varname = v - if ((self.dims==3) & (dimvar==2)) : self.__values=self.__values[0,:] - except: - raise + nc_declared_shape = dset.variables[v].shape + + v_slices = [slice(None) for _ in range(len(nc_declared_shape))] + + # Remove useless dimension at the beginning of the vector + k = 0 + while k < len(v_slices): + if nc_declared_shape[k] == 1: + v_slices[k] = 0 + else: + break + k += 1 + + new_shape = nc_declared_shape[k:] + + if len(new_shape) == 3 and dimvar == 2: + v_slices[-3] = 0 + + self.__values = np.array(dset.variables[v][tuple(v_slices)]) + + self.__shape = self.__values.shape + self.dims = len(self.__values.shape) + + for attr_name in ["missing_value","fillvalue","fillValue","FillValue"]: + if attr_name in dset.variables[v].ncattrs(): + fv = getattr(dset.variables[v], attr_name) + self.__dset_fillvalue = fv + + self.__filename = fn + self.__varname = v if not isinstance(mask, (Mask,)): raise ValueError("mask must be a Mask object") - else: - #test dimensions - if self.dims==3: - if self.__shape[1:] != mask.shape[1:]: raise ValueError("mask must have the same shape of the data") - data_jpk=self.__shape[0] - mask_jpk= mask.shape[0] - if data_jpk > mask_jpk: # working with reduced mask - if verbose: print('WARNING: slicing 3D field in range 0 -',mask_jpk ) - self.__values=self.values[:mask_jpk,:,:] - - if data_jpk < mask_jpk: - if (verbose) : print('WARNING: 3D file is a subset of mask domain') - appval = self.__values - self.__values = np.zeros(mask.shape) - self.__values[:self.__shape[0],:,:] = appval - self.__shape = self.__values.shape - if self.dims==2: - if self.__shape != mask.shape[1:] : raise ValueError("mask must have the same shape of the data") + + #test dimensions + if self.dims==3: + if self.__shape[1:] != mask.shape[1:]: + raise ValueError("mask must have the same shape of the data") + data_jpk=self.__shape[0] + mask_jpk= mask.shape[0] + if data_jpk > mask_jpk: # working with reduced mask + if verbose: + warnings.warn('WARNING: slicing 3D field in range 0 - {}'.format(mask_jpk)) + self.__values=self.values[:mask_jpk,:,:] + + if data_jpk < mask_jpk: + if verbose: + warnings.warn('WARNING: 3D file is a subset of mask domain') + appval = self.__values + self.__values = np.zeros(mask.shape) + self.__values[:self.__shape[0],:,:] = appval + self.__shape = self.__values.shape + if self.dims==2: + if self.__shape != mask.shape[1:]: + raise ValueError("mask must have the same shape of the data") + #Preserve mask reference self._mask = mask