next-exp · carhc · Dec 30, 2024 · Dec 30, 2024 · Jan 3, 2025 · Jan 3, 2025
diff --git a/invisible_cities/reco/krmap_functions.py b/invisible_cities/reco/krmap_functions.py
@@ -1,9 +1,17 @@
-import numpy  as np
-import pandas as pd
+import warnings
+import itertools
 
+import numpy        as np
+import pandas       as pd
+import scipy.stats  as stats
+
+
+from   typing               import Tuple, Optional
 from .. types.symbols       import KrFitFunction
 from .. evm.ic_containers   import FitFunction
 from .. core.fit_functions  import polynom, expo
+from ..core.core_functions  import in_range, shift_to_bin_centers
+from ..core.fit_functions   import fit, get_chi2_and_pvalue
 
 
 def lin_seed(x : np.array, y : np.array):
@@ -149,3 +157,326 @@ def transform_parameters(fit_output : FitFunction):
 
     return par, err, cov
 
+
+def create_df_kr_map(bins_x : np.array,
+                     bins_y : np.array,
+                     counts : np.array,
+                     n_min  : int,
+                     r_max  : float)->pd.DataFrame:
+    '''
+    This function creates the dataframe in which the map parameters are stored.
+
+    Parameters
+    ----------
+    bins_x : np.array
+        Bins in x axis
+    bins_y : np.array
+        Bins in y axis
+    counts : np.array
+        Number of events falling into each bin
+    n_min : int
+        Min number of events per bin required to perform fits
+    r_max : float
+        Radius defining the active area of the detector
+
+    Returns
+    -------
+    kr_map : pd.DataFrame
+        Kr map dataframe with all the info prior to the fits: bin label,
+        events per bin, bin in/outside the active volume, bin position
+        (X, Y, R), etc.
+    '''
+    n_xbins   = len(bins_x)-1
+    n_ybins   = len(bins_y)-1
+    b_center  = [shift_to_bin_centers(axis) for axis in [bins_x, bins_y]]
+
+    bin_index = range(n_xbins*n_ybins)
+    geom_comb = np.array(list(itertools.product(*b_center)))
+    r_values  = np.sum(geom_comb**2, axis=1)**0.5
+
+    kr_map    = pd.DataFrame(dict(bin            = bin_index,
+                                  counts         = counts,
+                                  X              = geom_comb.T[0],
+                                  Y              = geom_comb.T[1],
+                                  R              = r_values,
+                                  in_active      = r_values <= r_max,
+                                  has_min_counts = counts   >= n_min,
+                                  fit_success    = False,
+                                  valid          = False,
+                                  e0             = np.nan,
+                                  ue0            = np.nan,
+                                  lt             = np.nan,
+                                  ult            = np.nan,
+                                  cov            = np.nan,
+                                  res_std        = np.nan,
+                                  chi2           = np.nan,
+                                  pval           = np.nan))
+
+    return kr_map
+
+
+def get_number_of_bins(nevents : Optional[int] = None,
+                       thr     : Optional[int] = 1e6,
+                       n_bins  : Optional[np.array] = None)->np.array:
+    '''
+    Computes the number of XY bins to be used in the creation
+    of correction map regarding the number of selected events.
+
+    Parameters
+    ---------
+    nevents: int (optional)
+        Total number of provided events for the map computation.
+    thr: int (optional)
+        Event threshold to use 50x50 or 100x100 binning.
+    n_bins: int (optional)
+        The number of bins to use can be chosen a priori. If given,
+        the returned number of bins is the one provided by the user.
+        However, if no number of bins is given in advance, this will
+        automatically select a value depending on the amount of events
+        contained in the dst and the threshold.
+
+    Returns
+    -------
+    n_bins: int
+        Number of bins in each direction (X,Y) (square map).
+    '''
+    if    n_bins is not None: return n_bins;
+    elif  nevents < thr: return np.array([50, 50]);
+    else: return  np.array([100, 100]);
+
+
+def get_bin_counts_and_event_bin_id(dst    : pd.DataFrame,
+                                    bins_x : np.array,
+                                    bins_y : np.array):
+    '''
+    This function distributes all the events in the DST into the selected
+    bins. Given a certain binning, it computes which events fall into each
+    square of the grid formed by the bins arrays. binned_statistic_2d returns
+    counts (matrix here each matrix element is the number of events falling
+    into that specific bin) and bin_indexes (a 2-D array labeling each event
+    with the bin it belongs). Then counts is flattened into 1-D, bin_indexes
+    is transformed into 1-D using the number of bins on each axis.
+
+    Parameters
+    ----------
+    dst    : pd.DataFrame
+         Krypton dataframe.
+    bins_x : np.array
+         Binning in x axis
+    bins_y : np.array
+         Binning in y axis
+
+    Returns
+    -------
+    counts     : np.array
+         Total number of events falling into each bin
+    bin_labels : np.array
+         1D bin label for each event
+
+    Further info:
+    -------------
+    Why set expand_binnumbers to True (2D binning) if then we transform it to 1D?
+    Because even though expand_binnumbers = False returns 1-D labels, it also adds
+    two additional bins (per axis), taking into account out-of-range events which
+    dont fall into the the binning passed to the binned_statistic_2d function. But
+    since the dst is going to be previously selected and filtered with the desired
+    binning, it's not convenient to use that. Maybe a visual example is more useful:
+
+    2x2 binning (4 bins), natural index values shown both as 1D and 2D:
+
+    || 0 | 1 ||          || (0, 0) | (0, 1) ||
+    || 2 | 3 || (1D)  =  || (1, 0) | (1, 1) || (2D)
+
+    Using expand_binnumbers = False, the 1D index values instead of (0, ..., 3)
+    would be (0, ..., 15):
+
+    || 0 | 1 | 2 | 3 ||
+    || 4 | 5 | 6 | 7 ||  The bins that we "care about" (inner part) have indexes
+    || 8 | 9 |10 |11 ||  5, 6, 9, 10 which I believe is not convenient at all.
+    ||12 |13 |14 |15 ||  This creates (nx+2)*(ny+2) bins.
+    '''
+    n_xbins    = len(bins_x)-1
+    n_ybins    = len(bins_y)-1
+
+    counts, _, _, bin_indexes = stats.binned_statistic_2d(x=dst.X, y=dst.Y, values=None,
+                                                          bins=[bins_x, bins_y], statistic='count',
+                                                          expand_binnumbers=True)
+
+    counts       = counts.flatten()
+    bin_indexes -= 1
+    bin_labels   = np.ravel_multi_index(bin_indexes, dims=(n_xbins, n_ybins),
+                                        mode='clip', order = 'F')
+
+    return counts, bin_labels
-    counts       = counts.flatten()
-    bin_indexes -= 1
-    bin_labels   = np.ravel_multi_index(bin_indexes, dims=(n_xbins, n_ybins),
-                                        mode='clip', order = 'F')
-
-    return counts, bin_labels
+    # indexes 0 and len+1 represent underflow and overflow, thus we subtract 1
+    bin_labels   = np.ravel_multi_index(bin_indexes - 1, dims=(n_xbins, n_ybins),
+                                        mode='clip', order = 'F')
+
+    return counts.flatten(), bin_labels
-    counts       = counts.flatten()
-    bin_indexes -= 1
-    bin_labels   = np.ravel_multi_index(bin_indexes, dims=(n_xbins, n_ybins),
-                                        mode='clip', order = 'F')
-
-    return counts, bin_labels
+    counts       = counts.flatten()
+    bin_indexes -= 1
+    valid_mask = ( in_range(bin_indexes[0], 0, n_xbins, right_closed=True) &
+                   in_range(bin_indexes[1], 0, n_ybins, right_closed=True) )
+
+    bin_labels             = np.full(bin_indexes.shape[1], fill_value=np.nan)
+    bin_labels[valid_mask] = np.ravel_multi_index((bin_indexes[0, valid_mask],
+                                                   bin_indexes[1, valid_mask]),
+                                                   dims=(n_xbins, n_ybins),
+                                                   order='F')
-    counts       = counts.flatten()
-    bin_indexes -= 1
-    bin_labels   = np.ravel_multi_index(bin_indexes, dims=(n_xbins, n_ybins),
-                                        mode='clip', order = 'F')
-
-    return counts, bin_labels
+    # indexes 0 and len+1 represent underflow and overflow, thus we subtract 1
+    bin_labels   = np.ravel_multi_index(bin_indexes - 1, dims=(n_xbins, n_ybins),
+                                        mode='clip', order = 'F')
+
+    return counts.flatten(), bin_labels
-    counts       = counts.flatten()
-    bin_indexes -= 1
-    bin_labels   = np.ravel_multi_index(bin_indexes, dims=(n_xbins, n_ybins),
-                                        mode='clip', order = 'F')
-
-    return counts, bin_labels
+    counts       = counts.flatten()
+    bin_indexes -= 1
+    valid_mask = ( in_range(bin_indexes[0], 0, n_xbins, right_closed=True) &
+                   in_range(bin_indexes[1], 0, n_ybins, right_closed=True) )
+
+    bin_labels             = np.full(bin_indexes.shape[1], fill_value=np.nan)
+    bin_labels[valid_mask] = np.ravel_multi_index((bin_indexes[0, valid_mask],
+                                                   bin_indexes[1, valid_mask]),
+                                                   dims=(n_xbins, n_ybins),
+                                                   order='F')
+
+
+def calculate_residuals(x          : np.array,
+                        y          : np.array,
+                        fit_output : FitFunction):
+    '''
+    Calculate residuals and their standard deviation for the fitted data.
+
+    Parameters
+    ----------
+    x : pd.array
+        Independent variable
+    y : pd.array
+        Dependent variable
+    fit_output : FitFunction
+        Container for IC's fit function result
+
+    Returns
+    -------
+    res : np.array
+        Residuals.
+    std : float
+        Standard deviation of residuals.
+    '''
+
+    res = y - fit_output.fn(x); std = res.std()
-    res = y - fit_output.fn(x); std = res.std()
+    res = y - fit_output.fn(x)
+    std = res.std()
-    res = y - fit_output.fn(x); std = res.std()
+    res = y - fit_output.fn(x)
+    std = res.std()
+
+    return res, std
+
+
+def calculate_pval(residuals : np.array):
+    '''
+    Calculate the p-value for the Shapiro-Wilk normality test of residuals.
+
+    Parameters
+    ----------
+    residuals : np.array
+        Residuals from the fitted model.
+
+    Returns
+    -------
+    pval : float
+        p-value for the Shapiro-Wilk normality test.
+    '''
+
+    pval = stats.shapiro(residuals)[1] if (len(residuals) > 3) else 0.
+
+    return pval
+
+
+def valid_bin_counter(map_df             : pd.DataFrame,
+                      validity_parameter : Optional[float] = 0.9):
+    '''
+    Count the number of valid bins in the map DataFrame and issue a warning
+    if the validity threshold is not met.
+
+    Parameters
+    ----------
+    map_df : pd.DataFrame
+        DataFrame containing map data.
+    validity_parameter : float
+        Threshold for the ratio of valid bins (default set to 0.9).
+
+    Returns
+    -------
+    valid_per : float
+        Percentage of valid bins.
+    '''
+
+    inside = np.count_nonzero(map_df.in_active)
+    valid  = np.count_nonzero(map_df.in_active & map_df.valid)
+
+    valid_per = valid / inside if inside else 0
+
+    if valid_per <= validity_parameter:
+        warnings.warn(f"{inside-valid} inner bins are not valid. {100*valid_per:.1f}% are successful.", UserWarning)
+
+    return valid_per
+
+
+def fit_and_fill_map(map_bin : pd.DataFrame,
+                     dst     : pd.DataFrame,
+                     fittype : KrFitFunction):
+    '''
+    This function is the core of the map computation. It's the one in charge of looping
+    over all the bins, performing the calculations of the lifetime fits and filling the
+    krypton map dataframe with all the obtained parameters.
+
+    Basically, it's applied to the dataframe, and bin by bin, it checks if the conditions
+    for the map computation are met. If it's the case, it will call the corresponding fit
+    function specified in the fittype parameter, calculate all the different map values
+    and fill the dataframe with them.
+
+    Parameters
+    ----------
+    map_bin : pd.DataFrame
+         KrMap dataframe
+    dst     : pd.DataFrame
+         kdst dataframe.
+    fittype : str
+         Type of fit to perform.
+
+    Returns
+    -------
+    kr_map : pd.DataFrame
+         DataFrame containing map parameters.
+    '''
+    if not map_bin.in_active or not map_bin.has_min_counts: return map_bin
+
+    dst_bin  = dst.loc[dst.bin_index == map_bin.bin]
+
+    fit_func, seed = get_function_and_seed_lt(fittype = fittype)
+    x, y           = select_fit_variables    (fittype = fittype,
+                                              dst     = dst_bin)
+
+    fit_output = fit(func        = fit_func,
+                     x           = x,
+                     y           = y,
+                     seed        = seed(x, y))
+
+    par, err, cov = transform_parameters(fit_output = fit_output)
+
+    res  = y - fit_output.fn(x)
+    std  = res.std()
+    chi2 = get_chi2_and_pvalue(y, fit_output.fn(x), len(x)-len(par), std)[0]
+    pval = stats.shapiro(res)[1] if (len(res) > 3) else 0.
+
+    map_bin['e0']          = par[0]
+    map_bin['ue0']         = err[0]
+    map_bin['lt']          = par[1]
+    map_bin['ult']         = err[1]
+    map_bin['cov']         = cov
+    map_bin['res_std']     = std
+    map_bin['chi2']        = chi2
+    map_bin['pval']        = pval
+    map_bin['fit_success'] = fit_output.ier in range(1, 5)
+    map_bin['valid']       = map_bin['fit_success'] and map_bin['has_min_counts'] and map_bin['in_active']
+
+    return map_bin
+
+
+def regularize_map(maps    : pd.DataFrame,
+                   x2range : Tuple[float, float] = None):
+    '''
+    Given a certain chi2 range, this function checks which bins are outside that
+    range and substitutes the (probably wrong) values of the map (e0, lt, etc) for
+    the mean value of the whole map.
+
+    Parameters
+    ----------
+    maps: pd.DataFrame
+        Map to check the outliers
+    x2range : Tuple[float, float]
+        Range for chi2
+
+    Returns
+    ---------
+    new_map: pd.DataFrame
+        Regularized map.'''
+    new_map   = maps.copy()
+    outliers  = new_map.in_active & ~new_map.valid
+
+    if isinstance(x2range, Tuple):
+          outliers &= ~in_range(new_map.chi2, *x2range)
+    else: outliers = outliers
+
+    new_map['e0'] [outliers] = np.nanmean(maps['e0'])
+    new_map['lt'] [outliers] = np.nanmean(maps['lt'])
+    new_map['ue0'][outliers] = np.nanmean(maps['ue0'])
+    new_map['ult'][outliers] = np.nanmean(maps['ult'])
+
+    return new_map