Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
335 changes: 333 additions & 2 deletions invisible_cities/reco/krmap_functions.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
import numpy as np
import pandas as pd
import warnings
import itertools

import numpy as np
import pandas as pd
import scipy.stats as stats


from typing import Tuple, Optional
from .. types.symbols import KrFitFunction
from .. evm.ic_containers import FitFunction
from .. core.fit_functions import polynom, expo
from ..core.core_functions import in_range, shift_to_bin_centers
from ..core.fit_functions import fit, get_chi2_and_pvalue


def lin_seed(x : np.array, y : np.array):
Expand Down Expand Up @@ -149,3 +157,326 @@ def transform_parameters(fit_output : FitFunction):

return par, err, cov


def create_df_kr_map(bins_x : np.array,
bins_y : np.array,
counts : np.array,
n_min : int,
r_max : float)->pd.DataFrame:
'''
This function creates the dataframe in which the map parameters are stored.

Parameters
----------
bins_x : np.array
Bins in x axis
bins_y : np.array
Bins in y axis
counts : np.array
Number of events falling into each bin
n_min : int
Min number of events per bin required to perform fits
r_max : float
Radius defining the active area of the detector

Returns
-------
kr_map : pd.DataFrame
Kr map dataframe with all the info prior to the fits: bin label,
events per bin, bin in/outside the active volume, bin position
(X, Y, R), etc.
'''
n_xbins = len(bins_x)-1
n_ybins = len(bins_y)-1
b_center = [shift_to_bin_centers(axis) for axis in [bins_x, bins_y]]

bin_index = range(n_xbins*n_ybins)
geom_comb = np.array(list(itertools.product(*b_center)))
r_values = np.sum(geom_comb**2, axis=1)**0.5

kr_map = pd.DataFrame(dict(bin = bin_index,
counts = counts,
X = geom_comb.T[0],
Y = geom_comb.T[1],
R = r_values,
in_active = r_values <= r_max,
has_min_counts = counts >= n_min,
fit_success = False,
valid = False,
e0 = np.nan,
ue0 = np.nan,
lt = np.nan,
ult = np.nan,
cov = np.nan,
res_std = np.nan,
chi2 = np.nan,
pval = np.nan))

return kr_map


def get_number_of_bins(nevents : Optional[int] = None,
thr : Optional[int] = 1e6,
n_bins : Optional[np.array] = None)->np.array:
'''
Computes the number of XY bins to be used in the creation
of correction map regarding the number of selected events.

Parameters
---------
nevents: int (optional)
Total number of provided events for the map computation.
thr: int (optional)
Event threshold to use 50x50 or 100x100 binning.
n_bins: int (optional)
The number of bins to use can be chosen a priori. If given,
the returned number of bins is the one provided by the user.
However, if no number of bins is given in advance, this will
automatically select a value depending on the amount of events
contained in the dst and the threshold.

Returns
-------
n_bins: int
Number of bins in each direction (X,Y) (square map).
'''
if n_bins is not None: return n_bins;
elif nevents < thr: return np.array([50, 50]);
else: return np.array([100, 100]);


def get_bin_counts_and_event_bin_id(dst : pd.DataFrame,
bins_x : np.array,
bins_y : np.array):
'''
This function distributes all the events in the DST into the selected
bins. Given a certain binning, it computes which events fall into each
square of the grid formed by the bins arrays. binned_statistic_2d returns
counts (matrix here each matrix element is the number of events falling
into that specific bin) and bin_indexes (a 2-D array labeling each event
with the bin it belongs). Then counts is flattened into 1-D, bin_indexes
is transformed into 1-D using the number of bins on each axis.

Parameters
----------
dst : pd.DataFrame
Krypton dataframe.
bins_x : np.array
Binning in x axis
bins_y : np.array
Binning in y axis

Returns
-------
counts : np.array
Total number of events falling into each bin
bin_labels : np.array
1D bin label for each event

Further info:
-------------
Why set expand_binnumbers to True (2D binning) if then we transform it to 1D?
Because even though expand_binnumbers = False returns 1-D labels, it also adds
two additional bins (per axis), taking into account out-of-range events which
dont fall into the the binning passed to the binned_statistic_2d function. But
since the dst is going to be previously selected and filtered with the desired
binning, it's not convenient to use that. Maybe a visual example is more useful:

2x2 binning (4 bins), natural index values shown both as 1D and 2D:

|| 0 | 1 || || (0, 0) | (0, 1) ||
|| 2 | 3 || (1D) = || (1, 0) | (1, 1) || (2D)

Using expand_binnumbers = False, the 1D index values instead of (0, ..., 3)
would be (0, ..., 15):

|| 0 | 1 | 2 | 3 ||
|| 4 | 5 | 6 | 7 || The bins that we "care about" (inner part) have indexes
|| 8 | 9 |10 |11 || 5, 6, 9, 10 which I believe is not convenient at all.
||12 |13 |14 |15 || This creates (nx+2)*(ny+2) bins.
'''
n_xbins = len(bins_x)-1
n_ybins = len(bins_y)-1

counts, _, _, bin_indexes = stats.binned_statistic_2d(x=dst.X, y=dst.Y, values=None,
bins=[bins_x, bins_y], statistic='count',
expand_binnumbers=True)

counts = counts.flatten()
bin_indexes -= 1
bin_labels = np.ravel_multi_index(bin_indexes, dims=(n_xbins, n_ybins),
mode='clip', order = 'F')

return counts, bin_labels
Comment on lines +305 to +310
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
counts = counts.flatten()
bin_indexes -= 1
bin_labels = np.ravel_multi_index(bin_indexes, dims=(n_xbins, n_ybins),
mode='clip', order = 'F')
return counts, bin_labels
# indexes 0 and len+1 represent underflow and overflow, thus we subtract 1
bin_labels = np.ravel_multi_index(bin_indexes - 1, dims=(n_xbins, n_ybins),
mode='clip', order = 'F')
return counts.flatten(), bin_labels

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if I understand correctly mode="clip" means that bins out of range will be assigned the closest bin. Is this what we used to do @bpalmeiro ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In our case, we had three values for "fiducial":

  • The R max for the selection (just a quality one to ensure everything was inside)

  • The R max for the map, that defines the maximum extension of it. In this case, if the event didn't happen to fall inside the bins, it was disregarded.

  • Also (not sure here it's the most appropriate place to mention, tho), we had a posterior step where the peripheral bins (those further than a fiducial r -but within said Rmax for map production-) were set to nan.

The latter was used to ensure that, in the posterior analysis, all the hits reconstructed outside the volume were automatically flagged as nan.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That said, the best approach would be to disregard events outside boundaries instead of fakely merging them in the border bins, even if we rely on a previous selection. It's not a very strong opinion, but I think it adds redundancy (and, therefore, a bit of robustness) to the process.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, my bad... I assumed that there would not be any events outside the boundaries here since I was expecting a "clean" dst in this part of the code, so when I defined bin_labels like that I didn't think of the consequences in case some event was actually out of the range.

I am thinking in a solution like this one:

Suggested change
counts = counts.flatten()
bin_indexes -= 1
bin_labels = np.ravel_multi_index(bin_indexes, dims=(n_xbins, n_ybins),
mode='clip', order = 'F')
return counts, bin_labels
counts = counts.flatten()
bin_indexes -= 1
valid_mask = ( in_range(bin_indexes[0], 0, n_xbins, right_closed=True) &
in_range(bin_indexes[1], 0, n_ybins, right_closed=True) )
bin_labels = np.full(bin_indexes.shape[1], fill_value=np.nan)
bin_labels[valid_mask] = np.ravel_multi_index((bin_indexes[0, valid_mask],
bin_indexes[1, valid_mask]),
dims=(n_xbins, n_ybins),
order='F')

In case some events are outside the desired range, their label would be a NaN instead of a numerical index, so when grouping the events bin by bin, none would be assigned to those events...

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok



def calculate_residuals(x : np.array,
y : np.array,
fit_output : FitFunction):
'''
Calculate residuals and their standard deviation for the fitted data.

Parameters
----------
x : pd.array
Independent variable
y : pd.array
Dependent variable
fit_output : FitFunction
Container for IC's fit function result

Returns
-------
res : np.array
Residuals.
std : float
Standard deviation of residuals.
'''

res = y - fit_output.fn(x); std = res.std()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
res = y - fit_output.fn(x); std = res.std()
res = y - fit_output.fn(x)
std = res.std()

I think I told you to make it a single line, but I didn't realize you needed the two variables. In any case, this function is so trivial, that if it is only used in fit_and_fill_map, you can remove it.


return res, std


def calculate_pval(residuals : np.array):
'''
Calculate the p-value for the Shapiro-Wilk normality test of residuals.

Parameters
----------
residuals : np.array
Residuals from the fitted model.

Returns
-------
pval : float
p-value for the Shapiro-Wilk normality test.
'''

pval = stats.shapiro(residuals)[1] if (len(residuals) > 3) else 0.

return pval
Comment on lines +341 to +358
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can also remove this function.



def valid_bin_counter(map_df : pd.DataFrame,
validity_parameter : Optional[float] = 0.9):
Comment on lines +361 to +362
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this per one or percent? The docsstring suggests per one, the usage suggests percent. I suggest renaming it to make it more clear (validity_ratio or validity_percent)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, need tests for this function

'''
Count the number of valid bins in the map DataFrame and issue a warning
if the validity threshold is not met.

Parameters
----------
map_df : pd.DataFrame
DataFrame containing map data.
validity_parameter : float
Threshold for the ratio of valid bins (default set to 0.9).

Returns
-------
valid_per : float
Percentage of valid bins.
'''

inside = np.count_nonzero(map_df.in_active)
valid = np.count_nonzero(map_df.in_active & map_df.valid)

valid_per = valid / inside if inside else 0

if valid_per <= validity_parameter:
warnings.warn(f"{inside-valid} inner bins are not valid. {100*valid_per:.1f}% are successful.", UserWarning)

return valid_per


def fit_and_fill_map(map_bin : pd.DataFrame,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I understand correctly, this is a pd.Series, not a pd.DataFrame

dst : pd.DataFrame,
fittype : KrFitFunction):
Comment on lines +391 to +393
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is not tested

'''
This function is the core of the map computation. It's the one in charge of looping
over all the bins, performing the calculations of the lifetime fits and filling the
krypton map dataframe with all the obtained parameters.

Basically, it's applied to the dataframe, and bin by bin, it checks if the conditions
for the map computation are met. If it's the case, it will call the corresponding fit
function specified in the fittype parameter, calculate all the different map values
and fill the dataframe with them.

Parameters
----------
map_bin : pd.DataFrame
KrMap dataframe
dst : pd.DataFrame
kdst dataframe.
fittype : str
Type of fit to perform.

Returns
-------
kr_map : pd.DataFrame
DataFrame containing map parameters.
'''
if not map_bin.in_active or not map_bin.has_min_counts: return map_bin

dst_bin = dst.loc[dst.bin_index == map_bin.bin]

fit_func, seed = get_function_and_seed_lt(fittype = fittype)
x, y = select_fit_variables (fittype = fittype,
dst = dst_bin)

fit_output = fit(func = fit_func,
x = x,
y = y,
seed = seed(x, y))

par, err, cov = transform_parameters(fit_output = fit_output)

res = y - fit_output.fn(x)
std = res.std()
chi2 = get_chi2_and_pvalue(y, fit_output.fn(x), len(x)-len(par), std)[0]
pval = stats.shapiro(res)[1] if (len(res) > 3) else 0.

map_bin['e0'] = par[0]
map_bin['ue0'] = err[0]
map_bin['lt'] = par[1]
map_bin['ult'] = err[1]
map_bin['cov'] = cov
map_bin['res_std'] = std
map_bin['chi2'] = chi2
map_bin['pval'] = pval
map_bin['fit_success'] = fit_output.ier in range(1, 5)
map_bin['valid'] = map_bin['fit_success'] and map_bin['has_min_counts'] and map_bin['in_active']

return map_bin


def regularize_map(maps : pd.DataFrame,
x2range : Tuple[float, float] = None):
'''
Given a certain chi2 range, this function checks which bins are outside that
range and substitutes the (probably wrong) values of the map (e0, lt, etc) for
the mean value of the whole map.

Parameters
----------
maps: pd.DataFrame
Map to check the outliers
x2range : Tuple[float, float]
Range for chi2

Returns
---------
new_map: pd.DataFrame
Regularized map.'''
new_map = maps.copy()
outliers = new_map.in_active & ~new_map.valid

if isinstance(x2range, Tuple):
outliers &= ~in_range(new_map.chi2, *x2range)
Comment on lines +471 to +474
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the logic is wrong here. The outliers should be those that are in the active and (not valid OR not in range). Truth table:

in_active  valid   x2_in_range  outlier  expected
--------------------------------------------------
True       True    True         False    False   <--- satisfied
True       True    False        False    True    <--- not satisfied
True       False   True         False    True    <--- not satisfied
True       False   False        True     True    <--- satisfied
False      x       x            False    False   <--- satisfied

else: outliers = outliers

new_map['e0'] [outliers] = np.nanmean(maps['e0'])
new_map['lt'] [outliers] = np.nanmean(maps['lt'])
new_map['ue0'][outliers] = np.nanmean(maps['ue0'])
new_map['ult'][outliers] = np.nanmean(maps['ult'])

return new_map
Loading