diff --git a/doc/src/parts/user_guide.rst b/doc/src/parts/user_guide.rst index 137c2dbe..5517cb67 100644 --- a/doc/src/parts/user_guide.rst +++ b/doc/src/parts/user_guide.rst @@ -389,6 +389,29 @@ A typical dictionary returned by the :py:meth:`~.FitBase.do_fit` method looks li called with the corresponding keyword :code:`fit.do_fit(asymmetric_parameter_errors=True)`. Otherwise they will be :py:obj:`None`. +Histogram Fits +--------------- + +In physics experiments data is frequently histogrammed in order to reduce the data to a manageable number of bins. +*kafe2* provides a dedicated fitting class for histogram fits, intended to be used when datapoints are obtained +from a random distribution. Especially when large numbers of datapoints are present it is more efficient to treat +the data as a histogram. To perform a histogram fit using raw data, the datapoints have to be filled into a :py:obj:`HistContainer`. +Then the procedure is similar to the above. By default, the :py:obj:`HistFit` class will use a normal distribution as model +function and a poisson likelihood as cost function. Both can be changed using the `model_function` and +`cost_function` keywords. + +.. code-block:: python + + from kafe2 import Fit, HistContainer + + histogram = HistContainer(n_bins=10, bin_range=(-5, 5), + fill_data=[-7.5, 1.23, 5.74, 1.9, -0.2, 3.1, -2.75, ...]) + hist_fit = Fit(histogram) + hist_fit.do_fit() + +By default it is assumed that the model function for a `HistFit` object is a probability density function normalized to 1. +As a consequence the bin contents are also being normalized to 1. +To disable this behavior, set `density=False` in the `HistFit` constructor. .. _plotting: diff --git a/examples/009_histogram_fit/04_pitfalls.py b/examples/009_histogram_fit/04_pitfalls.py new file mode 100644 index 00000000..4df7df85 --- /dev/null +++ b/examples/009_histogram_fit/04_pitfalls.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python + +""" +kafe2 example: Histogram Fit (Pitfalls) +======================================= + +This example demonstrates a scenario in which it is more convenient to use the +HistFit class rather than the XYFit class. + +While it is in principle possible to perform such fits correctly using XYFit, +more care must be taken to avoid unusable or biased fit results. +This example shows common problems that can occur when using XYFit and how to fix them. +HistFit handles these problems automatically. + +We will try to build the equivalent of the kafe2 HistFit by hand using an XYFit. +In general histogram Fits are useful when handling large amounts of data. +The computational complexity of e.g. the chi2 cost function scales with the number of inputs cubed +(when considering correlations). So reducing the raw data to a comparatively +small number of bins is crucial to make the fit computationally feasible. + +The default XYFit has some problems that have to be addressed, when histogrammed data is +processed. The XYFit does not have any built-in functionality to transform raw data into +histogrammed data. This means if we want to use histogrammed data, this has to be done manually. +In the following, 30 datapoints, sampled from a normal distribution, are used. +""" +import numpy as np +import matplotlib.pyplot as plt +from kafe2 import XYContainer, Fit, HistContainer, Plot + + +def normal_distribution(x, mu=0, sigma=1): + return np.exp(-0.5 * ((x - mu) / sigma) ** 2) / np.sqrt(2.0 * np.pi * sigma**2) + + +# Fill the datapoints into bins (generated uniformly with mu = 0, sigma = 1) +data = np.array( + [ + 0.15452608, + 0.15814163, + 1.84110142, + -0.27350108, + -1.89289518, + 0.21379812, + -0.27967822, + 0.21451591, + -0.07221073, + -0.51282405, + 0.39306818, + 1.12542323, + -0.71070046, + 1.86784967, + -0.17510427, + 1.72786353, + 0.95243581, + -0.71476994, + -0.0816407, + -0.38590102, + 0.92506023, + 0.4333063, + 0.86662424, + -0.78075927, + 1.36575012, + -0.49828474, + 0.14649399, + 1.40194082, + -1.50842127, + 1.21646781, + ] +) + +bin_edges = np.array([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]) +bin_counts, bin_edges = np.histogram(data, bins=bin_edges, density=True) + +# Now naively perform a default XYFit +x_data = bincenters = np.mean([bin_edges[:-1], bin_edges[1:]], axis=0) # use bincenters as x values +y_data = bin_counts # use normalized histogram as y data +y_error = np.sqrt(bin_counts) / (np.sum(bin_counts) * np.diff(bin_edges)) # assume Poisson errors on our bin_counts + +xy_data = XYContainer(x_data=x_data, y_data=y_data) +xy_data.add_error(err_val=y_error, axis="y") + +XYFit_1 = Fit(xy_data, normal_distribution) +XYFit_1.do_fit() +# create a plot +Plot1 = Plot(XYFit_1) +Plot1.plot() +plt.show() +""" +When performing the fit, errors like "The cost function was evaluated as infinite" appear in the output. +Furthermore, when looking at the plot of our fit result, it is clear that this fit didn't return the result we expected. +The starting values for the fit are returned as best fit value, and the uncertainties are reported as NaN. +What happened? The problem arises because Poisson uncertainties were assumed for the bin counts. +If a bin is empty, the uncertainty is treated as zero by the fit. Thus the model function is forced to pass this datapoint +exactly or the cost function will be infinite. +This occures because the XYFit uses a χ²-cost function by default, which is only valid +for Gaussian uncertainties, but not in the case of Poisson uncertainties. To fix this, the +cost function of the fit is changed to a Poisson negative log-likelhoood (NLL). +""" + + +# rescale normal distribution by bin counts and bin width +def normal_distribution_scaled(x, mu=0, sigma=1): + return np.exp(-0.5 * ((x - mu) / sigma) ** 2) / np.sqrt(2.0 * np.pi * sigma**2) * np.sum(bin_counts) * 0.5 + + +bin_counts, bin_edges = np.histogram(data, bins=bin_edges) + +x_data = bincenters = np.mean([bin_edges[:-1], bin_edges[1:]], axis=0) # use bincenters as x values +y_data = bin_counts # use bin_counts as y data (not normalized now) + +xy_data = XYContainer(x_data=x_data, y_data=y_data) + +XYFit_2 = Fit(xy_data, normal_distribution_scaled, cost_function="poisson") +XYFit_2.do_fit() +# create a plot +Plot2 = Plot(XYFit_2) +Plot2.plot() +plt.show() + +""" +Now it wasn't even necessary to explicitly specify the y-errors. By using a Poisson NLL, +the y-errors are no longer calculated from the measured bin counts, but instead from the model expectation. +This handles empty bins correctly and also prevents getting biased uncertainties. + +Another subtlety is the definition of the y_data: So far, simply the midpoint of each bin was used. This is only +a linear approximation of the behaviour of the model function between the bin edges. The HistFit class of kafe2 +on the other hand uses "Simpson's rule", a method to approximate the behaviour quadratically for more accuracy. +The most accurate albeit computationally expensive method would be to integrate the model function over each bin. + +The implementation of Simpson's rule in our procedure using a XYFit will not be done here, +since the influence is rather small in this case. Instead, it is shown how it is much easier to just use the HistFit +class of kafe2. The binning is automatically done by the HistContainer, the Poisson NLL is the default cost function +and the Simpson rule is already implemented. +""" +# We will use our initial data and binning +hist_data = HistContainer(bin_edges=bin_edges, fill_data=data) + + +# This is everything we have to prepare befor performing the fit +Histfit = Fit(hist_data, model_function=normal_distribution, density=True) +Histfit.do_fit() + +Plot3 = Plot(Histfit) +Plot3.plot() +plt.show() + +""" +Now consider the case, where systematical errors next to the statistical ones are present. Suppose each bin has an additional +Gaussian uncertainty of 1. Now there are two different types of uncertainties in the Fit, that can't be simply added. +However, due to the central limit theorem, for sufficiently large event counts the Poisson distribution approaches +a normal distrbution. Therefore, by switching the cost function from a Poisson NLL to the Gaussian approximation, +the fit can be performed correctly again. Note that this gaussian approximation is different to the one that was falsly used +in the beginning of the example. Here statistical uncertainties are calculated from the model function, while above, +they were calculated from the measured data. +""" +# We will use our initial data and binning +hist_data = HistContainer(bin_edges=bin_edges, fill_data=data) +hist_data.add_error(err_val=1) + +# This is everything we have to prepare befor performing the fit +Histfit = Fit(hist_data, model_function=normal_distribution, density=True, cost_function="gauss-approximation") +Histfit.do_fit() + +Plot4 = Plot(Histfit) +Plot4.plot() +plt.show() diff --git a/kafe2/fit/_base/cost.py b/kafe2/fit/_base/cost.py index 483e59cb..040d7620 100644 --- a/kafe2/fit/_base/cost.py +++ b/kafe2/fit/_base/cost.py @@ -354,7 +354,7 @@ def _chi2(self, data, model, cov_mat_qr=None, cov_mat_cholesky=None, err=None): raise ValueError("'err' must not contain any zero values!") if self.needs_errors: # There are other warnings that notify the user about singular cov mat, etc. - warnings.warn("Setting all data errors to 1 as a fallback.") + warnings.warn("'err' contains zero values. Setting all data errors to 1 as a fallback.") else: _res = _res / err diff --git a/kafe2/fit/xy/cost.py b/kafe2/fit/xy/cost.py index 1583eced..80981555 100644 --- a/kafe2/fit/xy/cost.py +++ b/kafe2/fit/xy/cost.py @@ -1,3 +1,5 @@ +import numpy as np + from .._base import ( CostFunction_Chi2, CostFunction_GaussApproximation, @@ -67,6 +69,11 @@ def __init__(self, data_point_distribution="poisson", ratio=False, axes_to_use=" raise ValueError("Unknown value '%s' for 'axes_to_use': must be one of ('xy', 'y')") super(XYCostFunction_NegLogLikelihood, self).__init__(data_point_distribution=data_point_distribution, ratio=ratio) + def is_data_compatible(self, data): + if self._cost_function_handle in [self.nll_poisson, self.nllr_poisson] and (np.count_nonzero(data[1] % 1) > 0 or np.any(data[1] < 0)): + return False, "poisson distribution can only have non-negative integers as y data." + return True, None + class XYCostFunction_GaussApproximation(CostFunction_GaussApproximation): def __init__(