From 29d82ea60f59786ef7154eea0f5d9465bcb90f36 Mon Sep 17 00:00:00 2001 From: Jonas Kleck Date: Thu, 18 Sep 2025 11:32:11 +0200 Subject: [PATCH 01/10] added documentation for histogram fits, added an example showing why to use histogram fits --- doc/src/parts/user_guide.rst | 22 +++++ examples/009_histogram_fit/04_pitfalls.py | 115 ++++++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 examples/009_histogram_fit/04_pitfalls.py diff --git a/doc/src/parts/user_guide.rst b/doc/src/parts/user_guide.rst index 137c2dbe..63fae57f 100644 --- a/doc/src/parts/user_guide.rst +++ b/doc/src/parts/user_guide.rst @@ -389,6 +389,28 @@ A typical dictionary returned by the :py:meth:`~.FitBase.do_fit` method looks li called with the corresponding keyword :code:`fit.do_fit(asymmetric_parameter_errors=True)`. Otherwise they will be :py:obj:`None`. +Histogram Fits +--------------- + +A very common fit type is the histogram fit. *kafe2* provides a dedicated fitting class for histogram fits, +intendet to be used when datapoints are obtained from a random distribution. Especially when large +numbers of datapoints are present it is more efficient to treat the data as a histogram. To perform +a histogram fit, the datapoints have to be filled into a :py:obj:`HistContainer`. Then the procedure +is similar to the above. By default, the :py:obj:`HistFit` class will use a normal distribution as model +function and a poisson likelihood as cost function. Both can be changed using the `model_function` and +`cost_function` keywords. + +.. code-block:: python + from kafe2 import Fit, HistContainer + + histogram = HistContainer(n_bins=10, bin_range=(-5, 5), + fill_data=[-7.5, 1.23, 5.74, 1.9, -0.2, 3.1, -2.75, ...]) + hist_fit = Fit(histogram) + hist_fit.do_fit() + +Depending on whether the modelfunction is already normalised, or has a normalisation constant, +that is also supposed to be estimated in the fit the `density` keyword can be used, during creation +of the `HistFit` object. .. _plotting: diff --git a/examples/009_histogram_fit/04_pitfalls.py b/examples/009_histogram_fit/04_pitfalls.py new file mode 100644 index 00000000..767f9e55 --- /dev/null +++ b/examples/009_histogram_fit/04_pitfalls.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python + +""" +kafe2 example: Histogram Fit (Pitfalls) +======================================= + +This example demonstrates why it is more convenient to use the HistFit class +instead of XYFit when dealing with histogrammed data. + +While it is in principle possible to perform such fits correctly using XYFit, +it requires much more care. This example shows common mistakes that can occur +when that necessary care is not taken and how this makes the fit results worse. + +We will especially look at two scenarios that typically arise from having only small amounts of data. +This is, for example, a problem in the search for new rare processes in high-energy physics. +Assume we are looking for a Gaussian signal peak, free from background for simplicity +(the treatment of a signal over background is explained in example 03_SpluBfit.py). + +The first problem arises if zero events are present in a bin. Since we assume Poisson uncertainties +on the data points, an empty bin will be assumed to have an uncertainty of zero when using a Gaussian approximation. +By default, XYFit uses a χ² cost function, which describes data with Gaussian uncertainties. +This means empty bins will be assumed to be known with perfect precision, and +the model function is forced to pass this point exactly. Of course, this only happens when no other (systematic) +uncertainties are given. It can be seen in the following example script how this makes an interpretable +result impossible. Since the HistFit class uses a Poisson likelihood by default, this first problem is avoided. +""" + +import numpy as np +import matplotlib.pyplot as plt +from kafe2 import XYContainer, Fit, HistContainer, Plot + +def normal_distribution(x, mu = 0, sigma = 1): + return np.exp(-0.5 * ((x - mu) / sigma) ** 2) / np.sqrt(2.0 * np.pi * sigma ** 2) + +#Manually specify already binned data, to provoke empty bins (generated uniformly with mu = 0, sigma = 1) +bincounts = np.array([1, 0, 3, 9, 7, 8, 1, 1]) +binedges = np.array([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]) + +#Now naively perform a default XYFit +x_data = binmids = np.mean([binedges[:-1], binedges[1:]], axis=0) #use binmids as x values +y_data = bincounts/(np.sum(bincounts)*np.diff(binedges)) +print(y_data) #use normalized histogram as y data +x_error = 0.25 #use half binwidht as x_error +y_error = np.sqrt(y_data) + +xy_data = XYContainer(x_data=x_data, y_data=y_data) +xy_data.add_error(err_val=x_error, axis="x") +xy_data.add_error(err_val=y_error, axis="y") + +XYFit_1 = Fit(xy_data, normal_distribution) +XYFit_1.do_fit() +#create a plot +Plot1 = Plot(XYFit_1) +Plot1.plot() +plt.show() + + + +""" +This first fit should give you warnings about the cost function being evaluated as infinite, which comes from the empty bin. +Also, the result of the fit should not return any good values. This is the result of the empty bin. + +Another problem in the code above is the use of incorrect y-values for the fit. +Naively, the y-value of the model function at the position of the bin midpoints was used. +In fact, since probability densities and histogrammed data are being treated, the correct way +would be to integrate the model function over each bin to obtain the correct expected count of events in the bin. +It can be seen in the following that even if we combine bins in order to get rid of the empty bin, +the fit will still not yield perfect results. + +""" +bincounts = np.array([1, 3, 9, 7, 8, 2]) +binedges = np.array([-2, -1, -0.5, 0, 0.5, 1, 2]) + +#Now again perform a default XYFit +x_data = binmids = np.mean([binedges[:-1], binedges[1:]], axis=0) #use binmids as x values +y_data = bincounts/(np.sum(bincounts)*np.diff(binedges)) #use normalized histogram as y data +x_error = np.diff(binedges)/2 #use half binwidht as x_error +y_error = np.sqrt(y_data) + +xy_data = XYContainer(x_data=x_data, y_data=y_data) +xy_data.add_error(err_val=x_error, axis="x") +xy_data.add_error(err_val=y_error, axis="y") + +XYFit_2 = Fit(xy_data, normal_distribution) +XYFit_2.do_fit() +#create a plot +Plot2 = Plot(XYFit_2) +Plot2.plot() +plt.show() + +''' +Now you should see that the fit will at least converge, but the results have large uncertainties. +This is, among other things, because we still don't use the integral over our bins as the y-value for our model in the fit. +Next, the HistContainer, which automatically triggers kafe2 to use the HistFit class for fitting, will be used. +This will yield the best results without any further effort, even without eliminating the empty bin beforehand. +''' + +#We will use our initial bincounts and binning +bincounts = np.array([1, 0, 3, 9, 7, 8, 1, 1]) +binedges = np.array([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]) + +hist_data = HistContainer(bin_edges=binedges) +hist_data.set_bins(bincounts) + +#This is everything we have to prepare befor performing the fit +Histfit = Fit(hist_data, model_function=normal_distribution) +Histfit.do_fit() + +Plot3 = Plot(Histfit) +Plot3.plot() +plt.show() + + + + From 3dcaf2c4fb1c2dea686eb3c3806fb50010cb5783 Mon Sep 17 00:00:00 2001 From: jonas-kleck Date: Thu, 18 Sep 2025 11:39:47 +0200 Subject: [PATCH 02/10] edited Histogram Fit section --- doc/src/parts/user_guide.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/src/parts/user_guide.rst b/doc/src/parts/user_guide.rst index 63fae57f..c66233e1 100644 --- a/doc/src/parts/user_guide.rst +++ b/doc/src/parts/user_guide.rst @@ -401,6 +401,7 @@ function and a poisson likelihood as cost function. Both can be changed using th `cost_function` keywords. .. code-block:: python + from kafe2 import Fit, HistContainer histogram = HistContainer(n_bins=10, bin_range=(-5, 5), From a548b9bbd39259eba538a7135efe11a979d6b169 Mon Sep 17 00:00:00 2001 From: Jonas Kleck Date: Thu, 18 Sep 2025 20:07:59 +0200 Subject: [PATCH 03/10] Added prefit as solution to the incorrectly specified uncertainties to example --- examples/009_histogram_fit/04_pitfalls.py | 77 +++++++++++++++++++---- 1 file changed, 64 insertions(+), 13 deletions(-) diff --git a/examples/009_histogram_fit/04_pitfalls.py b/examples/009_histogram_fit/04_pitfalls.py index 767f9e55..3fc4e2e0 100644 --- a/examples/009_histogram_fit/04_pitfalls.py +++ b/examples/009_histogram_fit/04_pitfalls.py @@ -11,7 +11,7 @@ it requires much more care. This example shows common mistakes that can occur when that necessary care is not taken and how this makes the fit results worse. -We will especially look at two scenarios that typically arise from having only small amounts of data. +We will especially look at two scenarios that can arise from having only small amounts of data. This is, for example, a problem in the search for new rare processes in high-energy physics. Assume we are looking for a Gaussian signal peak, free from background for simplicity (the treatment of a signal over background is explained in example 03_SpluBfit.py). @@ -60,14 +60,11 @@ def normal_distribution(x, mu = 0, sigma = 1): This first fit should give you warnings about the cost function being evaluated as infinite, which comes from the empty bin. Also, the result of the fit should not return any good values. This is the result of the empty bin. -Another problem in the code above is the use of incorrect y-values for the fit. -Naively, the y-value of the model function at the position of the bin midpoints was used. -In fact, since probability densities and histogrammed data are being treated, the correct way -would be to integrate the model function over each bin to obtain the correct expected count of events in the bin. It can be seen in the following that even if we combine bins in order to get rid of the empty bin, -the fit will still not yield perfect results. - +the fit will converge and not throw errors, but still not yield perfect results. """ + + bincounts = np.array([1, 3, 9, 7, 8, 2]) binedges = np.array([-2, -1, -0.5, 0, 0.5, 1, 2]) @@ -88,12 +85,66 @@ def normal_distribution(x, mu = 0, sigma = 1): Plot2.plot() plt.show() -''' -Now you should see that the fit will at least converge, but the results have large uncertainties. -This is, among other things, because we still don't use the integral over our bins as the y-value for our model in the fit. + + +""" +In the code above, the uncertainties on each bin were simply specified as the square root of the counts in each bin. +In reality, the measured bin counts are just one random experiment. It is the outcome of a random draw +from a Poisson distribution. The mean, and thus also the Poisson uncertainty of these bin counts, stems from the +expected number of events in that bin given by the model function. That means calculating the uncertainties like +it was done above introduces a bias. If there are more events than expected in a bin, the uncertainties are overestimated. +On the other hand, they are underestimated if the observed number of events in a bin is lower than expected. + +This problem can be reduced by doing a so-called prefit. The idea is to use the fit from above as a first approximation +to our true distribution. Then uncertainties are obtained from the expected number of events in each bin as predicted +by the prefit. +""" + +bincounts = np.array([1, 3, 9, 7, 8, 2]) +binedges = np.array([-2, -1, -0.5, 0, 0.5, 1, 2]) + +#Now again perform a default XYFit +x_data = binmids = np.mean([binedges[:-1], binedges[1:]], axis=0) #use binmids as x values +y_data = bincounts/(np.sum(bincounts)*np.diff(binedges)) #use normalized histogram as y data +x_error = np.diff(binedges)/2 #use half binwidht as x_error + +prefit_result = XYFit_2.parameter_values +y_error_model = np.sqrt(normal_distribution(binmids, *prefit_result))#Now instead of the measured data, use the prefit to give us the expected bincounts + +#print both y_errors, from the prefit and from this fit for comparison +print("The y errors used in the prefit, relative to the datapoints are: " + str(y_error)) +print("The y errors used now, relative to the model with parameters from the prefit: " + str(y_error_model)) + + +xy_data = XYContainer(x_data=x_data, y_data=y_data) +xy_data.add_error(err_val=x_error, axis="x") +xy_data.add_error(err_val=y_error_model, axis="y") + +XYFit_3 = Fit(xy_data, normal_distribution) +XYFit_3.do_fit() +#create a plot +Plot3 = Plot(XYFit_3) +Plot3.plot() +plt.show() + +""" +The uncertainties on the result were already reduced slightly in comparison to the prefit. And by looking at the printed +y-errors for the prefit and the final fit, differences up to 20% can be observed. +Still the uncertainties are larger than they should be. This is also due to one final problem that +is discussed in this example script. + +One problem in all the code above is the use of incorrect y-values for the fit. +Naively, the y-value of the model function at the position of the bin midpoints was used. +In fact, since probability densities and histogrammed data are being treated, the correct way +would be to integrate the model function over each bin to obtain the correct expected count of events in the bin. +This will not be shown here. + +We can see that the result of our fit gets better with every improvement we do to our fitting procedure. +But a lot of effort has to be put in to get reasonable results. Next, the HistContainer, which automatically triggers kafe2 to use the HistFit class for fitting, will be used. This will yield the best results without any further effort, even without eliminating the empty bin beforehand. -''' +Furthermore, the Plot shows nicely the model function prediction for the histogram. +""" #We will use our initial bincounts and binning bincounts = np.array([1, 0, 3, 9, 7, 8, 1, 1]) @@ -106,8 +157,8 @@ def normal_distribution(x, mu = 0, sigma = 1): Histfit = Fit(hist_data, model_function=normal_distribution) Histfit.do_fit() -Plot3 = Plot(Histfit) -Plot3.plot() +Plot4 = Plot(Histfit) +Plot4.plot() plt.show() From 222df5523edb65f4092df8d730b8de28c7bb40cd Mon Sep 17 00:00:00 2001 From: Jonas Kleck Date: Mon, 22 Sep 2025 19:36:45 +0200 Subject: [PATCH 04/10] reformatting and reformulation of histogram fit example and documentation --- doc/src/parts/user_guide.rst | 13 ++-- examples/009_histogram_fit/04_pitfalls.py | 79 +++++++++++------------ 2 files changed, 46 insertions(+), 46 deletions(-) diff --git a/doc/src/parts/user_guide.rst b/doc/src/parts/user_guide.rst index c66233e1..c8a814ab 100644 --- a/doc/src/parts/user_guide.rst +++ b/doc/src/parts/user_guide.rst @@ -392,9 +392,10 @@ A typical dictionary returned by the :py:meth:`~.FitBase.do_fit` method looks li Histogram Fits --------------- -A very common fit type is the histogram fit. *kafe2* provides a dedicated fitting class for histogram fits, -intendet to be used when datapoints are obtained from a random distribution. Especially when large -numbers of datapoints are present it is more efficient to treat the data as a histogram. To perform +A very common fit type is the histogram fit. In physics experiments data is frequently histogrammed +in order to reduce the data to a manageable number of bins. *kafe2* provides a dedicated fitting class +for histogram fits, intendet to be used when datapoints are obtained from a random distribution. Especially +when large numbers of datapoints are present it is more efficient to treat the data as a histogram. To perform a histogram fit, the datapoints have to be filled into a :py:obj:`HistContainer`. Then the procedure is similar to the above. By default, the :py:obj:`HistFit` class will use a normal distribution as model function and a poisson likelihood as cost function. Both can be changed using the `model_function` and @@ -409,9 +410,9 @@ function and a poisson likelihood as cost function. Both can be changed using th hist_fit = Fit(histogram) hist_fit.do_fit() -Depending on whether the modelfunction is already normalised, or has a normalisation constant, -that is also supposed to be estimated in the fit the `density` keyword can be used, during creation -of the `HistFit` object. +By default it is assumed that the model function for a `HistFit` object is a probability density function normalised to 1. +As a consequence the bin contents are also being normalized to 1. +To disable this behavior, set `density=False` in the `HistFit` constructor. .. _plotting: diff --git a/examples/009_histogram_fit/04_pitfalls.py b/examples/009_histogram_fit/04_pitfalls.py index 3fc4e2e0..280634b0 100644 --- a/examples/009_histogram_fit/04_pitfalls.py +++ b/examples/009_histogram_fit/04_pitfalls.py @@ -4,17 +4,18 @@ kafe2 example: Histogram Fit (Pitfalls) ======================================= -This example demonstrates why it is more convenient to use the HistFit class -instead of XYFit when dealing with histogrammed data. +This example demonstrates a scenario in which it is more convenient to use the +HistFit class rather than the XYFit class. While it is in principle possible to perform such fits correctly using XYFit, -it requires much more care. This example shows common mistakes that can occur -when that necessary care is not taken and how this makes the fit results worse. +more care must be taken to avoid unusable or biased results. +This example shows common problems that can occur when using XYFit and how to fix them. +HistFit handles these problems automatically. We will especially look at two scenarios that can arise from having only small amounts of data. This is, for example, a problem in the search for new rare processes in high-energy physics. Assume we are looking for a Gaussian signal peak, free from background for simplicity -(the treatment of a signal over background is explained in example 03_SpluBfit.py). +(the treatment of a signal over background is explained in example 03_SplusBfit.py). The first problem arises if zero events are present in a bin. Since we assume Poisson uncertainties on the data points, an empty bin will be assumed to have an uncertainty of zero when using a Gaussian approximation. @@ -29,18 +30,20 @@ import matplotlib.pyplot as plt from kafe2 import XYContainer, Fit, HistContainer, Plot -def normal_distribution(x, mu = 0, sigma = 1): - return np.exp(-0.5 * ((x - mu) / sigma) ** 2) / np.sqrt(2.0 * np.pi * sigma ** 2) -#Manually specify already binned data, to provoke empty bins (generated uniformly with mu = 0, sigma = 1) +def normal_distribution(x, mu=0, sigma=1): + return np.exp(-0.5 * ((x - mu) / sigma) ** 2) / np.sqrt(2.0 * np.pi * sigma**2) + + +# Manually specify already binned data, to provoke empty bins (generated uniformly with mu = 0, sigma = 1) bincounts = np.array([1, 0, 3, 9, 7, 8, 1, 1]) binedges = np.array([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]) -#Now naively perform a default XYFit -x_data = binmids = np.mean([binedges[:-1], binedges[1:]], axis=0) #use binmids as x values -y_data = bincounts/(np.sum(bincounts)*np.diff(binedges)) -print(y_data) #use normalized histogram as y data -x_error = 0.25 #use half binwidht as x_error +# Now naively perform a default XYFit +x_data = bincenters = np.mean([binedges[:-1], binedges[1:]], axis=0) # use bincenters as x values +y_data = bincounts / (np.sum(bincounts) * np.diff(binedges)) +print(y_data) # use normalized histogram as y data +x_error = 0.25 # use half binwidth as x_error y_error = np.sqrt(y_data) xy_data = XYContainer(x_data=x_data, y_data=y_data) @@ -49,29 +52,28 @@ def normal_distribution(x, mu = 0, sigma = 1): XYFit_1 = Fit(xy_data, normal_distribution) XYFit_1.do_fit() -#create a plot +# create a plot Plot1 = Plot(XYFit_1) Plot1.plot() plt.show() - """ -This first fit should give you warnings about the cost function being evaluated as infinite, which comes from the empty bin. -Also, the result of the fit should not return any good values. This is the result of the empty bin. +This first fit raises warnings about the cost function being evaluated as infinite, which originates from the empty bin. +Also, the result of the fit does not return any good values. This is the result of the empty bin. -It can be seen in the following that even if we combine bins in order to get rid of the empty bin, +It can be seen in the following that even if the bins are combined in order to get rid of the empty bin, the fit will converge and not throw errors, but still not yield perfect results. """ -bincounts = np.array([1, 3, 9, 7, 8, 2]) +bincounts = np.array([1, 3, 9, 7, 8, 2]) binedges = np.array([-2, -1, -0.5, 0, 0.5, 1, 2]) -#Now again perform a default XYFit -x_data = binmids = np.mean([binedges[:-1], binedges[1:]], axis=0) #use binmids as x values -y_data = bincounts/(np.sum(bincounts)*np.diff(binedges)) #use normalized histogram as y data -x_error = np.diff(binedges)/2 #use half binwidht as x_error +# Now again perform a default XYFit +x_data = bincenters = np.mean([binedges[:-1], binedges[1:]], axis=0) # use bincenters as x values +y_data = bincounts / (np.sum(bincounts) * np.diff(binedges)) # use normalized histogram as y data +x_error = np.diff(binedges) / 2 # use half binwidth as x_error y_error = np.sqrt(y_data) xy_data = XYContainer(x_data=x_data, y_data=y_data) @@ -80,13 +82,12 @@ def normal_distribution(x, mu = 0, sigma = 1): XYFit_2 = Fit(xy_data, normal_distribution) XYFit_2.do_fit() -#create a plot +# create a plot Plot2 = Plot(XYFit_2) Plot2.plot() plt.show() - """ In the code above, the uncertainties on each bin were simply specified as the square root of the counts in each bin. In reality, the measured bin counts are just one random experiment. It is the outcome of a random draw @@ -100,18 +101,20 @@ def normal_distribution(x, mu = 0, sigma = 1): by the prefit. """ -bincounts = np.array([1, 3, 9, 7, 8, 2]) +bincounts = np.array([1, 3, 9, 7, 8, 2]) binedges = np.array([-2, -1, -0.5, 0, 0.5, 1, 2]) -#Now again perform a default XYFit -x_data = binmids = np.mean([binedges[:-1], binedges[1:]], axis=0) #use binmids as x values -y_data = bincounts/(np.sum(bincounts)*np.diff(binedges)) #use normalized histogram as y data -x_error = np.diff(binedges)/2 #use half binwidht as x_error +# Now again perform a default XYFit +x_data = bincenters = np.mean([binedges[:-1], binedges[1:]], axis=0) # use bincenters as x values +y_data = bincounts / (np.sum(bincounts) * np.diff(binedges)) # use normalized histogram as y data +x_error = np.diff(binedges) / 2 # use half binwidth as x_error prefit_result = XYFit_2.parameter_values -y_error_model = np.sqrt(normal_distribution(binmids, *prefit_result))#Now instead of the measured data, use the prefit to give us the expected bincounts +y_error_model = np.sqrt( + normal_distribution(bincenters, *prefit_result) +) # Now instead of the measured data, use the prefit to give us the expected bincounts -#print both y_errors, from the prefit and from this fit for comparison +# print both y_errors, from the prefit and from this fit for comparison print("The y errors used in the prefit, relative to the datapoints are: " + str(y_error)) print("The y errors used now, relative to the model with parameters from the prefit: " + str(y_error_model)) @@ -122,13 +125,13 @@ def normal_distribution(x, mu = 0, sigma = 1): XYFit_3 = Fit(xy_data, normal_distribution) XYFit_3.do_fit() -#create a plot +# create a plot Plot3 = Plot(XYFit_3) Plot3.plot() plt.show() """ -The uncertainties on the result were already reduced slightly in comparison to the prefit. And by looking at the printed +The uncertainties on the result were already reduced slightly in comparison to the prefit. And by looking at the y-errors for the prefit and the final fit, differences up to 20% can be observed. Still the uncertainties are larger than they should be. This is also due to one final problem that is discussed in this example script. @@ -146,21 +149,17 @@ def normal_distribution(x, mu = 0, sigma = 1): Furthermore, the Plot shows nicely the model function prediction for the histogram. """ -#We will use our initial bincounts and binning +# We will use our initial bincounts and binning bincounts = np.array([1, 0, 3, 9, 7, 8, 1, 1]) binedges = np.array([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]) hist_data = HistContainer(bin_edges=binedges) hist_data.set_bins(bincounts) -#This is everything we have to prepare befor performing the fit +# This is everything we have to prepare befor performing the fit Histfit = Fit(hist_data, model_function=normal_distribution) Histfit.do_fit() Plot4 = Plot(Histfit) Plot4.plot() plt.show() - - - - From ca710c873b16713a1c1f9c13120484bf542890a7 Mon Sep 17 00:00:00 2001 From: Jonas Kleck Date: Wed, 24 Sep 2025 21:09:05 +0200 Subject: [PATCH 05/10] incorpoate feedback into histogram fit section section --- doc/src/parts/user_guide.rst | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/doc/src/parts/user_guide.rst b/doc/src/parts/user_guide.rst index c8a814ab..a4ea21a9 100644 --- a/doc/src/parts/user_guide.rst +++ b/doc/src/parts/user_guide.rst @@ -392,12 +392,11 @@ A typical dictionary returned by the :py:meth:`~.FitBase.do_fit` method looks li Histogram Fits --------------- -A very common fit type is the histogram fit. In physics experiments data is frequently histogrammed -in order to reduce the data to a manageable number of bins. *kafe2* provides a dedicated fitting class -for histogram fits, intendet to be used when datapoints are obtained from a random distribution. Especially -when large numbers of datapoints are present it is more efficient to treat the data as a histogram. To perform -a histogram fit, the datapoints have to be filled into a :py:obj:`HistContainer`. Then the procedure -is similar to the above. By default, the :py:obj:`HistFit` class will use a normal distribution as model +In physics experiments data is frequently histogrammed in order to reduce the data to a manageable number of bins. +*kafe2* provides a dedicated fitting class for histogram fits, intendet to be used when datapoints are obtained +from a random distribution. Especiallywhen large numbers of datapoints are present it is more efficient to treat +the data as a histogram. To perform a histogram fit, the datapoints have to be filled into a :py:obj:`HistContainer`. +Then the procedure is similar to the above. By default, the :py:obj:`HistFit` class will use a normal distribution as model function and a poisson likelihood as cost function. Both can be changed using the `model_function` and `cost_function` keywords. From 337e4276190cee68582d5c0d7eba5d1ebc7e9bea Mon Sep 17 00:00:00 2001 From: Jonas Kleck Date: Wed, 1 Oct 2025 10:01:34 +0200 Subject: [PATCH 06/10] changed the example. The focus lies more on 'recreating' a HistFit using XYFit --- examples/009_histogram_fit/04_pitfalls.py | 193 +++++++++++----------- 1 file changed, 95 insertions(+), 98 deletions(-) diff --git a/examples/009_histogram_fit/04_pitfalls.py b/examples/009_histogram_fit/04_pitfalls.py index 280634b0..5648c97f 100644 --- a/examples/009_histogram_fit/04_pitfalls.py +++ b/examples/009_histogram_fit/04_pitfalls.py @@ -12,20 +12,15 @@ This example shows common problems that can occur when using XYFit and how to fix them. HistFit handles these problems automatically. -We will especially look at two scenarios that can arise from having only small amounts of data. -This is, for example, a problem in the search for new rare processes in high-energy physics. -Assume we are looking for a Gaussian signal peak, free from background for simplicity -(the treatment of a signal over background is explained in example 03_SplusBfit.py). - -The first problem arises if zero events are present in a bin. Since we assume Poisson uncertainties -on the data points, an empty bin will be assumed to have an uncertainty of zero when using a Gaussian approximation. -By default, XYFit uses a χ² cost function, which describes data with Gaussian uncertainties. -This means empty bins will be assumed to be known with perfect precision, and -the model function is forced to pass this point exactly. Of course, this only happens when no other (systematic) -uncertainties are given. It can be seen in the following example script how this makes an interpretable -result impossible. Since the HistFit class uses a Poisson likelihood by default, this first problem is avoided. +We will try to build the kafe2 HistFit by hand using an XYFit. In general Histogram Fits are useful, +when large amounts of data are obtained, since the binning of a histogram reduces the +amount of computation. + +The default XYFit has some problems that have to be addressed, when histogrammed data is +processed. First of all, when using an XYFit, the data is passed to the Fit object in a +XYContainer. This container does not automatically fill the datapoints in bins. So this first step +has to be done manually. In the following, 30 datapoints, sampled from a normal distribution, are used. """ - import numpy as np import matplotlib.pyplot as plt from kafe2 import XYContainer, Fit, HistContainer, Plot @@ -35,19 +30,51 @@ def normal_distribution(x, mu=0, sigma=1): return np.exp(-0.5 * ((x - mu) / sigma) ** 2) / np.sqrt(2.0 * np.pi * sigma**2) -# Manually specify already binned data, to provoke empty bins (generated uniformly with mu = 0, sigma = 1) -bincounts = np.array([1, 0, 3, 9, 7, 8, 1, 1]) +# Fill the datapoints into bins (generated uniformly with mu = 0, sigma = 1) +data = np.array( + [ + 0.15452608, + 0.15814163, + 1.84110142, + -0.27350108, + -1.89289518, + 0.21379812, + -0.27967822, + 0.21451591, + -0.07221073, + -0.51282405, + 0.39306818, + 1.12542323, + -0.71070046, + 1.86784967, + -0.17510427, + 1.72786353, + 0.95243581, + -0.71476994, + -0.0816407, + -0.38590102, + 0.92506023, + 0.4333063, + 0.86662424, + -0.78075927, + 1.36575012, + -0.49828474, + 0.14649399, + 1.40194082, + -1.50842127, + 1.21646781, + ] +) + binedges = np.array([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]) +bincounts, binedges = np.histogram(data, bins=binedges, density=True) # Now naively perform a default XYFit x_data = bincenters = np.mean([binedges[:-1], binedges[1:]], axis=0) # use bincenters as x values -y_data = bincounts / (np.sum(bincounts) * np.diff(binedges)) -print(y_data) # use normalized histogram as y data -x_error = 0.25 # use half binwidth as x_error -y_error = np.sqrt(y_data) +y_data = bincounts # use normalized histogram as y data +y_error = np.sqrt(bincounts) / (np.sum(bincounts) * np.diff(binedges)) # assume Poisson errors on our bincounts xy_data = XYContainer(x_data=x_data, y_data=y_data) -xy_data.add_error(err_val=x_error, axis="x") xy_data.add_error(err_val=y_error, axis="y") XYFit_1 = Fit(xy_data, normal_distribution) @@ -56,110 +83,80 @@ def normal_distribution(x, mu=0, sigma=1): Plot1 = Plot(XYFit_1) Plot1.plot() plt.show() - - """ -This first fit raises warnings about the cost function being evaluated as infinite, which originates from the empty bin. -Also, the result of the fit does not return any good values. This is the result of the empty bin. - -It can be seen in the following that even if the bins are combined in order to get rid of the empty bin, -the fit will converge and not throw errors, but still not yield perfect results. +When performing the fit, errors like "The cost function was evaluated as infinite" appear in the output. +Furthermore, when looking at the plot of our fit result, it is clear that this fit didn't return the result we expected. +The starting values for the fit are returned as best fit value, and the uncertainties are reported as NaN. +What happened? The problem arises because Poisson uncertainties were assumed for the bin counts. +If a bin is empty, the uncertainty is treated as zero by the fit. Thus the model function is forced to pass this datapoint +exactly or the cost function will be infinite. +This occures because the XYFit uses a χ²-cost function by default, which is only valid +for Gaussian uncertainties, but not in the case of Poisson uncertainties. To fix this, the +cost function of the fit is changed to a Poisson negative log-likelhoood (NLL). """ -bincounts = np.array([1, 3, 9, 7, 8, 2]) -binedges = np.array([-2, -1, -0.5, 0, 0.5, 1, 2]) +# rescale normal distribution by bin counts and bin width +def normal_distribution_scaled(x, mu=0, sigma=1): + return np.exp(-0.5 * ((x - mu) / sigma) ** 2) / np.sqrt(2.0 * np.pi * sigma**2) * np.sum(bincounts) * 0.5 + + +bincounts, binedges = np.histogram(data, bins=binedges) -# Now again perform a default XYFit x_data = bincenters = np.mean([binedges[:-1], binedges[1:]], axis=0) # use bincenters as x values -y_data = bincounts / (np.sum(bincounts) * np.diff(binedges)) # use normalized histogram as y data -x_error = np.diff(binedges) / 2 # use half binwidth as x_error -y_error = np.sqrt(y_data) +y_data = bincounts # use bincounts as y data (not normalized now) xy_data = XYContainer(x_data=x_data, y_data=y_data) -xy_data.add_error(err_val=x_error, axis="x") -xy_data.add_error(err_val=y_error, axis="y") -XYFit_2 = Fit(xy_data, normal_distribution) +XYFit_2 = Fit(xy_data, normal_distribution_scaled, cost_function="poisson") XYFit_2.do_fit() # create a plot Plot2 = Plot(XYFit_2) Plot2.plot() plt.show() - """ -In the code above, the uncertainties on each bin were simply specified as the square root of the counts in each bin. -In reality, the measured bin counts are just one random experiment. It is the outcome of a random draw -from a Poisson distribution. The mean, and thus also the Poisson uncertainty of these bin counts, stems from the -expected number of events in that bin given by the model function. That means calculating the uncertainties like -it was done above introduces a bias. If there are more events than expected in a bin, the uncertainties are overestimated. -On the other hand, they are underestimated if the observed number of events in a bin is lower than expected. - -This problem can be reduced by doing a so-called prefit. The idea is to use the fit from above as a first approximation -to our true distribution. Then uncertainties are obtained from the expected number of events in each bin as predicted -by the prefit. +Now it wasn't even necessary to explicitly specify the y-errors. By using a Poisson NLL, +the y-errors are no longer calculated from the measured bin counts, but instead from the model expectation. +This handles empty bins correctly and also prevents getting biased uncertainties. + +Another subtlety is the definition of the y_data: So far, simply the midpoint of each bin was used. This is only +a linear approximation of the behaviour of the model function between the bin edges. The HistFit class of kafe2 +on the other hand uses "Simpsons rule", a method to approximate the behaviour quadratically for more accuracy. +The most accurate albeit computationally expensive method would be to integrate the model function over each bin. + +The implementation of Simpsons rule in our procedure using a XYFit will not be done here, +since the influence is rather small in this case. Instead, it is shown how it is much easier to just use the HistFit +class of kafe2. The binning is automatically done by the HistContainer, the Poisson NLL is the default cost function +and the Simpson rule is already implemented. """ +# We will use our initial data and binning +hist_data = HistContainer(bin_edges=binedges, fill_data=data) -bincounts = np.array([1, 3, 9, 7, 8, 2]) -binedges = np.array([-2, -1, -0.5, 0, 0.5, 1, 2]) - -# Now again perform a default XYFit -x_data = bincenters = np.mean([binedges[:-1], binedges[1:]], axis=0) # use bincenters as x values -y_data = bincounts / (np.sum(bincounts) * np.diff(binedges)) # use normalized histogram as y data -x_error = np.diff(binedges) / 2 # use half binwidth as x_error -prefit_result = XYFit_2.parameter_values -y_error_model = np.sqrt( - normal_distribution(bincenters, *prefit_result) -) # Now instead of the measured data, use the prefit to give us the expected bincounts - -# print both y_errors, from the prefit and from this fit for comparison -print("The y errors used in the prefit, relative to the datapoints are: " + str(y_error)) -print("The y errors used now, relative to the model with parameters from the prefit: " + str(y_error_model)) - - -xy_data = XYContainer(x_data=x_data, y_data=y_data) -xy_data.add_error(err_val=x_error, axis="x") -xy_data.add_error(err_val=y_error_model, axis="y") +# This is everything we have to prepare befor performing the fit +Histfit = Fit(hist_data, model_function=normal_distribution, density=True) +Histfit.do_fit() -XYFit_3 = Fit(xy_data, normal_distribution) -XYFit_3.do_fit() -# create a plot -Plot3 = Plot(XYFit_3) -Plot3.plot() +Plot5 = Plot(Histfit) +Plot5.plot() plt.show() """ -The uncertainties on the result were already reduced slightly in comparison to the prefit. And by looking at the -y-errors for the prefit and the final fit, differences up to 20% can be observed. -Still the uncertainties are larger than they should be. This is also due to one final problem that -is discussed in this example script. - -One problem in all the code above is the use of incorrect y-values for the fit. -Naively, the y-value of the model function at the position of the bin midpoints was used. -In fact, since probability densities and histogrammed data are being treated, the correct way -would be to integrate the model function over each bin to obtain the correct expected count of events in the bin. -This will not be shown here. - -We can see that the result of our fit gets better with every improvement we do to our fitting procedure. -But a lot of effort has to be put in to get reasonable results. -Next, the HistContainer, which automatically triggers kafe2 to use the HistFit class for fitting, will be used. -This will yield the best results without any further effort, even without eliminating the empty bin beforehand. -Furthermore, the Plot shows nicely the model function prediction for the histogram. +Now consider the case, where systematical errors next to the statistical ones are present. Suppose each bin has an additional +Gaussian uncertainty of 1. Now there are two different types of uncertainties in the Fit, that can't be simply added. +However, due to the central limit theorem, for sufficiently large event counts the Poisson distribution approaches +a normal distrbution. Therefore, by switching the cost function from a Poisson NLL to the Gaussian approximation, +the fit can be performed correctly again. """ - -# We will use our initial bincounts and binning -bincounts = np.array([1, 0, 3, 9, 7, 8, 1, 1]) -binedges = np.array([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]) - -hist_data = HistContainer(bin_edges=binedges) -hist_data.set_bins(bincounts) +# We will use our initial data and binning +hist_data = HistContainer(bin_edges=binedges, fill_data=data) +hist_data.add_error(err_val=1) # This is everything we have to prepare befor performing the fit -Histfit = Fit(hist_data, model_function=normal_distribution) +Histfit = Fit(hist_data, model_function=normal_distribution, density=True, cost_function="gauss-approximation") Histfit.do_fit() -Plot4 = Plot(Histfit) -Plot4.plot() +Plot5 = Plot(Histfit) +Plot5.plot() plt.show() From 46c9e44ceeb2c1079613b5ab45df51df150edbff Mon Sep 17 00:00:00 2001 From: Jonas Kleck Date: Wed, 1 Oct 2025 10:03:16 +0200 Subject: [PATCH 07/10] changed the data compatibility check for XYFits using the poisson NLL to accept also non integer binedges --- kafe2/fit/xy/cost.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kafe2/fit/xy/cost.py b/kafe2/fit/xy/cost.py index 1583eced..80981555 100644 --- a/kafe2/fit/xy/cost.py +++ b/kafe2/fit/xy/cost.py @@ -1,3 +1,5 @@ +import numpy as np + from .._base import ( CostFunction_Chi2, CostFunction_GaussApproximation, @@ -67,6 +69,11 @@ def __init__(self, data_point_distribution="poisson", ratio=False, axes_to_use=" raise ValueError("Unknown value '%s' for 'axes_to_use': must be one of ('xy', 'y')") super(XYCostFunction_NegLogLikelihood, self).__init__(data_point_distribution=data_point_distribution, ratio=ratio) + def is_data_compatible(self, data): + if self._cost_function_handle in [self.nll_poisson, self.nllr_poisson] and (np.count_nonzero(data[1] % 1) > 0 or np.any(data[1] < 0)): + return False, "poisson distribution can only have non-negative integers as y data." + return True, None + class XYCostFunction_GaussApproximation(CostFunction_GaussApproximation): def __init__( From c68cbe4d274008b4af0e58bef95584561c8a06bc Mon Sep 17 00:00:00 2001 From: Jonas Kleck Date: Tue, 7 Oct 2025 11:02:14 +0200 Subject: [PATCH 08/10] revised histogram fit section --- doc/src/parts/user_guide.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/src/parts/user_guide.rst b/doc/src/parts/user_guide.rst index a4ea21a9..5517cb67 100644 --- a/doc/src/parts/user_guide.rst +++ b/doc/src/parts/user_guide.rst @@ -393,9 +393,9 @@ Histogram Fits --------------- In physics experiments data is frequently histogrammed in order to reduce the data to a manageable number of bins. -*kafe2* provides a dedicated fitting class for histogram fits, intendet to be used when datapoints are obtained -from a random distribution. Especiallywhen large numbers of datapoints are present it is more efficient to treat -the data as a histogram. To perform a histogram fit, the datapoints have to be filled into a :py:obj:`HistContainer`. +*kafe2* provides a dedicated fitting class for histogram fits, intended to be used when datapoints are obtained +from a random distribution. Especially when large numbers of datapoints are present it is more efficient to treat +the data as a histogram. To perform a histogram fit using raw data, the datapoints have to be filled into a :py:obj:`HistContainer`. Then the procedure is similar to the above. By default, the :py:obj:`HistFit` class will use a normal distribution as model function and a poisson likelihood as cost function. Both can be changed using the `model_function` and `cost_function` keywords. @@ -409,7 +409,7 @@ function and a poisson likelihood as cost function. Both can be changed using th hist_fit = Fit(histogram) hist_fit.do_fit() -By default it is assumed that the model function for a `HistFit` object is a probability density function normalised to 1. +By default it is assumed that the model function for a `HistFit` object is a probability density function normalized to 1. As a consequence the bin contents are also being normalized to 1. To disable this behavior, set `density=False` in the `HistFit` constructor. From a0958b70d929283af7974cd2a25c219d1787be6d Mon Sep 17 00:00:00 2001 From: Jonas Kleck Date: Tue, 7 Oct 2025 11:04:28 +0200 Subject: [PATCH 09/10] improved explanations in example --- examples/009_histogram_fit/04_pitfalls.py | 54 ++++++++++++----------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/examples/009_histogram_fit/04_pitfalls.py b/examples/009_histogram_fit/04_pitfalls.py index 5648c97f..4df7df85 100644 --- a/examples/009_histogram_fit/04_pitfalls.py +++ b/examples/009_histogram_fit/04_pitfalls.py @@ -8,18 +8,20 @@ HistFit class rather than the XYFit class. While it is in principle possible to perform such fits correctly using XYFit, -more care must be taken to avoid unusable or biased results. +more care must be taken to avoid unusable or biased fit results. This example shows common problems that can occur when using XYFit and how to fix them. HistFit handles these problems automatically. -We will try to build the kafe2 HistFit by hand using an XYFit. In general Histogram Fits are useful, -when large amounts of data are obtained, since the binning of a histogram reduces the -amount of computation. +We will try to build the equivalent of the kafe2 HistFit by hand using an XYFit. +In general histogram Fits are useful when handling large amounts of data. +The computational complexity of e.g. the chi2 cost function scales with the number of inputs cubed +(when considering correlations). So reducing the raw data to a comparatively +small number of bins is crucial to make the fit computationally feasible. The default XYFit has some problems that have to be addressed, when histogrammed data is -processed. First of all, when using an XYFit, the data is passed to the Fit object in a -XYContainer. This container does not automatically fill the datapoints in bins. So this first step -has to be done manually. In the following, 30 datapoints, sampled from a normal distribution, are used. +processed. The XYFit does not have any built-in functionality to transform raw data into +histogrammed data. This means if we want to use histogrammed data, this has to be done manually. +In the following, 30 datapoints, sampled from a normal distribution, are used. """ import numpy as np import matplotlib.pyplot as plt @@ -66,13 +68,13 @@ def normal_distribution(x, mu=0, sigma=1): ] ) -binedges = np.array([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]) -bincounts, binedges = np.histogram(data, bins=binedges, density=True) +bin_edges = np.array([-2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, 2]) +bin_counts, bin_edges = np.histogram(data, bins=bin_edges, density=True) # Now naively perform a default XYFit -x_data = bincenters = np.mean([binedges[:-1], binedges[1:]], axis=0) # use bincenters as x values -y_data = bincounts # use normalized histogram as y data -y_error = np.sqrt(bincounts) / (np.sum(bincounts) * np.diff(binedges)) # assume Poisson errors on our bincounts +x_data = bincenters = np.mean([bin_edges[:-1], bin_edges[1:]], axis=0) # use bincenters as x values +y_data = bin_counts # use normalized histogram as y data +y_error = np.sqrt(bin_counts) / (np.sum(bin_counts) * np.diff(bin_edges)) # assume Poisson errors on our bin_counts xy_data = XYContainer(x_data=x_data, y_data=y_data) xy_data.add_error(err_val=y_error, axis="y") @@ -98,13 +100,13 @@ def normal_distribution(x, mu=0, sigma=1): # rescale normal distribution by bin counts and bin width def normal_distribution_scaled(x, mu=0, sigma=1): - return np.exp(-0.5 * ((x - mu) / sigma) ** 2) / np.sqrt(2.0 * np.pi * sigma**2) * np.sum(bincounts) * 0.5 + return np.exp(-0.5 * ((x - mu) / sigma) ** 2) / np.sqrt(2.0 * np.pi * sigma**2) * np.sum(bin_counts) * 0.5 -bincounts, binedges = np.histogram(data, bins=binedges) +bin_counts, bin_edges = np.histogram(data, bins=bin_edges) -x_data = bincenters = np.mean([binedges[:-1], binedges[1:]], axis=0) # use bincenters as x values -y_data = bincounts # use bincounts as y data (not normalized now) +x_data = bincenters = np.mean([bin_edges[:-1], bin_edges[1:]], axis=0) # use bincenters as x values +y_data = bin_counts # use bin_counts as y data (not normalized now) xy_data = XYContainer(x_data=x_data, y_data=y_data) @@ -122,24 +124,24 @@ def normal_distribution_scaled(x, mu=0, sigma=1): Another subtlety is the definition of the y_data: So far, simply the midpoint of each bin was used. This is only a linear approximation of the behaviour of the model function between the bin edges. The HistFit class of kafe2 -on the other hand uses "Simpsons rule", a method to approximate the behaviour quadratically for more accuracy. +on the other hand uses "Simpson's rule", a method to approximate the behaviour quadratically for more accuracy. The most accurate albeit computationally expensive method would be to integrate the model function over each bin. -The implementation of Simpsons rule in our procedure using a XYFit will not be done here, +The implementation of Simpson's rule in our procedure using a XYFit will not be done here, since the influence is rather small in this case. Instead, it is shown how it is much easier to just use the HistFit class of kafe2. The binning is automatically done by the HistContainer, the Poisson NLL is the default cost function and the Simpson rule is already implemented. """ # We will use our initial data and binning -hist_data = HistContainer(bin_edges=binedges, fill_data=data) +hist_data = HistContainer(bin_edges=bin_edges, fill_data=data) # This is everything we have to prepare befor performing the fit Histfit = Fit(hist_data, model_function=normal_distribution, density=True) Histfit.do_fit() -Plot5 = Plot(Histfit) -Plot5.plot() +Plot3 = Plot(Histfit) +Plot3.plot() plt.show() """ @@ -147,16 +149,18 @@ class of kafe2. The binning is automatically done by the HistContainer, the Pois Gaussian uncertainty of 1. Now there are two different types of uncertainties in the Fit, that can't be simply added. However, due to the central limit theorem, for sufficiently large event counts the Poisson distribution approaches a normal distrbution. Therefore, by switching the cost function from a Poisson NLL to the Gaussian approximation, -the fit can be performed correctly again. +the fit can be performed correctly again. Note that this gaussian approximation is different to the one that was falsly used +in the beginning of the example. Here statistical uncertainties are calculated from the model function, while above, +they were calculated from the measured data. """ # We will use our initial data and binning -hist_data = HistContainer(bin_edges=binedges, fill_data=data) +hist_data = HistContainer(bin_edges=bin_edges, fill_data=data) hist_data.add_error(err_val=1) # This is everything we have to prepare befor performing the fit Histfit = Fit(hist_data, model_function=normal_distribution, density=True, cost_function="gauss-approximation") Histfit.do_fit() -Plot5 = Plot(Histfit) -Plot5.plot() +Plot4 = Plot(Histfit) +Plot4.plot() plt.show() From 08779cec0a773d291200248a05e527aa568c79d3 Mon Sep 17 00:00:00 2001 From: Jonas Kleck Date: Tue, 7 Oct 2025 11:05:49 +0200 Subject: [PATCH 10/10] edited warning message for zero errors in chi2 cost function --- kafe2/fit/_base/cost.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kafe2/fit/_base/cost.py b/kafe2/fit/_base/cost.py index 483e59cb..040d7620 100644 --- a/kafe2/fit/_base/cost.py +++ b/kafe2/fit/_base/cost.py @@ -354,7 +354,7 @@ def _chi2(self, data, model, cov_mat_qr=None, cov_mat_cholesky=None, err=None): raise ValueError("'err' must not contain any zero values!") if self.needs_errors: # There are other warnings that notify the user about singular cov mat, etc. - warnings.warn("Setting all data errors to 1 as a fallback.") + warnings.warn("'err' contains zero values. Setting all data errors to 1 as a fallback.") else: _res = _res / err