From 16236b817ae3db7548888c1a8f416a5bffe8eff7 Mon Sep 17 00:00:00 2001
From: Florian Mausolf <florian.mausolf@rwth-aachen.de>
Date: Fri, 14 Nov 2025 11:16:53 +0100
Subject: [PATCH] include bkg 4 and 5 in 1D data generation

---
 examples/1D_example/run_gmm_example.py     | 67 +++++++---------------
 examples/1D_example/run_sigmoid_example.py | 67 +++++++---------------
 src/gatohep/data_generation.py             | 39 ++++++++++---
 3 files changed, 76 insertions(+), 97 deletions(-)

diff --git a/examples/1D_example/run_gmm_example.py b/examples/1D_example/run_gmm_example.py
index c9bb5a7..8327ddf 100644
--- a/examples/1D_example/run_gmm_example.py
+++ b/examples/1D_example/run_gmm_example.py
@@ -149,34 +149,21 @@ def main():
         high=high,
         name="Signal",
     )
-    hist_bkg1 = create_hist(
-        data["bkg1"]["NN_output"],
-        weights=data["bkg1"]["weight"],
-        bins=n_bins,
-        low=low,
-        high=high,
-        name="Bkg1",
-    )
-    hist_bkg2 = create_hist(
-        data["bkg2"]["NN_output"],
-        weights=data["bkg2"]["weight"],
-        bins=n_bins,
-        low=low,
-        high=high,
-        name="Bkg2",
-    )
-    hist_bkg3 = create_hist(
-        data["bkg3"]["NN_output"],
-        weights=data["bkg3"]["weight"],
-        bins=n_bins,
-        low=low,
-        high=high,
-        name="Bkg3",
-    )
-    bkg_hists = [hist_bkg1, hist_bkg2, hist_bkg3]
+    bkg_processes = [f"bkg{i}" for i in range(1, 6)]
+    bkg_hists = [
+        create_hist(
+            data[proc]["NN_output"],
+            weights=data[proc]["weight"],
+            bins=n_bins,
+            low=low,
+            high=high,
+            name=f"{proc.capitalize()}",
+        )
+        for proc in bkg_processes
+    ]
 
     # plot the backgrounds:
-    process_labels = ["Background 1", "Background 2", "Background 3"]
+    process_labels = [f"Background {i}" for i in range(1, len(bkg_processes) + 1)]
     signal_labels = ["Signal x 100"]
 
     # For demonstration, we compare multiple binning schemes.
@@ -356,25 +343,15 @@ def train_step(
             bins=opt_bin_edges,
             name="Signal_opt",
         )
-        h_bkg1_opt = create_hist(
-            data["bkg1"]["NN_output"],
-            weights=data["bkg1"]["weight"],
-            bins=opt_bin_edges,
-            name="Bkg1_opt",
-        )
-        h_bkg2_opt = create_hist(
-            data["bkg2"]["NN_output"],
-            weights=data["bkg2"]["weight"],
-            bins=opt_bin_edges,
-            name="Bkg2_opt",
-        )
-        h_bkg3_opt = create_hist(
-            data["bkg3"]["NN_output"],
-            weights=data["bkg3"]["weight"],
-            bins=opt_bin_edges,
-            name="Bkg3_opt",
-        )
-        opt_bkg_hists = [h_bkg1_opt, h_bkg2_opt, h_bkg3_opt]
+        opt_bkg_hists = [
+            create_hist(
+                data[proc]["NN_output"],
+                weights=data[proc]["weight"],
+                bins=opt_bin_edges,
+                name=f"{proc}_opt",
+            )
+            for proc in bkg_processes
+        ]
 
         # Compute significance from these optimized histograms.
         Z_opt = compute_significance_from_hists(h_signal_opt, opt_bkg_hists)
diff --git a/examples/1D_example/run_sigmoid_example.py b/examples/1D_example/run_sigmoid_example.py
index eefd7e5..b9dcd45 100644
--- a/examples/1D_example/run_sigmoid_example.py
+++ b/examples/1D_example/run_sigmoid_example.py
@@ -174,34 +174,21 @@ def main():
         high=high,
         name="Signal",
     )
-    hist_bkg1 = create_hist(
-        data["bkg1"]["NN_output"],
-        weights=data["bkg1"]["weight"],
-        bins=n_bins,
-        low=low,
-        high=high,
-        name="Bkg1",
-    )
-    hist_bkg2 = create_hist(
-        data["bkg2"]["NN_output"],
-        weights=data["bkg2"]["weight"],
-        bins=n_bins,
-        low=low,
-        high=high,
-        name="Bkg2",
-    )
-    hist_bkg3 = create_hist(
-        data["bkg3"]["NN_output"],
-        weights=data["bkg3"]["weight"],
-        bins=n_bins,
-        low=low,
-        high=high,
-        name="Bkg3",
-    )
-    bkg_hists = [hist_bkg1, hist_bkg2, hist_bkg3]
+    bkg_processes = [f"bkg{i}" for i in range(1, 6)]
+    bkg_hists = [
+        create_hist(
+            data[proc]["NN_output"],
+            weights=data[proc]["weight"],
+            bins=n_bins,
+            low=low,
+            high=high,
+            name=f"{proc.capitalize()}",
+        )
+        for proc in bkg_processes
+    ]
 
     # plot the backgrounds:
-    process_labels = ["Background 1", "Background 2", "Background 3"]
+    process_labels = [f"Background {i}" for i in range(1, len(bkg_processes) + 1)]
     signal_labels = ["Signal x 100"]
 
     # For demonstration, we compare multiple binning schemes.
@@ -385,25 +372,15 @@ def train_step(
             bins=opt_bin_edges,
             name="Signal_opt",
         )
-        h_bkg1_opt = create_hist(
-            data["bkg1"]["NN_output"],
-            weights=data["bkg1"]["weight"],
-            bins=opt_bin_edges,
-            name="Bkg1_opt",
-        )
-        h_bkg2_opt = create_hist(
-            data["bkg2"]["NN_output"],
-            weights=data["bkg2"]["weight"],
-            bins=opt_bin_edges,
-            name="Bkg2_opt",
-        )
-        h_bkg3_opt = create_hist(
-            data["bkg3"]["NN_output"],
-            weights=data["bkg3"]["weight"],
-            bins=opt_bin_edges,
-            name="Bkg3_opt",
-        )
-        opt_bkg_hists = [h_bkg1_opt, h_bkg2_opt, h_bkg3_opt]
+        opt_bkg_hists = [
+            create_hist(
+                data[proc]["NN_output"],
+                weights=data[proc]["weight"],
+                bins=opt_bin_edges,
+                name=f"{proc}_opt",
+            )
+            for proc in bkg_processes
+        ]
 
         # Compute significance from these optimized histograms.
         Z_opt = compute_significance_from_hists(h_signal_opt, opt_bkg_hists)
diff --git a/src/gatohep/data_generation.py b/src/gatohep/data_generation.py
index eac9019..6fc33e0 100644
--- a/src/gatohep/data_generation.py
+++ b/src/gatohep/data_generation.py
@@ -148,6 +148,8 @@ def generate_toy_data_1D(
     xs_bkg1: float = 100,
     xs_bkg2: float = 80,
     xs_bkg3: float = 50,
+    xs_bkg4: float = 20,
+    xs_bkg5: float = 10,
     lumi: float = 100.0,
     noise_scale: float = 0.3,
     seed: int | None = None,
@@ -169,6 +171,10 @@ def generate_toy_data_1D(
         Cross-section for the second background component. Default is 15.
     xs_bkg3 : float, optional
         Cross-section for the third background component. Default is 10.
+    xs_bkg4 : float, optional
+        Cross-section for the fourth background component. Default is 20.
+    xs_bkg5 : float, optional
+        Cross-section for the fifth background component. Default is 10.
     lumi : float, optional
         Luminosity for scaling event weights. Default is 100.
     seed : int or None, optional
@@ -184,32 +190,51 @@ def generate_toy_data_1D(
     if seed is not None:
         np.random.seed(seed)
 
-    tot_xs_bkg = xs_bkg1 + xs_bkg2 + xs_bkg3
+    tot_xs_bkg = xs_bkg1 + xs_bkg2 + xs_bkg3 + xs_bkg4 + xs_bkg5
     n_bkg1 = int(n_bkg * xs_bkg1 / tot_xs_bkg)
     n_bkg2 = int(n_bkg * xs_bkg2 / tot_xs_bkg)
-    n_bkg3 = n_bkg - (n_bkg1 + n_bkg2)
+    n_bkg3 = int(n_bkg * xs_bkg3 / tot_xs_bkg)
+    n_bkg4 = int(n_bkg * xs_bkg4 / tot_xs_bkg)
+    n_bkg5 = n_bkg - (n_bkg1 + n_bkg2 + n_bkg3 + n_bkg4)
 
-    counts = dict(signal=n_signal, bkg1=n_bkg1, bkg2=n_bkg2, bkg3=n_bkg3)
-    xs = dict(signal=xs_signal, bkg1=xs_bkg1, bkg2=xs_bkg2, bkg3=xs_bkg3)
+    counts = dict(
+        signal=n_signal,
+        bkg1=n_bkg1,
+        bkg2=n_bkg2,
+        bkg3=n_bkg3,
+        bkg4=n_bkg4,
+        bkg5=n_bkg5,
+    )
+    xs = dict(
+        signal=xs_signal,
+        bkg1=xs_bkg1,
+        bkg2=xs_bkg2,
+        bkg3=xs_bkg3,
+        bkg4=xs_bkg4,
+        bkg5=xs_bkg5,
+    )
 
     X = {
         "signal": _sample("signal1", n_signal, seed),
         "bkg1":   _sample("bkg1",    n_bkg1,  seed + 1 if seed else None),
         "bkg2":   _sample("bkg2",    n_bkg2,  seed + 2 if seed else None),
         "bkg3":   _sample("bkg3",    n_bkg3,  seed + 3 if seed else None),
+        "bkg4":   _sample("bkg4",    n_bkg4,  seed + 4 if seed else None),
+        "bkg5":   _sample("bkg5",    n_bkg5,  seed + 5 if seed else None),
     }
 
     pdf_sig = multivariate_normal(MEANS["signal1"], COV)
     pdf_bkg = {
-        p: multivariate_normal(MEANS[p], COV) for p in ("bkg1", "bkg2", "bkg3")
+        p: multivariate_normal(MEANS[p], COV)
+        for p in ("bkg1", "bkg2", "bkg3", "bkg4", "bkg5")
     }
-    total_bkg_xs = xs_bkg1 + xs_bkg2 + xs_bkg3
+    total_bkg_xs = xs_bkg1 + xs_bkg2 + xs_bkg3 + xs_bkg4 + xs_bkg5
 
     def _pb(x):
         return sum((xs[p] / total_bkg_xs) * pdf_bkg[p].pdf(x) for p in pdf_bkg)
 
     data = {}
-    for proc in ("signal", "bkg1", "bkg2", "bkg3"):
+    for proc in ("signal", "bkg1", "bkg2", "bkg3", "bkg4", "bkg5"):
         Xp = X[proc]
         ps = pdf_sig.pdf(Xp)
         pb = _pb(Xp)