diff --git a/.gitignore b/.gitignore index 0f366f91..5ecd5d89 100644 --- a/.gitignore +++ b/.gitignore @@ -22,4 +22,5 @@ nobackup/ fitting/results/ fitting/templates/ plots/ +*.txt *.ipynb_checkpoints/ diff --git a/fitting/check_plot.py b/fitting/check_plot.py new file mode 100644 index 00000000..3bbcef9c --- /dev/null +++ b/fitting/check_plot.py @@ -0,0 +1,170 @@ +from __future__ import annotations + +import matplotlib.pyplot as plt +import mplhep as hep +import numpy as np +import uproot + +# --- CONFIGURATION --- +# Make sure this points to the STABLE tag you decided on +FILENAME = "parT_results/25Nov19_stable_v14_private/2022/testsignalregion.root" +CATEGORY = "pass_bb" +LUMI = 34.7 + +hep.style.use("CMS") + +# 1. VISUAL CONFIGURATION +# Matches image_1dc440.jpg style +color_map = { + r"$\gamma$+jets": "#1f4e5f", # Dark Teal (Main background) + r"Other": "#5e6c7a", # Slate Grey + r"$t\bar{t}$": "#7f964f", # Olive Green + r"W+$\gamma$": "#d16a4c", # Salmon/Red + r"Z+$\gamma$": "#6a3d6a", # Purple + "Data": "black", +} + +# Stack order (Bottom -> Top) +stack_order = [ + r"$\gamma$+jets", + r"Other", + r"$t\bar{t}$", + r"W+$\gamma$", + r"Z+$\gamma$", +] + + +def main(): + print(f"Opening {FILENAME}...") + try: + f = uproot.open(FILENAME) + except FileNotFoundError: + print(f"Error: Could not find {FILENAME}. Check your path/tag.") + return + + hists = {} + data_hist = None + + # Pythonic iteration (fixes SIM118) + keys = [k for k in f if CATEGORY in k] + print(f"Found {len(keys)} histograms for category '{CATEGORY}'") + + if not keys: + print("Error: No keys found. Check if the category name is correct.") + return + + for key in keys: + h = f[key].to_hist() + name = key.split(";")[0] # remove ;1 suffix + + # 2. ROBUST PROCESS GROUPING + # Logic to map file names to the labels in our legend + if "Run2022" in name or "Data" in name or "EGamma" in name: + proc = "Data" + elif "Zgamma" in name: + proc = r"Z+$\gamma$" + elif "TTGamma" in name or "TTbar" in name or "TTTo" in name: + proc = r"$t\bar{t}$" + elif "WGamma" in name: + proc = r"W+$\gamma$" + # This catches Gamma+Jets (often named GJ_PTG or GJet) + elif "GJet" in name or "GJ_" in name or "QCD" in name: + proc = r"$\gamma$+jets" + elif "ZJets" in name or "DYJets" in name: + proc = r"Other" # Merging Z+jets into Other based on plot size + elif "WJets" in name: + proc = r"Other" # Merging W+jets into Other + else: + proc = r"Other" + + # Accumulate + if proc == "Data": + if data_hist is None: + data_hist = h + else: + data_hist += h + else: + if proc not in hists: + hists[proc] = h + else: + hists[proc] += h + + # 3. BUILD STACK + # Filter out empty processes and enforce order + final_stack_names = [p for p in stack_order if p in hists] + stack_hists = [hists[p] for p in final_stack_names] + stack_colors = [color_map.get(p, "grey") for p in final_stack_names] + + # 4. PLOTTING + fig, (ax, rax) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": [3, 1]}, sharex=True + ) + + # Main Stack + if stack_hists: + hep.histplot( + stack_hists, + label=final_stack_names, + stack=True, + histtype="fill", + color=stack_colors, + ax=ax, + ) + + # Data + if data_hist: + hep.histplot( + data_hist, + label="Data", + histtype="errorbar", + color="black", + ax=ax, + markersize=10, + yerr=True, + ) + + # Ratio Calculation + total_mc = sum(stack_hists) + r_num = data_hist.values() + r_den = total_mc.values() + # Safe division + ratio = np.divide(r_num, r_den, out=np.ones_like(r_num), where=r_den != 0) + + # Ratio Plot + centers = data_hist.axes[0].centers + rax.errorbar(centers, ratio, yerr=0, fmt="ko", markersize=4) + + # 5. STYLING + ax.set_ylabel("Events / GeV") + ax.set_xlim(0, 200) # Match reference X-axis + + # Legend: Reverse handles so top of stack = top of legend + handles, labels = ax.get_legend_handles_labels() + ax.legend(handles[::-1], labels[::-1], title=None, loc="upper right", ncol=2, fontsize=14) + + # Text Overlay (The "TXbb > 0.95" text) + ax.text( + 0.95, + 0.92, + r"TXbb $> 0.95$ Region, $200 < p_T < 1200$ GeV", + transform=ax.transAxes, + ha="right", + fontsize=16, + ) + + # Ratio Styling + rax.set_ylabel("Data / Bkg") + rax.set_ylim(0, 2.5) + rax.axhline(1, color="gray", linestyle="--") + rax.set_xlabel(r"Jet 0 $m_{sd}$ [GeV]", fontsize=18) + + # CMS Label + hep.cms.label("Private Work", data=True, lumi=LUMI, year="2022", ax=ax, loc=0) + + out_name = "check_zgamma_plot_styled.png" + plt.savefig(out_name) + print(f"\nPlot saved to {out_name}") + + +if __name__ == "__main__": + main() diff --git a/fitting/make_hists.py b/fitting/make_hists.py old mode 100644 new mode 100755 index 6d0011e7..02fe1f6d --- a/fitting/make_hists.py +++ b/fitting/make_hists.py @@ -2,11 +2,11 @@ from __future__ import annotations import argparse +import json +import os from pathlib import Path import hist -import json -import os import uproot from hbb import utils @@ -14,7 +14,15 @@ def fill_hists(outdict, events, region, reg_cfg, obs_cfg, qq_true): - h = hist.Hist(hist.axis.Regular(obs_cfg["nbins"], obs_cfg["min"], obs_cfg["max"], name=obs_cfg["name"], label=obs_cfg["name"])) + h = hist.Hist( + hist.axis.Regular( + obs_cfg["nbins"], + obs_cfg["min"], + obs_cfg["max"], + name=obs_cfg["name"], + label=obs_cfg["name"], + ) + ) bins_list = reg_cfg["bins"] bin_pname = reg_cfg["bin_pname"] @@ -22,7 +30,7 @@ def fill_hists(outdict, events, region, reg_cfg, obs_cfg, qq_true): for _process_name, data in events.items(): - #TODO add in systematics functionality + # TODO add in systematics functionality weight_val = data["finalWeight"].astype(float) s = "nominal" @@ -37,17 +45,17 @@ def fill_hists(outdict, events, region, reg_cfg, obs_cfg, qq_true): pre_selection = (obs_br > obs_cfg["min"]) & (obs_br < obs_cfg["max"]) selection_dict = { - "pass_bb": pre_selection & (Txbbxcc > 0.95) & (Txbb > Txcc), - "pass_cc": pre_selection & (Txbbxcc > 0.95) & (Txcc > Txbb), + "pass_bb": pre_selection & (Txbbxcc > 0.95) & (Txbb > Txcc), + "pass_cc": pre_selection & (Txbbxcc > 0.95) & (Txcc > Txbb), "fail": pre_selection & (Txbbxcc <= 0.95), - "pass": pre_selection & (Txbbxcc > 0.95) + "pass": pre_selection & (Txbbxcc > 0.95), } - cut_bb = (genf == 3) + cut_bb = genf == 3 cut_qq = (genf > 0) & (genf < 3) for i in range(len(bins_list) - 1): - bin_cut = (bin_br > bins_list[i]) & (bin_br < bins_list[i+1]) & pre_selection + bin_cut = (bin_br > bins_list[i]) & (bin_br < bins_list[i + 1]) & pre_selection for category, selection in selection_dict.items(): if qq_true: @@ -57,7 +65,7 @@ def fill_hists(outdict, events, region, reg_cfg, obs_cfg, qq_true): obs_br[selection & bin_cut & cut_qq], weight=weight_val[selection & bin_cut & cut_qq], ) - if not name in outdict: + if name not in outdict: outdict[name] = h.copy() else: outdict[name] += h.copy() @@ -68,7 +76,7 @@ def fill_hists(outdict, events, region, reg_cfg, obs_cfg, qq_true): obs_br[selection & bin_cut & cut_bb], weight=weight_val[selection & bin_cut & cut_bb], ) - if not name in outdict: + if name not in outdict: outdict[name] = h.copy() else: outdict[name] += h.copy() @@ -80,20 +88,22 @@ def fill_hists(outdict, events, region, reg_cfg, obs_cfg, qq_true): obs_br[selection & bin_cut], weight=weight_val[selection & bin_cut], ) - if not name in outdict: + if name not in outdict: outdict[name] = h.copy() else: outdict[name] += h.copy() return outdict + def main(args): year = args.year tag = args.tag - path_to_dir = f"/eos/uscms/store/group/lpchbbrun3/skims/{tag}" - - samples_qq = ['Wjets','Zjets','EWKW','EWKZ','EWKV'] - + # path_to_dir = f"/eos/uscms/store/group/lpchbbrun3/skims/{tag}" + path_to_dir = f"/eos/uscms/store/group/lpchbbrun3/gmachado/{tag}" + + samples_qq = ["Wjets", "Zjets", "EWKW", "EWKZ", "EWKV"] + columns = [ "weight", "FatJet0_pt", @@ -110,31 +120,31 @@ def main(args): out_path = f"results/{tag}/{year}" output_file = f"{out_path}/signalregion.root" - if not os.path.exists(out_path): - os.makedirs(out_path) + if not Path(out_path).exists(): + Path(out_path).mkdir(parents=True) - if os.path.isfile(output_file): - os.remove(output_file) + if Path(output_file).is_file(): + Path(output_file).unlink() fout = uproot.create(output_file) # So I can remember the settings I used for each set of results produced - os.popen(f'cp setup.json {out_path}') - with open('setup.json') as f: + os.popen(f"cp setup.json {out_path}") + with Path("setup.json").open() as f: setup = json.load(f) cats = setup["categories"] obs_cfg = setup["observable"] - with open('pmap_run3.json') as f: + with Path("pmap_run3.json").open() as f: pmap = json.load(f) - + filters = [ - ("FatJet0_pt", ">", 450), - ("FatJet0_pt", "<", 1200), - ("VBFPair_mjj", ">", -2), - ("VBFPair_mjj", "<", 13000), + ("FatJet0_pt", ">", 450), + ("FatJet0_pt", "<", 1200), + ("VBFPair_mjj", ">", -2), + ("VBFPair_mjj", "<", 13000), ] - if not obs_cfg["branch_name"] in columns: + if obs_cfg["branch_name"] not in columns: columns.append(obs_cfg["branch_name"]) out_hists = {} @@ -148,7 +158,7 @@ def main(args): {process: [dataset]}, columns=columns, region=cfg["name"], - filters=filters + filters=filters, ) if not events: @@ -161,6 +171,7 @@ def main(args): print(f"Histograms saved to {output_file}") + if __name__ == "__main__": parser = argparse.ArgumentParser(description="Make histograms for a given year.") parser.add_argument( @@ -170,12 +181,7 @@ def main(args): required=True, choices=["2022", "2022EE", "2023", "2023BPix"], ) - parser.add_argument( - "--tag", - help="tag", - type=str, - required=True - ) + parser.add_argument("--tag", help="tag", type=str, required=True) args = parser.parse_args() main(args) diff --git a/fitting/make_zg_cards.py b/fitting/make_zg_cards.py new file mode 100644 index 00000000..ed9e4495 --- /dev/null +++ b/fitting/make_zg_cards.py @@ -0,0 +1,575 @@ +from __future__ import annotations + +import argparse +import json +import pickle # You ARE using this for dumping the model +import warnings +from pathlib import Path + +import numpy as np +import pandas as pd +import rhalphalib as rl +import ROOT + +from hbb.common_vars import LUMI + +# Suppress annoying ROOT warnings +ROOT.gROOT.SetBatch(True) +warnings.filterwarnings("ignore") + +rl.util.install_roofit_helpers() + +eps = 0.001 + +### DIFF: Disabled systematics and Muon CR for the initial Z-Gamma validation fit. +### Original had these set to True. +do_systematics = True +do_muon_CR = False + +lumi_err = {"2022": 1.01, "2023": 1.02} + + +def plot_mctf(tf_MCtempl, msdbins, name, year, tag): + """ + Plot the MC pass / fail TF as function of (pt,rho) and (pt,msd) + """ + import matplotlib.pyplot as plt + + # Create directory using Pathlib + outdir = Path(f"parT_results/{tag}/{year}/plots/MCTF/") + outdir.mkdir(parents=True, exist_ok=True) + + # --- ADAPTATION: Z-Gamma Low pT Range (200 - 1200) --- + pts = np.linspace(200, 1200, 15) + ptpts, msdpts = np.meshgrid( + pts[:-1] + 0.5 * np.diff(pts), msdbins[:-1] + 0.5 * np.diff(msdbins), indexing="ij" + ) + + # Scaling must match the fit logic! + ptpts_scaled = (ptpts - 200.0) / (1200.0 - 200.0) + rhopts = 2 * np.log(msdpts / ptpts) + + rhopts_scaled = (rhopts - (-6)) / ((-2.1) - (-6)) + + # Valid Region Mask + validbins = (rhopts_scaled >= 0) & (rhopts_scaled <= 1) + + ptpts = ptpts[validbins].copy() + msdpts = msdpts[validbins].copy() + ptpts_scaled = ptpts_scaled[validbins].copy() + rhopts_scaled = rhopts_scaled[validbins].copy() + + # Evaluate the Polynomial + tf_MCtempl_vals = tf_MCtempl(ptpts_scaled, rhopts_scaled, nominal=True) + + df = pd.DataFrame([]) # noqa: PD901 + df["msd"] = msdpts.reshape(-1) + df["pt"] = ptpts.reshape(-1) + df["MCTF"] = tf_MCtempl_vals.reshape(-1) + + # Plot 1: pT vs Mass + fig, ax = plt.subplots() + h = ax.hist2d(x=df["msd"], y=df["pt"], weights=df["MCTF"], bins=(msdbins, pts)) + plt.xlabel("$m_{sd}$ [GeV]") + plt.ylabel("$p_{T}$ [GeV]") + cb = fig.colorbar(h[3], ax=ax) + cb.set_label("Ratio (Pass/Fail)") + + # Save Plot 1 + outname_msd = outdir / f"MCTF_msdpt_{name}.png" + fig.savefig(outname_msd, bbox_inches="tight") + plt.close() # Always close to save memory! + + # --- Plot 2: pT vs Rho --- + rhos = np.linspace(-6, -2.1, 23) + ptpts, rhopts = np.meshgrid( + pts[:-1] + 0.5 * np.diff(pts), rhos[:-1] + 0.5 * np.diff(rhos), indexing="ij" + ) + + ptpts_scaled = (ptpts - 200.0) / (1200.0 - 200.0) + rhopts_scaled = (rhopts - (-6)) / ((-2.1) - (-6)) + validbins = (rhopts_scaled >= 0) & (rhopts_scaled <= 1) + + ptpts = ptpts[validbins].copy() + rhopts = rhopts[validbins].copy() + ptpts_scaled = ptpts_scaled[validbins].copy() + rhopts_scaled = rhopts_scaled[validbins].copy() + + tf_MCtempl_vals = tf_MCtempl(ptpts_scaled, rhopts_scaled, nominal=True) + + df = pd.DataFrame([]) # noqa: PD901 + df["rho"] = rhopts.reshape(-1) + df["pt"] = ptpts.reshape(-1) + df["MCTF"] = tf_MCtempl_vals.reshape(-1) + + fig, ax = plt.subplots() + h = ax.hist2d(x=df["rho"], y=df["pt"], weights=df["MCTF"], bins=(rhos, pts)) + plt.xlabel(r"$\rho = 2\ln(m_{SD}/p_T)$") + plt.ylabel("$p_{T}$ [GeV]") + cb = fig.colorbar(h[3], ax=ax) + cb.set_label("Ratio (Pass/Fail)") + + # Save Plot 2 + outname_rho = outdir / f"MCTF_rhopt_{name}.png" + fig.savefig(outname_rho, bbox_inches="tight") + plt.close() + + print(f"Saved MCTF debug plots to {outdir}") + + +def badtemp_ma(hvalues, mask=None): + # Need minimum size & more than 1 non-zero bins + tot = np.sum(hvalues[mask]) + count_nonzeros = np.sum(hvalues[mask] > 0) + return bool(tot < eps or count_nonzeros < 2) + + +def get_template(year, tag, sName, region, ptbin, cat, obs, syst): + """ + Read msd template from root file + """ + ### DIFF: Updated naming convention to match 'make_zg_hists.py' output. + ### Original script had complex logic for 'ggf_', 'vbf_', etc. + ### We now use a standardized format: {cat}_{region}_pt{bin}_{process}_{syst} + + f = ROOT.TFile.Open(f"parT_results/{tag}/{year}/signalregion.root") + + reg_clean = region.rstrip("_") + name = f"{cat}_{reg_clean}_pt{ptbin}_{sName}_{syst}" + + h = f.Get(name) + + ### DIFF: Added safety check. + ### Original script crashes if histogram is missing. This returns a dummy empty hist. + if not h: + return ( + np.zeros(len(obs.binning) - 1), + obs.binning, + obs.name, + np.zeros(len(obs.binning) - 1), + ) + + sumw = [] + sumw2 = [] + + for i in range(1, h.GetNbinsX() + 1): + if h.GetBinContent(i) < 0: + sumw += [0] + sumw2 += [0] + else: + sumw += [h.GetBinContent(i)] + sumw2 += [h.GetBinError(i) * h.GetBinError(i)] + + return (np.array(sumw), obs.binning, obs.name, np.array(sumw2)) + + +def zgamma_rhalphabet(args): + """ + Create the data cards for Z-Gamma! + """ + + year = args.year + tag = args.tag + + print("Running Z-Gamma Card Maker for " + year) + + datacard_dir = Path(f"results/{tag}/{year}/datacards/") + initvals_dir = Path(f"results/{tag}/{year}/initial_vals/") + + if not datacard_dir.exists(): + datacard_dir.mkdir(parents=True, exist_ok=True) + + if not initvals_dir.exists(): + initvals_dir.mkdir(parents=True, exist_ok=True) + + ### DIFF: Loading 'setup_zgamma.json' instead of 'setup.json' + ### This ensures we use the low-pT binning (200 GeV) instead of Higgs binning. + with Path("setup_zgamma.json").open() as f: + setup = json.load(f) + cats_cfg = setup["categories"] + + total_model_bins = [] + + sys_lumi_uncor = rl.NuisanceParameter(f"CMS_lumi_13TeV_{year[:4]}", "lnN") + + validbins = {} + + msd_cfg = setup["observable"] + msdbins = np.linspace(msd_cfg["min"], msd_cfg["max"], msd_cfg["nbins"] + 1) + msd = rl.Observable(msd_cfg["name"], msdbins) + + ### DIFF: Only running the Z-Gamma Control Region category ('zgcr') + ### Original ran 'ggf', 'vh', 'vbf'. + cats = ["zgcr"] + + # --- QCD ESTIMATION (TF) --- + tf_params = {} + for cat in cats: + + ptbins = np.array(cats_cfg[cat]["bins"]) + npt = len(ptbins) - 1 + + # Derive 2D array + ptpts, msdpts = np.meshgrid( + ptbins[:-1] + 0.3 * np.diff(ptbins), + msdbins[:-1] + 0.5 * np.diff(msdbins), + indexing="ij", + ) + rhopts = 2 * np.log(msdpts / ptpts) + + ### DIFF: Changed pT scaling geometry. + ### Original scaled from 450-1200 (Higgs). We scale 200-1200 (Z-Boson). + ptscaled = (ptpts - 200.0) / (1200.0 - 200.0) + rhoscaled = (rhopts - (-6.0)) / ((-1.0) - (-6.0)) + + validbins[cat] = (rhoscaled >= 0.0) & (rhoscaled <= 1.0) + rhoscaled[~validbins[cat]] = 1 + + tf_params[cat] = {} + fitfailed_qcd = {} + + # We model pass_bb and pass_cc vs fail + for reg in ["bb"]: + fitfailed_qcd[reg] = 0 + + # Simple retry loop + while fitfailed_qcd[reg] < 5: + + qcdmodel = rl.Model(f"qcdmodel_{cat}_{reg}") + qcdpass, qcdfail = 0.0, 0.0 + + for ptbin in range(npt): + failCh = rl.Channel(f"ptbin{ptbin}{cat}fail{year}{reg}") + passCh = rl.Channel(f"ptbin{ptbin}{cat}pass{year}{reg}") + qcdmodel.addChannel(failCh) + qcdmodel.addChannel(passCh) + + # Using QCD MC for the Transfer Factor + failTempl = get_template( + year, + tag, + "GJets", + "fail_", + ptbin + 1, + cat, + obs=msd, + syst="nominal", + ) + passTempl = get_template( + year, + tag, + "GJets", + f"pass_{reg}_", + ptbin + 1, + cat, + obs=msd, + syst="nominal", + ) + + failCh.setObservation(failTempl, read_sumw2=True) + passCh.setObservation(passTempl, read_sumw2=True) + + qcdfail += failCh.getObservation()[0].sum() + qcdpass += passCh.getObservation()[0].sum() + + qcdeff = qcdpass / qcdfail + print(f"Inclusive P/F ({reg}) from Monte Carlo = " + str(qcdeff)) + + # INITIAL VALUES + initF = ( + Path(f"results/{tag}/{year}/initial_vals") / f"initial_vals_{cat}_{reg}.json" + ) + + ### DIFF: Added fallback for missing initial values. + ### Original assumes file exists. We default to [1,1] so the fit can bootstrap itself. + if initF.exists(): + with initF.open() as f: + initial_vals = np.array(json.load(f)["initial_vals"]) + else: + print(f"No initial vals found for {reg}, using default Order 2 poly.") + initial_vals = np.array([[1.0, 0.1, 0.1]]) + + print( + "TFpf order " + + str(initial_vals.shape[0] - 1) + + " in pT, " + + str(initial_vals.shape[1] - 1) + + " in rho" + ) + print(initial_vals) + + tf_MCtempl = rl.BasisPoly( + "tf_MCtempl_" + cat + reg + year, + (initial_vals.shape[0] - 1, initial_vals.shape[1] - 1), + ["pt", "rho"], + basis="Bernstein", + init_params=initial_vals, + limits=(0, 10), + coefficient_transform=None, + ) + + tf_MCtempl_params = qcdeff * tf_MCtempl(ptscaled, rhoscaled) + + for ptbin in range(npt): + failCh = qcdmodel[f"ptbin{ptbin}{cat}fail{year}{reg}"] + passCh = qcdmodel[f"ptbin{ptbin}{cat}pass{year}{reg}"] + failObs = failCh.getObservation()[0] + + qcdparams = np.array( + [ + rl.IndependentParameter(f"qcdparam_ptbin{ptbin}{cat}{year}{reg}_{i}", 0) + for i in range(msd.nbins) + ] + ) + sigmascale = 10.0 + scaledparams = ( + failObs * (1 + sigmascale / np.maximum(1.0, np.sqrt(failObs))) ** qcdparams + ) + + fail_qcd = rl.ParametericSample( + f"ptbin{ptbin}{cat}fail{year}{reg}_qcd", + rl.Sample.BACKGROUND, + msd, + scaledparams, + ) + failCh.addSample(fail_qcd) + pass_qcd = rl.TransferFactorSample( + f"ptbin{ptbin}{cat}pass{year}{reg}_qcd", + rl.Sample.BACKGROUND, + tf_MCtempl_params[ptbin, :], + fail_qcd, + ) + passCh.addSample(pass_qcd) + + failCh.mask = validbins[cat][ptbin] + passCh.mask = validbins[cat][ptbin] + + # Run the Fit + qcdfit_ws = ROOT.RooWorkspace("w") + simpdf, obs = qcdmodel.renderRoofit(qcdfit_ws) + qcdfit = simpdf.fitTo( + obs, + ROOT.RooFit.Extended(True), + ROOT.RooFit.SumW2Error(True), + ROOT.RooFit.Strategy(2), + ROOT.RooFit.Save(), + ROOT.RooFit.Minimizer("Minuit2", "migrad"), + ROOT.RooFit.PrintLevel(-1), + ) + qcdfit_ws.add(qcdfit) + qcdfit_ws.writeToFile( + str(datacard_dir / f"testModel_qcdfit_{cat}_{reg}_{year}.root") + ) + + # Check status + if qcdfit.status() != 0: + fitfailed_qcd[reg] += 1 + print(f"Fit failed for {reg}, retrying...") + else: + # Save results if successful + print(f"HERE Fit successful for {reg}") + allparams = dict(zip(qcdfit.nameArray(), qcdfit.valueArray())) + pvalues = [] + for _, p in enumerate(tf_MCtempl.parameters.reshape(-1)): + p.value = allparams[p.name] + pvalues += [p.value] + new_values = np.array(pvalues).reshape(tf_MCtempl.parameters.shape) + with initF.open("w") as outfile: + json.dump({"initial_vals": new_values.tolist()}, outfile) + break + + print("Fitted GJets for category " + cat + " region " + reg) + + # --- NEW: Call Plotting Function --- + plot_mctf(tf_MCtempl, msdbins, f"{cat}_{reg}", year, tag) + # ----------------------------------- + # Decorrelate Parameters + param_names = [p.name for p in tf_MCtempl.parameters.reshape(-1)] + decoVector = rl.DecorrelatedNuisanceVector.fromRooFitResult( + tf_MCtempl.name + "_deco", qcdfit, param_names + ) + tf_MCtempl.parameters = decoVector.correlated_params.reshape( + tf_MCtempl.parameters.shape + ) + + # Data Residual (Blinded for now / identity) + tf_dataResidual = rl.BasisPoly( + "tf_dataResidual_" + year + cat + reg, + (0, 0), + ["pt", "rho"], + basis="Bernstein", + init_params=np.array( + [[1.0]] + ), # tried changing this and it doesn't change the outcome + limits=(0, 20), + coefficient_transform=None, + ) + + tf_params[cat][reg] = ( + qcdeff * tf_MCtempl(ptscaled, rhoscaled) * tf_dataResidual(ptscaled, rhoscaled) + ) + + # --- BUILD ACTUAL MODEL --- + model = rl.Model("zgammaModel_" + year) + + ### DIFF: Changed Samples and Signal Definition. + ### Original used ggF, VBF, WH, ZH. + ### We use Zgammabb as Signal, and Zgamma/GJets/TTGamma as Backgrounds. + samps = ["Zgammabb", "Zgamma", "Wjets", "Zjets", "GJets", "TTGamma"] + sigs = ["Zgammabb"] + + for cat in cats: + ptbins = np.array(cats_cfg[cat]["bins"]) + npt = len(ptbins) - 1 + + for ptbin in range(npt): + for region in ["pass_bb_", "fail_"]: + + ch_name = f"ptbin{ptbin}{cat}{region.replace('_', '')}{year}" + total_model_bins.append(ch_name) + + ch = rl.Channel(ch_name) + model.addChannel(ch) + + # Add MC Samples + for sName in samps: + + # Skip QCD in the main loop (handled via data-driven later) + if sName == "GJets": + continue + + templ = get_template( + year, + tag, + sName, + region, + ptbin + 1, + cat, + obs=msd, + syst="nominal", + ) + nominal = templ[0] + + # Skip empty samples + if badtemp_ma(nominal): + # print("Sample {} is too small, skipping".format(ch.name + '_' + sName)) + continue + + stype = rl.Sample.SIGNAL if sName in sigs else rl.Sample.BACKGROUND + + sample = rl.TemplateSample(ch.name + "_" + sName, stype, templ) + + # Simple Lumi Uncertainty + sample.setParamEffect( + sys_lumi_uncor, + lumi_err[year[:4]] ** (LUMI[year[:4]] / LUMI["2022-2023"]), + ) + + if do_systematics: + sample.autoMCStats(lnN=True) + + ch.addSample(sample) + + ### DIFF: Using 'Jetdata' for observation. + ### Z-Gamma triggers might eventually require EGamma data, but keeping Jetdata structure for now. + data_obs = get_template( + year, tag, "EGammadata", region, ptbin + 1, cat, obs=msd, syst="nominal" + ) + ch.setObservation(data_obs[0:3]) + + # --- ADD DATA-DRIVEN QCD --- + for cat in cats: + ptbins = np.array(cats_cfg[cat]["bins"]) + npt = len(ptbins) - 1 + + for ptbin in range(npt): + + failCh = model[f"ptbin{ptbin}{cat}fail{year}"] + passChbb = model[f"ptbin{ptbin}{cat}passbb{year}"] + # passChcc = model[f"ptbin{ptbin}{cat}passcc{year}"] + + qcdparams = np.array( + [ + rl.IndependentParameter(f"qcdparam_ptbin{ptbin}{cat}{year}_{i}", 0) + for i in range(msd.nbins) + ] + ) + initial_qcd = failCh.getObservation().astype(float) + + # Subtract other backgrounds from Data in Fail Region + for sample in failCh: + initial_qcd -= sample.getExpectation(nominal=True) + + if np.any(initial_qcd < 0.0): + initial_qcd[np.where(initial_qcd < 0)] = 0 + print("Warning: negative QCD estimate in some bins") + + sigmascale = 20 + scaledparams = ( + initial_qcd * (1 + sigmascale / np.maximum(1.0, np.sqrt(initial_qcd))) ** qcdparams + ) + + fail_qcd = rl.ParametericSample( + name=f"ptbin{ptbin}{cat}fail{year}_qcd", + sampletype=rl.Sample.BACKGROUND, + observable=msd, + params=scaledparams, + ) + failCh.addSample(fail_qcd) + + pass_qcdbb = rl.TransferFactorSample( + name=f"ptbin{ptbin}{cat}passbb{year}_qcd", + sampletype=rl.Sample.BACKGROUND, + transferfactor=tf_params[cat]["bb"][ptbin, :], + dependentsample=fail_qcd, + observable=msd, + ) + passChbb.addSample(pass_qcdbb) + + # pass_qcdcc = rl.TransferFactorSample( + # name=f"ptbin{ptbin}{cat}passcc{year}_qcd", + # sampletype=rl.Sample.BACKGROUND, + # transferfactor=tf_params[cat]["cc"][ptbin, :], + # dependentsample=fail_qcd, + # observable=msd, + # ) + # passChcc.addSample(pass_qcdcc) + + mask = validbins[cat][ptbin] + failCh.mask = mask + # passChcc.mask = mask + passChbb.mask = mask + + # --- SAVE OUTPUT --- + with (datacard_dir / f"zgammaModel_{year}.pkl").open("wb") as fout: + pickle.dump(model, fout) + + modeldir = datacard_dir / f"zgammaModel_{year}" + model.renderCombine(modeldir) + + out_cards = "" + for card in total_model_bins: + out_cards += f"{card}={card}.txt " + + with Path(f"{modeldir}/{card}.txt").open("a") as f: + f.write("\nqcd_norm rateParam * qcd 1.0 [0,20]\n") + + ### DIFF: Physics Model Configuration + ### We map the 'Zgammabb' sample to the signal strength 'r'. + t2w_cfg = "-P HiggsAnalysis.CombinedLimit.PhysicsModel:multiSignalModel --PO verbose --PO 'map=.*/Zgammabb:r[1,-100,100]'" + + build_sh = modeldir / "build.sh" + with build_sh.open("w") as f: + f.write(f"combineCards.py {out_cards} > model_combined.txt\n") + f.write(f"text2workspace.py {t2w_cfg} model_combined.txt -o workspace.root") + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("--year", help="year", type=str, required=True) + parser.add_argument("--tag", help="tag", type=str, required=True) + + args = parser.parse_args() + + zgamma_rhalphabet(args) diff --git a/fitting/make_zg_hists.py b/fitting/make_zg_hists.py new file mode 100755 index 00000000..4be34610 --- /dev/null +++ b/fitting/make_zg_hists.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import os +from pathlib import Path + +import hist +import numpy as np +import uproot + +from hbb import utils + + +def fill_hists(outdict, events, region, reg_cfg, obs_cfg, qq_true): + h = hist.Hist( + hist.axis.Regular( + obs_cfg["nbins"], + obs_cfg["min"], + obs_cfg["max"], + name=obs_cfg["name"], + label=obs_cfg["name"], + ) + ) + + bins_list = reg_cfg["bins"] + bin_pname = reg_cfg["bin_pname"] + str_bin_br = reg_cfg["branch_name"] + + for _process_name, data in events.items(): + + weight_val = data["finalWeight"].astype(float) + s = "nominal" + + bin_br = data[str_bin_br] + obs_br = data[obs_cfg["branch_name"]] + + # Extracting variables + Txcc = data["FatJet0_ParTPXccVsQCD"] + Txbb = data["FatJet0_ParTPXbbVsQCD"] + + msd = data["FatJet0_msd"] + pt = data["FatJet0_pt"] + photon_pt = data["Photon0_pt"] + + # --- FIX: Calculate dPhi manually (handling wrapping) --- + dphi_raw = np.abs(data["Photon0_phi"] - data["FatJet0_phi"]) + dphi = np.where(dphi_raw > np.pi, 2 * np.pi - dphi_raw, dphi_raw) + + # --- FIX: Extract MET (Robust) --- + # MET is loaded as a dictionary (e.g. {'pt':.., 'phi':..}), so we extract 'pt' + if data["MET"].dtype == "object": + met_pt = data["MET"].apply(lambda x: x["pt"]) + else: + met_pt = data["MET"] + + # --- 1. TRIGGER LOGIC --- + trigger_mask = True + is_data = "GenFlavor" not in data.columns + has_trigger_cols = ("Photon200" in data.columns) and ( + "Photon110EB_TightID_TightIso" in data.columns + ) + + if is_data and has_trigger_cols: + trigger_mask = data["Photon200"] | data["Photon110EB_TightID_TightIso"] + elif is_data and not has_trigger_cols: + print(f"WARNING: Trigger missing for {_process_name}!") + + # --- 2. DEFINE SELECTION --- + # Note: Your reference script cuts pt > 250, even though plot title said 200. + # This matches the reference script logic. + basic_cuts = ( + (photon_pt > 120) & (msd > 20) & (msd < 200) & (pt > 250) & (pt < 1200) & trigger_mask + ) + + # --- FIX: Uncomment and apply topological cuts --- + topo_cuts = (dphi > 2.2) & (met_pt < 50) + + # Combined Pre-Selection + pre_selection = basic_cuts & topo_cuts + + # --- 3. CATEGORIZATION --- + genf = data["GenFlavor"] if "GenFlavor" in data.columns else 0 + + # --- FIX: Change WP to 0.95 to match reference --- + WP = 0.95 + + # --- FIX: Removed (Txbb > Txcc) to match strict reference reproduction --- + selection_dict = { + "pass_bb": pre_selection & (Txbb > WP), + "pass_cc": pre_selection + & (Txcc > WP), # usually orthogonalized but reference didn't imply it + "fail": pre_selection & (Txbb <= WP), + } + + cut_bb = genf == 3 + cut_qq = genf != 3 + + for i in range(len(bins_list) - 1): + bin_cut = (bin_br > bins_list[i]) & (bin_br < bins_list[i + 1]) + + for category, selection in selection_dict.items(): + if qq_true: + # Logic for splitting backgrounds into light/bb + name = f"{region}_{category}_{bin_pname}{i+1}_{_process_name}_{s}" + h.view()[:] = 0 + h.fill( + obs_br[selection & bin_cut & cut_qq], + weight=weight_val[selection & bin_cut & cut_qq], + ) + if name not in outdict: + outdict[name] = h.copy() + else: + outdict[name] += h.copy() + + name = f"{region}_{category}_{bin_pname}{i+1}_{_process_name}bb_{s}" + h.view()[:] = 0 + h.fill( + obs_br[selection & bin_cut & cut_bb], + weight=weight_val[selection & bin_cut & cut_bb], + ) + if name not in outdict: + outdict[name] = h.copy() + else: + outdict[name] += h.copy() + else: + # Logic for Data/Signal + name = f"{region}_{category}_{bin_pname}{i+1}_{_process_name}_{s}" + h.view()[:] = 0 + h.fill(obs_br[selection & bin_cut], weight=weight_val[selection & bin_cut]) + if name not in outdict: + outdict[name] = h.copy() + else: + outdict[name] += h.copy() + return outdict + + +def main(args): + year = args.year + tag = args.tag + + # --- FIX 2: Hardcoded path to YOUR directory --- + path_to_dir = f"/eos/uscms/store/group/lpchbbrun3/gmachado/{tag}" + + # --- FIX 3: Hardcoded Z-Gamma Sample List --- + # We include TTGamma and Zgamma here so they get split into bb/light + samples_qq = ["Wjets", "Zjets", "Zgamma", "TTGamma"] + + columns = [ + "weight", + "FatJet0_pt", # Kinematic cut + "FatJet0_msd", # Observable/Kinematic cut + "FatJet0_ParTPXbbVsQCD", # ParticleNet b-tagger + "FatJet0_ParTPXccVsQCD", # ParticleNet c-tagger + "Photon0_pt", # New Kinematic cut + "FatJet0_phi", + "Photon0_phi", + "MET", + "GenFlavor", # MC Truth matching + "Photon200", # Trigger OR logic + "Photon110EB_TightID_TightIso", # Trigger OR logic + ] + + data_dirs = [Path(path_to_dir) / year] + + out_path = f"parT_results/{tag}/{year}" + output_file = f"{out_path}/testsignalregion.root" + + if not Path(out_path).exists(): + Path(out_path).mkdir(parents=True) + + if Path(output_file).is_file(): + Path(output_file).unlink() + fout = uproot.create(output_file) + + # --- FIX 4: Hardcoded to load setup_zgamma.json --- + os.popen(f"cp setup_zgamma.json {out_path}") + with Path("setup_zgamma.json").open() as f: + setup = json.load(f) + cats = setup["categories"] + obs_cfg = setup["observable"] + + with Path("pmap_run3.json").open() as f: + pmap = json.load(f) + + # --- FIX 5: Hardcoded Filters for Low pT --- + # This ensures we don't cut out the Z-Gamma events + filters = [ + ("FatJet0_pt", ">", 200), + ("FatJet0_pt", "<", 2000), + ] + + if obs_cfg["branch_name"] not in columns: + columns.append(obs_cfg["branch_name"]) + + out_hists = {} + for process, datasets in pmap.items(): + for dataset in datasets: + for reg, cfg in cats.items(): + for data_dir in data_dirs: + + events = utils.load_samples( + data_dir, + {process: [dataset]}, + columns=columns, + region=cfg["name"], + filters=filters, + ) + + if not events: + continue + + fill_hists(out_hists, events, reg, cfg, obs_cfg, (process in samples_qq)) + + for name, h in out_hists.items(): + fout[name] = h + + print(f"Histograms saved to {output_file}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Make histograms for a given year.") + parser.add_argument( + "--year", + help="year", + type=str, + required=True, + choices=["2022", "2022EE", "2023", "2023BPix"], + ) + parser.add_argument("--tag", help="tag", type=str, required=True) + args = parser.parse_args() + + main(args) diff --git a/fitting/pmap_run3.json b/fitting/pmap_run3.json index 9233f0ee..46dd5c4f 100644 --- a/fitting/pmap_run3.json +++ b/fitting/pmap_run3.json @@ -37,6 +37,13 @@ "EGamma_Run2023Cv4", "EGamma_Run2023D" ], + "GJets": [ + "GJ_PTG-100to200", + "GJ_PTG-200to400", + "GJ_PTG-400to600", + "GJ_PTG-600", + "GJ_PTG-20to100" + ], "ggF": [ "GluGluHto2B_M-125", "GluGluHto2B_PT-200_M-125", @@ -202,4 +209,4 @@ "ParkingVBF_Run2023Cv3", "ParkingVBF_Run2023Cv4" ] -} \ No newline at end of file +} diff --git a/fitting/run_cards_all.sh b/fitting/run_cards_all.sh new file mode 100644 index 00000000..d4ace26b --- /dev/null +++ b/fitting/run_cards_all.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +TAG="25Nov11_v14_private" + +for YEAR in 2022 2022EE 2023 2023BPix +do + echo "Making Card for $YEAR..." + python3 make_zg_cards.py --year $YEAR --tag $TAG +done diff --git a/fitting/run_hists_all.sh b/fitting/run_hists_all.sh new file mode 100644 index 00000000..749d8a37 --- /dev/null +++ b/fitting/run_hists_all.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +TAG="25Nov11_v14_private" + +# Loop over the 4 eras +for YEAR in 2022 2022EE 2023 2023BPix +do + echo "Processing Histograms for $YEAR..." + python3 make_zg_hists.py --year $YEAR --tag $TAG +done diff --git a/fitting/setup.json b/fitting/setup.json index 9ad87194..e7189a54 100644 --- a/fitting/setup.json +++ b/fitting/setup.json @@ -1,7 +1,7 @@ { "observable" : { "name" : "msd", - "min" : 40, + "min" : 40, "max" : 201, "nbins" : 23, "title" : "m_{sd} [GeV]", @@ -40,6 +40,14 @@ "bin_pname" : "pt", "branch_name" : "FatJet0_pt", "bin_title" : "p_{T}" + }, + "zgcr": { + "name" : "control-zgamma", + "bins" : [200,1200], + "bin_name" : "pt1", + "bin_pname" : "pt", + "branch_name" : "FatJet0_pt", + "bin_title" : "p_{T}" } } -} \ No newline at end of file +}