CMS-L1T-Jet-Tagging · OzAmram · Apr 18, 2025
diff --git a/tagger/data/tau_tools.py b/tagger/data/tau_tools.py
@@ -0,0 +1,167 @@
+import os, gc, json, glob, shutil
+
+# Third party
+import numpy as np
+import awkward as ak
+import uproot, yaml
+
+from sklearn.utils import shuffle
+from .tools import _save_chunk_metadata
+
+gc.set_threshold(0)
+
+tau_inputs = [
+    "pt",
+    "deta",
+    "dphi", 
+    "isPhoton",
+    "isElectronPlus",
+    "isMuonPlus",
+    "isNeutralHadron",
+    "isChargedHadronPlus"
+]
+
+def _save_tau_dataset_metadata(outdir, class_labels):
+
+    dataset_metadata_file = os.path.join(outdir, 'variables.json')
+
+    metadata = {"outputs": class_labels,
+                "inputs": tau_inputs,
+                "extras": []}
+
+    with open(dataset_metadata_file, "w") as f: json.dump(metadata, f, indent=4)
+
+    return
+
+
+def _process_tau_chunk(filtered_data, chunk, outdir):
+    """
+    Process chunk of data_split to save/parse it for training datasets
+    """
+
+    #Save chunk to files
+    outfile = os.path.join(outdir, f'data_chunk_{chunk}.root')
+    with uproot.recreate(outfile) as f:
+        f["data"] = filtered_data
+        print(f"Saved chunk {chunk} to {outfile}")
+
+    # Log metadata
+    metadata_file = os.path.join(outdir, "metadata.json")
+    nevents = len(filtered_data['class_label'])
+
+    _save_chunk_metadata(metadata_file, chunk, nevents, outfile) #Chunk, Entries, Outfile
+
+    #Delete the variables to save memory
+    gc.collect()
+
+    return
+
+def _process_tau(data):
+
+    dR_match = 0.2
+    gen_pt_cut = 5.0
+
+    n_parts = 10
+    n_feats = 8
+
+    class_labels = {
+            "light": 0,
+            "taus" : 1,
+        }
+
+    out = {}
+    # Initialize the new array in data for numeric labels with default -1 for unmatched entries
+    #data['class_label'] = ak.full_like(data['gendr1'], 0)
+    out['class_label'] = np.zeros(len(data['gendr1']), dtype=np.int)
+    out['jet_pt_phys'] = np.asarray(data['pt'])
+
+    #tau_match = (np.abs(data['gendr1']) < dR_match)  & (data['genpt1'] > gen_pt_cut)
+    tau_match = (np.abs(np.asarray(data['gendr1'])) < dR_match)  & (np.asarray(data['genpt1']) > gen_pt_cut)
+
+    # Assign class label
+    out['class_label'][tau_match] = 1
+
+    #Set pt regression target
+    gen_pt = np.nan_to_num(np.asarray(data["genpt1"]),nan=0,posinf=0,neginf=0)
+    tau_pt_ratio = np.nan_to_num(gen_pt/np.asarray(data["pt"]), nan=0, posinf=0, neginf=0)
+    tau_pt_ratio = np.clip(tau_pt_ratio, 0.3, 2)
+
+    out['target_pt'] = np.ones(len(out['class_label']))
+    out['target_pt_phys'] = np.asarray(ak.copy(data['pt']))
+
+    out['target_pt'][tau_match] = tau_pt_ratio[tau_match] 
+    out['target_pt_phys'][tau_match] = gen_pt[tau_match]
+
+    out['nn_inputs'] = np.asarray(data['m_inputs']).reshape(-1, n_parts, n_feats)
+
+    #shuffle order of training inputs
+    out['class_label'], out['target_pt'], out['target_pt_phys'], out['nn_inputs'] = shuffle(
+        out['class_label'], out['target_pt'], out['target_pt_phys'], out['nn_inputs'], random_state = 42)
+
+    # Sanity check for data consistency
+    # TODO 
+
+    return out, class_labels
+
+def make_tau_data(infile='/eos/cms/store/cmst3/group/l1tr/sewuchte/l1teg/fp_ntuples_v131Xv9/baselineTRK_4param_221124/All.root', 
+              outdir='training_data/',
+              n_parts=10,
+              ratio=1.0,
+              step_size="100MB",
+              tree="ntuplePupSingle/tree",
+              **kwargs):
+    """
+    Process the data set in chunks from the input ntuples file.
+
+    Parameters:
+        infile (str): The input file path.
+        outdir (str): The output directory.
+        tag (str): Input tags to use from pfcands, defined in pfcand_fields.yml.
+        extras (str): Extra fields to store for plotting, defined in pfcand_fields.yml
+        n_parts (int): Number of constituent particles to use for tagging.
+        fraction (float) : fraction from (0-1) of data to process for training/testing
+        step_size (str): Step size for uproot iteration.
+    """
+
+    #Check if output dir already exists, remove if so
+    if os.path.exists(outdir):
+        confirm = input(f"The directory '{outdir}' already exists. Do you want to delete it and continue? [y/n]: ")
+        if confirm.lower() == 'y':
+            shutil.rmtree(outdir)
+            print(f"Deleted existing directory: {outdir}")
+        else:
+            print("Exiting without making changes.")
+            return
+
+    #Create output training dataset
+    os.makedirs(outdir, exist_ok=True)
+    print("Output directory:", outdir)
+
+    #Loop through the entries
+    num_entries = uproot.open(infile)[tree].num_entries
+    num_entries_done = 0
+    chunk = 0
+
+    pt_cut = 15
+    eta_cut = 2.4
+
+    for data in uproot.iterate(infile+":"+tree, how="zip", step_size=step_size, max_workers=8):
+        jet_cut = (data['pt'] > pt_cut) & (np.abs(data['eta']) < eta_cut)
+        data = data[jet_cut]
+
+        #Add additional response variables
+        # _add_response_vars(data)
+        #Split data into all the training classes
+        data_split, class_labels = _process_tau(data)
+
+        #If first chunk then save metadata of the dataset
+        if chunk == 0: _save_tau_dataset_metadata(outdir, class_labels)
+
+        #Process and save training data for a given feature set
+        _process_tau_chunk(data_split, chunk=chunk, outdir=outdir)
+
+        #Number of chunk for indexing files
+        chunk += 1
+        num_entries_done += len(data)
+        print(f"Processed {num_entries_done}/{num_entries} entries | {np.round(num_entries_done / num_entries * 100, 1)}%")
+        if num_entries_done / num_entries >= ratio: break
diff --git a/tagger/plot/basic.py b/tagger/plot/basic.py
@@ -286,7 +286,7 @@ def get_response(truth_pt, reco_pt, pt_ratio):
 
     return uncorrected_response, regressed_response, uncorrected_errors, regressed_errors
 
-def response(class_labels, y_test, truth_pt_test, reco_pt_test, pt_ratio, plot_dir):
+def response(class_labels, y_test, truth_pt_test, reco_pt_test, pt_ratio, plot_dir, tau_training=False):
     save_dir = os.path.join(plot_dir, 'response')
     os.makedirs(save_dir, exist_ok=True)
 
@@ -326,11 +326,18 @@ def plot_response(uncorrected_response, regressed_response, uncorrected_errors,
         plot_response(uncorrected_response, regressed_response, uncorrected_errors, regressed_errors, flavor=flavor, plot_name=f"{flavor}_response")
 
     #Taus, jets, leptons rms
-    rms_selection = {
-        'taus': [class_labels['taup'], class_labels['taum']],
-        'jets': [class_labels[key] for key in ['b', 'charm', 'light', 'gluon']],
-        'leptons': [class_labels[key] for key in ['muon', 'electron']]
-    }
+    if(not tau_training):
+        rms_selection = {
+            'taus': [class_labels['taup'], class_labels['taum']],
+            'jets': [class_labels[key] for key in ['b', 'charm', 'light', 'gluon']],
+            'leptons': [class_labels[key] for key in ['muon', 'electron']]
+        }
+    else:
+        rms_selection = {
+            'taus': [class_labels['taus']],
+            'jets': [class_labels['light']],
+            }
+
 
     for key in rms_selection.keys():
         selection = sum(y_test[:, idx] for idx in rms_selection[key]) > 0
@@ -385,7 +392,7 @@ def get_rms(truth_pt, reco_pt, pt_ratio):
 
     return rms_uncorr, rms_reg, rms_uncorr_err, rms_reg_err
 
-def rms(class_labels, y_test, truth_pt_test, reco_pt_test, pt_ratio, plot_dir):
+def rms(class_labels, y_test, truth_pt_test, reco_pt_test, pt_ratio, plot_dir, tau_training):
 
     save_dir = os.path.join(plot_dir, 'residual_rms')
     os.makedirs(save_dir, exist_ok=True)
@@ -425,11 +432,17 @@ def plot_rms(uncorrected_rms, regressed_rms, uncorrected_rms_err, regressed_rms_
         plot_rms(uncorrected_rms, regressed_rms, uncorrected_rms_err, regressed_rms_err, flavor=flavor, plot_name=f"{flavor}_rms")
 
     #Taus, jets, leptons rms
-    rms_selection = {
-        'taus': [class_labels['taup'], class_labels['taum']],
-        'jets': [class_labels[key] for key in ['b', 'charm', 'light', 'gluon']],
-        'leptons': [class_labels[key] for key in ['muon', 'electron']]
-    }
+    if(not tau_training):
+        rms_selection = {
+            'taus': [class_labels['taup'], class_labels['taum']],
+            'jets': [class_labels[key] for key in ['b', 'charm', 'light', 'gluon']],
+            'leptons': [class_labels[key] for key in ['muon', 'electron']]
+        }
+    else:
+        rms_selection = {
+            'taus': [class_labels['taus']],
+            'jets': [class_labels['light']],
+            }
 
     for key in rms_selection.keys():
         selection = sum(y_test[:, idx] for idx in rms_selection[key]) > 0
@@ -501,7 +514,7 @@ def plot_shaply(model, X_test, class_labels, input_vars, plot_dir):
         plt.savefig(plot_dir+"/shap_summary_reg.png",bbox_inches='tight')
 
 # <<<<<<<<<<<<<<<<< end of plotting functions, call basic to plot all of them
-def basic(model_dir):
+def basic(model_dir, tau_training):
     """
     Plot the basic ROCs for different classes. Does not reflect L1 rate
     Returns a dictionary of ROCs for each class
@@ -539,8 +552,9 @@ def basic(model_dir):
                 class_pair = (i,j)
                 ROC_binary(y_pred, y_test, class_labels, plot_dir, class_pair)
 
-    #ROC for taus versus jets and taus versus leptons
-    ROC_taus(y_pred, y_test, class_labels, plot_dir)
+    if(not tau_training):
+        #ROC for taus versus jets and taus versus leptons
+        ROC_taus(y_pred, y_test, class_labels, plot_dir)
 
     # Confusion matrix
     confusion(y_pred, y_test, class_labels, plot_dir)
@@ -552,10 +566,10 @@ def basic(model_dir):
     plot_input_vars(X_test, input_vars, plot_dir)
 
     #Plot inclusive response and individual flavor
-    response(class_labels, y_test, truth_pt_test, reco_pt_test, pt_ratio, plot_dir)
+    response(class_labels, y_test, truth_pt_test, reco_pt_test, pt_ratio, plot_dir, tau_training=tau_training)
 
     #Plot the rms of the residuals vs pt
-    rms(class_labels, y_test, truth_pt_test, reco_pt_test, pt_ratio, plot_dir)
+    rms(class_labels, y_test, truth_pt_test, reco_pt_test, pt_ratio, plot_dir, tau_training=tau_training)
 
     #Plot the shaply feature importance
     plot_shaply(model, X_test, class_labels, input_vars, plot_dir)

diff --git a/tagger/train/train.py b/tagger/train/train.py
@@ -3,6 +3,7 @@
 
 #Import from other modules
 from tagger.data.tools import make_data, load_data, to_ML
+from tagger.data.tau_tools import make_tau_data
 from tagger.plot.basic import loss_history, basic
 import models
 
@@ -202,6 +203,8 @@ def train(out_dir, percent, model_name):
 
     parser = ArgumentParser()
 
+    parser.add_argument('--tau', action='store_true', help='Tau tagger version')
+
     #Making input arguments
     parser.add_argument('--make-data', action='store_true', help='Prepare the data if set.')
     parser.add_argument('-i','--input', default='/eos/cms/store/cmst3/group/l1tr/sewuchte/l1teg/fp_ntuples_v131Xv9/extendedTRK_5param_221124/All200.root' , help = 'Path to input training data')
@@ -225,7 +228,11 @@ def train(out_dir, percent, model_name):
 
     #Either make data or start the training
     if args.make_data:
-        make_data(infile=args.input, step_size=args.step, extras=args.extras, ratio=args.ratio, tree=args.tree) #Write to training_data/, can be specified using outdir, but keeping it simple here for now
+        if args.tau:
+            make_tau_data(infile=args.input, step_size=args.step, ratio=args.ratio, tree=args.tree) #Write to training_data/, can be specified using outdir, but keeping it simple here for now
+
+        else:
+            make_data(infile=args.input, step_size=args.step, extras=args.extras, ratio=args.ratio, tree=args.tree) #Write to training_data/, can be specified using outdir, but keeping it simple here for now
     elif args.plot_basic:
         model_dir = args.output
         f = open("mlflow_run_id.txt", "r")
@@ -237,7 +244,7 @@ def train(out_dir, percent, model_name):
                             ):
 
             #All the basic plots!
-            results = basic(model_dir)
+            results = basic(model_dir, args.tau)
             for class_label in results.keys():
                 mlflow.log_metric(class_label + ' ROC AUC',results[class_label])