Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 167 additions & 0 deletions tagger/data/tau_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import os, gc, json, glob, shutil

# Third party
import numpy as np
import awkward as ak
import uproot, yaml

from sklearn.utils import shuffle
from .tools import _save_chunk_metadata

gc.set_threshold(0)

tau_inputs = [
"pt",
"deta",
"dphi",
"isPhoton",
"isElectronPlus",
"isMuonPlus",
"isNeutralHadron",
"isChargedHadronPlus"
]

def _save_tau_dataset_metadata(outdir, class_labels):

dataset_metadata_file = os.path.join(outdir, 'variables.json')

metadata = {"outputs": class_labels,
"inputs": tau_inputs,
"extras": []}

with open(dataset_metadata_file, "w") as f: json.dump(metadata, f, indent=4)

return


def _process_tau_chunk(filtered_data, chunk, outdir):
"""
Process chunk of data_split to save/parse it for training datasets
"""

#Save chunk to files
outfile = os.path.join(outdir, f'data_chunk_{chunk}.root')
with uproot.recreate(outfile) as f:
f["data"] = filtered_data
print(f"Saved chunk {chunk} to {outfile}")

# Log metadata
metadata_file = os.path.join(outdir, "metadata.json")
nevents = len(filtered_data['class_label'])

_save_chunk_metadata(metadata_file, chunk, nevents, outfile) #Chunk, Entries, Outfile

#Delete the variables to save memory
gc.collect()

return

def _process_tau(data):

dR_match = 0.2
gen_pt_cut = 5.0

n_parts = 10
n_feats = 8

class_labels = {
"light": 0,
"taus" : 1,
}

out = {}
# Initialize the new array in data for numeric labels with default -1 for unmatched entries
#data['class_label'] = ak.full_like(data['gendr1'], 0)
out['class_label'] = np.zeros(len(data['gendr1']), dtype=np.int)
out['jet_pt_phys'] = np.asarray(data['pt'])

#tau_match = (np.abs(data['gendr1']) < dR_match) & (data['genpt1'] > gen_pt_cut)
tau_match = (np.abs(np.asarray(data['gendr1'])) < dR_match) & (np.asarray(data['genpt1']) > gen_pt_cut)

# Assign class label
out['class_label'][tau_match] = 1

#Set pt regression target
gen_pt = np.nan_to_num(np.asarray(data["genpt1"]),nan=0,posinf=0,neginf=0)
tau_pt_ratio = np.nan_to_num(gen_pt/np.asarray(data["pt"]), nan=0, posinf=0, neginf=0)
tau_pt_ratio = np.clip(tau_pt_ratio, 0.3, 2)

out['target_pt'] = np.ones(len(out['class_label']))
out['target_pt_phys'] = np.asarray(ak.copy(data['pt']))

out['target_pt'][tau_match] = tau_pt_ratio[tau_match]
out['target_pt_phys'][tau_match] = gen_pt[tau_match]

out['nn_inputs'] = np.asarray(data['m_inputs']).reshape(-1, n_parts, n_feats)

#shuffle order of training inputs
out['class_label'], out['target_pt'], out['target_pt_phys'], out['nn_inputs'] = shuffle(
out['class_label'], out['target_pt'], out['target_pt_phys'], out['nn_inputs'], random_state = 42)

# Sanity check for data consistency
# TODO

return out, class_labels

def make_tau_data(infile='/eos/cms/store/cmst3/group/l1tr/sewuchte/l1teg/fp_ntuples_v131Xv9/baselineTRK_4param_221124/All.root',
outdir='training_data/',
n_parts=10,
ratio=1.0,
step_size="100MB",
tree="ntuplePupSingle/tree",
**kwargs):
"""
Process the data set in chunks from the input ntuples file.

Parameters:
infile (str): The input file path.
outdir (str): The output directory.
tag (str): Input tags to use from pfcands, defined in pfcand_fields.yml.
extras (str): Extra fields to store for plotting, defined in pfcand_fields.yml
n_parts (int): Number of constituent particles to use for tagging.
fraction (float) : fraction from (0-1) of data to process for training/testing
step_size (str): Step size for uproot iteration.
"""

#Check if output dir already exists, remove if so
if os.path.exists(outdir):
confirm = input(f"The directory '{outdir}' already exists. Do you want to delete it and continue? [y/n]: ")
if confirm.lower() == 'y':
shutil.rmtree(outdir)
print(f"Deleted existing directory: {outdir}")
else:
print("Exiting without making changes.")
return

#Create output training dataset
os.makedirs(outdir, exist_ok=True)
print("Output directory:", outdir)

#Loop through the entries
num_entries = uproot.open(infile)[tree].num_entries
num_entries_done = 0
chunk = 0

pt_cut = 15
eta_cut = 2.4

for data in uproot.iterate(infile+":"+tree, how="zip", step_size=step_size, max_workers=8):
jet_cut = (data['pt'] > pt_cut) & (np.abs(data['eta']) < eta_cut)
data = data[jet_cut]

#Add additional response variables
# _add_response_vars(data)
#Split data into all the training classes
data_split, class_labels = _process_tau(data)

#If first chunk then save metadata of the dataset
if chunk == 0: _save_tau_dataset_metadata(outdir, class_labels)

#Process and save training data for a given feature set
_process_tau_chunk(data_split, chunk=chunk, outdir=outdir)

#Number of chunk for indexing files
chunk += 1
num_entries_done += len(data)
print(f"Processed {num_entries_done}/{num_entries} entries | {np.round(num_entries_done / num_entries * 100, 1)}%")
if num_entries_done / num_entries >= ratio: break
48 changes: 31 additions & 17 deletions tagger/plot/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def get_response(truth_pt, reco_pt, pt_ratio):

return uncorrected_response, regressed_response, uncorrected_errors, regressed_errors

def response(class_labels, y_test, truth_pt_test, reco_pt_test, pt_ratio, plot_dir):
def response(class_labels, y_test, truth_pt_test, reco_pt_test, pt_ratio, plot_dir, tau_training=False):
save_dir = os.path.join(plot_dir, 'response')
os.makedirs(save_dir, exist_ok=True)

Expand Down Expand Up @@ -326,11 +326,18 @@ def plot_response(uncorrected_response, regressed_response, uncorrected_errors,
plot_response(uncorrected_response, regressed_response, uncorrected_errors, regressed_errors, flavor=flavor, plot_name=f"{flavor}_response")

#Taus, jets, leptons rms
rms_selection = {
'taus': [class_labels['taup'], class_labels['taum']],
'jets': [class_labels[key] for key in ['b', 'charm', 'light', 'gluon']],
'leptons': [class_labels[key] for key in ['muon', 'electron']]
}
if(not tau_training):
rms_selection = {
'taus': [class_labels['taup'], class_labels['taum']],
'jets': [class_labels[key] for key in ['b', 'charm', 'light', 'gluon']],
'leptons': [class_labels[key] for key in ['muon', 'electron']]
}
else:
rms_selection = {
'taus': [class_labels['taus']],
'jets': [class_labels['light']],
}


for key in rms_selection.keys():
selection = sum(y_test[:, idx] for idx in rms_selection[key]) > 0
Expand Down Expand Up @@ -385,7 +392,7 @@ def get_rms(truth_pt, reco_pt, pt_ratio):

return rms_uncorr, rms_reg, rms_uncorr_err, rms_reg_err

def rms(class_labels, y_test, truth_pt_test, reco_pt_test, pt_ratio, plot_dir):
def rms(class_labels, y_test, truth_pt_test, reco_pt_test, pt_ratio, plot_dir, tau_training):

save_dir = os.path.join(plot_dir, 'residual_rms')
os.makedirs(save_dir, exist_ok=True)
Expand Down Expand Up @@ -425,11 +432,17 @@ def plot_rms(uncorrected_rms, regressed_rms, uncorrected_rms_err, regressed_rms_
plot_rms(uncorrected_rms, regressed_rms, uncorrected_rms_err, regressed_rms_err, flavor=flavor, plot_name=f"{flavor}_rms")

#Taus, jets, leptons rms
rms_selection = {
'taus': [class_labels['taup'], class_labels['taum']],
'jets': [class_labels[key] for key in ['b', 'charm', 'light', 'gluon']],
'leptons': [class_labels[key] for key in ['muon', 'electron']]
}
if(not tau_training):
rms_selection = {
'taus': [class_labels['taup'], class_labels['taum']],
'jets': [class_labels[key] for key in ['b', 'charm', 'light', 'gluon']],
'leptons': [class_labels[key] for key in ['muon', 'electron']]
}
else:
rms_selection = {
'taus': [class_labels['taus']],
'jets': [class_labels['light']],
}

for key in rms_selection.keys():
selection = sum(y_test[:, idx] for idx in rms_selection[key]) > 0
Expand Down Expand Up @@ -501,7 +514,7 @@ def plot_shaply(model, X_test, class_labels, input_vars, plot_dir):
plt.savefig(plot_dir+"/shap_summary_reg.png",bbox_inches='tight')

# <<<<<<<<<<<<<<<<< end of plotting functions, call basic to plot all of them
def basic(model_dir):
def basic(model_dir, tau_training):
"""
Plot the basic ROCs for different classes. Does not reflect L1 rate
Returns a dictionary of ROCs for each class
Expand Down Expand Up @@ -539,8 +552,9 @@ def basic(model_dir):
class_pair = (i,j)
ROC_binary(y_pred, y_test, class_labels, plot_dir, class_pair)

#ROC for taus versus jets and taus versus leptons
ROC_taus(y_pred, y_test, class_labels, plot_dir)
if(not tau_training):
#ROC for taus versus jets and taus versus leptons
ROC_taus(y_pred, y_test, class_labels, plot_dir)

# Confusion matrix
confusion(y_pred, y_test, class_labels, plot_dir)
Expand All @@ -552,10 +566,10 @@ def basic(model_dir):
plot_input_vars(X_test, input_vars, plot_dir)

#Plot inclusive response and individual flavor
response(class_labels, y_test, truth_pt_test, reco_pt_test, pt_ratio, plot_dir)
response(class_labels, y_test, truth_pt_test, reco_pt_test, pt_ratio, plot_dir, tau_training=tau_training)

#Plot the rms of the residuals vs pt
rms(class_labels, y_test, truth_pt_test, reco_pt_test, pt_ratio, plot_dir)
rms(class_labels, y_test, truth_pt_test, reco_pt_test, pt_ratio, plot_dir, tau_training=tau_training)

#Plot the shaply feature importance
plot_shaply(model, X_test, class_labels, input_vars, plot_dir)
Expand Down
11 changes: 9 additions & 2 deletions tagger/train/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#Import from other modules
from tagger.data.tools import make_data, load_data, to_ML
from tagger.data.tau_tools import make_tau_data
from tagger.plot.basic import loss_history, basic
import models

Expand Down Expand Up @@ -202,6 +203,8 @@ def train(out_dir, percent, model_name):

parser = ArgumentParser()

parser.add_argument('--tau', action='store_true', help='Tau tagger version')

#Making input arguments
parser.add_argument('--make-data', action='store_true', help='Prepare the data if set.')
parser.add_argument('-i','--input', default='/eos/cms/store/cmst3/group/l1tr/sewuchte/l1teg/fp_ntuples_v131Xv9/extendedTRK_5param_221124/All200.root' , help = 'Path to input training data')
Expand All @@ -225,7 +228,11 @@ def train(out_dir, percent, model_name):

#Either make data or start the training
if args.make_data:
make_data(infile=args.input, step_size=args.step, extras=args.extras, ratio=args.ratio, tree=args.tree) #Write to training_data/, can be specified using outdir, but keeping it simple here for now
if args.tau:
make_tau_data(infile=args.input, step_size=args.step, ratio=args.ratio, tree=args.tree) #Write to training_data/, can be specified using outdir, but keeping it simple here for now

else:
make_data(infile=args.input, step_size=args.step, extras=args.extras, ratio=args.ratio, tree=args.tree) #Write to training_data/, can be specified using outdir, but keeping it simple here for now
elif args.plot_basic:
model_dir = args.output
f = open("mlflow_run_id.txt", "r")
Expand All @@ -237,7 +244,7 @@ def train(out_dir, percent, model_name):
):

#All the basic plots!
results = basic(model_dir)
results = basic(model_dir, args.tau)
for class_label in results.keys():
mlflow.log_metric(class_label + ' ROC AUC',results[class_label])

Expand Down