From a17bce83f9a2d99678dc50124c2399ad5ff0d4ea Mon Sep 17 00:00:00 2001 From: Isaac Ehle Date: Fri, 6 Oct 2023 15:12:15 +0200 Subject: [PATCH 01/12] copy over submission related scripts --- .../submit_scripts/condor_cluster_size.sh | 11 + .../production/submit_scripts/job_submit.py | 226 ++++++++++++++++++ .../cluster_size/condor/cluster_opt.py | 138 +++++++++++ .../cluster_size/condor/run_cluster.py | 65 +++++ .../scripts/cluster_size/run_combine.py | 137 +++++++++++ .../scripts/cluster_size/run_init_tasks.py | 81 +++++++ config.yaml | 22 +- 7 files changed, 679 insertions(+), 1 deletion(-) create mode 100644 bye_splits/production/submit_scripts/condor_cluster_size.sh create mode 100644 bye_splits/production/submit_scripts/job_submit.py create mode 100644 bye_splits/scripts/cluster_size/condor/cluster_opt.py create mode 100644 bye_splits/scripts/cluster_size/condor/run_cluster.py create mode 100644 bye_splits/scripts/cluster_size/run_combine.py create mode 100644 bye_splits/scripts/cluster_size/run_init_tasks.py diff --git a/bye_splits/production/submit_scripts/condor_cluster_size.sh b/bye_splits/production/submit_scripts/condor_cluster_size.sh new file mode 100644 index 00000000..73b9146d --- /dev/null +++ b/bye_splits/production/submit_scripts/condor_cluster_size.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +cd /home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/scripts/cluster_size/condor/ + +# Coefficients (radii) stored in .txt file, run cluster step on each radius +coef_file=$1 +particles=$2 +pileup=$3 +while read -r line; do + python run_cluster.py --coef "$line" --particles "$particles" --pileup "$pileup" +done <$coef_file \ No newline at end of file diff --git a/bye_splits/production/submit_scripts/job_submit.py b/bye_splits/production/submit_scripts/job_submit.py new file mode 100644 index 00000000..c82e2e13 --- /dev/null +++ b/bye_splits/production/submit_scripts/job_submit.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python + +import os +import sys +from datetime import datetime + +parent_dir = os.path.abspath(__file__ + 5 * "../") +sys.path.insert(0, parent_dir) + +from bye_splits.utils import params, common + +import subprocess +import yaml + +# Read particle specific variables from the YAML file +particle_var = lambda part, var: config["job"][part][var] + +class JobBatches: + def __init__(self, particle, config): + self.particle = particle + self.config = config + + def setup_batches(self): + + my_batches = lambda files, files_per_batch: [files[i: i + files_per_batch] for i in range(0, len(files), files_per_batch)] + + read_dir = self.config["job"]["read_dir"] + files = particle_var(self.particle, "files") + files_per_batch = particle_var(self.particle, "files_per_batch") + + if not read_dir: + with open(files, "r") as file: + paths = file.read().splitlines() + else: + part_submit_dir = particle_var(self.particle, "submit_dir") + "ntuples/" + paths = [ + "{}{}".format(part_submit_dir, file) for file in os.listdir(part_submit_dir) if file.startswith("ntuple") + ] + + batches = my_batches(paths, files_per_batch) + + return batches + +class CondJobBase(JobBatches): + def __init__(self, particle, config): + super().__init__(particle, config) + self.particle_dir = particle_var(self.particle, "submit_dir") + self.script = config["job"]["script"] + self.args = config["job"]["arguments"] + self.queue = config["job"]["queue"] + self.proxy = config["job"]["proxy"] + self.local = config["job"]["local"] + self.user = config["job"]["user"] + + + def write_batch_files(self): + batch_dir = "{}batches/".format(self.particle_dir) + if not os.path.exists(batch_dir): + os.makedirs(batch_dir) + batch_script_dir = "{}{}/".format(batch_dir, os.path.splitext(os.path.basename(self.script))[0]) + if not os.path.exists(batch_script_dir): + os.makedirs(batch_script_dir) + + batches = self.setup_batches() + global current_batch_versions + current_batch_versions = [] + for i, batch in enumerate(batches): + out_name = "{}batch_{}.txt".format(batch_script_dir, i) + written_version = common.grab_most_recent(out_name, return_all=True) + batch_lines = ["{}\n".format(b) for b in batch] + current_version = common.conditional_write(written_version, out_name, batch_lines) + current_batch_versions.append(current_version) + + + def prepare_batch_submission(self): + sub_dir = "{}subs/".format(self.particle_dir) + + if not os.path.exists(sub_dir): + os.makedirs(sub_dir) + script_basename = os.path.basename(self.script).replace(".sh", "").replace(".py", "") + + submit_file_name_template = "{}{}_submit.sh".format(sub_dir, script_basename) + submit_file_versions = common.grab_most_recent(submit_file_name_template, return_all=True) + + current_version = [] + current_version.append("#!/usr/bin/env bash\n") + current_version.append("workdir={}/bye_splits/production/submit_scripts\n".format(parent_dir)) + current_version.append("cd $workdir\n") + current_version.append("export VO_CMS_SW_DIR=/cvmfs/cms.cern.ch\n") + current_version.append("export SITECONFIG_PATH=$VO_CMS_SW_DIR/SITECONF/T2_FR_GRIF_LLR/GRIF-LLR/\n") + current_version.append("source $VO_CMS_SW_DIR/cmsset_default.sh\n") + if len(self.args) > 0: + args = ["bash {}".format(self.script)] + for i in range(len(self.args)): + args.append(f"${i+1}") + args = " ".join(args) + current_version.append(args) + else: + current_version.append("bash {}".format(self.script)) + + # Write the file only if an identical file doesn't already exist + global sub_file + sub_file = common.conditional_write(submit_file_versions, submit_file_name_template, current_version) + + def prepare_multi_job_condor(self): + log_dir = "{}logs/".format(self.particle_dir) + + batch_files = current_batch_versions + + arg_dict = {} + for arg in self.args: + if arg=="filename": + arg_dict[arg] = batch_files + elif arg=="particles": + arg_dict[arg] = self.particle + elif arg=="pileup": + arg_dict[arg] = "PU0" if "PU0" in batch_files[0] else "PU200" + else: + print(f"{arg} is not currently supported.") + quit() + + script_basename = os.path.basename(self.script).replace(".sh", "").replace(".py", "") + + job_file_name_template = "{}jobs/{}.sub".format(self.particle_dir, script_basename) + + job_file_versions = common.grab_most_recent(job_file_name_template, return_all=True) + + current_version = [] + current_version.append("executable = {}\n".format(sub_file)) + current_version.append("Universe = vanilla\n") + if len(self.args) > 0: + current_version.append("Arguments =") + for arg in self.args[:-1]: + current_version.append(" $({}) ".format(arg)) + current_version.append("$({})\n".format(self.args[-1])) + current_version.append("output = {}{}_C$(Cluster)P$(Process).out\n".format(log_dir, script_basename)) + current_version.append("error = {}{}_C$(Cluster)P$(Process).err\n".format(log_dir, script_basename)) + current_version.append("log = {}{}_C$(Cluster)P$(Process).log\n".format(log_dir, script_basename)) + current_version.append("getenv = true\n") + current_version.append("T3Queue = {}\n".format(self.queue)) + current_version.append("WNTag = el7\n") + current_version.append('+SingularityCmd = ""\n') + current_version.append("include: /opt/exp_soft/cms/t3/t3queue |\n") + if len(arg_dict.keys()) > 0: + arg_keys = [key for key in arg_dict.keys()] + arg_keys = ", ".join(arg_keys) + arg_keys = "queue " + arg_keys + " from (\n" + current_version.append(arg_keys) + for file in arg_dict["filename"]: + sub_args = list(arg_dict.keys())[1:] + arg_vals = [file]+[arg_dict[key] for key in sub_args] + arg_vals = ", ".join(arg_vals) + "\n" + current_version.append(arg_vals) + current_version.append(")") + + # Write the file only if an identical file doesn't already exist + global submission_file # Save to launch later + submission_file = common.conditional_write(job_file_versions, job_file_name_template, current_version) + +class CondJob(CondJobBase): + def __init__(self, particle, config): + super().__init__(particle, config) + + + def prepare_jobs(self): + self.write_batch_files() + + configs = lambda dir: dir + "configs" + jobs = lambda dir: dir + "jobs" + logs = lambda dir: dir + "logs" + + config_dir = configs(self.particle_dir) + job_dir = jobs(self.particle_dir) + log_dir = logs(self.particle_dir) + + if not os.path.exists(config_dir): + os.makedirs(config_dir) + if not os.path.exists(job_dir): + os.makedirs(job_dir) + if not os.path.exists(log_dir): + os.makedirs(log_dir) + + self.prepare_batch_submission() + self.prepare_multi_job_condor() + + + def launch_jobs(self): + + if self.local == True: + machine = "local" + else: + machine = "llrt3.in2p3.fr" + + sub_comm = ["condor_submit"] + + if not self.local: + print( + "\nSending {} jobs on {}".format(self.particle, self.queue + "@{}".format(machine)) + ) + print("===============") + print("\n") + + sub_args = [] + + sub_args.append(submission_file) + + if self.local: + comm = sub_args + else: + comm = sub_comm + sub_args + + print(str(datetime.now()), " ".join(comm)) + status = subprocess.run(comm) + +if __name__ == "__main__": + with open(params.CfgPath, "r") as afile: + config = yaml.safe_load(afile) + + job = CondJob("pions", config) + job.prepare_jobs() + job.launch_jobs() + + '''for particle in ("photons", "electrons", "pions"): + job = CondJob(particle, config) + job.prepare_jobs() + job.launch_jobs()''' \ No newline at end of file diff --git a/bye_splits/scripts/cluster_size/condor/cluster_opt.py b/bye_splits/scripts/cluster_size/condor/cluster_opt.py new file mode 100644 index 00000000..0c36daad --- /dev/null +++ b/bye_splits/scripts/cluster_size/condor/cluster_opt.py @@ -0,0 +1,138 @@ +# coding: utf-8 + +_all_ = [] + +import os +import sys + +parent_dir = os.path.abspath(__file__ + 4 * "/..") +sys.path.insert(0, parent_dir) + +import tasks +from utils import params, common, parsing, cl_helpers + +from data_handle.data_process import get_data_reco_chain_start + +import argparse +import random + +random.seed(10) +import numpy as np +import pandas as pd +from scipy.optimize import minimize, Bounds + +import yaml + +pt_norm_loss = lambda mean: np.sqrt(1-mean**2)/2 + +def get_gen(pars, cfg): + particles = pars["particles"] + eta = pars["eta"] + + reprocess = cfg["clusterStudies"]["reprocess"] + nevents = cfg["clusterStudies"]["nevents"] + tag = cfg["clusterStudies"]["parquetTag"] + + df_gen, _ , _= get_data_reco_chain_start( + particles=particles, nevents=nevents, reprocess=reprocess, tag=tag + ) + + df = df_gen[ df_gen.gen_eta > 0 ] if eta=="pos" else df_gen[ df_gen.gen_eta < 0 ] + if eta=="neg": df_gen["gen_eta"] = abs(df_gen.gen_eta) + + return df + +def cluster_coef(pars, cfg, radii): + particles = pars["particles"] + cluster_d = params.read_task_params("cluster") + + cluster_d["ClusterOutPlot"], cluster_d["ClusterOutValidation"] = cfg["clusterStudies"]["clusterSizeBaseName"], cfg["clusterStudies"]["clusterSizeBaseName"]+"_valid" + cluster_d["CoeffA"] = radii + + for key in ("ClusterInTC", "ClusterInSeeds", "ClusterOutPlot", "ClusterOutValidation"): + name = cluster_d[key] + + cluster_d[key] = "{}_PU0_{}_posEta".format(particles, name) + + cluster_d["returnDF"] = True + _, df = tasks.cluster.cluster_default(pars, **cluster_d) + + return df + +def normalize_df(cl_df, gen_df, dRThresh=0.05): + cl_df=cl_df.reset_index().set_index(["event","seed_idx"]) + combined_df = cl_df.join( + gen_df.set_index("event"), on="event", how="inner" + ) + + if "dR" not in combined_df.keys(): + combined_df["dR"] = np.sqrt((abs(combined_df["eta"])-abs(combined_df["gen_eta"]))**2+(combined_df["phi"]-combined_df["gen_phi"])**2) + if "matches" not in combined_df.keys(): + combined_df["matches"] = combined_df["dR"] <= dRThresh + + combined_df["pt"] = combined_df["en"] / np.cosh(combined_df["eta"]) + combined_df["gen_pt"] = combined_df["gen_en"] / np.cosh(combined_df["gen_eta"]) + + combined_df["pt_norm"] = combined_df["pt"] / combined_df["gen_pt"] + combined_df["en_norm"] = combined_df["en"] / combined_df["gen_en"] + + return combined_df + +def filter_df(df): + df = df[ df.matches == True ] + df = df.groupby("event").apply(lambda x: x.loc[x.pt.idxmax()]) + + return df + +class clusterRad: + def __init__(self, pars, cfg): + self.pars = pars + self.cfg = cfg + self.df_gen = get_gen(pars, cfg) + + def cluster_check(self, radii): + df_cluster = cluster_coef(self.pars, self.cfg, radii) + df = normalize_df(df_cluster, self.df_gen) + return filter_df(df) + + def cluster_loss(self, radii): + df_cluster = cluster_coef(self.pars, self.cfg, radii) + df = normalize_df(df_cluster, self.df_gen) + df_filt = filter_df(df) + pt_norm_mean = df_filt.pt_norm.mean() + + return pt_norm_loss(pt_norm_mean) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="") + parser.add_argument("--particles", choices=("photons", "electrons", "pions"), required=True) + parser.add_argument("--eta", help="Eta region", choices=("pos", "neg"), default="pos") + parsing.add_parameters(parser) + + FLAGS = parser.parse_args() + pars = common.dot_dict(vars(FLAGS)) + + with open(params.CfgPath, "r") as afile: + cfg = yaml.safe_load(afile) + + if pars.particles == "photons": + #init_radii = np.asarray([0.01]*50) + init_radii = np.asarray([0.042, 0.042, 0.014, 0.014, 0.017, 0.17, 0.011, 0.011, 0.014, 0.014, 0.011, 0.011, 0.013, 0.013, 0.012, 0.012, 0.026, 0.026, 0.021, 0.021, 0.021, 0.021, 0.016, 0.016, 0.029, 0.029, 0.042, 0.042]) + else: + init_radii = np.asarray([0.015]*50) if pars.particles == "electrons" else np.asarray([0.02]*50) + + cluster = clusterRad(pars, cfg) + test_df = cluster.cluster_check(init_radii) + breakpoint() + + + '''lower_bounds, upper_bounds = 0.5*init_radii, 1.5*init_radii + bounds = Bounds(lower_bounds, upper_bounds) + + + min_options = {"maxiter": 2} + + res = minimize(cluster.cluster_loss, init_radii, bounds=bounds, options=min_options)''' + + + diff --git a/bye_splits/scripts/cluster_size/condor/run_cluster.py b/bye_splits/scripts/cluster_size/condor/run_cluster.py new file mode 100644 index 00000000..dca4112e --- /dev/null +++ b/bye_splits/scripts/cluster_size/condor/run_cluster.py @@ -0,0 +1,65 @@ +# coding: utf-8 + +_all_ = [] + +import os +import sys + +parent_dir = os.path.abspath(__file__ + 4 * "/..") +sys.path.insert(0, parent_dir) + +import tasks +from utils import params, common, parsing, cl_helpers + +import argparse +import random + +random.seed(10) +import numpy as np +import pandas as pd + +import yaml + +def cluster_coef(pars, cfg): + cluster_d = params.read_task_params("cluster") + + particles = pars["particles"] + pileup = pars["pileup"] + coef = pars["coef"] + + cl_size_coef = "{}_coef_{}".format( + cfg["clusterStudies"]["clusterSizeBaseName"], + str(round(coef, 3)).replace(".", "p"), + ) + cluster_d["ClusterOutPlot"], cluster_d["ClusterOutValidation"] = cl_size_coef, cl_size_coef+"_valid" + cluster_d["CoeffA"] = [coef] * 50 + #cluster_d["weights"] = cfg["weights"] + + for key in ("ClusterInTC", "ClusterInSeeds", "ClusterOutPlot", "ClusterOutValidation"): + name = cluster_d[key] + + cluster_d[key] = "{}_{}_{}_posEta".format(particles, pileup, name) + + nevents_end = tasks.cluster.cluster_default(pars, **cluster_d) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="") + parser.add_argument("--coef", help="Coefficient to use as the max cluster radius", required=True, type=float) + parser.add_argument("--particles", choices=("photons", "electrons", "pions"), required=True) + parser.add_argument("--pileup", help="tag for PU200 vs PU0", choices=("PU0", "PU200"), required=True) + parsing.add_parameters(parser) + + FLAGS = parser.parse_args() + pars = common.dot_dict(vars(FLAGS)) + + radius = round(pars.coef, 3) + + with open(params.CfgPath, "r") as afile: + cfg = yaml.safe_load(afile) + + '''weight_dir = "{}/PU0/".format(params.LocalStorage) + weights_by_particle = cl_helpers.read_weights(weight_dir, cfg) + weights = weights_by_particle[pars.particles][radius] + cfg["weights"] = weights''' + + cluster_coef(pars, cfg) \ No newline at end of file diff --git a/bye_splits/scripts/cluster_size/run_combine.py b/bye_splits/scripts/cluster_size/run_combine.py new file mode 100644 index 00000000..731fd65e --- /dev/null +++ b/bye_splits/scripts/cluster_size/run_combine.py @@ -0,0 +1,137 @@ +# coding: utf-8 + +_all_ = [] + +import os +import sys +import argparse + +parent_dir = os.path.abspath(__file__ + 3 * "/..") +sys.path.insert(0, parent_dir) + +from utils import params, common + +from data_handle.data_process import get_data_reco_chain_start + +import random +import re + +random.seed(10) +import numpy as np +import pandas as pd +import sys + +import yaml +from tqdm import tqdm + +def split_dfs(cl_df): + weighted_cols = [col for col in cl_df.keys() if "weighted" in col] + weighted_cols += [col for col in cl_df.keys() if "layer" in col] + original_cols = [col.replace("weighted_","") for col in weighted_cols] + weighted_cols += ["event"] + original_cols += ["event"] + + original_df = cl_df[original_cols] + weighted_df = cl_df[weighted_cols].rename(dict(zip(weighted_cols, original_cols)), axis=1) + + return original_df, weighted_df + +def normalize_df(cl_df, gen_df, dRThresh): + cl_df=cl_df.reset_index().set_index(["event","seed_idx"]) + combined_df = cl_df.join( + gen_df.set_index("event"), on="event", how="inner" + ) + + if "dR" not in combined_df.keys(): + combined_df["dR"] = np.sqrt((abs(combined_df["eta"])-abs(combined_df["gen_eta"]))**2+(combined_df["phi"]-combined_df["gen_phi"])**2) + if "matches" not in combined_df.keys(): + combined_df["matches"] = combined_df["dR"] <= dRThresh + + combined_df["pt"] = combined_df["en"] / np.cosh(combined_df["eta"]) + combined_df["gen_pt"] = combined_df["gen_en"] / np.cosh(combined_df["gen_eta"]) + + combined_df["pt_norm"] = combined_df["pt"] / combined_df["gen_pt"] + combined_df["en_norm"] = combined_df["en"] / combined_df["gen_en"] + + return combined_df + +def combine_files_by_coef(in_dir, file_pattern): + files = [ + file for file in os.listdir(in_dir) if re.search(file_pattern, file) != None and "valid" not in file + ] + coef_pattern = r"coef_0p(\d+)" + out_path = common.fill_path(file_pattern, data_dir=in_dir) + with pd.HDFStore(out_path, "w") as clusterSizeOut: + print("\nCombining Files:\n") + for file in tqdm(files): + key = re.search(coef_pattern, file).group() + with pd.HDFStore(in_dir + "/" + file, "r") as clSizeCoef: + clusterSizeOut[key] = clSizeCoef["/data"] + +def split_and_norm(df_cl, df_gen, dRthresh): + original_df, weighted_df = split_dfs(df_cl) + normed_df, normed_weighted_df = normalize_df(original_df, df_gen, dRthresh), normalize_df(weighted_df, df_gen, dRthresh) + df_dict = {"original": normed_df, + "weighted": normed_weighted_df} + df_cl = pd.Series(df_dict) + return df_cl + +def combine_cluster(cfg, **pars): + """Originally designed to combine the files returned by cluster for each radii, + and to normalize each by the gen_particle information. Now accepts an optional --file + parameter to normalize this, skipping the combinination step.""" + + input_file_path = pars["file"] if "file" in pars.keys() else None + unweighted = pars["unweighted"] if "unweighted" in pars.keys() else False + + particles = cfg["particles"] + nevents = cfg["clusterStudies"]["nevents"] + + if input_file_path == None: + pileup = "PU0" if not cfg["clusterStudies"]["pileup"] else "PU200" + + basename = cfg["clusterStudies"]["combination"][pileup][particles]["basename"] + sub_dir = cfg["clusterStudies"]["combination"][pileup][particles]["sub_dir"] + + dir = "{}/{}/{}".format(params.LocalStorage, pileup, sub_dir) + + combine_files_by_coef(dir, basename) + + cl_size_out = common.fill_path(basename, data_dir=dir) + + else: + cl_size_out = input_file_path + + with pd.HDFStore(cl_size_out, mode="a") as clSizeOut: + df_gen, _, _ = get_data_reco_chain_start( + particles=particles, nevents=nevents, reprocess=False, tag = cfg["clusterStudies"]["parquetTag"] + ) + #if "negEta" in sub_dir: + if "negEta" in cl_size_out: + df_gen = df_gen[ df_gen.gen_eta < 0 ] + df_gen["gen_eta"] = abs(df_gen.gen_eta) + else: + df_gen = df_gen[ df_gen.gen_eta > 0 ] + dRthresh = cfg["selection"]["deltarThreshold"] + if input_file_path != None: + clSizeOut["data"] = split_and_norm(clSizeOut["data"], df_gen, dRthresh) if not unweighted else normalize_df(clSizeOut["data"], df_gen, dRthresh) + else: + coef_keys = clSizeOut.keys() + print("\nNormalizing Files:\n") + for coef in tqdm(coef_keys): + clSizeOut[coef] = split_and_norm(clSizeOut[coef], df_gen, dRthresh) if not unweighted else normalize_df(clSizeOut[coef], df_gen, dRthresh) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="") + parser.add_argument("--file", type=str) + parser.add_argument("--unweighted", action="store_true") + + FLAGS = parser.parse_args() + pars = common.dot_dict(vars(FLAGS)) + + with open(params.CfgPath, "r") as afile: + cfg = yaml.safe_load(afile) + + for particles in ("photons", "electrons", "pions"): + cfg.update({"particles": particles}) + combine_cluster(cfg) \ No newline at end of file diff --git a/bye_splits/scripts/cluster_size/run_init_tasks.py b/bye_splits/scripts/cluster_size/run_init_tasks.py new file mode 100644 index 00000000..4271faf1 --- /dev/null +++ b/bye_splits/scripts/cluster_size/run_init_tasks.py @@ -0,0 +1,81 @@ +# coding: utf-8 + +_all_ = [] + +import os +import sys + +parent_dir = os.path.abspath(__file__ + 3 * "/..") +sys.path.insert(0, parent_dir) + +import tasks +from utils import params, common, parsing + +from data_handle.data_process import get_data_reco_chain_start + +import argparse +import random + +random.seed(10) +import sys + +import yaml + +def start_chain(pars, cfg): + particles = cfg["selection"]["particles"] + pileup = "PU0" if not cfg["clusterStudies"]["pileup"] else "PU200" + reprocess = cfg["clusterStudies"]["reprocess"] + nevents = cfg["clusterStudies"]["nevents"] + tag = cfg["clusterStudies"]["parquetTag"] + + df_gen, df_cl, df_tc = get_data_reco_chain_start( + particles=particles, nevents=nevents, reprocess=reprocess, tag=tag + ) + + df_gen_pos, df_gen_neg = df_gen[ df_gen.gen_eta > 0 ], df_gen[ df_gen.gen_eta < 0] + df_cl_pos, df_cl_neg = df_cl[ df_cl.cl3d_eta > 0 ], df_cl[ df_cl.cl3d_eta < 0] + df_tc_pos, df_tc_neg = df_tc[ df_tc.tc_eta > 0 ], df_tc[ df_tc.tc_eta < 0] + + for i in range(2): + if i==1: + df_gen, df_cl, df_tc = df_gen_neg, df_cl_neg, df_tc_neg + eta_tag = "negEta" + df_gen["gen_eta"], df_cl["cl3d_eta"], df_tc["tc_eta"], df_tc["tc_z"] = abs(df_gen.gen_eta), abs(df_cl.cl3d_eta), abs(df_tc.tc_eta), abs(df_tc.tc_z) + else: + df_gen, df_cl, df_tc = df_gen_pos, df_cl_pos, df_tc_pos + eta_tag = "posEta" + + print(f"{particles}: {eta_tag}") + + fill_d = params.read_task_params("fill") + for key in ("FillOut", "FillOutComp", "FillOutPlot"): + name = fill_d[key] + fill_d[key] = "{}_{}_{}_{}_17juillet".format(particles, pileup, name, eta_tag) + tasks.fill.fill(pars, df_gen, df_cl, df_tc, **fill_d) + + smooth_d = params.read_task_params("smooth") + for key in ("SmoothIn", "SmoothOut"): + name = smooth_d[key] + smooth_d[key] = "{}_{}_{}_{}_17juillet".format(particles, pileup, name, eta_tag) + tasks.smooth.smooth(pars, **smooth_d) + + seed_d = params.read_task_params("seed") + for key in ("SeedIn", "SeedOut"): + name = seed_d[key] + seed_d[key] = "{}_{}_{}_{}_17juillet".format(particles, pileup, name, eta_tag) + tasks.seed.seed(pars, **seed_d) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="") + parsing.add_parameters(parser) + + FLAGS = parser.parse_args() + + with open(params.CfgPath, "r") as afile: + cfg = yaml.safe_load(afile) + + start_chain(common.dot_dict(vars(FLAGS)), cfg) + + for particles in ("photons", "electrons", "pions"): + cfg["selection"]["particles"] = particles + start_chain(common.dot_dict(vars(FLAGS)), cfg) \ No newline at end of file diff --git a/config.yaml b/config.yaml index e8b02b00..61f3f4c8 100644 --- a/config.yaml +++ b/config.yaml @@ -135,6 +135,27 @@ varGeometry: reprocess: False coefs: [0.010, 0.015, 0.020, 0.025] +job: + user: iehle + proxy: ~/.t3/proxy.cert + queue: short + local: False + script: /grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/production/submit_scripts/condor_cluster_size.sh + arguments: [filename, particles, pileup] + read_dir: False # {True: read arguments as paths in /ntuples, False: read arguments stored on the .txt } + photons: + submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/photons/ + files: /data_CMS/cms/ehle/L1HGCAL/PU200/coefs.txt + files_per_batch: 1 + electrons: + submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/electrons/ + files: /data_CMS/cms/ehle/L1HGCAL/PU200/coefs.txt + files_per_batch: 1 + pions: + submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/pions/ + files: /data_CMS/cms/ehle/L1HGCAL/PU200/coefs.txt + files_per_batch: 1 + clusterStudies: localDir: /home/llr/cms/mchiusi/event_display/bye_splits/data/new_algos/ ehleDir: /eos/user/i/iehle/ @@ -145,7 +166,6 @@ clusterStudies: nevents: 100 pileUp: False tree: FloatingpointMixedbcstcrealsig4DummyHistomaxxydr015GenmatchGenclustersntuple/HGCalTriggerNtuple - #coeffs: [0.0, 0.05, 50] fileBaseName: energy_out local: False From d86dd718ee6edead89fe466a0eb24cdc55eaebbc Mon Sep 17 00:00:00 2001 From: Isaac Ehle Date: Mon, 9 Oct 2023 12:08:31 +0200 Subject: [PATCH 02/12] mostly variable / key name updates, addition of some needed functions --- .../production/submit_scripts/job_submit.py | 8 +- .../cluster_size/condor/run_cluster.py | 18 +++-- .../scripts/cluster_size/run_init_tasks.py | 14 ++-- bye_splits/utils/cl_helpers.py | 36 +++++++-- bye_splits/utils/common.py | 73 +++++++++++++++++++ config.yaml | 11 ++- 6 files changed, 131 insertions(+), 29 deletions(-) diff --git a/bye_splits/production/submit_scripts/job_submit.py b/bye_splits/production/submit_scripts/job_submit.py index c82e2e13..0513176f 100644 --- a/bye_splits/production/submit_scripts/job_submit.py +++ b/bye_splits/production/submit_scripts/job_submit.py @@ -216,11 +216,7 @@ def launch_jobs(self): with open(params.CfgPath, "r") as afile: config = yaml.safe_load(afile) - job = CondJob("pions", config) - job.prepare_jobs() - job.launch_jobs() - - '''for particle in ("photons", "electrons", "pions"): + for particle in ("photons", "electrons", "pions"): job = CondJob(particle, config) job.prepare_jobs() - job.launch_jobs()''' \ No newline at end of file + job.launch_jobs() \ No newline at end of file diff --git a/bye_splits/scripts/cluster_size/condor/run_cluster.py b/bye_splits/scripts/cluster_size/condor/run_cluster.py index dca4112e..9f3828bd 100644 --- a/bye_splits/scripts/cluster_size/condor/run_cluster.py +++ b/bye_splits/scripts/cluster_size/condor/run_cluster.py @@ -33,12 +33,14 @@ def cluster_coef(pars, cfg): ) cluster_d["ClusterOutPlot"], cluster_d["ClusterOutValidation"] = cl_size_coef, cl_size_coef+"_valid" cluster_d["CoeffA"] = [coef] * 50 - #cluster_d["weights"] = cfg["weights"] + + if "weights" in cfg: + cluster_d["weights"] = cfg["weights"] for key in ("ClusterInTC", "ClusterInSeeds", "ClusterOutPlot", "ClusterOutValidation"): name = cluster_d[key] - cluster_d[key] = "{}_{}_{}_posEta".format(particles, pileup, name) + cluster_d[key] = "{}_{}_{}_posEta_9oct".format(particles, pileup, name) nevents_end = tasks.cluster.cluster_default(pars, **cluster_d) @@ -47,6 +49,7 @@ def cluster_coef(pars, cfg): parser.add_argument("--coef", help="Coefficient to use as the max cluster radius", required=True, type=float) parser.add_argument("--particles", choices=("photons", "electrons", "pions"), required=True) parser.add_argument("--pileup", help="tag for PU200 vs PU0", choices=("PU0", "PU200"), required=True) + parser.add_argument("--weighted", help="Apply pre-calculated layer weights", default=False) parsing.add_parameters(parser) FLAGS = parser.parse_args() @@ -57,9 +60,10 @@ def cluster_coef(pars, cfg): with open(params.CfgPath, "r") as afile: cfg = yaml.safe_load(afile) - '''weight_dir = "{}/PU0/".format(params.LocalStorage) - weights_by_particle = cl_helpers.read_weights(weight_dir, cfg) - weights = weights_by_particle[pars.particles][radius] - cfg["weights"] = weights''' - + if pars.weighted: + weight_dir = "{}/PU0/".format(params.LocalStorage) + weights_by_particle = cl_helpers.read_weights(weight_dir, cfg) + weights = weights_by_particle[pars.particles][radius] + cfg["weights"] = weights + cluster_coef(pars, cfg) \ No newline at end of file diff --git a/bye_splits/scripts/cluster_size/run_init_tasks.py b/bye_splits/scripts/cluster_size/run_init_tasks.py index 4271faf1..e7c425da 100644 --- a/bye_splits/scripts/cluster_size/run_init_tasks.py +++ b/bye_splits/scripts/cluster_size/run_init_tasks.py @@ -48,21 +48,22 @@ def start_chain(pars, cfg): print(f"{particles}: {eta_tag}") fill_d = params.read_task_params("fill") - for key in ("FillOut", "FillOutComp", "FillOutPlot"): + #for key in ("FillOut", "FillOutComp", "FillOutPlot"): + for key in ("FillOut", "FillOutGenCl", "FillOutTcAll"): name = fill_d[key] - fill_d[key] = "{}_{}_{}_{}_17juillet".format(particles, pileup, name, eta_tag) + fill_d[key] = "{}_{}_{}_{}_9oct".format(particles, pileup, name, eta_tag) tasks.fill.fill(pars, df_gen, df_cl, df_tc, **fill_d) smooth_d = params.read_task_params("smooth") for key in ("SmoothIn", "SmoothOut"): name = smooth_d[key] - smooth_d[key] = "{}_{}_{}_{}_17juillet".format(particles, pileup, name, eta_tag) + smooth_d[key] = "{}_{}_{}_{}_9oct".format(particles, pileup, name, eta_tag) tasks.smooth.smooth(pars, **smooth_d) seed_d = params.read_task_params("seed") for key in ("SeedIn", "SeedOut"): name = seed_d[key] - seed_d[key] = "{}_{}_{}_{}_17juillet".format(particles, pileup, name, eta_tag) + seed_d[key] = "{}_{}_{}_{}_9oct".format(particles, pileup, name, eta_tag) tasks.seed.seed(pars, **seed_d) if __name__ == "__main__": @@ -74,8 +75,9 @@ def start_chain(pars, cfg): with open(params.CfgPath, "r") as afile: cfg = yaml.safe_load(afile) + cfg["selection"]["particles"] = "pions" start_chain(common.dot_dict(vars(FLAGS)), cfg) - for particles in ("photons", "electrons", "pions"): + '''for particles in ("photons", "electrons", "pions"): cfg["selection"]["particles"] = particles - start_chain(common.dot_dict(vars(FLAGS)), cfg) \ No newline at end of file + start_chain(common.dot_dict(vars(FLAGS)), cfg)''' \ No newline at end of file diff --git a/bye_splits/utils/cl_helpers.py b/bye_splits/utils/cl_helpers.py index 27a24eb9..0d376483 100644 --- a/bye_splits/utils/cl_helpers.py +++ b/bye_splits/utils/cl_helpers.py @@ -9,11 +9,12 @@ import numpy as np import pandas as pd -from bye_splits.utils import common, parsing, params +#from bye_splits.utils import common, parsing, params +from bye_splits.utils import common, params -parser = argparse.ArgumentParser(description="Seeding standalone step.") +'''parser = argparse.ArgumentParser(description="Seeding standalone step.") parsing.add_parameters(parser) -FLAGS = parser.parse_args() +FLAGS = parser.parse_args()''' def get_last_version(name): @@ -154,8 +155,31 @@ def get_input_files(base_path, pile_up=False): return input_files - -def get_output_files(cfg): +def read_weights(dir, cfg, version="final", mode="weights"): + weights_by_particle = {} + #for particle in ("photons", "electrons", "pions"): + for particle in ("photons", "pions"): + basename = "optimization_selectOneEffRms_maxSeed_bc_stc" if particle == "pions" else "optimization_selectOneStd_adjustMaxWeight_maxSeed" + + version_dir = "{}/".format(version) + particle_dir = "{}{}/optimization/official/{}".format(dir, particle, version_dir) + + files = [f for f in os.listdir(particle_dir) if basename in f] + weights_by_radius = {} + for file in files: + radius = float(file.replace(".hdf5","").replace(f"{basename}_","").replace("r0","0").replace("p",".")) + infile = particle_dir+file + with pd.HDFStore(infile, "r") as optWeights: + weights_by_radius[radius] = optWeights[mode] + + weights_by_particle[particle] = weights_by_radius + + weights_by_particle["electrons"] = weights_by_particle["photons"] + + return weights_by_particle + + +'''def get_output_files(cfg): """Accepts a configuration file containing the base directory, a file basename, local (Bool) and pileUp (Bool). Finds the full paths of the files created by cluster_size.py, and returns a dictionary corresponding to particles:[file_paths].""" @@ -194,4 +218,4 @@ def get_output_files(cfg): for key in output_files.keys(): output_files[key] = list(set(output_files[key])) - return output_files + return output_files''' diff --git a/bye_splits/utils/common.py b/bye_splits/utils/common.py index eef36c71..fbaa2653 100644 --- a/bye_splits/utils/common.py +++ b/bye_splits/utils/common.py @@ -16,6 +16,8 @@ import numpy as np import pandas as pd +import re + def binConv(vals, dist, amin): """ @@ -150,3 +152,74 @@ def seed_extra_name(cfg): s = '_hexdist' if cfg['seed_cs']['hexDist'] else '' s += '_' + cfg['seed_cs']['InputName'] return s + +# Accepts a template for a full path to a file and increments the version +def increment_version(file_path): + dir, file = os.path.split(file_path) + base, ext = os.path.splitext(file) + i = 0 + file_path = "{}/{}_v{}{}".format(dir, base, i, ext) + while os.path.exists(file_path): + i += 1 + file_path = "{}/{}_v{}{}".format(dir, base, i, ext) + return file_path + + +# Grab the most recent version of the file corresponding to the template file_path (or return all matches) +def grab_most_recent(file_path, return_all=False): + dir, file = os.path.split(file_path) + base, ext = os.path.splitext(file) + files = os.listdir(dir) + version_pattern = re.compile("{}_v(\\d+)\\{}".format(base, ext)) + matches = [version_pattern.search(file) for file in files] + matches = [match for match in matches if not match is None] + if len(matches) > 0: + matches = [int(match.group(1)) for match in matches] + most_recent = max(matches) + if not return_all: + file_path = dir + "/" + base + "_v" + str(most_recent) + ext + else: + file_path = [dir + "/" + base + "_v" + str(f) + ext for f in matches] + return file_path + +def compare_file_contents(file_path, buffer_list): + """ + Compares the content in with , + which should be a list of strings that you wish to write + to a new file. + """ + with open(file_path, "r") as file: + contents = file.readlines() + return contents==buffer_list + +def write_file_version(template, version): + file_name = increment_version(template) + with open(file_name, "w") as job_file: + job_file.writelines(version) + st = os.stat(file_name) + os.chmod(file_name, st.st_mode | 0o744) + return file_name + +def conditional_write(file_versions, file_template, current_version): + """ + Loops through the files in , comparing their contents + to the current version. If an identical version is found, the function + breaks and does nothing. Otherwise, it will write the contents in + to an updated version number whose basename corresponds to + . + """ + if file_versions != None: + identical_version = False + for file in file_versions: + if not compare_file_contents(file, current_version): + continue + else: + identical_version = True + file_path = file + break + if not identical_version: + file_path = write_file_version(file_template, current_version) + + else: + file_path = write_file_version(file_template, current_version) + return file_path diff --git a/config.yaml b/config.yaml index 61f3f4c8..ea69d2b6 100644 --- a/config.yaml +++ b/config.yaml @@ -72,7 +72,7 @@ varEvents: geta: 'good_genpart_exeta' gphi: 'good_genpart_exphi' gen: 'good_genpart_energy' - gpt: 'good_genpart_pt' +# gpt: 'good_genpart_pt' tc: event: 'event' wu: 'good_tc_waferu' @@ -161,10 +161,13 @@ clusterStudies: ehleDir: /eos/user/i/iehle/ dataFolder: data/ reinit: True #False - clusterSizeBaseName: cluster_size + reprocess: False # Reproduce .parquet file + parquetTag: jobTest + clusterSizeBaseName: cluster_size_jobTest_yesWeight coeffs: [0.0, 0.5, 50] - nevents: 100 - pileUp: False + #nevents: 100 + nevents: -1 + pileup: False tree: FloatingpointMixedbcstcrealsig4DummyHistomaxxydr015GenmatchGenclustersntuple/HGCalTriggerNtuple fileBaseName: energy_out local: False From fe04b0d75831ea202a90739e87618c3ac3cd65d4 Mon Sep 17 00:00:00 2001 From: Isaac Ehle Date: Mon, 9 Oct 2023 12:13:52 +0200 Subject: [PATCH 03/12] remove depricated file --- .../cluster_size/condor/cluster_opt.py | 138 ------------------ 1 file changed, 138 deletions(-) delete mode 100644 bye_splits/scripts/cluster_size/condor/cluster_opt.py diff --git a/bye_splits/scripts/cluster_size/condor/cluster_opt.py b/bye_splits/scripts/cluster_size/condor/cluster_opt.py deleted file mode 100644 index 0c36daad..00000000 --- a/bye_splits/scripts/cluster_size/condor/cluster_opt.py +++ /dev/null @@ -1,138 +0,0 @@ -# coding: utf-8 - -_all_ = [] - -import os -import sys - -parent_dir = os.path.abspath(__file__ + 4 * "/..") -sys.path.insert(0, parent_dir) - -import tasks -from utils import params, common, parsing, cl_helpers - -from data_handle.data_process import get_data_reco_chain_start - -import argparse -import random - -random.seed(10) -import numpy as np -import pandas as pd -from scipy.optimize import minimize, Bounds - -import yaml - -pt_norm_loss = lambda mean: np.sqrt(1-mean**2)/2 - -def get_gen(pars, cfg): - particles = pars["particles"] - eta = pars["eta"] - - reprocess = cfg["clusterStudies"]["reprocess"] - nevents = cfg["clusterStudies"]["nevents"] - tag = cfg["clusterStudies"]["parquetTag"] - - df_gen, _ , _= get_data_reco_chain_start( - particles=particles, nevents=nevents, reprocess=reprocess, tag=tag - ) - - df = df_gen[ df_gen.gen_eta > 0 ] if eta=="pos" else df_gen[ df_gen.gen_eta < 0 ] - if eta=="neg": df_gen["gen_eta"] = abs(df_gen.gen_eta) - - return df - -def cluster_coef(pars, cfg, radii): - particles = pars["particles"] - cluster_d = params.read_task_params("cluster") - - cluster_d["ClusterOutPlot"], cluster_d["ClusterOutValidation"] = cfg["clusterStudies"]["clusterSizeBaseName"], cfg["clusterStudies"]["clusterSizeBaseName"]+"_valid" - cluster_d["CoeffA"] = radii - - for key in ("ClusterInTC", "ClusterInSeeds", "ClusterOutPlot", "ClusterOutValidation"): - name = cluster_d[key] - - cluster_d[key] = "{}_PU0_{}_posEta".format(particles, name) - - cluster_d["returnDF"] = True - _, df = tasks.cluster.cluster_default(pars, **cluster_d) - - return df - -def normalize_df(cl_df, gen_df, dRThresh=0.05): - cl_df=cl_df.reset_index().set_index(["event","seed_idx"]) - combined_df = cl_df.join( - gen_df.set_index("event"), on="event", how="inner" - ) - - if "dR" not in combined_df.keys(): - combined_df["dR"] = np.sqrt((abs(combined_df["eta"])-abs(combined_df["gen_eta"]))**2+(combined_df["phi"]-combined_df["gen_phi"])**2) - if "matches" not in combined_df.keys(): - combined_df["matches"] = combined_df["dR"] <= dRThresh - - combined_df["pt"] = combined_df["en"] / np.cosh(combined_df["eta"]) - combined_df["gen_pt"] = combined_df["gen_en"] / np.cosh(combined_df["gen_eta"]) - - combined_df["pt_norm"] = combined_df["pt"] / combined_df["gen_pt"] - combined_df["en_norm"] = combined_df["en"] / combined_df["gen_en"] - - return combined_df - -def filter_df(df): - df = df[ df.matches == True ] - df = df.groupby("event").apply(lambda x: x.loc[x.pt.idxmax()]) - - return df - -class clusterRad: - def __init__(self, pars, cfg): - self.pars = pars - self.cfg = cfg - self.df_gen = get_gen(pars, cfg) - - def cluster_check(self, radii): - df_cluster = cluster_coef(self.pars, self.cfg, radii) - df = normalize_df(df_cluster, self.df_gen) - return filter_df(df) - - def cluster_loss(self, radii): - df_cluster = cluster_coef(self.pars, self.cfg, radii) - df = normalize_df(df_cluster, self.df_gen) - df_filt = filter_df(df) - pt_norm_mean = df_filt.pt_norm.mean() - - return pt_norm_loss(pt_norm_mean) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="") - parser.add_argument("--particles", choices=("photons", "electrons", "pions"), required=True) - parser.add_argument("--eta", help="Eta region", choices=("pos", "neg"), default="pos") - parsing.add_parameters(parser) - - FLAGS = parser.parse_args() - pars = common.dot_dict(vars(FLAGS)) - - with open(params.CfgPath, "r") as afile: - cfg = yaml.safe_load(afile) - - if pars.particles == "photons": - #init_radii = np.asarray([0.01]*50) - init_radii = np.asarray([0.042, 0.042, 0.014, 0.014, 0.017, 0.17, 0.011, 0.011, 0.014, 0.014, 0.011, 0.011, 0.013, 0.013, 0.012, 0.012, 0.026, 0.026, 0.021, 0.021, 0.021, 0.021, 0.016, 0.016, 0.029, 0.029, 0.042, 0.042]) - else: - init_radii = np.asarray([0.015]*50) if pars.particles == "electrons" else np.asarray([0.02]*50) - - cluster = clusterRad(pars, cfg) - test_df = cluster.cluster_check(init_radii) - breakpoint() - - - '''lower_bounds, upper_bounds = 0.5*init_radii, 1.5*init_radii - bounds = Bounds(lower_bounds, upper_bounds) - - - min_options = {"maxiter": 2} - - res = minimize(cluster.cluster_loss, init_radii, bounds=bounds, options=min_options)''' - - - From cc683e996d57b17dc823553a05e5a49dcffe3a98 Mon Sep 17 00:00:00 2001 From: Isaac Ehle Date: Mon, 9 Oct 2023 14:09:57 +0200 Subject: [PATCH 04/12] updated README.md --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 562f1778..f3acfffd 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ 2. [Data production](#dataprod) 1. [Skimming](#skim) 2. [Data sources](#sources) + 3. [Job Submission](#job-submission) 3. [Reconstruction Chain](#org0bc224d) 1. [Cluster Size Studies](#orgc33e2a6) 4. [Event Visualization](#org44a4071) @@ -99,6 +100,11 @@ This framework relies on photon-, electron- and pion-gun samples produced via CR The `PU0` files above were merged and are stored under `/data_CMS/cms/alves/L1HGCAL/`, accessible to LLR users and under `/eos/user/b/bfontana/FPGAs/new_algos/`, accessible to all lxplus and LLR users. The latter is used since it is well interfaced with CERN services. The `PU200` files were merged and stored under `/eos/user/i/iehle/data/PU200//`. + +## Job Submission + +Job submission to HT Condor is handled through `bye_splits/production/submit_scripts/job_submit.py` using the `job` section of `config.yaml` for its configuration. The configuration should include usual condor variables, i.e `user`, `proxy`, `queue`, and `local`, as well as a path to the `script` you would like to run on condor and a list of `arguments` that the script accepts. It then contains a section for each particle type which should contain a `submit_dir`, i.e. the directory in which to read and write submission related files, `files` which should be a `.txt` file containing the values you would like to iterate over, and `files_per_batch` which can be any number between 1 and the total number of values you would like to run. + # Reconstruction Chain From 85308fa708494aac0cb8414fb3a33c4a6633c435 Mon Sep 17 00:00:00 2001 From: Isaac Ehle Date: Wed, 11 Oct 2023 13:53:52 +0200 Subject: [PATCH 05/12] better argument handling and variable naming --- .../submit_scripts/condor_cluster_size.sh | 34 ++++- .../production/submit_scripts/job_submit.py | 125 ++++++------------ .../cluster_size/condor/run_cluster.py | 20 +-- config.yaml | 15 ++- 4 files changed, 91 insertions(+), 103 deletions(-) diff --git a/bye_splits/production/submit_scripts/condor_cluster_size.sh b/bye_splits/production/submit_scripts/condor_cluster_size.sh index 73b9146d..8f0e26a7 100644 --- a/bye_splits/production/submit_scripts/condor_cluster_size.sh +++ b/bye_splits/production/submit_scripts/condor_cluster_size.sh @@ -2,10 +2,30 @@ cd /home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/scripts/cluster_size/condor/ -# Coefficients (radii) stored in .txt file, run cluster step on each radius -coef_file=$1 -particles=$2 -pileup=$3 -while read -r line; do - python run_cluster.py --coef "$line" --particles "$particles" --pileup "$pileup" -done <$coef_file \ No newline at end of file +radius=() +particles="" +pileup="" + +while [[ "$#" -gt 0 ]]; do + case "$1" in + --radius) + IFS=";" read -ra radius <<< "${2:1:-1}" + shift 2 + ;; + --particles) + particles="$2" + shift 2 + ;; + --pileup) + pileup="$2" + shift 2 + ;; + *) + echo "Unrecognized argument $1" + exit 1;; + esac +done + +for rad in ${radius[@]}; do + python run_cluster.py --radius "$rad" --particles "$particles" --pileup "$pileup" +done \ No newline at end of file diff --git a/bye_splits/production/submit_scripts/job_submit.py b/bye_splits/production/submit_scripts/job_submit.py index 0513176f..fb7ed6b7 100644 --- a/bye_splits/production/submit_scripts/job_submit.py +++ b/bye_splits/production/submit_scripts/job_submit.py @@ -19,58 +19,29 @@ class JobBatches: def __init__(self, particle, config): self.particle = particle self.config = config + self.iterOver = config["job"]["iterOver"] def setup_batches(self): + total = self.config["job"]["arguments"][self.iterOver] - my_batches = lambda files, files_per_batch: [files[i: i + files_per_batch] for i in range(0, len(files), files_per_batch)] + vals_per_batch = particle_var(self.particle, "files_per_batch") - read_dir = self.config["job"]["read_dir"] - files = particle_var(self.particle, "files") - files_per_batch = particle_var(self.particle, "files_per_batch") + batches = [total[i: i + vals_per_batch] for i in range(0, len(total), vals_per_batch)] - if not read_dir: - with open(files, "r") as file: - paths = file.read().splitlines() - else: - part_submit_dir = particle_var(self.particle, "submit_dir") + "ntuples/" - paths = [ - "{}{}".format(part_submit_dir, file) for file in os.listdir(part_submit_dir) if file.startswith("ntuple") - ] - - batches = my_batches(paths, files_per_batch) - return batches -class CondJobBase(JobBatches): +class CondJobBase: def __init__(self, particle, config): - super().__init__(particle, config) - self.particle_dir = particle_var(self.particle, "submit_dir") + self.particle = particle self.script = config["job"]["script"] + self.iterOver = config["job"]["iterOver"] self.args = config["job"]["arguments"] self.queue = config["job"]["queue"] self.proxy = config["job"]["proxy"] self.local = config["job"]["local"] self.user = config["job"]["user"] - - - def write_batch_files(self): - batch_dir = "{}batches/".format(self.particle_dir) - if not os.path.exists(batch_dir): - os.makedirs(batch_dir) - batch_script_dir = "{}{}/".format(batch_dir, os.path.splitext(os.path.basename(self.script))[0]) - if not os.path.exists(batch_script_dir): - os.makedirs(batch_script_dir) - - batches = self.setup_batches() - global current_batch_versions - current_batch_versions = [] - for i, batch in enumerate(batches): - out_name = "{}batch_{}.txt".format(batch_script_dir, i) - written_version = common.grab_most_recent(out_name, return_all=True) - batch_lines = ["{}\n".format(b) for b in batch] - current_version = common.conditional_write(written_version, out_name, batch_lines) - current_batch_versions.append(current_version) - + self.particle_dir = particle_var(self.particle, "submit_dir") + self.batches = JobBatches(particle, config).setup_batches() def prepare_batch_submission(self): sub_dir = "{}subs/".format(self.particle_dir) @@ -89,35 +60,21 @@ def prepare_batch_submission(self): current_version.append("export VO_CMS_SW_DIR=/cvmfs/cms.cern.ch\n") current_version.append("export SITECONFIG_PATH=$VO_CMS_SW_DIR/SITECONF/T2_FR_GRIF_LLR/GRIF-LLR/\n") current_version.append("source $VO_CMS_SW_DIR/cmsset_default.sh\n") - if len(self.args) > 0: + + if len(self.args.keys()) > 0: args = ["bash {}".format(self.script)] - for i in range(len(self.args)): - args.append(f"${i+1}") + for i, key in enumerate(self.args.keys()): + args.append("--{} ${}".format(key, i+1)) args = " ".join(args) current_version.append(args) else: current_version.append("bash {}".format(self.script)) # Write the file only if an identical file doesn't already exist - global sub_file - sub_file = common.conditional_write(submit_file_versions, submit_file_name_template, current_version) + self.sub_file = common.conditional_write(submit_file_versions, submit_file_name_template, current_version) def prepare_multi_job_condor(self): log_dir = "{}logs/".format(self.particle_dir) - - batch_files = current_batch_versions - - arg_dict = {} - for arg in self.args: - if arg=="filename": - arg_dict[arg] = batch_files - elif arg=="particles": - arg_dict[arg] = self.particle - elif arg=="pileup": - arg_dict[arg] = "PU0" if "PU0" in batch_files[0] else "PU200" - else: - print(f"{arg} is not currently supported.") - quit() script_basename = os.path.basename(self.script).replace(".sh", "").replace(".py", "") @@ -126,13 +83,15 @@ def prepare_multi_job_condor(self): job_file_versions = common.grab_most_recent(job_file_name_template, return_all=True) current_version = [] - current_version.append("executable = {}\n".format(sub_file)) + current_version.append("executable = {}\n".format(self.sub_file)) current_version.append("Universe = vanilla\n") if len(self.args) > 0: current_version.append("Arguments =") - for arg in self.args[:-1]: + + for arg in self.args.keys(): current_version.append(" $({}) ".format(arg)) - current_version.append("$({})\n".format(self.args[-1])) + current_version.append("\n") + current_version.append("output = {}{}_C$(Cluster)P$(Process).out\n".format(log_dir, script_basename)) current_version.append("error = {}{}_C$(Cluster)P$(Process).err\n".format(log_dir, script_basename)) current_version.append("log = {}{}_C$(Cluster)P$(Process).log\n".format(log_dir, script_basename)) @@ -141,37 +100,37 @@ def prepare_multi_job_condor(self): current_version.append("WNTag = el7\n") current_version.append('+SingularityCmd = ""\n') current_version.append("include: /opt/exp_soft/cms/t3/t3queue |\n") - if len(arg_dict.keys()) > 0: - arg_keys = [key for key in arg_dict.keys()] - arg_keys = ", ".join(arg_keys) + + if len(self.args.keys()) > 0: + arg_keys = ", ".join(self.args.keys()) arg_keys = "queue " + arg_keys + " from (\n" current_version.append(arg_keys) - for file in arg_dict["filename"]: - sub_args = list(arg_dict.keys())[1:] - arg_vals = [file]+[arg_dict[key] for key in sub_args] - arg_vals = ", ".join(arg_vals) + "\n" - current_version.append(arg_vals) + for batch in self.batches: + sub_args = list(self.args.keys())[1:] + arg_vals = [self.args[key] for key in sub_args] + all_vals = ["{}".format(batch).replace(", ", ";")]+arg_vals + all_vals = ", ".join(all_vals) + "\n" + current_version.append(all_vals) + current_version.append(")") # Write the file only if an identical file doesn't already exist - global submission_file # Save to launch later - submission_file = common.conditional_write(job_file_versions, job_file_name_template, current_version) + self.submission_file = common.conditional_write(job_file_versions, job_file_name_template, current_version) # Save to launch later -class CondJob(CondJobBase): +class CondJob: def __init__(self, particle, config): - super().__init__(particle, config) + self.base = CondJobBase(particle=particle, config=config) def prepare_jobs(self): - self.write_batch_files() configs = lambda dir: dir + "configs" jobs = lambda dir: dir + "jobs" logs = lambda dir: dir + "logs" - config_dir = configs(self.particle_dir) - job_dir = jobs(self.particle_dir) - log_dir = logs(self.particle_dir) + config_dir = configs(self.base.particle_dir) + job_dir = jobs(self.base.particle_dir) + log_dir = logs(self.base.particle_dir) if not os.path.exists(config_dir): os.makedirs(config_dir) @@ -180,31 +139,31 @@ def prepare_jobs(self): if not os.path.exists(log_dir): os.makedirs(log_dir) - self.prepare_batch_submission() - self.prepare_multi_job_condor() + self.base.prepare_batch_submission() + self.base.prepare_multi_job_condor() def launch_jobs(self): - if self.local == True: + if self.base.local == True: machine = "local" else: machine = "llrt3.in2p3.fr" sub_comm = ["condor_submit"] - if not self.local: + if not self.base.local: print( - "\nSending {} jobs on {}".format(self.particle, self.queue + "@{}".format(machine)) + "\nSending {} jobs on {}".format(self.base.particle, self.base.queue + "@{}".format(machine)) ) print("===============") print("\n") sub_args = [] - sub_args.append(submission_file) + sub_args.append(self.base.submission_file) - if self.local: + if self.base.local: comm = sub_args else: comm = sub_comm + sub_args diff --git a/bye_splits/scripts/cluster_size/condor/run_cluster.py b/bye_splits/scripts/cluster_size/condor/run_cluster.py index 9f3828bd..52d9df0e 100644 --- a/bye_splits/scripts/cluster_size/condor/run_cluster.py +++ b/bye_splits/scripts/cluster_size/condor/run_cluster.py @@ -20,19 +20,19 @@ import yaml -def cluster_coef(pars, cfg): +def cluster_radius(pars, cfg): cluster_d = params.read_task_params("cluster") particles = pars["particles"] pileup = pars["pileup"] - coef = pars["coef"] + radius = pars["radius"] - cl_size_coef = "{}_coef_{}".format( + cl_size_radius = "{}_radius_{}".format( cfg["clusterStudies"]["clusterSizeBaseName"], - str(round(coef, 3)).replace(".", "p"), + str(round(radius, 3)).replace(".", "p"), ) - cluster_d["ClusterOutPlot"], cluster_d["ClusterOutValidation"] = cl_size_coef, cl_size_coef+"_valid" - cluster_d["CoeffA"] = [coef] * 50 + cluster_d["ClusterOutPlot"], cluster_d["ClusterOutValidation"] = cl_size_radius, cl_size_radius+"_valid" + cluster_d["CoeffA"] = [radius] * 50 if "weights" in cfg: cluster_d["weights"] = cfg["weights"] @@ -46,7 +46,7 @@ def cluster_coef(pars, cfg): if __name__ == "__main__": parser = argparse.ArgumentParser(description="") - parser.add_argument("--coef", help="Coefficient to use as the max cluster radius", required=True, type=float) + parser.add_argument("--radius", help="Coefficient to use as the max cluster radius", required=True, type=float) parser.add_argument("--particles", choices=("photons", "electrons", "pions"), required=True) parser.add_argument("--pileup", help="tag for PU200 vs PU0", choices=("PU0", "PU200"), required=True) parser.add_argument("--weighted", help="Apply pre-calculated layer weights", default=False) @@ -55,7 +55,7 @@ def cluster_coef(pars, cfg): FLAGS = parser.parse_args() pars = common.dot_dict(vars(FLAGS)) - radius = round(pars.coef, 3) + radius_str = round(pars.radius, 3) with open(params.CfgPath, "r") as afile: cfg = yaml.safe_load(afile) @@ -63,7 +63,7 @@ def cluster_coef(pars, cfg): if pars.weighted: weight_dir = "{}/PU0/".format(params.LocalStorage) weights_by_particle = cl_helpers.read_weights(weight_dir, cfg) - weights = weights_by_particle[pars.particles][radius] + weights = weights_by_particle[pars.particles][radius_str] cfg["weights"] = weights - cluster_coef(pars, cfg) \ No newline at end of file + cluster_radius(pars, cfg) \ No newline at end of file diff --git a/config.yaml b/config.yaml index ea69d2b6..81b8e0fe 100644 --- a/config.yaml +++ b/config.yaml @@ -141,12 +141,21 @@ job: queue: short local: False script: /grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/production/submit_scripts/condor_cluster_size.sh - arguments: [filename, particles, pileup] + iterOver: radius + arguments: + radius: [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, + 0.01 , 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.017, 0.018, + 0.019, 0.02 , 0.021, 0.022, 0.023, 0.024, 0.025, 0.026, 0.027, + 0.028, 0.029, 0.03 , 0.031, 0.032, 0.033, 0.034, 0.035, 0.036, + 0.037, 0.038, 0.039, 0.04 , 0.041, 0.042, 0.043, 0.044, 0.045, + 0.046, 0.047, 0.048, 0.049, 0.05 ] + particles: pions + pileup: PU0 read_dir: False # {True: read arguments as paths in /ntuples, False: read arguments stored on the .txt } photons: submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/photons/ files: /data_CMS/cms/ehle/L1HGCAL/PU200/coefs.txt - files_per_batch: 1 + files_per_batch: 10 electrons: submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/electrons/ files: /data_CMS/cms/ehle/L1HGCAL/PU200/coefs.txt @@ -154,7 +163,7 @@ job: pions: submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/pions/ files: /data_CMS/cms/ehle/L1HGCAL/PU200/coefs.txt - files_per_batch: 1 + files_per_batch: 10 clusterStudies: localDir: /home/llr/cms/mchiusi/event_display/bye_splits/data/new_algos/ From 0073ffe5c14093e5c05a39c0f46bc1cbc5cfa18d Mon Sep 17 00:00:00 2001 From: Isaac Ehle Date: Wed, 11 Oct 2023 17:56:41 +0200 Subject: [PATCH 06/12] general clean up and documentation --- README.md | 2 +- bye_splits/production/produce.cc | 94 ----------- .../production/submit_scripts/job_submit.py | 149 +++++++++++------- .../cluster_size/condor/run_cluster.py | 20 ++- .../scripts/cluster_size/run_combine.py | 38 +++-- .../scripts/cluster_size/run_init_tasks.py | 49 +++--- bye_splits/utils/cl_helpers.py | 57 +------ bye_splits/utils/common.py | 73 +-------- bye_splits/utils/job_helpers.py | 81 ++++++++++ config.yaml | 17 +- 10 files changed, 247 insertions(+), 333 deletions(-) delete mode 100644 bye_splits/production/produce.cc create mode 100644 bye_splits/utils/job_helpers.py diff --git a/README.md b/README.md index f3acfffd..a66bc6cd 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ The `PU0` files above were merged and are stored under `/data_CMS/cms/alves/L1HG ## Job Submission -Job submission to HT Condor is handled through `bye_splits/production/submit_scripts/job_submit.py` using the `job` section of `config.yaml` for its configuration. The configuration should include usual condor variables, i.e `user`, `proxy`, `queue`, and `local`, as well as a path to the `script` you would like to run on condor and a list of `arguments` that the script accepts. It then contains a section for each particle type which should contain a `submit_dir`, i.e. the directory in which to read and write submission related files, `files` which should be a `.txt` file containing the values you would like to iterate over, and `files_per_batch` which can be any number between 1 and the total number of values you would like to run. +Job submission to HT Condor is handled through `bye_splits/production/submit_scripts/job_submit.py` using the `job` section of `config.yaml` for its configuration. The configuration should include usual condor variables, i.e `user`, `proxy`, `queue`, and `local`, as well as a path to the `script` you would like to run on condor. The `arguments` sub-section should contain `key/value` pairs matching the expected arguments that `script` accepts. The variable that you would like to iterate over should be set in `iterOver` and its value should correspond to a `key` in the `arguments` sub-section whose value is a list containing the values the script should iterate over. It then contains a section for each particle type which should contain a `submit_dir`, i.e. the directory in which to read and write submission related files, and `args_per_batch` which can be any number between 1 and `len(arguments[])`. diff --git a/bye_splits/production/produce.cc b/bye_splits/production/produce.cc deleted file mode 100644 index cef61f1f..00000000 --- a/bye_splits/production/produce.cc +++ /dev/null @@ -1,94 +0,0 @@ -#include -#include "include/skim.h" - -#include // for printf() -#include // for strtol() -#include // for errno -#include // for INT_MIN and INT_MAX -#include // for strlen - -int convert_to_int(char** argv, int idx) { - char* p; - errno = 0; // not 'int errno', because the '#include' already defined it - long arg = strtol(argv[idx], &p, 10); - if (*p != '\0' || errno != 0) { - return 1; // In main(), returning non-zero means failure - } - - if (arg < INT_MIN || arg > INT_MAX) { - return 1; - } - int arg_int = arg; - - // Everything went well, print it as a regular number plus a newline - return arg_int; -} - -void show_help(const po::options_description&, const std::string&); -po::variables_map process_program_options(int argc, char **argv); - -void validate(po::variables_map args) { - std::string particles = args["particles"].as(); - if(!(particles=="photons" || particles=="electrons" || particles=="pions")) { - throw po::validation_error(po::validation_error::invalid_option_value, "particles"); - } -} - -void show_help(const po::options_description& desc, - const std::string& topic = "") { - std::cout << desc << '\n'; - if (topic != "") { - std::cout << "You asked for help on: " << topic << '\n'; - } -} - -po::variables_map process_program_options(int argc, char **argv) -{ - po::options_description desc("Usage"); - desc.add_options() - ("help,h", - po::value()->implicit_value("") - ->notifier([&desc](const std::string &topic) {show_help(desc, topic);}), - "Show help. If given, show help on the specified topic.") - ("nevents", po::value()->default_value(-1), - "number of entries to consider, useful for debugging (-1 means all)") - ("particles", po::value()->required(), - "type of particle"); - - if (argc <= 1) { - show_help(desc); // does not return - exit( EXIT_SUCCESS ); - } - - po::variables_map args; - try { - po::store(po::parse_command_line(argc, argv, desc), args); - } - catch (po::error const& e) { - std::cerr << e.what() << '\n'; - exit( EXIT_FAILURE ); - } - po::notify(args); - validate(args); - return args; -} - -//Run with ./produce.exe photons -int main(int argc, char **argv) { - std::string dir = "/eos/user/b/bfontana/FPGAs/new_algos/"; - std::string tree_name = "FloatingpointMixedbcstcrealsig4DummyHistomaxxydr015GenmatchGenclustersntuple/HGCalTriggerNtuple"; - - po::variables_map args = process_program_options(argc, argv); - if (args.count("help")) { - return 1; - } - - string particles = args["particles"].as(); - int nevents = args["nevents"].as(); - - std::string infile = particles + "_0PU_bc_stc_hadd.root"; - std::string events_str = nevents > 0 ? std::to_string(nevents) + "events_" : ""; - std::string outfile = "skim_" + events_str + infile; - skim(tree_name, dir + infile, dir + outfile, particles, nevents); - return 0; -} diff --git a/bye_splits/production/submit_scripts/job_submit.py b/bye_splits/production/submit_scripts/job_submit.py index fb7ed6b7..43334214 100644 --- a/bye_splits/production/submit_scripts/job_submit.py +++ b/bye_splits/production/submit_scripts/job_submit.py @@ -7,24 +7,29 @@ parent_dir = os.path.abspath(__file__ + 5 * "../") sys.path.insert(0, parent_dir) -from bye_splits.utils import params, common +from bye_splits.utils import params, common, job_helpers import subprocess import yaml -# Read particle specific variables from the YAML file -particle_var = lambda part, var: config["job"][part][var] - class JobBatches: + """Class for setting up job batches and setting configuration + variables. The function setup_batches() will take the list in + config[arguments[]] and return a list of lists containing + values in each sublist. Example for five total values + with = 2: + [0.01, 0.02, 0.03, 0.04, 0.05] --> [[0.01, 0.02], [0.03, 0.04], [0.05]]""" + def __init__(self, particle, config): self.particle = particle - self.config = config + self.config = config self.iterOver = config["job"]["iterOver"] + self.particle_var = lambda part, var: config["job"][part][var] def setup_batches(self): total = self.config["job"]["arguments"][self.iterOver] - vals_per_batch = particle_var(self.particle, "files_per_batch") + vals_per_batch = self.particle_var(self.particle, "args_per_batch") batches = [total[i: i + vals_per_batch] for i in range(0, len(total), vals_per_batch)] @@ -32,26 +37,75 @@ def setup_batches(self): class CondJobBase: def __init__(self, particle, config): - self.particle = particle - self.script = config["job"]["script"] - self.iterOver = config["job"]["iterOver"] - self.args = config["job"]["arguments"] - self.queue = config["job"]["queue"] - self.proxy = config["job"]["proxy"] - self.local = config["job"]["local"] - self.user = config["job"]["user"] - self.particle_dir = particle_var(self.particle, "submit_dir") - self.batches = JobBatches(particle, config).setup_batches() + self.particle = particle + self.script = config["job"]["script"] + self.iterOver = config["job"]["iterOver"] + self.args = config["job"]["arguments"] + self.queue = config["job"]["queue"] + self.proxy = config["job"]["proxy"] + self.local = config["job"]["local"] + self.user = config["job"]["user"] + self.batch = JobBatches(particle, config) + self.particle_dir = self.batch.particle_var(self.particle, "submit_dir") + self.batches = self.batch.setup_batches() + + def _write_arg_keys(self, current_version): + """Writes the line containing the argument + names to the buffer list.""" + + arg_keys = ["Arguments ="] + for arg in self.args.keys(): + arg_keys.append("$({})".format(arg)) + arg_keys = " ".join(arg_keys) + "\n" + + current_version.append(arg_keys) + + def _write_arg_values(self, current_version): + """Adds the argument values, where the batch lists are converted + to strings as [val_1, val_2, ...] --> "[val_1;val_2]". + The choice of a semicolon as the delimiter is arbitrary but it + cannot be a comma because this is the delimeter condor itself uses. + + Example: + + queue radius, particle from ( + [0.01, 0.02], photon + ) + incorrectly assigns radius="[0.01", particle="0.02]" + + queue radius, particle from ( + [0.01;0.02], photon + ) + correctly assigns radius="[0.01, 0.02]", particle="photon" + """ + arg_keys = ", ".join(self.args.keys()) + arg_keys = "queue " + arg_keys + " from (\n" + current_version.append(arg_keys) + for batch in self.batches: + sub_args = list(self.args.keys())[1:] + arg_vals = [self.args[key] for key in sub_args] + all_vals = ["{}".format(batch).replace(", ", ";")]+arg_vals + all_vals = ", ".join(all_vals) + "\n" + current_version.append(all_vals) + + current_version.append(")") def prepare_batch_submission(self): + """Writes the .sh script that constitutes the executable + in the .sub script. The basename will be the same as the fundamental + script, followed by a version number. Stores the contents in a list that's + used as a buffer and checked against the content in previous + versions, only writing the file if an identical file doesn't + already exist. The version number will be incrimented in this case.""" + sub_dir = "{}subs/".format(self.particle_dir) - if not os.path.exists(sub_dir): - os.makedirs(sub_dir) + common.create_dir(sub_dir) + script_basename = os.path.basename(self.script).replace(".sh", "").replace(".py", "") submit_file_name_template = "{}{}_submit.sh".format(sub_dir, script_basename) - submit_file_versions = common.grab_most_recent(submit_file_name_template, return_all=True) + submit_file_versions = job_helpers.grab_most_recent(submit_file_name_template, return_all=True) current_version = [] current_version.append("#!/usr/bin/env bash\n") @@ -71,26 +125,27 @@ def prepare_batch_submission(self): current_version.append("bash {}".format(self.script)) # Write the file only if an identical file doesn't already exist - self.sub_file = common.conditional_write(submit_file_versions, submit_file_name_template, current_version) + self.sub_file = job_helpers.conditional_write(submit_file_versions, submit_file_name_template, current_version) def prepare_multi_job_condor(self): + """Writes the .sub script that is submitted to HT Condor. + Follows the same naming convention and conditional_write() + procedure as the previous function.""" + log_dir = "{}logs/".format(self.particle_dir) script_basename = os.path.basename(self.script).replace(".sh", "").replace(".py", "") job_file_name_template = "{}jobs/{}.sub".format(self.particle_dir, script_basename) - job_file_versions = common.grab_most_recent(job_file_name_template, return_all=True) + job_file_versions = job_helpers.grab_most_recent(job_file_name_template, return_all=True) current_version = [] current_version.append("executable = {}\n".format(self.sub_file)) current_version.append("Universe = vanilla\n") - if len(self.args) > 0: - current_version.append("Arguments =") - for arg in self.args.keys(): - current_version.append(" $({}) ".format(arg)) - current_version.append("\n") + if len(self.args) > 0: + self._write_arg_keys(current_version) current_version.append("output = {}{}_C$(Cluster)P$(Process).out\n".format(log_dir, script_basename)) current_version.append("error = {}{}_C$(Cluster)P$(Process).err\n".format(log_dir, script_basename)) @@ -100,52 +155,36 @@ def prepare_multi_job_condor(self): current_version.append("WNTag = el7\n") current_version.append('+SingularityCmd = ""\n') current_version.append("include: /opt/exp_soft/cms/t3/t3queue |\n") - + if len(self.args.keys()) > 0: - arg_keys = ", ".join(self.args.keys()) - arg_keys = "queue " + arg_keys + " from (\n" - current_version.append(arg_keys) - for batch in self.batches: - sub_args = list(self.args.keys())[1:] - arg_vals = [self.args[key] for key in sub_args] - all_vals = ["{}".format(batch).replace(", ", ";")]+arg_vals - all_vals = ", ".join(all_vals) + "\n" - current_version.append(all_vals) - - current_version.append(")") + self._write_arg_values(current_version) # Write the file only if an identical file doesn't already exist - self.submission_file = common.conditional_write(job_file_versions, job_file_name_template, current_version) # Save to launch later + self.submission_file = job_helpers.conditional_write(job_file_versions, job_file_name_template, current_version) # Save to launch later class CondJob: + """Creates the job directories and files + with prepare_jobs() and runs the jobs with + launch_jobs().""" + def __init__(self, particle, config): self.base = CondJobBase(particle=particle, config=config) - def prepare_jobs(self): - configs = lambda dir: dir + "configs" - jobs = lambda dir: dir + "jobs" - logs = lambda dir: dir + "logs" - - config_dir = configs(self.base.particle_dir) - job_dir = jobs(self.base.particle_dir) - log_dir = logs(self.base.particle_dir) + config_dir = self.base.particle_dir + "configs" + job_dir = self.base.particle_dir + "jobs" + log_dir = self.base.particle_dir + "logs" - if not os.path.exists(config_dir): - os.makedirs(config_dir) - if not os.path.exists(job_dir): - os.makedirs(job_dir) - if not os.path.exists(log_dir): - os.makedirs(log_dir) + for d in (config_dir, job_dir, log_dir): + common.create_dir(d) self.base.prepare_batch_submission() self.base.prepare_multi_job_condor() - def launch_jobs(self): - if self.base.local == True: + if self.base.local: machine = "local" else: machine = "llrt3.in2p3.fr" diff --git a/bye_splits/scripts/cluster_size/condor/run_cluster.py b/bye_splits/scripts/cluster_size/condor/run_cluster.py index 52d9df0e..ee186f52 100644 --- a/bye_splits/scripts/cluster_size/condor/run_cluster.py +++ b/bye_splits/scripts/cluster_size/condor/run_cluster.py @@ -1,6 +1,6 @@ # coding: utf-8 -_all_ = [] +_all_ = ['cluster_radius'] import os import sys @@ -21,6 +21,11 @@ import yaml def cluster_radius(pars, cfg): + """Runs the default clustering algorithm using the + specified radius. Runs on both negative and positive + eta files, and adds layer weights to the cluster + kwargs if specified.""" + cluster_d = params.read_task_params("cluster") particles = pars["particles"] @@ -37,19 +42,20 @@ def cluster_radius(pars, cfg): if "weights" in cfg: cluster_d["weights"] = cfg["weights"] - for key in ("ClusterInTC", "ClusterInSeeds", "ClusterOutPlot", "ClusterOutValidation"): - name = cluster_d[key] + for eta_tag in ("negEta", "posEta"): + for key in ("ClusterInTC", "ClusterInSeeds", "ClusterOutPlot", "ClusterOutValidation"): + name = cluster_d[key] - cluster_d[key] = "{}_{}_{}_posEta_9oct".format(particles, pileup, name) - - nevents_end = tasks.cluster.cluster_default(pars, **cluster_d) + cluster_d[key] = "{}_{}_{}_{}".format(particles, pileup, name, eta_tag) + + nevents_end = tasks.cluster.cluster_default(pars, **cluster_d) if __name__ == "__main__": parser = argparse.ArgumentParser(description="") parser.add_argument("--radius", help="Coefficient to use as the max cluster radius", required=True, type=float) parser.add_argument("--particles", choices=("photons", "electrons", "pions"), required=True) parser.add_argument("--pileup", help="tag for PU200 vs PU0", choices=("PU0", "PU200"), required=True) - parser.add_argument("--weighted", help="Apply pre-calculated layer weights", default=False) + parser.add_argument("--weighted", help="Apply pre-caluclated layer weights", action="store_true") parsing.add_parameters(parser) FLAGS = parser.parse_args() diff --git a/bye_splits/scripts/cluster_size/run_combine.py b/bye_splits/scripts/cluster_size/run_combine.py index 731fd65e..ca44870c 100644 --- a/bye_splits/scripts/cluster_size/run_combine.py +++ b/bye_splits/scripts/cluster_size/run_combine.py @@ -1,6 +1,6 @@ # coding: utf-8 -_all_ = [] +_all_ = ['split_dfs', 'combine_normalize', 'combine_files_by_coef', 'split_and_norm', 'combine_cluster'] import os import sys @@ -9,14 +9,13 @@ parent_dir = os.path.abspath(__file__ + 3 * "/..") sys.path.insert(0, parent_dir) -from utils import params, common +from utils import params, common, parsing from data_handle.data_process import get_data_reco_chain_start import random import re -random.seed(10) import numpy as np import pandas as pd import sys @@ -25,6 +24,9 @@ from tqdm import tqdm def split_dfs(cl_df): + """Splits the dataframe created by cluster() into an + unweighted dataframe and a weighted dataframe.""" + weighted_cols = [col for col in cl_df.keys() if "weighted" in col] weighted_cols += [col for col in cl_df.keys() if "layer" in col] original_cols = [col.replace("weighted_","") for col in weighted_cols] @@ -36,7 +38,12 @@ def split_dfs(cl_df): return original_df, weighted_df -def normalize_df(cl_df, gen_df, dRThresh): +def combine_normalize(cl_df, gen_df, dRThresh): + """Combines a cluster dataframe with an associated + gen-level dataframe. Useful variables that may or may not + be present are added to the combined dataframe before + normalized energy and pt columns are added.""" + cl_df=cl_df.reset_index().set_index(["event","seed_idx"]) combined_df = cl_df.join( gen_df.set_index("event"), on="event", how="inner" @@ -56,6 +63,10 @@ def normalize_df(cl_df, gen_df, dRThresh): return combined_df def combine_files_by_coef(in_dir, file_pattern): + """Combines .hdf5 cluster files of individual + radii into one .hdf5 file containing dataframes + for all radii.""" + files = [ file for file in os.listdir(in_dir) if re.search(file_pattern, file) != None and "valid" not in file ] @@ -70,11 +81,10 @@ def combine_files_by_coef(in_dir, file_pattern): def split_and_norm(df_cl, df_gen, dRthresh): original_df, weighted_df = split_dfs(df_cl) - normed_df, normed_weighted_df = normalize_df(original_df, df_gen, dRthresh), normalize_df(weighted_df, df_gen, dRthresh) + normed_df, normed_weighted_df = combine_normalize(original_df, df_gen, dRthresh), combine_normalize(weighted_df, df_gen, dRthresh) df_dict = {"original": normed_df, "weighted": normed_weighted_df} - df_cl = pd.Series(df_dict) - return df_cl + return pd.Series(df_dict) def combine_cluster(cfg, **pars): """Originally designed to combine the files returned by cluster for each radii, @@ -85,9 +95,9 @@ def combine_cluster(cfg, **pars): unweighted = pars["unweighted"] if "unweighted" in pars.keys() else False particles = cfg["particles"] - nevents = cfg["clusterStudies"]["nevents"] + nevents = pars.nevents - if input_file_path == None: + if input_file_path is None: pileup = "PU0" if not cfg["clusterStudies"]["pileup"] else "PU200" basename = cfg["clusterStudies"]["combination"][pileup][particles]["basename"] @@ -106,25 +116,25 @@ def combine_cluster(cfg, **pars): df_gen, _, _ = get_data_reco_chain_start( particles=particles, nevents=nevents, reprocess=False, tag = cfg["clusterStudies"]["parquetTag"] ) - #if "negEta" in sub_dir: if "negEta" in cl_size_out: df_gen = df_gen[ df_gen.gen_eta < 0 ] df_gen["gen_eta"] = abs(df_gen.gen_eta) else: df_gen = df_gen[ df_gen.gen_eta > 0 ] dRthresh = cfg["selection"]["deltarThreshold"] - if input_file_path != None: - clSizeOut["data"] = split_and_norm(clSizeOut["data"], df_gen, dRthresh) if not unweighted else normalize_df(clSizeOut["data"], df_gen, dRthresh) + if input_file_path is not None: + clSizeOut["data"] = split_and_norm(clSizeOut["data"], df_gen, dRthresh) if not unweighted else combine_normalize(clSizeOut["data"], df_gen, dRthresh) else: coef_keys = clSizeOut.keys() print("\nNormalizing Files:\n") for coef in tqdm(coef_keys): - clSizeOut[coef] = split_and_norm(clSizeOut[coef], df_gen, dRthresh) if not unweighted else normalize_df(clSizeOut[coef], df_gen, dRthresh) + clSizeOut[coef] = split_and_norm(clSizeOut[coef], df_gen, dRthresh) if not unweighted else combine_normalize(clSizeOut[coef], df_gen, dRthresh) if __name__ == "__main__": parser = argparse.ArgumentParser(description="") parser.add_argument("--file", type=str) parser.add_argument("--unweighted", action="store_true") + parsing.add_parameters(parser) FLAGS = parser.parse_args() pars = common.dot_dict(vars(FLAGS)) @@ -134,4 +144,4 @@ def combine_cluster(cfg, **pars): for particles in ("photons", "electrons", "pions"): cfg.update({"particles": particles}) - combine_cluster(cfg) \ No newline at end of file + combine_cluster(cfg, pars) \ No newline at end of file diff --git a/bye_splits/scripts/cluster_size/run_init_tasks.py b/bye_splits/scripts/cluster_size/run_init_tasks.py index e7c425da..ddba90bf 100644 --- a/bye_splits/scripts/cluster_size/run_init_tasks.py +++ b/bye_splits/scripts/cluster_size/run_init_tasks.py @@ -16,54 +16,60 @@ import argparse import random -random.seed(10) import sys import yaml def start_chain(pars, cfg): + """Runs the first three steps of the TPG on + negative and positive eta samples.""" + particles = cfg["selection"]["particles"] - pileup = "PU0" if not cfg["clusterStudies"]["pileup"] else "PU200" + pileup = "PU0" if not cfg["clusterStudies"]["pileup"] else "PU200" reprocess = cfg["clusterStudies"]["reprocess"] - nevents = cfg["clusterStudies"]["nevents"] - tag = cfg["clusterStudies"]["parquetTag"] + tag = cfg["clusterStudies"]["parquetTag"] + nevents = pars.nevents df_gen, df_cl, df_tc = get_data_reco_chain_start( particles=particles, nevents=nevents, reprocess=reprocess, tag=tag ) - df_gen_pos, df_gen_neg = df_gen[ df_gen.gen_eta > 0 ], df_gen[ df_gen.gen_eta < 0] - df_cl_pos, df_cl_neg = df_cl[ df_cl.cl3d_eta > 0 ], df_cl[ df_cl.cl3d_eta < 0] - df_tc_pos, df_tc_neg = df_tc[ df_tc.tc_eta > 0 ], df_tc[ df_tc.tc_eta < 0] + df_gen_pos, df_gen_neg = df_gen[ df_gen.gen_eta > 0 ], df_gen[ df_gen.gen_eta < 0 ] + df_cl_pos, df_cl_neg = df_cl[ df_cl.cl3d_eta > 0 ], df_cl[ df_cl.cl3d_eta < 0 ] + df_tc_pos, df_tc_neg = df_tc[ df_tc.tc_eta > 0 ], df_tc[ df_tc.tc_eta < 0 ] + + eta_dict = {"negEta": {"df_gen": df_gen_neg, + "df_cl": df_cl_neg, + "df_tc": df_tc_neg}, + "posEta": {"df_gen": df_gen_pos, + "df_cl": df_cl_pos, + "df_tc": df_tc_pos} + } - for i in range(2): - if i==1: - df_gen, df_cl, df_tc = df_gen_neg, df_cl_neg, df_tc_neg - eta_tag = "negEta" + for eta_tag, dfs in eta_dict.items(): + df_gen, df_cl, df_tc = dfs.values() + + if eta_tag=="negEta": df_gen["gen_eta"], df_cl["cl3d_eta"], df_tc["tc_eta"], df_tc["tc_z"] = abs(df_gen.gen_eta), abs(df_cl.cl3d_eta), abs(df_tc.tc_eta), abs(df_tc.tc_z) - else: - df_gen, df_cl, df_tc = df_gen_pos, df_cl_pos, df_tc_pos - eta_tag = "posEta" print(f"{particles}: {eta_tag}") fill_d = params.read_task_params("fill") - #for key in ("FillOut", "FillOutComp", "FillOutPlot"): for key in ("FillOut", "FillOutGenCl", "FillOutTcAll"): name = fill_d[key] - fill_d[key] = "{}_{}_{}_{}_9oct".format(particles, pileup, name, eta_tag) + fill_d[key] = "{}_{}_{}_{}_pt_test".format(particles, pileup, name, eta_tag) tasks.fill.fill(pars, df_gen, df_cl, df_tc, **fill_d) smooth_d = params.read_task_params("smooth") for key in ("SmoothIn", "SmoothOut"): name = smooth_d[key] - smooth_d[key] = "{}_{}_{}_{}_9oct".format(particles, pileup, name, eta_tag) + smooth_d[key] = "{}_{}_{}_{}_pt_test".format(particles, pileup, name, eta_tag) tasks.smooth.smooth(pars, **smooth_d) seed_d = params.read_task_params("seed") for key in ("SeedIn", "SeedOut"): name = seed_d[key] - seed_d[key] = "{}_{}_{}_{}_9oct".format(particles, pileup, name, eta_tag) + seed_d[key] = "{}_{}_{}_{}_pt_test".format(particles, pileup, name, eta_tag) tasks.seed.seed(pars, **seed_d) if __name__ == "__main__": @@ -75,9 +81,6 @@ def start_chain(pars, cfg): with open(params.CfgPath, "r") as afile: cfg = yaml.safe_load(afile) - cfg["selection"]["particles"] = "pions" - start_chain(common.dot_dict(vars(FLAGS)), cfg) - - '''for particles in ("photons", "electrons", "pions"): + for particles in ("photons", "electrons", "pions"): cfg["selection"]["particles"] = particles - start_chain(common.dot_dict(vars(FLAGS)), cfg)''' \ No newline at end of file + start_chain(common.dot_dict(vars(FLAGS)), cfg) \ No newline at end of file diff --git a/bye_splits/utils/cl_helpers.py b/bye_splits/utils/cl_helpers.py index 0d376483..cb3c7d2c 100644 --- a/bye_splits/utils/cl_helpers.py +++ b/bye_splits/utils/cl_helpers.py @@ -9,14 +9,8 @@ import numpy as np import pandas as pd -#from bye_splits.utils import common, parsing, params from bye_splits.utils import common, params -'''parser = argparse.ArgumentParser(description="Seeding standalone step.") -parsing.add_parameters(parser) -FLAGS = parser.parse_args()''' - - def get_last_version(name): """Takes a template path, such as '/full/path/to/my_file.ext' and returns the path to the latest version corresponding to '/full/path/to/my_file_vN.ext' where N is the latest version number in the directory. @@ -25,7 +19,6 @@ def get_last_version(name): base, ext = os.path.splitext(base) dir = os.path.dirname(name) if os.path.exists: - # pattern = rf"{base}_v(\d{ext})" pattern = r"{}_v(\d{})".format(base, ext) matches = [re.match(pattern, file) for file in os.listdir(dir)] version = max( @@ -37,17 +30,12 @@ def get_last_version(name): ) return version - def update_version_name(name): """Takes the same template path as get_last_version(), and uses it to update the version number.""" base, ext = os.path.splitext(name) version = 0 if not os.path.exists(name) else get_last_version(name) return f"{base}_v{str(version+1)}{ext}" - -# def_k = 0.0 - - def closest(list, k=0.0): """Find the element of a list containing strings ['coef_{float_1}', 'coef_{float_2}', ...] which is closest to some float_i""" try: @@ -157,7 +145,6 @@ def get_input_files(base_path, pile_up=False): def read_weights(dir, cfg, version="final", mode="weights"): weights_by_particle = {} - #for particle in ("photons", "electrons", "pions"): for particle in ("photons", "pions"): basename = "optimization_selectOneEffRms_maxSeed_bc_stc" if particle == "pions" else "optimization_selectOneStd_adjustMaxWeight_maxSeed" @@ -176,46 +163,4 @@ def read_weights(dir, cfg, version="final", mode="weights"): weights_by_particle["electrons"] = weights_by_particle["photons"] - return weights_by_particle - - -'''def get_output_files(cfg): - """Accepts a configuration file containing the base directory, a file basename, local (Bool) and pileUp (Bool). - Finds the full paths of the files created by cluster_size.py, and returns - a dictionary corresponding to particles:[file_paths].""" - - output_files = {"photons": [], "pions": [], "electrons": []} - template = os.path.basename( - common.fill_path(cfg["clusterStudies"]["fileBaseName"], **vars(FLAGS)) - ) - template = re.split("_", template) - if cfg["clusterStudies"]["local"]: - base_path = cfg["clusterStudies"]["localDir"] - else: - base_path = ( - params.EOSStorage(FLAGS.user, "data/PU0/") - if not cfg["clusterStudies"]["pileUp"] - else params.EOSStorage(FLAGS.user, "data/PU200/") - ) - for particles in output_files.keys(): - particle_dir = ( - base_path + particles + "/" if cfg["clusterStudies"]["local"] else base_path - ) - files = [re.split("_", file) for file in os.listdir(particle_dir)] - for filename in files: - if set(template).issubset(set(filename)): - path = os.path.join(f"{particle_dir}{'_'.join(filename)}") - with pd.HDFStore(path, "r") as File: - if len(File.keys()) > 0: - if ("photon" in filename) or ("photons" in filename): - output_files["photons"].append(path) - elif ("electron" in filename) or ("electrons" in filename): - output_files["electrons"].append(path) - else: - output_files["pions"].append(path) - - # Get rid of duplicates that the dictionary filling produced - for key in output_files.keys(): - output_files[key] = list(set(output_files[key])) - - return output_files''' + return weights_by_particle \ No newline at end of file diff --git a/bye_splits/utils/common.py b/bye_splits/utils/common.py index fbaa2653..b2138620 100644 --- a/bye_splits/utils/common.py +++ b/bye_splits/utils/common.py @@ -151,75 +151,4 @@ def std_eff(values, c=0.68): def seed_extra_name(cfg): s = '_hexdist' if cfg['seed_cs']['hexDist'] else '' s += '_' + cfg['seed_cs']['InputName'] - return s - -# Accepts a template for a full path to a file and increments the version -def increment_version(file_path): - dir, file = os.path.split(file_path) - base, ext = os.path.splitext(file) - i = 0 - file_path = "{}/{}_v{}{}".format(dir, base, i, ext) - while os.path.exists(file_path): - i += 1 - file_path = "{}/{}_v{}{}".format(dir, base, i, ext) - return file_path - - -# Grab the most recent version of the file corresponding to the template file_path (or return all matches) -def grab_most_recent(file_path, return_all=False): - dir, file = os.path.split(file_path) - base, ext = os.path.splitext(file) - files = os.listdir(dir) - version_pattern = re.compile("{}_v(\\d+)\\{}".format(base, ext)) - matches = [version_pattern.search(file) for file in files] - matches = [match for match in matches if not match is None] - if len(matches) > 0: - matches = [int(match.group(1)) for match in matches] - most_recent = max(matches) - if not return_all: - file_path = dir + "/" + base + "_v" + str(most_recent) + ext - else: - file_path = [dir + "/" + base + "_v" + str(f) + ext for f in matches] - return file_path - -def compare_file_contents(file_path, buffer_list): - """ - Compares the content in with , - which should be a list of strings that you wish to write - to a new file. - """ - with open(file_path, "r") as file: - contents = file.readlines() - return contents==buffer_list - -def write_file_version(template, version): - file_name = increment_version(template) - with open(file_name, "w") as job_file: - job_file.writelines(version) - st = os.stat(file_name) - os.chmod(file_name, st.st_mode | 0o744) - return file_name - -def conditional_write(file_versions, file_template, current_version): - """ - Loops through the files in , comparing their contents - to the current version. If an identical version is found, the function - breaks and does nothing. Otherwise, it will write the contents in - to an updated version number whose basename corresponds to - . - """ - if file_versions != None: - identical_version = False - for file in file_versions: - if not compare_file_contents(file, current_version): - continue - else: - identical_version = True - file_path = file - break - if not identical_version: - file_path = write_file_version(file_template, current_version) - - else: - file_path = write_file_version(file_template, current_version) - return file_path + return s \ No newline at end of file diff --git a/bye_splits/utils/job_helpers.py b/bye_splits/utils/job_helpers.py new file mode 100644 index 00000000..49e84165 --- /dev/null +++ b/bye_splits/utils/job_helpers.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python + +import re + +import os +import sys + +parent_dir = os.path.abspath(__file__ + 5 * "../") +sys.path.insert(0, parent_dir) + +_all_ = ['increment_version', 'grab_most_recent', 'compare_file_contents', 'write_file_version', 'conditional_write'] + +# Accepts a template for a full path to a file and increments the version +def increment_version(file_path): + dir, file = os.path.split(file_path) + base, ext = os.path.splitext(file) + i = 0 + file_path = "{}/{}_v{}{}".format(dir, base, i, ext) + while os.path.exists(file_path): + i += 1 + file_path = "{}/{}_v{}{}".format(dir, base, i, ext) + return file_path + +# Grab the most recent version of the file corresponding to the template file_path (or return all matches) +def grab_most_recent(file_path, return_all=False): + dir, file = os.path.split(file_path) + base, ext = os.path.splitext(file) + files = os.listdir(dir) + version_pattern = re.compile("{}_v(\\d+)\\{}".format(base, ext)) + matches = [version_pattern.search(file) for file in files] + matches = [match for match in matches if not match is None] + if len(matches) > 0: + matches = [int(match.group(1)) for match in matches] + most_recent = max(matches) + if not return_all: + file_path = dir + "/" + base + "_v" + str(most_recent) + ext + else: + file_path = [dir + "/" + base + "_v" + str(f) + ext for f in matches] + return file_path + +def compare_file_contents(file_path, buffer_list): + """ + Compares the content in with , + which should be a list of strings that you wish to write + to a new file. + """ + with open(file_path, "r") as file: + contents = file.readlines() + return contents==buffer_list + +def write_file_version(template, version): + file_name = increment_version(template) + with open(file_name, "w") as job_file: + job_file.writelines(version) + st = os.stat(file_name) + os.chmod(file_name, st.st_mode | 0o744) + return file_name + +def conditional_write(file_versions, file_template, current_version): + """ + Loops through the files in , comparing their contents + to the current version. If an identical version is found, the function + breaks and does nothing. Otherwise, it will write the contents in + to an updated version number whose basename corresponds to + . + """ + if file_versions != None: + identical_version = False + for file in file_versions: + if not compare_file_contents(file, current_version): + continue + else: + identical_version = True + file_path = file + break + if not identical_version: + file_path = write_file_version(file_template, current_version) + + else: + file_path = write_file_version(file_template, current_version) + return file_path \ No newline at end of file diff --git a/config.yaml b/config.yaml index 81b8e0fe..d61020a4 100644 --- a/config.yaml +++ b/config.yaml @@ -72,7 +72,7 @@ varEvents: geta: 'good_genpart_exeta' gphi: 'good_genpart_exphi' gen: 'good_genpart_energy' -# gpt: 'good_genpart_pt' + gpt: 'good_genpart_pt' tc: event: 'event' wu: 'good_tc_waferu' @@ -151,30 +151,25 @@ job: 0.046, 0.047, 0.048, 0.049, 0.05 ] particles: pions pileup: PU0 - read_dir: False # {True: read arguments as paths in /ntuples, False: read arguments stored on the .txt } photons: submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/photons/ - files: /data_CMS/cms/ehle/L1HGCAL/PU200/coefs.txt - files_per_batch: 10 + args_per_batch: 10 electrons: submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/electrons/ - files: /data_CMS/cms/ehle/L1HGCAL/PU200/coefs.txt - files_per_batch: 1 + args_per_batch: 10 pions: submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/pions/ - files: /data_CMS/cms/ehle/L1HGCAL/PU200/coefs.txt - files_per_batch: 10 + args_per_batch: 10 clusterStudies: localDir: /home/llr/cms/mchiusi/event_display/bye_splits/data/new_algos/ ehleDir: /eos/user/i/iehle/ dataFolder: data/ reinit: True #False - reprocess: False # Reproduce .parquet file - parquetTag: jobTest + reprocess: True # Reproduce .parquet file + parquetTag: pt_test clusterSizeBaseName: cluster_size_jobTest_yesWeight coeffs: [0.0, 0.5, 50] - #nevents: 100 nevents: -1 pileup: False tree: FloatingpointMixedbcstcrealsig4DummyHistomaxxydr015GenmatchGenclustersntuple/HGCalTriggerNtuple From 8161d31e69a3942b89df3758216c73f916c18baa Mon Sep 17 00:00:00 2001 From: Isaac Ehle Date: Thu, 12 Oct 2023 17:04:34 +0200 Subject: [PATCH 07/12] implementation of final comments --- .../submit_scripts/condor_cluster_size.sh | 2 +- .../production/submit_scripts/job_submit.py | 18 ++++++------ .../cluster_size/condor/run_cluster.py | 6 +--- .../scripts/cluster_size/run_combine.py | 15 ++++------ .../scripts/cluster_size/run_init_tasks.py | 7 ++--- bye_splits/utils/cl_helpers.py | 29 +++++++++---------- bye_splits/utils/common.py | 3 -- bye_splits/utils/job_helpers.py | 8 +++-- config.yaml | 6 ++-- 9 files changed, 40 insertions(+), 54 deletions(-) diff --git a/bye_splits/production/submit_scripts/condor_cluster_size.sh b/bye_splits/production/submit_scripts/condor_cluster_size.sh index 8f0e26a7..a2830f2c 100644 --- a/bye_splits/production/submit_scripts/condor_cluster_size.sh +++ b/bye_splits/production/submit_scripts/condor_cluster_size.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -cd /home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/scripts/cluster_size/condor/ +cd /home/llr/cms/${USER}/NewRepos/bye_splits/bye_splits/scripts/cluster_size/condor/ radius=() particles="" diff --git a/bye_splits/production/submit_scripts/job_submit.py b/bye_splits/production/submit_scripts/job_submit.py index 43334214..ed184f9e 100644 --- a/bye_splits/production/submit_scripts/job_submit.py +++ b/bye_splits/production/submit_scripts/job_submit.py @@ -2,13 +2,13 @@ import os import sys -from datetime import datetime parent_dir = os.path.abspath(__file__ + 5 * "../") sys.path.insert(0, parent_dir) from bye_splits.utils import params, common, job_helpers +from datetime import datetime import subprocess import yaml @@ -53,10 +53,10 @@ def _write_arg_keys(self, current_version): """Writes the line containing the argument names to the buffer list.""" - arg_keys = ["Arguments ="] + arg_keys = "Arguments =" for arg in self.args.keys(): - arg_keys.append("$({})".format(arg)) - arg_keys = " ".join(arg_keys) + "\n" + arg_keys += " $({})".format(arg) + arg_keys += "\n" current_version.append(arg_keys) @@ -92,11 +92,11 @@ def _write_arg_values(self, current_version): def prepare_batch_submission(self): """Writes the .sh script that constitutes the executable - in the .sub script. The basename will be the same as the fundamental - script, followed by a version number. Stores the contents in a list that's - used as a buffer and checked against the content in previous - versions, only writing the file if an identical file doesn't - already exist. The version number will be incrimented in this case.""" + in the .sub script. The basename will be the same as the running script, i.e. + the script set in the configuration file. This is then followed by a version number. + Stores the contents in a list that's used as a buffer and checked against the content + in previous versions, only writing the file if an identical file doesn't already exist. + The version number will be incrimented in this case.""" sub_dir = "{}subs/".format(self.particle_dir) diff --git a/bye_splits/scripts/cluster_size/condor/run_cluster.py b/bye_splits/scripts/cluster_size/condor/run_cluster.py index ee186f52..c12289c6 100644 --- a/bye_splits/scripts/cluster_size/condor/run_cluster.py +++ b/bye_splits/scripts/cluster_size/condor/run_cluster.py @@ -12,12 +12,8 @@ from utils import params, common, parsing, cl_helpers import argparse -import random - -random.seed(10) import numpy as np import pandas as pd - import yaml def cluster_radius(pars, cfg): @@ -67,7 +63,7 @@ def cluster_radius(pars, cfg): cfg = yaml.safe_load(afile) if pars.weighted: - weight_dir = "{}/PU0/".format(params.LocalStorage) + weight_dir = os.path.join(params.LocalStorage, "PU0/") weights_by_particle = cl_helpers.read_weights(weight_dir, cfg) weights = weights_by_particle[pars.particles][radius_str] cfg["weights"] = weights diff --git a/bye_splits/scripts/cluster_size/run_combine.py b/bye_splits/scripts/cluster_size/run_combine.py index ca44870c..33bfba39 100644 --- a/bye_splits/scripts/cluster_size/run_combine.py +++ b/bye_splits/scripts/cluster_size/run_combine.py @@ -4,22 +4,17 @@ import os import sys -import argparse parent_dir = os.path.abspath(__file__ + 3 * "/..") sys.path.insert(0, parent_dir) from utils import params, common, parsing - from data_handle.data_process import get_data_reco_chain_start -import random +import argparse import re - import numpy as np import pandas as pd -import sys - import yaml from tqdm import tqdm @@ -68,7 +63,7 @@ def combine_files_by_coef(in_dir, file_pattern): for all radii.""" files = [ - file for file in os.listdir(in_dir) if re.search(file_pattern, file) != None and "valid" not in file + file for file in os.listdir(in_dir) if re.search(file_pattern, file) is not None and "valid" not in file ] coef_pattern = r"coef_0p(\d+)" out_path = common.fill_path(file_pattern, data_dir=in_dir) @@ -80,10 +75,10 @@ def combine_files_by_coef(in_dir, file_pattern): clusterSizeOut[key] = clSizeCoef["/data"] def split_and_norm(df_cl, df_gen, dRthresh): - original_df, weighted_df = split_dfs(df_cl) - normed_df, normed_weighted_df = combine_normalize(original_df, df_gen, dRthresh), combine_normalize(weighted_df, df_gen, dRthresh) + o_df, w_df = split_dfs(df_cl) + normed_df, normed_w_df = combine_normalize(o_df, df_gen, dRthresh), combine_normalize(w_df, df_gen, dRthresh) df_dict = {"original": normed_df, - "weighted": normed_weighted_df} + "weighted": normed_w_df} return pd.Series(df_dict) def combine_cluster(cfg, **pars): diff --git a/bye_splits/scripts/cluster_size/run_init_tasks.py b/bye_splits/scripts/cluster_size/run_init_tasks.py index ddba90bf..ff64c8df 100644 --- a/bye_splits/scripts/cluster_size/run_init_tasks.py +++ b/bye_splits/scripts/cluster_size/run_init_tasks.py @@ -10,14 +10,10 @@ import tasks from utils import params, common, parsing - from data_handle.data_process import get_data_reco_chain_start import argparse import random - -import sys - import yaml def start_chain(pars, cfg): @@ -25,10 +21,10 @@ def start_chain(pars, cfg): negative and positive eta samples.""" particles = cfg["selection"]["particles"] - pileup = "PU0" if not cfg["clusterStudies"]["pileup"] else "PU200" reprocess = cfg["clusterStudies"]["reprocess"] tag = cfg["clusterStudies"]["parquetTag"] nevents = pars.nevents + pileup = pars.pileup df_gen, df_cl, df_tc = get_data_reco_chain_start( particles=particles, nevents=nevents, reprocess=reprocess, tag=tag @@ -74,6 +70,7 @@ def start_chain(pars, cfg): if __name__ == "__main__": parser = argparse.ArgumentParser(description="") + parser.add_argument("--pileup", help="tag for pileup choice", default="PU0") parsing.add_parameters(parser) FLAGS = parser.parse_args() diff --git a/bye_splits/utils/cl_helpers.py b/bye_splits/utils/cl_helpers.py index cb3c7d2c..b778afd4 100644 --- a/bye_splits/utils/cl_helpers.py +++ b/bye_splits/utils/cl_helpers.py @@ -1,15 +1,15 @@ import os import sys -import re -import argparse parent_dir = os.path.abspath(__file__ + 3 * "/..") sys.path.insert(0, parent_dir) +from bye_splits.utils import common, params + import numpy as np import pandas as pd - -from bye_splits.utils import common, params +import re +import argparse def get_last_version(name): """Takes a template path, such as '/full/path/to/my_file.ext' and returns the path to the latest version @@ -19,11 +19,11 @@ def get_last_version(name): base, ext = os.path.splitext(base) dir = os.path.dirname(name) if os.path.exists: - pattern = r"{}_v(\d{})".format(base, ext) + pattern = r"{}_v(\d){}".format(base, ext) matches = [re.match(pattern, file) for file in os.listdir(dir)] version = max( [ - int(match.group(1).replace(ext, "")) + int(match.group(1)) for match in matches if not match is None ] @@ -36,19 +36,15 @@ def update_version_name(name): version = 0 if not os.path.exists(name) else get_last_version(name) return f"{base}_v{str(version+1)}{ext}" -def closest(list, k=0.0): +def closest(coef_list, k=0.0): """Find the element of a list containing strings ['coef_{float_1}', 'coef_{float_2}', ...] which is closest to some float_i""" - try: - list = np.reshape(np.asarray(list), 1) - except ValueError: - list = np.asarray(list) + coef_list = np.asarray(coef_list) if isinstance(k, str): k_num = float(re.split("coef_", k)[1].replace("p", ".")) else: k_num = k - id = (np.abs(list - k_num)).argmin() - return list[id] - + id = (np.abs(coef_list - k_num)).argmin() + return coef_list[id] def get_str(coef, df_dict): """Accepts a coefficient, either as a float or string starting with coef_, along with a dictionary of coefficient:DataFrame pairs. @@ -66,7 +62,6 @@ def get_str(coef, df_dict): coef_str = "/coef_{}".format(str(new_coef).replace(".", "p")) return coef_str - # Old Naming Conventions used different column names in the dataframes column_matching = { "etanew": "eta", @@ -161,6 +156,10 @@ def read_weights(dir, cfg, version="final", mode="weights"): weights_by_particle[particle] = weights_by_radius + '''Weights are calculated from pt_norm distributions, which + are distorted by brem events for electrons. As this is + a physics effect uncorrelated to the TPG response, we correct + electrons with weights derived from photon pt_norm distributions''' weights_by_particle["electrons"] = weights_by_particle["photons"] return weights_by_particle \ No newline at end of file diff --git a/bye_splits/utils/common.py b/bye_splits/utils/common.py index b2138620..d362e5df 100644 --- a/bye_splits/utils/common.py +++ b/bye_splits/utils/common.py @@ -16,9 +16,6 @@ import numpy as np import pandas as pd -import re - - def binConv(vals, dist, amin): """ Converts bin indexes back to values (central values in the bin). diff --git a/bye_splits/utils/job_helpers.py b/bye_splits/utils/job_helpers.py index 49e84165..764d38bd 100644 --- a/bye_splits/utils/job_helpers.py +++ b/bye_splits/utils/job_helpers.py @@ -10,8 +10,8 @@ _all_ = ['increment_version', 'grab_most_recent', 'compare_file_contents', 'write_file_version', 'conditional_write'] -# Accepts a template for a full path to a file and increments the version def increment_version(file_path): + """Accepts a template for a full path to a file and increments the version""" dir, file = os.path.split(file_path) base, ext = os.path.splitext(file) i = 0 @@ -21,8 +21,8 @@ def increment_version(file_path): file_path = "{}/{}_v{}{}".format(dir, base, i, ext) return file_path -# Grab the most recent version of the file corresponding to the template file_path (or return all matches) def grab_most_recent(file_path, return_all=False): + """Grab the most recent version of the file corresponding to the template file_path (or return all matches)""" dir, file = os.path.split(file_path) base, ext = os.path.splitext(file) files = os.listdir(dir) @@ -37,6 +37,8 @@ def grab_most_recent(file_path, return_all=False): else: file_path = [dir + "/" + base + "_v" + str(f) + ext for f in matches] return file_path + else: + raise ValueError("There are no versions of the passed file: {}".format(file_path)) def compare_file_contents(file_path, buffer_list): """ @@ -64,7 +66,7 @@ def conditional_write(file_versions, file_template, current_version): to an updated version number whose basename corresponds to . """ - if file_versions != None: + if file_versions is not None: identical_version = False for file in file_versions: if not compare_file_contents(file, current_version): diff --git a/config.yaml b/config.yaml index d61020a4..d9888518 100644 --- a/config.yaml +++ b/config.yaml @@ -148,7 +148,7 @@ job: 0.019, 0.02 , 0.021, 0.022, 0.023, 0.024, 0.025, 0.026, 0.027, 0.028, 0.029, 0.03 , 0.031, 0.032, 0.033, 0.034, 0.035, 0.036, 0.037, 0.038, 0.039, 0.04 , 0.041, 0.042, 0.043, 0.044, 0.045, - 0.046, 0.047, 0.048, 0.049, 0.05 ] + 0.046, 0.047, 0.048, 0.049, 0.05] particles: pions pileup: PU0 photons: @@ -162,7 +162,7 @@ job: args_per_batch: 10 clusterStudies: - localDir: /home/llr/cms/mchiusi/event_display/bye_splits/data/new_algos/ + localDir: /home/llr/cms/ehle/NewRepos/bye_splits/data/new_algos/ ehleDir: /eos/user/i/iehle/ dataFolder: data/ reinit: True #False @@ -174,7 +174,7 @@ clusterStudies: pileup: False tree: FloatingpointMixedbcstcrealsig4DummyHistomaxxydr015GenmatchGenclustersntuple/HGCalTriggerNtuple fileBaseName: energy_out - local: False + local: True base: NbinsRz: 42 From 482e84cd1bab79fa1fe7ce6f4a1f82aff869c3b2 Mon Sep 17 00:00:00 2001 From: Isaac Ehle Date: Fri, 13 Oct 2023 13:46:43 +0200 Subject: [PATCH 08/12] minor updates/corrections --- .../production/submit_scripts/condor_cluster_size.sh | 2 +- bye_splits/production/submit_scripts/job_submit.py | 3 ++- bye_splits/scripts/cluster_size/condor/run_cluster.py | 2 +- bye_splits/scripts/cluster_size/run_init_tasks.py | 2 +- bye_splits/utils/cl_helpers.py | 9 ++++----- config.yaml | 5 +++++ 6 files changed, 14 insertions(+), 9 deletions(-) diff --git a/bye_splits/production/submit_scripts/condor_cluster_size.sh b/bye_splits/production/submit_scripts/condor_cluster_size.sh index a2830f2c..ea696e46 100644 --- a/bye_splits/production/submit_scripts/condor_cluster_size.sh +++ b/bye_splits/production/submit_scripts/condor_cluster_size.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -cd /home/llr/cms/${USER}/NewRepos/bye_splits/bye_splits/scripts/cluster_size/condor/ +cd ${HOME}/bye_splits/bye_splits/scripts/cluster_size/condor/ radius=() particles="" diff --git a/bye_splits/production/submit_scripts/job_submit.py b/bye_splits/production/submit_scripts/job_submit.py index ed184f9e..9b3c65c1 100644 --- a/bye_splits/production/submit_scripts/job_submit.py +++ b/bye_splits/production/submit_scripts/job_submit.py @@ -77,7 +77,8 @@ def _write_arg_values(self, current_version): [0.01;0.02], photon ) correctly assigns radius="[0.01, 0.02]", particle="photon" - """ + """ + if "particles" in self.args.keys(): self.args["particles"] = self.particle arg_keys = ", ".join(self.args.keys()) arg_keys = "queue " + arg_keys + " from (\n" current_version.append(arg_keys) diff --git a/bye_splits/scripts/cluster_size/condor/run_cluster.py b/bye_splits/scripts/cluster_size/condor/run_cluster.py index c12289c6..e779035f 100644 --- a/bye_splits/scripts/cluster_size/condor/run_cluster.py +++ b/bye_splits/scripts/cluster_size/condor/run_cluster.py @@ -33,7 +33,7 @@ def cluster_radius(pars, cfg): str(round(radius, 3)).replace(".", "p"), ) cluster_d["ClusterOutPlot"], cluster_d["ClusterOutValidation"] = cl_size_radius, cl_size_radius+"_valid" - cluster_d["CoeffA"] = [radius] * 50 + cluster_d["CoeffA"] = [radius] * (cfg["geometry"]["nlayersCEE"]+cfg["geometry"]["nlayersCEH"]) # Radii in each of the HGCAL layers if "weights" in cfg: cluster_d["weights"] = cfg["weights"] diff --git a/bye_splits/scripts/cluster_size/run_init_tasks.py b/bye_splits/scripts/cluster_size/run_init_tasks.py index ff64c8df..89a55f7e 100644 --- a/bye_splits/scripts/cluster_size/run_init_tasks.py +++ b/bye_splits/scripts/cluster_size/run_init_tasks.py @@ -1,6 +1,6 @@ # coding: utf-8 -_all_ = [] +_all_ = ['start_chain'] import os import sys diff --git a/bye_splits/utils/cl_helpers.py b/bye_splits/utils/cl_helpers.py index b778afd4..465aba86 100644 --- a/bye_splits/utils/cl_helpers.py +++ b/bye_splits/utils/cl_helpers.py @@ -138,13 +138,12 @@ def get_input_files(base_path, pile_up=False): return input_files -def read_weights(dir, cfg, version="final", mode="weights"): +def read_weights(dir, cfg, version="layer", mode="weights"): weights_by_particle = {} - for particle in ("photons", "pions"): - basename = "optimization_selectOneEffRms_maxSeed_bc_stc" if particle == "pions" else "optimization_selectOneStd_adjustMaxWeight_maxSeed" + weight_path_templates = cfg["clusterStudies"]["weights"][version] + for particle, basename in weight_path_templates.items(): - version_dir = "{}/".format(version) - particle_dir = "{}{}/optimization/official/{}".format(dir, particle, version_dir) + particle_dir = os.path.join(dir, particle, cfg["clusterStudies"]["weights"]["subDir"]) files = [f for f in os.listdir(particle_dir) if basename in f] weights_by_radius = {} diff --git a/config.yaml b/config.yaml index d9888518..94cf9f8f 100644 --- a/config.yaml +++ b/config.yaml @@ -175,6 +175,11 @@ clusterStudies: tree: FloatingpointMixedbcstcrealsig4DummyHistomaxxydr015GenmatchGenclustersntuple/HGCalTriggerNtuple fileBaseName: energy_out local: True + weights: + subDir: optimization/official/final/ + layer: + photons: optimization_selectOneStd_adjustMaxWeight_maxSeed + pions: optimization_selectOneEffRms_maxSeed_bc_stc base: NbinsRz: 42 From fc9e0002d2c5befe56f12c011c6794eebe62bf5d Mon Sep 17 00:00:00 2001 From: Isaac Ehle Date: Sat, 14 Oct 2023 16:10:03 +0200 Subject: [PATCH 09/12] implemented Marco's comments --- README.md | 66 ++++++++++++++++++- .../production/submit_scripts/job_submit.py | 6 +- bye_splits/utils/job_helpers.py | 7 +- 3 files changed, 71 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index a66bc6cd..f27151de 100644 --- a/README.md +++ b/README.md @@ -103,8 +103,70 @@ The `PU0` files above were merged and are stored under `/data_CMS/cms/alves/L1HG ## Job Submission -Job submission to HT Condor is handled through `bye_splits/production/submit_scripts/job_submit.py` using the `job` section of `config.yaml` for its configuration. The configuration should include usual condor variables, i.e `user`, `proxy`, `queue`, and `local`, as well as a path to the `script` you would like to run on condor. The `arguments` sub-section should contain `key/value` pairs matching the expected arguments that `script` accepts. The variable that you would like to iterate over should be set in `iterOver` and its value should correspond to a `key` in the `arguments` sub-section whose value is a list containing the values the script should iterate over. It then contains a section for each particle type which should contain a `submit_dir`, i.e. the directory in which to read and write submission related files, and `args_per_batch` which can be any number between 1 and `len(arguments[])`. - +Job submission to HT Condor is handled through `bye_splits/production/submit_scripts/job_submit.py` using the `job` section of `config.yaml` for its configuration. The configuration should include usual condor variables, i.e `user`, `proxy`, `queue`, and `local`, as well as a path to the `script` you would like to run on condor. The `arguments` sub-section should contain `key/value` pairs matching the expected arguments that `script` accepts. The variable that you would like to iterate over should be set in `iterOver` and its value should correspond to a `key` in the `arguments` sub-section whose value is a list containing the values the script should iterate over. It then contains a section for each particle type which should contain a `submit_dir`, i.e. the directory in which to read and write submission related files, and `args_per_batch` which can be any number between 1 and `len(arguments[])`. An example of the `job` configuration settings is as such: + + job: + user: iehle + proxy: ~/.t3/proxy.cert + queue: short + local: False + script: /grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/production/submit_scripts/condor_cluster_size.sh + iterOver: radius + arguments: + radius: [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, + 0.01 , 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.017, 0.018, + 0.019, 0.02 , 0.021, 0.022, 0.023, 0.024, 0.025, 0.026, 0.027, + 0.028, 0.029, 0.03 , 0.031, 0.032, 0.033, 0.034, 0.035, 0.036, + 0.037, 0.038, 0.039, 0.04 , 0.041, 0.042, 0.043, 0.044, 0.045, + 0.046, 0.047, 0.048, 0.049, 0.05] + particles: photons + pileup: PU0 + photons: + submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/photons/ + args_per_batch: 10 + electrons: + submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/electrons/ + args_per_batch: 10 + pions: + submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/pions/ + args_per_batch: 10 + +After setting the configuration variables, the jobs are created and launched via + + python bye_splits/production/submit_scripts/job_submit.py + +and will produce both the executable `.sh` file which will have a form like + + #!/usr/bin/env bash + workdir=/grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/production/submit_scripts + cd $workdir + export VO_CMS_SW_DIR=/cvmfs/cms.cern.ch + export SITECONFIG_PATH=$VO_CMS_SW_DIR/SITECONF/T2_FR_GRIF_LLR/GRIF-LLR/ + source $VO_CMS_SW_DIR/cmsset_default.sh + bash /grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/production/submit_scripts/condor_cluster_size.sh --radius $1 --particles $2 --pileup $3 + +and the `.sub` file submitted to HT Condor: + + executable = /data_CMS/cms/ehle/L1HGCAL/PU0/photons/subs/condor_cluster_size_submit_v1.sh + Universe = vanilla + Arguments = $(radius) $(particles) $(pileup) + output = /data_CMS/cms/ehle/L1HGCAL/PU0/photons/logs/condor_cluster_size_C$(Cluster)P$(Process).out + error = /data_CMS/cms/ehle/L1HGCAL/PU0/photons/logs/condor_cluster_size_C$(Cluster)P$(Process).err + log = /data_CMS/cms/ehle/L1HGCAL/PU0/photons/logs/condor_cluster_size_C$(Cluster)P$(Process).log + getenv = true + T3Queue = short + WNTag = el7 + +SingularityCmd = "" + include: /opt/exp_soft/cms/t3/t3queue | + queue radius, particles, pileup from ( + [0.001;0.002;0.003;0.004;0.005;0.006;0.007;0.008;0.009;0.01], photons, PU0 + [0.011;0.012;0.013;0.014;0.015;0.016;0.017;0.018;0.019;0.02], photons, PU0 + [0.021;0.022;0.023;0.024;0.025;0.026;0.027;0.028;0.029;0.03], photons, PU0 + [0.031;0.032;0.033;0.034;0.035;0.036;0.037;0.038;0.039;0.04], photons, PU0 + [0.041;0.042;0.043;0.044;0.045;0.046;0.047;0.048;0.049;0.05], photons, PU0 + ) + +This describes just one example, and other uses can be easily implemented. You may, for example, run the [skimming procedure](#skimming) on HT Condor with a small bash script that accepts `--particles` and `--nevents` as arguments, and update the configuration variables accordingly. # Reconstruction Chain diff --git a/bye_splits/production/submit_scripts/job_submit.py b/bye_splits/production/submit_scripts/job_submit.py index 9b3c65c1..e2b32594 100644 --- a/bye_splits/production/submit_scripts/job_submit.py +++ b/bye_splits/production/submit_scripts/job_submit.py @@ -27,11 +27,11 @@ def __init__(self, particle, config): self.particle_var = lambda part, var: config["job"][part][var] def setup_batches(self): - total = self.config["job"]["arguments"][self.iterOver] + total_vals = self.config["job"]["arguments"][self.iterOver] vals_per_batch = self.particle_var(self.particle, "args_per_batch") - batches = [total[i: i + vals_per_batch] for i in range(0, len(total), vals_per_batch)] + batches = [total_vals[i: i + vals_per_batch] for i in range(0, len(total_vals), vals_per_batch)] return batches @@ -86,7 +86,7 @@ def _write_arg_values(self, current_version): sub_args = list(self.args.keys())[1:] arg_vals = [self.args[key] for key in sub_args] all_vals = ["{}".format(batch).replace(", ", ";")]+arg_vals - all_vals = ", ".join(all_vals) + "\n" + all_vals = ", ".join(map(str, all_vals)) + "\n" current_version.append(all_vals) current_version.append(")") diff --git a/bye_splits/utils/job_helpers.py b/bye_splits/utils/job_helpers.py index 764d38bd..428a4a87 100644 --- a/bye_splits/utils/job_helpers.py +++ b/bye_splits/utils/job_helpers.py @@ -22,7 +22,8 @@ def increment_version(file_path): return file_path def grab_most_recent(file_path, return_all=False): - """Grab the most recent version of the file corresponding to the template file_path (or return all matches)""" + """Grab the most recent version of the file corresponding to the template file_path (or return all matches). + Returns None if no files mathing template have been written.""" dir, file = os.path.split(file_path) base, ext = os.path.splitext(file) files = os.listdir(dir) @@ -38,7 +39,7 @@ def grab_most_recent(file_path, return_all=False): file_path = [dir + "/" + base + "_v" + str(f) + ext for f in matches] return file_path else: - raise ValueError("There are no versions of the passed file: {}".format(file_path)) + return None def compare_file_contents(file_path, buffer_list): """ @@ -64,7 +65,7 @@ def conditional_write(file_versions, file_template, current_version): to the current version. If an identical version is found, the function breaks and does nothing. Otherwise, it will write the contents in to an updated version number whose basename corresponds to - . + . If file_versions is None, writes the v0 version. """ if file_versions is not None: identical_version = False From 5dd003fbae73a4af9fd573ec84b302d28ba3efa5 Mon Sep 17 00:00:00 2001 From: Isaac Ehle Date: Mon, 16 Oct 2023 19:02:30 +0200 Subject: [PATCH 10/12] use python files as