From a17bce83f9a2d99678dc50124c2399ad5ff0d4ea Mon Sep 17 00:00:00 2001
From: Isaac Ehle <ehle@llr.in2p3.fr>
Date: Fri, 6 Oct 2023 15:12:15 +0200
Subject: [PATCH 01/12] copy over submission related scripts

---
 .../submit_scripts/condor_cluster_size.sh     |  11 +
 .../production/submit_scripts/job_submit.py   | 226 ++++++++++++++++++
 .../cluster_size/condor/cluster_opt.py        | 138 +++++++++++
 .../cluster_size/condor/run_cluster.py        |  65 +++++
 .../scripts/cluster_size/run_combine.py       | 137 +++++++++++
 .../scripts/cluster_size/run_init_tasks.py    |  81 +++++++
 config.yaml                                   |  22 +-
 7 files changed, 679 insertions(+), 1 deletion(-)
 create mode 100644 bye_splits/production/submit_scripts/condor_cluster_size.sh
 create mode 100644 bye_splits/production/submit_scripts/job_submit.py
 create mode 100644 bye_splits/scripts/cluster_size/condor/cluster_opt.py
 create mode 100644 bye_splits/scripts/cluster_size/condor/run_cluster.py
 create mode 100644 bye_splits/scripts/cluster_size/run_combine.py
 create mode 100644 bye_splits/scripts/cluster_size/run_init_tasks.py

diff --git a/bye_splits/production/submit_scripts/condor_cluster_size.sh b/bye_splits/production/submit_scripts/condor_cluster_size.sh
new file mode 100644
index 00000000..73b9146d
--- /dev/null
+++ b/bye_splits/production/submit_scripts/condor_cluster_size.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+cd /home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/scripts/cluster_size/condor/
+
+# Coefficients (radii) stored in .txt file, run cluster step on each radius
+coef_file=$1
+particles=$2
+pileup=$3
+while read -r line; do
+    python run_cluster.py --coef "$line" --particles "$particles" --pileup "$pileup"
+done <$coef_file
\ No newline at end of file
diff --git a/bye_splits/production/submit_scripts/job_submit.py b/bye_splits/production/submit_scripts/job_submit.py
new file mode 100644
index 00000000..c82e2e13
--- /dev/null
+++ b/bye_splits/production/submit_scripts/job_submit.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python
+
+import os
+import sys
+from datetime import datetime
+
+parent_dir = os.path.abspath(__file__ + 5 * "../")
+sys.path.insert(0, parent_dir)
+
+from bye_splits.utils import params, common
+
+import subprocess
+import yaml
+
+# Read particle specific variables from the YAML file
+particle_var = lambda part, var: config["job"][part][var]
+
+class JobBatches:
+    def __init__(self, particle, config):
+        self.particle = particle
+        self.config = config
+
+    def setup_batches(self):
+
+        my_batches = lambda files, files_per_batch: [files[i: i + files_per_batch] for i in range(0, len(files), files_per_batch)]
+
+        read_dir = self.config["job"]["read_dir"]
+        files = particle_var(self.particle, "files")
+        files_per_batch = particle_var(self.particle, "files_per_batch")
+        
+        if not read_dir:
+            with open(files, "r") as file:
+                paths = file.read().splitlines()
+        else:
+            part_submit_dir = particle_var(self.particle, "submit_dir") + "ntuples/"
+            paths = [
+                "{}{}".format(part_submit_dir, file) for file in os.listdir(part_submit_dir) if file.startswith("ntuple")
+            ]
+        
+        batches = my_batches(paths, files_per_batch)
+
+        return batches
+    
+class CondJobBase(JobBatches):
+    def __init__(self, particle, config):
+        super().__init__(particle, config)
+        self.particle_dir = particle_var(self.particle, "submit_dir")
+        self.script = config["job"]["script"]
+        self.args = config["job"]["arguments"]
+        self.queue = config["job"]["queue"]
+        self.proxy = config["job"]["proxy"]
+        self.local = config["job"]["local"]
+        self.user = config["job"]["user"]
+
+
+    def write_batch_files(self):
+        batch_dir = "{}batches/".format(self.particle_dir)
+        if not os.path.exists(batch_dir):
+            os.makedirs(batch_dir)
+        batch_script_dir = "{}{}/".format(batch_dir, os.path.splitext(os.path.basename(self.script))[0])
+        if not os.path.exists(batch_script_dir):
+            os.makedirs(batch_script_dir)
+
+        batches = self.setup_batches()
+        global current_batch_versions
+        current_batch_versions = []
+        for i, batch in enumerate(batches):
+            out_name = "{}batch_{}.txt".format(batch_script_dir, i)
+            written_version = common.grab_most_recent(out_name, return_all=True)
+            batch_lines = ["{}\n".format(b) for b in batch]
+            current_version = common.conditional_write(written_version, out_name, batch_lines)
+            current_batch_versions.append(current_version)
+
+
+    def prepare_batch_submission(self):
+        sub_dir = "{}subs/".format(self.particle_dir)
+
+        if not os.path.exists(sub_dir):
+            os.makedirs(sub_dir)
+        script_basename = os.path.basename(self.script).replace(".sh", "").replace(".py", "")
+
+        submit_file_name_template = "{}{}_submit.sh".format(sub_dir, script_basename)
+        submit_file_versions = common.grab_most_recent(submit_file_name_template, return_all=True)
+
+        current_version = []
+        current_version.append("#!/usr/bin/env bash\n")
+        current_version.append("workdir={}/bye_splits/production/submit_scripts\n".format(parent_dir))
+        current_version.append("cd $workdir\n")
+        current_version.append("export VO_CMS_SW_DIR=/cvmfs/cms.cern.ch\n")
+        current_version.append("export SITECONFIG_PATH=$VO_CMS_SW_DIR/SITECONF/T2_FR_GRIF_LLR/GRIF-LLR/\n")
+        current_version.append("source $VO_CMS_SW_DIR/cmsset_default.sh\n")
+        if len(self.args) > 0:
+            args = ["bash {}".format(self.script)]
+            for i in range(len(self.args)):
+                args.append(f"${i+1}")
+            args = " ".join(args)
+            current_version.append(args)
+        else:
+            current_version.append("bash {}".format(self.script))
+            
+        # Write the file only if an identical file doesn't already exist
+        global sub_file
+        sub_file = common.conditional_write(submit_file_versions, submit_file_name_template, current_version)
+
+    def prepare_multi_job_condor(self):
+        log_dir = "{}logs/".format(self.particle_dir)
+        
+        batch_files = current_batch_versions
+
+        arg_dict = {}
+        for arg in self.args:
+            if arg=="filename":
+                arg_dict[arg] = batch_files
+            elif arg=="particles":
+                arg_dict[arg] = self.particle
+            elif arg=="pileup":
+                arg_dict[arg] = "PU0" if "PU0" in batch_files[0] else "PU200"
+            else:
+                print(f"{arg} is not currently supported.")
+                quit()
+
+        script_basename = os.path.basename(self.script).replace(".sh", "").replace(".py", "")
+
+        job_file_name_template = "{}jobs/{}.sub".format(self.particle_dir, script_basename)
+
+        job_file_versions = common.grab_most_recent(job_file_name_template, return_all=True)
+
+        current_version = []
+        current_version.append("executable = {}\n".format(sub_file))
+        current_version.append("Universe              = vanilla\n")
+        if len(self.args) > 0:
+            current_version.append("Arguments =")
+            for arg in self.args[:-1]:
+                current_version.append(" $({}) ".format(arg))
+            current_version.append("$({})\n".format(self.args[-1]))
+        current_version.append("output = {}{}_C$(Cluster)P$(Process).out\n".format(log_dir, script_basename))
+        current_version.append("error = {}{}_C$(Cluster)P$(Process).err\n".format(log_dir, script_basename))
+        current_version.append("log = {}{}_C$(Cluster)P$(Process).log\n".format(log_dir, script_basename))
+        current_version.append("getenv                = true\n")
+        current_version.append("T3Queue = {}\n".format(self.queue))
+        current_version.append("WNTag                 = el7\n")
+        current_version.append('+SingularityCmd       = ""\n')
+        current_version.append("include: /opt/exp_soft/cms/t3/t3queue |\n")
+        if len(arg_dict.keys()) > 0:
+            arg_keys = [key for key in arg_dict.keys()]
+            arg_keys = ", ".join(arg_keys)
+            arg_keys = "queue " + arg_keys + " from (\n"
+            current_version.append(arg_keys)
+            for file in arg_dict["filename"]:
+                sub_args = list(arg_dict.keys())[1:]
+                arg_vals = [file]+[arg_dict[key] for key in sub_args]
+                arg_vals = ", ".join(arg_vals) + "\n"
+                current_version.append(arg_vals)
+            current_version.append(")")
+
+        # Write the file only if an identical file doesn't already exist
+        global submission_file # Save to launch later
+        submission_file = common.conditional_write(job_file_versions, job_file_name_template, current_version)
+
+class CondJob(CondJobBase):
+    def __init__(self, particle, config):
+        super().__init__(particle, config)
+
+
+    def prepare_jobs(self):
+        self.write_batch_files()
+
+        configs = lambda dir: dir + "configs"
+        jobs = lambda dir: dir + "jobs"
+        logs = lambda dir: dir + "logs"
+
+        config_dir = configs(self.particle_dir)
+        job_dir = jobs(self.particle_dir)
+        log_dir = logs(self.particle_dir)
+
+        if not os.path.exists(config_dir):
+            os.makedirs(config_dir)
+        if not os.path.exists(job_dir):
+            os.makedirs(job_dir)
+        if not os.path.exists(log_dir):
+            os.makedirs(log_dir)
+
+        self.prepare_batch_submission()
+        self.prepare_multi_job_condor()
+
+
+    def launch_jobs(self):
+
+        if self.local == True:
+            machine = "local"
+        else:
+            machine = "llrt3.in2p3.fr"
+
+        sub_comm = ["condor_submit"]
+
+        if not self.local:
+            print(
+                "\nSending {} jobs on {}".format(self.particle, self.queue + "@{}".format(machine))
+            )
+            print("===============")
+            print("\n")
+
+        sub_args = []
+
+        sub_args.append(submission_file)
+
+        if self.local:
+            comm = sub_args
+        else:
+            comm = sub_comm + sub_args
+
+        print(str(datetime.now()), " ".join(comm))
+        status = subprocess.run(comm)
+
+if __name__ == "__main__":    
+    with open(params.CfgPath, "r") as afile:
+        config = yaml.safe_load(afile)
+
+    job = CondJob("pions", config)
+    job.prepare_jobs()
+    job.launch_jobs()
+
+    '''for particle in ("photons", "electrons", "pions"):
+        job = CondJob(particle, config)
+        job.prepare_jobs()
+        job.launch_jobs()'''
\ No newline at end of file
diff --git a/bye_splits/scripts/cluster_size/condor/cluster_opt.py b/bye_splits/scripts/cluster_size/condor/cluster_opt.py
new file mode 100644
index 00000000..0c36daad
--- /dev/null
+++ b/bye_splits/scripts/cluster_size/condor/cluster_opt.py
@@ -0,0 +1,138 @@
+# coding: utf-8
+
+_all_ = []
+
+import os
+import sys
+
+parent_dir = os.path.abspath(__file__ + 4 * "/..")
+sys.path.insert(0, parent_dir)
+
+import tasks
+from utils import params, common, parsing, cl_helpers
+
+from data_handle.data_process import get_data_reco_chain_start
+
+import argparse
+import random
+
+random.seed(10)
+import numpy as np
+import pandas as pd
+from scipy.optimize import minimize, Bounds
+
+import yaml
+
+pt_norm_loss = lambda mean: np.sqrt(1-mean**2)/2
+
+def get_gen(pars, cfg):    
+    particles = pars["particles"]
+    eta = pars["eta"]
+
+    reprocess = cfg["clusterStudies"]["reprocess"]
+    nevents = cfg["clusterStudies"]["nevents"]
+    tag = cfg["clusterStudies"]["parquetTag"]
+
+    df_gen, _ , _= get_data_reco_chain_start(
+        particles=particles, nevents=nevents, reprocess=reprocess, tag=tag
+    )
+
+    df = df_gen[ df_gen.gen_eta > 0 ] if eta=="pos" else df_gen[ df_gen.gen_eta < 0 ]
+    if eta=="neg": df_gen["gen_eta"] = abs(df_gen.gen_eta)
+
+    return df
+
+def cluster_coef(pars, cfg, radii):
+    particles = pars["particles"]
+    cluster_d = params.read_task_params("cluster")
+
+    cluster_d["ClusterOutPlot"], cluster_d["ClusterOutValidation"] = cfg["clusterStudies"]["clusterSizeBaseName"], cfg["clusterStudies"]["clusterSizeBaseName"]+"_valid"
+    cluster_d["CoeffA"] = radii
+
+    for key in ("ClusterInTC", "ClusterInSeeds", "ClusterOutPlot", "ClusterOutValidation"):
+        name = cluster_d[key]
+
+        cluster_d[key] =  "{}_PU0_{}_posEta".format(particles, name)
+    
+    cluster_d["returnDF"] = True
+    _, df = tasks.cluster.cluster_default(pars, **cluster_d)
+    
+    return df
+
+def normalize_df(cl_df, gen_df, dRThresh=0.05):
+    cl_df=cl_df.reset_index().set_index(["event","seed_idx"])
+    combined_df = cl_df.join(
+        gen_df.set_index("event"), on="event", how="inner"
+    )
+
+    if "dR" not in combined_df.keys():
+        combined_df["dR"] = np.sqrt((abs(combined_df["eta"])-abs(combined_df["gen_eta"]))**2+(combined_df["phi"]-combined_df["gen_phi"])**2)
+    if "matches" not in combined_df.keys():
+        combined_df["matches"] = combined_df["dR"] <= dRThresh
+
+    combined_df["pt"] = combined_df["en"] / np.cosh(combined_df["eta"])
+    combined_df["gen_pt"] = combined_df["gen_en"] / np.cosh(combined_df["gen_eta"])
+
+    combined_df["pt_norm"] = combined_df["pt"] / combined_df["gen_pt"]
+    combined_df["en_norm"] = combined_df["en"] / combined_df["gen_en"]
+
+    return combined_df
+
+def filter_df(df):
+    df = df[ df.matches == True ]
+    df = df.groupby("event").apply(lambda x: x.loc[x.pt.idxmax()])
+    
+    return df
+
+class clusterRad:
+    def __init__(self, pars, cfg):
+        self.pars = pars
+        self.cfg = cfg
+        self.df_gen = get_gen(pars, cfg)
+
+    def cluster_check(self, radii):
+        df_cluster = cluster_coef(self.pars, self.cfg, radii)
+        df = normalize_df(df_cluster, self.df_gen)
+        return filter_df(df)
+
+    def cluster_loss(self, radii):
+        df_cluster = cluster_coef(self.pars, self.cfg, radii)
+        df = normalize_df(df_cluster, self.df_gen)
+        df_filt = filter_df(df)
+        pt_norm_mean = df_filt.pt_norm.mean()
+        
+        return pt_norm_loss(pt_norm_mean)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument("--particles", choices=("photons", "electrons", "pions"), required=True)
+    parser.add_argument("--eta", help="Eta region", choices=("pos", "neg"), default="pos")
+    parsing.add_parameters(parser)
+    
+    FLAGS = parser.parse_args()
+    pars = common.dot_dict(vars(FLAGS))
+
+    with open(params.CfgPath, "r") as afile:
+        cfg = yaml.safe_load(afile)
+    
+    if pars.particles == "photons":
+        #init_radii = np.asarray([0.01]*50)
+        init_radii = np.asarray([0.042, 0.042, 0.014, 0.014, 0.017, 0.17, 0.011, 0.011, 0.014, 0.014, 0.011, 0.011, 0.013, 0.013, 0.012, 0.012, 0.026, 0.026, 0.021, 0.021, 0.021, 0.021, 0.016, 0.016, 0.029, 0.029, 0.042, 0.042])
+    else:
+        init_radii = np.asarray([0.015]*50) if pars.particles == "electrons" else np.asarray([0.02]*50)
+
+    cluster = clusterRad(pars, cfg)
+    test_df = cluster.cluster_check(init_radii)
+    breakpoint()
+
+
+    '''lower_bounds, upper_bounds = 0.5*init_radii, 1.5*init_radii
+    bounds = Bounds(lower_bounds, upper_bounds)    
+
+
+    min_options = {"maxiter": 2}
+
+    res = minimize(cluster.cluster_loss, init_radii, bounds=bounds, options=min_options)'''
+
+
+
diff --git a/bye_splits/scripts/cluster_size/condor/run_cluster.py b/bye_splits/scripts/cluster_size/condor/run_cluster.py
new file mode 100644
index 00000000..dca4112e
--- /dev/null
+++ b/bye_splits/scripts/cluster_size/condor/run_cluster.py
@@ -0,0 +1,65 @@
+# coding: utf-8
+
+_all_ = []
+
+import os
+import sys
+
+parent_dir = os.path.abspath(__file__ + 4 * "/..")
+sys.path.insert(0, parent_dir)
+
+import tasks
+from utils import params, common, parsing, cl_helpers
+
+import argparse
+import random
+
+random.seed(10)
+import numpy as np
+import pandas as pd
+
+import yaml
+
+def cluster_coef(pars, cfg):
+    cluster_d = params.read_task_params("cluster")
+
+    particles = pars["particles"]
+    pileup = pars["pileup"]
+    coef = pars["coef"]
+
+    cl_size_coef = "{}_coef_{}".format(
+        cfg["clusterStudies"]["clusterSizeBaseName"],
+        str(round(coef, 3)).replace(".", "p"),
+    )
+    cluster_d["ClusterOutPlot"], cluster_d["ClusterOutValidation"] = cl_size_coef, cl_size_coef+"_valid"
+    cluster_d["CoeffA"] = [coef] * 50
+    #cluster_d["weights"] = cfg["weights"]
+
+    for key in ("ClusterInTC", "ClusterInSeeds", "ClusterOutPlot", "ClusterOutValidation"):
+        name = cluster_d[key]
+
+        cluster_d[key] =  "{}_{}_{}_posEta".format(particles, pileup, name)
+    
+    nevents_end = tasks.cluster.cluster_default(pars, **cluster_d)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument("--coef", help="Coefficient to use as the max cluster radius", required=True, type=float)
+    parser.add_argument("--particles", choices=("photons", "electrons", "pions"), required=True)
+    parser.add_argument("--pileup", help="tag for PU200 vs PU0", choices=("PU0", "PU200"), required=True)
+    parsing.add_parameters(parser)
+    
+    FLAGS = parser.parse_args()
+    pars = common.dot_dict(vars(FLAGS))
+
+    radius = round(pars.coef, 3)
+
+    with open(params.CfgPath, "r") as afile:
+        cfg = yaml.safe_load(afile)
+
+    '''weight_dir = "{}/PU0/".format(params.LocalStorage)
+    weights_by_particle = cl_helpers.read_weights(weight_dir, cfg)
+    weights = weights_by_particle[pars.particles][radius]
+    cfg["weights"] = weights'''
+    
+    cluster_coef(pars, cfg)
\ No newline at end of file
diff --git a/bye_splits/scripts/cluster_size/run_combine.py b/bye_splits/scripts/cluster_size/run_combine.py
new file mode 100644
index 00000000..731fd65e
--- /dev/null
+++ b/bye_splits/scripts/cluster_size/run_combine.py
@@ -0,0 +1,137 @@
+# coding: utf-8
+
+_all_ = []
+
+import os
+import sys
+import argparse
+
+parent_dir = os.path.abspath(__file__ + 3 * "/..")
+sys.path.insert(0, parent_dir)
+
+from utils import params, common
+
+from data_handle.data_process import get_data_reco_chain_start
+
+import random
+import re
+
+random.seed(10)
+import numpy as np
+import pandas as pd
+import sys
+
+import yaml
+from tqdm import tqdm
+
+def split_dfs(cl_df):
+    weighted_cols = [col for col in cl_df.keys() if "weighted" in col]
+    weighted_cols += [col for col in cl_df.keys() if "layer" in col]
+    original_cols = [col.replace("weighted_","") for col in weighted_cols]
+    weighted_cols += ["event"]
+    original_cols += ["event"]
+
+    original_df = cl_df[original_cols]
+    weighted_df = cl_df[weighted_cols].rename(dict(zip(weighted_cols, original_cols)), axis=1)
+
+    return original_df, weighted_df
+
+def normalize_df(cl_df, gen_df, dRThresh):
+    cl_df=cl_df.reset_index().set_index(["event","seed_idx"])
+    combined_df = cl_df.join(
+        gen_df.set_index("event"), on="event", how="inner"
+    )
+
+    if "dR" not in combined_df.keys():
+        combined_df["dR"] = np.sqrt((abs(combined_df["eta"])-abs(combined_df["gen_eta"]))**2+(combined_df["phi"]-combined_df["gen_phi"])**2)
+    if "matches" not in combined_df.keys():
+        combined_df["matches"] = combined_df["dR"] <= dRThresh
+
+    combined_df["pt"] = combined_df["en"] / np.cosh(combined_df["eta"])
+    combined_df["gen_pt"] = combined_df["gen_en"] / np.cosh(combined_df["gen_eta"])
+
+    combined_df["pt_norm"] = combined_df["pt"] / combined_df["gen_pt"]
+    combined_df["en_norm"] = combined_df["en"] / combined_df["gen_en"]
+
+    return combined_df
+
+def combine_files_by_coef(in_dir, file_pattern):
+    files = [
+        file for file in os.listdir(in_dir) if re.search(file_pattern, file) != None and "valid" not in file
+    ]
+    coef_pattern = r"coef_0p(\d+)"
+    out_path = common.fill_path(file_pattern, data_dir=in_dir)
+    with pd.HDFStore(out_path, "w") as clusterSizeOut:
+        print("\nCombining Files:\n")
+        for file in tqdm(files):
+            key = re.search(coef_pattern, file).group()
+            with pd.HDFStore(in_dir + "/" + file, "r") as clSizeCoef:
+                clusterSizeOut[key] = clSizeCoef["/data"]
+
+def split_and_norm(df_cl, df_gen, dRthresh):
+    original_df, weighted_df = split_dfs(df_cl)
+    normed_df, normed_weighted_df = normalize_df(original_df, df_gen, dRthresh), normalize_df(weighted_df, df_gen, dRthresh)
+    df_dict = {"original": normed_df,
+                "weighted": normed_weighted_df}
+    df_cl = pd.Series(df_dict)
+    return df_cl
+
+def combine_cluster(cfg, **pars):
+    """Originally designed to combine the files returned by cluster for each radii,
+    and to normalize each by the gen_particle information. Now accepts an optional --file
+    parameter to normalize this, skipping the combinination step."""
+
+    input_file_path = pars["file"] if "file" in pars.keys() else None
+    unweighted = pars["unweighted"] if "unweighted" in pars.keys() else False
+
+    particles = cfg["particles"]
+    nevents = cfg["clusterStudies"]["nevents"]
+
+    if input_file_path == None:
+        pileup = "PU0" if not cfg["clusterStudies"]["pileup"] else "PU200"
+
+        basename = cfg["clusterStudies"]["combination"][pileup][particles]["basename"]
+        sub_dir = cfg["clusterStudies"]["combination"][pileup][particles]["sub_dir"]
+
+        dir = "{}/{}/{}".format(params.LocalStorage, pileup, sub_dir)
+
+        combine_files_by_coef(dir, basename)
+
+        cl_size_out = common.fill_path(basename, data_dir=dir)
+
+    else:
+        cl_size_out = input_file_path
+
+    with pd.HDFStore(cl_size_out, mode="a") as clSizeOut:
+        df_gen, _, _ = get_data_reco_chain_start(
+            particles=particles, nevents=nevents, reprocess=False, tag = cfg["clusterStudies"]["parquetTag"]
+        )
+        #if "negEta" in sub_dir:
+        if "negEta" in cl_size_out:
+            df_gen = df_gen[ df_gen.gen_eta < 0 ]
+            df_gen["gen_eta"] = abs(df_gen.gen_eta)
+        else:
+            df_gen = df_gen[ df_gen.gen_eta > 0 ]
+        dRthresh = cfg["selection"]["deltarThreshold"]
+        if input_file_path != None:
+            clSizeOut["data"] = split_and_norm(clSizeOut["data"], df_gen, dRthresh) if not unweighted else normalize_df(clSizeOut["data"], df_gen, dRthresh)
+        else:
+            coef_keys = clSizeOut.keys()
+            print("\nNormalizing Files:\n")
+            for coef in tqdm(coef_keys):
+                clSizeOut[coef] = split_and_norm(clSizeOut[coef], df_gen, dRthresh) if not unweighted else normalize_df(clSizeOut[coef], df_gen, dRthresh)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument("--file", type=str)
+    parser.add_argument("--unweighted", action="store_true")
+
+    FLAGS = parser.parse_args()
+    pars = common.dot_dict(vars(FLAGS))
+
+    with open(params.CfgPath, "r") as afile:
+        cfg = yaml.safe_load(afile)
+
+    for particles in ("photons", "electrons", "pions"):
+        cfg.update({"particles": particles})
+        combine_cluster(cfg)
\ No newline at end of file
diff --git a/bye_splits/scripts/cluster_size/run_init_tasks.py b/bye_splits/scripts/cluster_size/run_init_tasks.py
new file mode 100644
index 00000000..4271faf1
--- /dev/null
+++ b/bye_splits/scripts/cluster_size/run_init_tasks.py
@@ -0,0 +1,81 @@
+# coding: utf-8
+
+_all_ = []
+
+import os
+import sys
+
+parent_dir = os.path.abspath(__file__ + 3 * "/..")
+sys.path.insert(0, parent_dir)
+
+import tasks
+from utils import params, common, parsing
+
+from data_handle.data_process import get_data_reco_chain_start
+
+import argparse
+import random
+
+random.seed(10)
+import sys
+
+import yaml
+
+def start_chain(pars, cfg):
+    particles = cfg["selection"]["particles"]
+    pileup = "PU0" if not cfg["clusterStudies"]["pileup"] else "PU200"
+    reprocess = cfg["clusterStudies"]["reprocess"]
+    nevents = cfg["clusterStudies"]["nevents"]
+    tag = cfg["clusterStudies"]["parquetTag"]
+
+    df_gen, df_cl, df_tc = get_data_reco_chain_start(
+        particles=particles, nevents=nevents, reprocess=reprocess, tag=tag
+    )
+
+    df_gen_pos, df_gen_neg = df_gen[ df_gen.gen_eta > 0 ], df_gen[ df_gen.gen_eta < 0]
+    df_cl_pos, df_cl_neg = df_cl[ df_cl.cl3d_eta > 0 ], df_cl[ df_cl.cl3d_eta < 0]
+    df_tc_pos, df_tc_neg = df_tc[ df_tc.tc_eta > 0 ], df_tc[ df_tc.tc_eta < 0]
+
+    for i in range(2):
+        if i==1:
+            df_gen, df_cl, df_tc = df_gen_neg, df_cl_neg, df_tc_neg
+            eta_tag = "negEta"
+            df_gen["gen_eta"], df_cl["cl3d_eta"], df_tc["tc_eta"], df_tc["tc_z"] = abs(df_gen.gen_eta), abs(df_cl.cl3d_eta), abs(df_tc.tc_eta), abs(df_tc.tc_z)
+        else:
+            df_gen, df_cl, df_tc = df_gen_pos, df_cl_pos, df_tc_pos
+            eta_tag = "posEta"
+
+        print(f"{particles}: {eta_tag}")
+
+        fill_d = params.read_task_params("fill")
+        for key in ("FillOut", "FillOutComp", "FillOutPlot"):
+            name = fill_d[key]
+            fill_d[key] = "{}_{}_{}_{}_17juillet".format(particles, pileup, name, eta_tag)
+        tasks.fill.fill(pars, df_gen, df_cl, df_tc, **fill_d)
+
+        smooth_d = params.read_task_params("smooth")
+        for key in ("SmoothIn", "SmoothOut"):
+            name = smooth_d[key]
+            smooth_d[key] =  "{}_{}_{}_{}_17juillet".format(particles, pileup, name, eta_tag)
+        tasks.smooth.smooth(pars, **smooth_d)
+
+        seed_d = params.read_task_params("seed")
+        for key in ("SeedIn", "SeedOut"):
+            name = seed_d[key]
+            seed_d[key] = "{}_{}_{}_{}_17juillet".format(particles, pileup, name, eta_tag)
+        tasks.seed.seed(pars, **seed_d)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="")
+    parsing.add_parameters(parser)
+
+    FLAGS = parser.parse_args()
+
+    with open(params.CfgPath, "r") as afile:
+        cfg = yaml.safe_load(afile)
+
+    start_chain(common.dot_dict(vars(FLAGS)), cfg)
+
+    for particles in ("photons", "electrons", "pions"):
+        cfg["selection"]["particles"] = particles
+        start_chain(common.dot_dict(vars(FLAGS)), cfg)
\ No newline at end of file
diff --git a/config.yaml b/config.yaml
index e8b02b00..61f3f4c8 100644
--- a/config.yaml
+++ b/config.yaml
@@ -135,6 +135,27 @@ varGeometry:
   reprocess: False
   coefs: [0.010, 0.015, 0.020, 0.025]
 
+job:
+  user: iehle
+  proxy: ~/.t3/proxy.cert
+  queue: short
+  local: False
+  script: /grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/production/submit_scripts/condor_cluster_size.sh
+  arguments: [filename, particles, pileup]
+  read_dir: False # {True: read arguments as paths in <submit_dir>/ntuples, False: read arguments stored on the .txt <files>}
+  photons:
+    submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/photons/
+    files: /data_CMS/cms/ehle/L1HGCAL/PU200/coefs.txt
+    files_per_batch: 1
+  electrons:
+    submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/electrons/
+    files: /data_CMS/cms/ehle/L1HGCAL/PU200/coefs.txt
+    files_per_batch: 1
+  pions:
+    submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/pions/
+    files: /data_CMS/cms/ehle/L1HGCAL/PU200/coefs.txt
+    files_per_batch: 1
+
 clusterStudies:
   localDir: /home/llr/cms/mchiusi/event_display/bye_splits/data/new_algos/
   ehleDir: /eos/user/i/iehle/
@@ -145,7 +166,6 @@ clusterStudies:
   nevents: 100
   pileUp: False
   tree: FloatingpointMixedbcstcrealsig4DummyHistomaxxydr015GenmatchGenclustersntuple/HGCalTriggerNtuple
-  #coeffs: [0.0, 0.05, 50]
   fileBaseName: energy_out
   local: False
 

From d86dd718ee6edead89fe466a0eb24cdc55eaebbc Mon Sep 17 00:00:00 2001
From: Isaac Ehle <ehle@llr.in2p3.fr>
Date: Mon, 9 Oct 2023 12:08:31 +0200
Subject: [PATCH 02/12] mostly variable / key name updates, addition of some
 needed functions

---
 .../production/submit_scripts/job_submit.py   |  8 +-
 .../cluster_size/condor/run_cluster.py        | 18 +++--
 .../scripts/cluster_size/run_init_tasks.py    | 14 ++--
 bye_splits/utils/cl_helpers.py                | 36 +++++++--
 bye_splits/utils/common.py                    | 73 +++++++++++++++++++
 config.yaml                                   | 11 ++-
 6 files changed, 131 insertions(+), 29 deletions(-)

diff --git a/bye_splits/production/submit_scripts/job_submit.py b/bye_splits/production/submit_scripts/job_submit.py
index c82e2e13..0513176f 100644
--- a/bye_splits/production/submit_scripts/job_submit.py
+++ b/bye_splits/production/submit_scripts/job_submit.py
@@ -216,11 +216,7 @@ def launch_jobs(self):
     with open(params.CfgPath, "r") as afile:
         config = yaml.safe_load(afile)
 
-    job = CondJob("pions", config)
-    job.prepare_jobs()
-    job.launch_jobs()
-
-    '''for particle in ("photons", "electrons", "pions"):
+    for particle in ("photons", "electrons", "pions"):
         job = CondJob(particle, config)
         job.prepare_jobs()
-        job.launch_jobs()'''
\ No newline at end of file
+        job.launch_jobs()
\ No newline at end of file
diff --git a/bye_splits/scripts/cluster_size/condor/run_cluster.py b/bye_splits/scripts/cluster_size/condor/run_cluster.py
index dca4112e..9f3828bd 100644
--- a/bye_splits/scripts/cluster_size/condor/run_cluster.py
+++ b/bye_splits/scripts/cluster_size/condor/run_cluster.py
@@ -33,12 +33,14 @@ def cluster_coef(pars, cfg):
     )
     cluster_d["ClusterOutPlot"], cluster_d["ClusterOutValidation"] = cl_size_coef, cl_size_coef+"_valid"
     cluster_d["CoeffA"] = [coef] * 50
-    #cluster_d["weights"] = cfg["weights"]
+    
+    if "weights" in cfg:
+        cluster_d["weights"] = cfg["weights"]
 
     for key in ("ClusterInTC", "ClusterInSeeds", "ClusterOutPlot", "ClusterOutValidation"):
         name = cluster_d[key]
 
-        cluster_d[key] =  "{}_{}_{}_posEta".format(particles, pileup, name)
+        cluster_d[key] =  "{}_{}_{}_posEta_9oct".format(particles, pileup, name)
     
     nevents_end = tasks.cluster.cluster_default(pars, **cluster_d)
 
@@ -47,6 +49,7 @@ def cluster_coef(pars, cfg):
     parser.add_argument("--coef", help="Coefficient to use as the max cluster radius", required=True, type=float)
     parser.add_argument("--particles", choices=("photons", "electrons", "pions"), required=True)
     parser.add_argument("--pileup", help="tag for PU200 vs PU0", choices=("PU0", "PU200"), required=True)
+    parser.add_argument("--weighted", help="Apply pre-calculated layer weights", default=False)
     parsing.add_parameters(parser)
     
     FLAGS = parser.parse_args()
@@ -57,9 +60,10 @@ def cluster_coef(pars, cfg):
     with open(params.CfgPath, "r") as afile:
         cfg = yaml.safe_load(afile)
 
-    '''weight_dir = "{}/PU0/".format(params.LocalStorage)
-    weights_by_particle = cl_helpers.read_weights(weight_dir, cfg)
-    weights = weights_by_particle[pars.particles][radius]
-    cfg["weights"] = weights'''
-    
+    if pars.weighted:
+        weight_dir = "{}/PU0/".format(params.LocalStorage)
+        weights_by_particle = cl_helpers.read_weights(weight_dir, cfg)
+        weights = weights_by_particle[pars.particles][radius]
+        cfg["weights"] = weights
+
     cluster_coef(pars, cfg)
\ No newline at end of file
diff --git a/bye_splits/scripts/cluster_size/run_init_tasks.py b/bye_splits/scripts/cluster_size/run_init_tasks.py
index 4271faf1..e7c425da 100644
--- a/bye_splits/scripts/cluster_size/run_init_tasks.py
+++ b/bye_splits/scripts/cluster_size/run_init_tasks.py
@@ -48,21 +48,22 @@ def start_chain(pars, cfg):
         print(f"{particles}: {eta_tag}")
 
         fill_d = params.read_task_params("fill")
-        for key in ("FillOut", "FillOutComp", "FillOutPlot"):
+        #for key in ("FillOut", "FillOutComp", "FillOutPlot"):
+        for key in ("FillOut", "FillOutGenCl", "FillOutTcAll"):
             name = fill_d[key]
-            fill_d[key] = "{}_{}_{}_{}_17juillet".format(particles, pileup, name, eta_tag)
+            fill_d[key] = "{}_{}_{}_{}_9oct".format(particles, pileup, name, eta_tag)
         tasks.fill.fill(pars, df_gen, df_cl, df_tc, **fill_d)
 
         smooth_d = params.read_task_params("smooth")
         for key in ("SmoothIn", "SmoothOut"):
             name = smooth_d[key]
-            smooth_d[key] =  "{}_{}_{}_{}_17juillet".format(particles, pileup, name, eta_tag)
+            smooth_d[key] =  "{}_{}_{}_{}_9oct".format(particles, pileup, name, eta_tag)
         tasks.smooth.smooth(pars, **smooth_d)
 
         seed_d = params.read_task_params("seed")
         for key in ("SeedIn", "SeedOut"):
             name = seed_d[key]
-            seed_d[key] = "{}_{}_{}_{}_17juillet".format(particles, pileup, name, eta_tag)
+            seed_d[key] = "{}_{}_{}_{}_9oct".format(particles, pileup, name, eta_tag)
         tasks.seed.seed(pars, **seed_d)
 
 if __name__ == "__main__":
@@ -74,8 +75,9 @@ def start_chain(pars, cfg):
     with open(params.CfgPath, "r") as afile:
         cfg = yaml.safe_load(afile)
 
+    cfg["selection"]["particles"] = "pions"
     start_chain(common.dot_dict(vars(FLAGS)), cfg)
 
-    for particles in ("photons", "electrons", "pions"):
+    '''for particles in ("photons", "electrons", "pions"):
         cfg["selection"]["particles"] = particles
-        start_chain(common.dot_dict(vars(FLAGS)), cfg)
\ No newline at end of file
+        start_chain(common.dot_dict(vars(FLAGS)), cfg)'''
\ No newline at end of file
diff --git a/bye_splits/utils/cl_helpers.py b/bye_splits/utils/cl_helpers.py
index 27a24eb9..0d376483 100644
--- a/bye_splits/utils/cl_helpers.py
+++ b/bye_splits/utils/cl_helpers.py
@@ -9,11 +9,12 @@
 import numpy as np
 import pandas as pd
 
-from bye_splits.utils import common, parsing, params
+#from bye_splits.utils import common, parsing, params
+from bye_splits.utils import common, params
 
-parser = argparse.ArgumentParser(description="Seeding standalone step.")
+'''parser = argparse.ArgumentParser(description="Seeding standalone step.")
 parsing.add_parameters(parser)
-FLAGS = parser.parse_args()
+FLAGS = parser.parse_args()'''
 
 
 def get_last_version(name):
@@ -154,8 +155,31 @@ def get_input_files(base_path, pile_up=False):
 
     return input_files
 
-
-def get_output_files(cfg):
+def read_weights(dir, cfg, version="final", mode="weights"):
+    weights_by_particle = {}
+    #for particle in ("photons", "electrons", "pions"):
+    for particle in ("photons", "pions"):
+        basename = "optimization_selectOneEffRms_maxSeed_bc_stc" if particle == "pions" else "optimization_selectOneStd_adjustMaxWeight_maxSeed"
+        
+        version_dir = "{}/".format(version)
+        particle_dir = "{}{}/optimization/official/{}".format(dir, particle, version_dir)
+
+        files = [f for f in os.listdir(particle_dir) if basename in f]
+        weights_by_radius = {}
+        for file in files:
+            radius = float(file.replace(".hdf5","").replace(f"{basename}_","").replace("r0","0").replace("p","."))
+            infile = particle_dir+file
+            with pd.HDFStore(infile, "r") as optWeights:
+                weights_by_radius[radius] = optWeights[mode]
+    
+        weights_by_particle[particle] = weights_by_radius
+    
+    weights_by_particle["electrons"] = weights_by_particle["photons"]
+    
+    return weights_by_particle
+
+
+'''def get_output_files(cfg):
     """Accepts a configuration file containing the base directory, a file basename, local (Bool) and pileUp (Bool).
     Finds the full paths of the files created by cluster_size.py, and returns
     a dictionary corresponding to particles:[file_paths]."""
@@ -194,4 +218,4 @@ def get_output_files(cfg):
         for key in output_files.keys():
             output_files[key] = list(set(output_files[key]))
 
-    return output_files
+    return output_files'''
diff --git a/bye_splits/utils/common.py b/bye_splits/utils/common.py
index eef36c71..fbaa2653 100644
--- a/bye_splits/utils/common.py
+++ b/bye_splits/utils/common.py
@@ -16,6 +16,8 @@
 import numpy as np
 import pandas as pd
 
+import re
+
 
 def binConv(vals, dist, amin):
     """
@@ -150,3 +152,74 @@ def seed_extra_name(cfg):
     s = '_hexdist' if cfg['seed_cs']['hexDist'] else ''
     s += '_' + cfg['seed_cs']['InputName']
     return s
+
+# Accepts a template for a full path to a file and increments the version
+def increment_version(file_path):
+    dir, file = os.path.split(file_path)
+    base, ext = os.path.splitext(file)
+    i = 0
+    file_path = "{}/{}_v{}{}".format(dir, base, i, ext)
+    while os.path.exists(file_path):
+        i += 1
+        file_path = "{}/{}_v{}{}".format(dir, base, i, ext)
+    return file_path
+
+
+# Grab the most recent version of the file corresponding to the template file_path (or return all matches)
+def grab_most_recent(file_path, return_all=False):
+    dir, file = os.path.split(file_path)
+    base, ext = os.path.splitext(file)
+    files = os.listdir(dir)
+    version_pattern = re.compile("{}_v(\\d+)\\{}".format(base, ext))
+    matches = [version_pattern.search(file) for file in files]
+    matches = [match for match in matches if not match is None]
+    if len(matches) > 0:
+        matches = [int(match.group(1)) for match in matches]
+        most_recent = max(matches)
+        if not return_all:
+            file_path = dir + "/" + base + "_v" + str(most_recent) + ext
+        else:
+            file_path = [dir + "/" + base + "_v" + str(f) + ext for f in matches]
+        return file_path
+
+def compare_file_contents(file_path, buffer_list):
+    """
+    Compares the content in <file_path> with <buffer_list>,
+    which should be a list of strings that you wish to write
+    to a new file.
+    """
+    with open(file_path, "r") as file:
+        contents = file.readlines()
+    return contents==buffer_list
+
+def write_file_version(template, version):
+    file_name = increment_version(template)
+    with open(file_name, "w") as job_file:
+        job_file.writelines(version)
+    st = os.stat(file_name)
+    os.chmod(file_name, st.st_mode | 0o744)
+    return file_name
+
+def conditional_write(file_versions, file_template, current_version):
+    """
+    Loops through the files in <file_versions>, comparing their contents
+    to the current version. If an identical version is found, the function
+    breaks and does nothing. Otherwise, it will write the contents in
+    <current_version> to an updated version number whose basename corresponds to
+    <file_template>.
+    """
+    if file_versions != None:
+        identical_version = False
+        for file in file_versions:
+            if not compare_file_contents(file, current_version):
+                continue
+            else:
+                identical_version = True
+                file_path = file
+                break
+        if not identical_version:
+            file_path = write_file_version(file_template, current_version)
+    
+    else:
+        file_path = write_file_version(file_template, current_version)
+    return file_path
diff --git a/config.yaml b/config.yaml
index 61f3f4c8..ea69d2b6 100644
--- a/config.yaml
+++ b/config.yaml
@@ -72,7 +72,7 @@ varEvents:
     geta: 'good_genpart_exeta'
     gphi: 'good_genpart_exphi'
     gen: 'good_genpart_energy'
-    gpt: 'good_genpart_pt'
+#    gpt: 'good_genpart_pt'
   tc:
     event: 'event'
     wu: 'good_tc_waferu'
@@ -161,10 +161,13 @@ clusterStudies:
   ehleDir: /eos/user/i/iehle/
   dataFolder: data/
   reinit: True #False
-  clusterSizeBaseName: cluster_size
+  reprocess: False # Reproduce .parquet file
+  parquetTag: jobTest
+  clusterSizeBaseName: cluster_size_jobTest_yesWeight
   coeffs: [0.0, 0.5, 50]
-  nevents: 100
-  pileUp: False
+  #nevents: 100
+  nevents: -1
+  pileup: False
   tree: FloatingpointMixedbcstcrealsig4DummyHistomaxxydr015GenmatchGenclustersntuple/HGCalTriggerNtuple
   fileBaseName: energy_out
   local: False

From fe04b0d75831ea202a90739e87618c3ac3cd65d4 Mon Sep 17 00:00:00 2001
From: Isaac Ehle <ehle@llr.in2p3.fr>
Date: Mon, 9 Oct 2023 12:13:52 +0200
Subject: [PATCH 03/12] remove depricated file

---
 .../cluster_size/condor/cluster_opt.py        | 138 ------------------
 1 file changed, 138 deletions(-)
 delete mode 100644 bye_splits/scripts/cluster_size/condor/cluster_opt.py

diff --git a/bye_splits/scripts/cluster_size/condor/cluster_opt.py b/bye_splits/scripts/cluster_size/condor/cluster_opt.py
deleted file mode 100644
index 0c36daad..00000000
--- a/bye_splits/scripts/cluster_size/condor/cluster_opt.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# coding: utf-8
-
-_all_ = []
-
-import os
-import sys
-
-parent_dir = os.path.abspath(__file__ + 4 * "/..")
-sys.path.insert(0, parent_dir)
-
-import tasks
-from utils import params, common, parsing, cl_helpers
-
-from data_handle.data_process import get_data_reco_chain_start
-
-import argparse
-import random
-
-random.seed(10)
-import numpy as np
-import pandas as pd
-from scipy.optimize import minimize, Bounds
-
-import yaml
-
-pt_norm_loss = lambda mean: np.sqrt(1-mean**2)/2
-
-def get_gen(pars, cfg):    
-    particles = pars["particles"]
-    eta = pars["eta"]
-
-    reprocess = cfg["clusterStudies"]["reprocess"]
-    nevents = cfg["clusterStudies"]["nevents"]
-    tag = cfg["clusterStudies"]["parquetTag"]
-
-    df_gen, _ , _= get_data_reco_chain_start(
-        particles=particles, nevents=nevents, reprocess=reprocess, tag=tag
-    )
-
-    df = df_gen[ df_gen.gen_eta > 0 ] if eta=="pos" else df_gen[ df_gen.gen_eta < 0 ]
-    if eta=="neg": df_gen["gen_eta"] = abs(df_gen.gen_eta)
-
-    return df
-
-def cluster_coef(pars, cfg, radii):
-    particles = pars["particles"]
-    cluster_d = params.read_task_params("cluster")
-
-    cluster_d["ClusterOutPlot"], cluster_d["ClusterOutValidation"] = cfg["clusterStudies"]["clusterSizeBaseName"], cfg["clusterStudies"]["clusterSizeBaseName"]+"_valid"
-    cluster_d["CoeffA"] = radii
-
-    for key in ("ClusterInTC", "ClusterInSeeds", "ClusterOutPlot", "ClusterOutValidation"):
-        name = cluster_d[key]
-
-        cluster_d[key] =  "{}_PU0_{}_posEta".format(particles, name)
-    
-    cluster_d["returnDF"] = True
-    _, df = tasks.cluster.cluster_default(pars, **cluster_d)
-    
-    return df
-
-def normalize_df(cl_df, gen_df, dRThresh=0.05):
-    cl_df=cl_df.reset_index().set_index(["event","seed_idx"])
-    combined_df = cl_df.join(
-        gen_df.set_index("event"), on="event", how="inner"
-    )
-
-    if "dR" not in combined_df.keys():
-        combined_df["dR"] = np.sqrt((abs(combined_df["eta"])-abs(combined_df["gen_eta"]))**2+(combined_df["phi"]-combined_df["gen_phi"])**2)
-    if "matches" not in combined_df.keys():
-        combined_df["matches"] = combined_df["dR"] <= dRThresh
-
-    combined_df["pt"] = combined_df["en"] / np.cosh(combined_df["eta"])
-    combined_df["gen_pt"] = combined_df["gen_en"] / np.cosh(combined_df["gen_eta"])
-
-    combined_df["pt_norm"] = combined_df["pt"] / combined_df["gen_pt"]
-    combined_df["en_norm"] = combined_df["en"] / combined_df["gen_en"]
-
-    return combined_df
-
-def filter_df(df):
-    df = df[ df.matches == True ]
-    df = df.groupby("event").apply(lambda x: x.loc[x.pt.idxmax()])
-    
-    return df
-
-class clusterRad:
-    def __init__(self, pars, cfg):
-        self.pars = pars
-        self.cfg = cfg
-        self.df_gen = get_gen(pars, cfg)
-
-    def cluster_check(self, radii):
-        df_cluster = cluster_coef(self.pars, self.cfg, radii)
-        df = normalize_df(df_cluster, self.df_gen)
-        return filter_df(df)
-
-    def cluster_loss(self, radii):
-        df_cluster = cluster_coef(self.pars, self.cfg, radii)
-        df = normalize_df(df_cluster, self.df_gen)
-        df_filt = filter_df(df)
-        pt_norm_mean = df_filt.pt_norm.mean()
-        
-        return pt_norm_loss(pt_norm_mean)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="")
-    parser.add_argument("--particles", choices=("photons", "electrons", "pions"), required=True)
-    parser.add_argument("--eta", help="Eta region", choices=("pos", "neg"), default="pos")
-    parsing.add_parameters(parser)
-    
-    FLAGS = parser.parse_args()
-    pars = common.dot_dict(vars(FLAGS))
-
-    with open(params.CfgPath, "r") as afile:
-        cfg = yaml.safe_load(afile)
-    
-    if pars.particles == "photons":
-        #init_radii = np.asarray([0.01]*50)
-        init_radii = np.asarray([0.042, 0.042, 0.014, 0.014, 0.017, 0.17, 0.011, 0.011, 0.014, 0.014, 0.011, 0.011, 0.013, 0.013, 0.012, 0.012, 0.026, 0.026, 0.021, 0.021, 0.021, 0.021, 0.016, 0.016, 0.029, 0.029, 0.042, 0.042])
-    else:
-        init_radii = np.asarray([0.015]*50) if pars.particles == "electrons" else np.asarray([0.02]*50)
-
-    cluster = clusterRad(pars, cfg)
-    test_df = cluster.cluster_check(init_radii)
-    breakpoint()
-
-
-    '''lower_bounds, upper_bounds = 0.5*init_radii, 1.5*init_radii
-    bounds = Bounds(lower_bounds, upper_bounds)    
-
-
-    min_options = {"maxiter": 2}
-
-    res = minimize(cluster.cluster_loss, init_radii, bounds=bounds, options=min_options)'''
-
-
-

From cc683e996d57b17dc823553a05e5a49dcffe3a98 Mon Sep 17 00:00:00 2001
From: Isaac Ehle <ehle@llr.in2p3.fr>
Date: Mon, 9 Oct 2023 14:09:57 +0200
Subject: [PATCH 04/12] updated README.md

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 562f1778..f3acfffd 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,7 @@
 2.  [Data production](#dataprod)
     1.  [Skimming](#skim)
     2.  [Data sources](#sources)
+    3.  [Job Submission](#job-submission)
 3.  [Reconstruction Chain](#org0bc224d)
     1.  [Cluster Size Studies](#orgc33e2a6)
 4.  [Event Visualization](#org44a4071)
@@ -99,6 +100,11 @@ This framework relies on photon-, electron- and pion-gun samples produced via CR
 
 The `PU0` files above were merged and are stored under `/data_CMS/cms/alves/L1HGCAL/`, accessible to LLR users and under `/eos/user/b/bfontana/FPGAs/new_algos/`, accessible to all lxplus and LLR users. The latter is used since it is well interfaced with CERN services. The `PU200` files were merged and stored under `/eos/user/i/iehle/data/PU200/<particle>/`.
 
+<a id="job-submission"></a>
+## Job Submission
+
+Job submission to HT Condor is handled through `bye_splits/production/submit_scripts/job_submit.py` using the `job` section of `config.yaml` for its configuration. The configuration should include usual condor variables, i.e `user`, `proxy`, `queue`, and `local`, as well as a path to the `script` you would like to run on condor and a list of `arguments` that the script accepts. It then contains a section for each particle type which should contain a `submit_dir`, i.e. the directory in which to read and write submission related files, `files` which should be a `.txt` file containing the values you would like to iterate over, and `files_per_batch` which can be any number between 1 and the total number of values you would like to run.
+
 
 <a id="org0bc224d"></a>
 # Reconstruction Chain

From 85308fa708494aac0cb8414fb3a33c4a6633c435 Mon Sep 17 00:00:00 2001
From: Isaac Ehle <ehle@llr.in2p3.fr>
Date: Wed, 11 Oct 2023 13:53:52 +0200
Subject: [PATCH 05/12] better argument handling and variable naming

---
 .../submit_scripts/condor_cluster_size.sh     |  34 ++++-
 .../production/submit_scripts/job_submit.py   | 125 ++++++------------
 .../cluster_size/condor/run_cluster.py        |  20 +--
 config.yaml                                   |  15 ++-
 4 files changed, 91 insertions(+), 103 deletions(-)

diff --git a/bye_splits/production/submit_scripts/condor_cluster_size.sh b/bye_splits/production/submit_scripts/condor_cluster_size.sh
index 73b9146d..8f0e26a7 100644
--- a/bye_splits/production/submit_scripts/condor_cluster_size.sh
+++ b/bye_splits/production/submit_scripts/condor_cluster_size.sh
@@ -2,10 +2,30 @@
 
 cd /home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/scripts/cluster_size/condor/
 
-# Coefficients (radii) stored in .txt file, run cluster step on each radius
-coef_file=$1
-particles=$2
-pileup=$3
-while read -r line; do
-    python run_cluster.py --coef "$line" --particles "$particles" --pileup "$pileup"
-done <$coef_file
\ No newline at end of file
+radius=()
+particles=""
+pileup=""
+
+while [[ "$#" -gt 0 ]]; do
+    case "$1" in
+        --radius)
+            IFS=";" read -ra radius <<< "${2:1:-1}"
+            shift 2
+            ;;
+        --particles)
+            particles="$2"
+            shift 2
+            ;;
+        --pileup)
+            pileup="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unrecognized argument $1"
+            exit 1;;
+    esac
+done
+
+for rad in ${radius[@]}; do
+    python run_cluster.py --radius "$rad" --particles "$particles" --pileup "$pileup"
+done
\ No newline at end of file
diff --git a/bye_splits/production/submit_scripts/job_submit.py b/bye_splits/production/submit_scripts/job_submit.py
index 0513176f..fb7ed6b7 100644
--- a/bye_splits/production/submit_scripts/job_submit.py
+++ b/bye_splits/production/submit_scripts/job_submit.py
@@ -19,58 +19,29 @@ class JobBatches:
     def __init__(self, particle, config):
         self.particle = particle
         self.config = config
+        self.iterOver = config["job"]["iterOver"]
 
     def setup_batches(self):
+        total = self.config["job"]["arguments"][self.iterOver]
 
-        my_batches = lambda files, files_per_batch: [files[i: i + files_per_batch] for i in range(0, len(files), files_per_batch)]
+        vals_per_batch = particle_var(self.particle, "files_per_batch")    
 
-        read_dir = self.config["job"]["read_dir"]
-        files = particle_var(self.particle, "files")
-        files_per_batch = particle_var(self.particle, "files_per_batch")
+        batches = [total[i: i + vals_per_batch] for i in range(0, len(total), vals_per_batch)]
         
-        if not read_dir:
-            with open(files, "r") as file:
-                paths = file.read().splitlines()
-        else:
-            part_submit_dir = particle_var(self.particle, "submit_dir") + "ntuples/"
-            paths = [
-                "{}{}".format(part_submit_dir, file) for file in os.listdir(part_submit_dir) if file.startswith("ntuple")
-            ]
-        
-        batches = my_batches(paths, files_per_batch)
-
         return batches
     
-class CondJobBase(JobBatches):
+class CondJobBase:
     def __init__(self, particle, config):
-        super().__init__(particle, config)
-        self.particle_dir = particle_var(self.particle, "submit_dir")
+        self.particle = particle
         self.script = config["job"]["script"]
+        self.iterOver = config["job"]["iterOver"]
         self.args = config["job"]["arguments"]
         self.queue = config["job"]["queue"]
         self.proxy = config["job"]["proxy"]
         self.local = config["job"]["local"]
         self.user = config["job"]["user"]
-
-
-    def write_batch_files(self):
-        batch_dir = "{}batches/".format(self.particle_dir)
-        if not os.path.exists(batch_dir):
-            os.makedirs(batch_dir)
-        batch_script_dir = "{}{}/".format(batch_dir, os.path.splitext(os.path.basename(self.script))[0])
-        if not os.path.exists(batch_script_dir):
-            os.makedirs(batch_script_dir)
-
-        batches = self.setup_batches()
-        global current_batch_versions
-        current_batch_versions = []
-        for i, batch in enumerate(batches):
-            out_name = "{}batch_{}.txt".format(batch_script_dir, i)
-            written_version = common.grab_most_recent(out_name, return_all=True)
-            batch_lines = ["{}\n".format(b) for b in batch]
-            current_version = common.conditional_write(written_version, out_name, batch_lines)
-            current_batch_versions.append(current_version)
-
+        self.particle_dir = particle_var(self.particle, "submit_dir")
+        self.batches = JobBatches(particle, config).setup_batches()
 
     def prepare_batch_submission(self):
         sub_dir = "{}subs/".format(self.particle_dir)
@@ -89,35 +60,21 @@ def prepare_batch_submission(self):
         current_version.append("export VO_CMS_SW_DIR=/cvmfs/cms.cern.ch\n")
         current_version.append("export SITECONFIG_PATH=$VO_CMS_SW_DIR/SITECONF/T2_FR_GRIF_LLR/GRIF-LLR/\n")
         current_version.append("source $VO_CMS_SW_DIR/cmsset_default.sh\n")
-        if len(self.args) > 0:
+        
+        if len(self.args.keys()) > 0:
             args = ["bash {}".format(self.script)]
-            for i in range(len(self.args)):
-                args.append(f"${i+1}")
+            for i, key in enumerate(self.args.keys()):
+                args.append("--{} ${}".format(key, i+1))
             args = " ".join(args)
             current_version.append(args)
         else:
             current_version.append("bash {}".format(self.script))
             
         # Write the file only if an identical file doesn't already exist
-        global sub_file
-        sub_file = common.conditional_write(submit_file_versions, submit_file_name_template, current_version)
+        self.sub_file = common.conditional_write(submit_file_versions, submit_file_name_template, current_version)
 
     def prepare_multi_job_condor(self):
         log_dir = "{}logs/".format(self.particle_dir)
-        
-        batch_files = current_batch_versions
-
-        arg_dict = {}
-        for arg in self.args:
-            if arg=="filename":
-                arg_dict[arg] = batch_files
-            elif arg=="particles":
-                arg_dict[arg] = self.particle
-            elif arg=="pileup":
-                arg_dict[arg] = "PU0" if "PU0" in batch_files[0] else "PU200"
-            else:
-                print(f"{arg} is not currently supported.")
-                quit()
 
         script_basename = os.path.basename(self.script).replace(".sh", "").replace(".py", "")
 
@@ -126,13 +83,15 @@ def prepare_multi_job_condor(self):
         job_file_versions = common.grab_most_recent(job_file_name_template, return_all=True)
 
         current_version = []
-        current_version.append("executable = {}\n".format(sub_file))
+        current_version.append("executable = {}\n".format(self.sub_file))
         current_version.append("Universe              = vanilla\n")
         if len(self.args) > 0:
             current_version.append("Arguments =")
-            for arg in self.args[:-1]:
+
+            for arg in self.args.keys():
                 current_version.append(" $({}) ".format(arg))
-            current_version.append("$({})\n".format(self.args[-1]))
+            current_version.append("\n")
+
         current_version.append("output = {}{}_C$(Cluster)P$(Process).out\n".format(log_dir, script_basename))
         current_version.append("error = {}{}_C$(Cluster)P$(Process).err\n".format(log_dir, script_basename))
         current_version.append("log = {}{}_C$(Cluster)P$(Process).log\n".format(log_dir, script_basename))
@@ -141,37 +100,37 @@ def prepare_multi_job_condor(self):
         current_version.append("WNTag                 = el7\n")
         current_version.append('+SingularityCmd       = ""\n')
         current_version.append("include: /opt/exp_soft/cms/t3/t3queue |\n")
-        if len(arg_dict.keys()) > 0:
-            arg_keys = [key for key in arg_dict.keys()]
-            arg_keys = ", ".join(arg_keys)
+
+        if len(self.args.keys()) > 0:
+            arg_keys = ", ".join(self.args.keys())
             arg_keys = "queue " + arg_keys + " from (\n"
             current_version.append(arg_keys)
-            for file in arg_dict["filename"]:
-                sub_args = list(arg_dict.keys())[1:]
-                arg_vals = [file]+[arg_dict[key] for key in sub_args]
-                arg_vals = ", ".join(arg_vals) + "\n"
-                current_version.append(arg_vals)
+            for batch in self.batches:
+                sub_args = list(self.args.keys())[1:]
+                arg_vals = [self.args[key] for key in sub_args]
+                all_vals = ["{}".format(batch).replace(", ", ";")]+arg_vals
+                all_vals = ", ".join(all_vals) + "\n"
+                current_version.append(all_vals)
+
             current_version.append(")")
 
         # Write the file only if an identical file doesn't already exist
-        global submission_file # Save to launch later
-        submission_file = common.conditional_write(job_file_versions, job_file_name_template, current_version)
+        self.submission_file = common.conditional_write(job_file_versions, job_file_name_template, current_version) # Save to launch later
 
-class CondJob(CondJobBase):
+class CondJob:
     def __init__(self, particle, config):
-        super().__init__(particle, config)
+        self.base = CondJobBase(particle=particle, config=config)
 
 
     def prepare_jobs(self):
-        self.write_batch_files()
 
         configs = lambda dir: dir + "configs"
         jobs = lambda dir: dir + "jobs"
         logs = lambda dir: dir + "logs"
 
-        config_dir = configs(self.particle_dir)
-        job_dir = jobs(self.particle_dir)
-        log_dir = logs(self.particle_dir)
+        config_dir = configs(self.base.particle_dir)
+        job_dir = jobs(self.base.particle_dir)
+        log_dir = logs(self.base.particle_dir)
 
         if not os.path.exists(config_dir):
             os.makedirs(config_dir)
@@ -180,31 +139,31 @@ def prepare_jobs(self):
         if not os.path.exists(log_dir):
             os.makedirs(log_dir)
 
-        self.prepare_batch_submission()
-        self.prepare_multi_job_condor()
+        self.base.prepare_batch_submission()
+        self.base.prepare_multi_job_condor()
 
 
     def launch_jobs(self):
 
-        if self.local == True:
+        if self.base.local == True:
             machine = "local"
         else:
             machine = "llrt3.in2p3.fr"
 
         sub_comm = ["condor_submit"]
 
-        if not self.local:
+        if not self.base.local:
             print(
-                "\nSending {} jobs on {}".format(self.particle, self.queue + "@{}".format(machine))
+                "\nSending {} jobs on {}".format(self.base.particle, self.base.queue + "@{}".format(machine))
             )
             print("===============")
             print("\n")
 
         sub_args = []
 
-        sub_args.append(submission_file)
+        sub_args.append(self.base.submission_file)
 
-        if self.local:
+        if self.base.local:
             comm = sub_args
         else:
             comm = sub_comm + sub_args
diff --git a/bye_splits/scripts/cluster_size/condor/run_cluster.py b/bye_splits/scripts/cluster_size/condor/run_cluster.py
index 9f3828bd..52d9df0e 100644
--- a/bye_splits/scripts/cluster_size/condor/run_cluster.py
+++ b/bye_splits/scripts/cluster_size/condor/run_cluster.py
@@ -20,19 +20,19 @@
 
 import yaml
 
-def cluster_coef(pars, cfg):
+def cluster_radius(pars, cfg):
     cluster_d = params.read_task_params("cluster")
 
     particles = pars["particles"]
     pileup = pars["pileup"]
-    coef = pars["coef"]
+    radius = pars["radius"]
 
-    cl_size_coef = "{}_coef_{}".format(
+    cl_size_radius = "{}_radius_{}".format(
         cfg["clusterStudies"]["clusterSizeBaseName"],
-        str(round(coef, 3)).replace(".", "p"),
+        str(round(radius, 3)).replace(".", "p"),
     )
-    cluster_d["ClusterOutPlot"], cluster_d["ClusterOutValidation"] = cl_size_coef, cl_size_coef+"_valid"
-    cluster_d["CoeffA"] = [coef] * 50
+    cluster_d["ClusterOutPlot"], cluster_d["ClusterOutValidation"] = cl_size_radius, cl_size_radius+"_valid"
+    cluster_d["CoeffA"] = [radius] * 50
     
     if "weights" in cfg:
         cluster_d["weights"] = cfg["weights"]
@@ -46,7 +46,7 @@ def cluster_coef(pars, cfg):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="")
-    parser.add_argument("--coef", help="Coefficient to use as the max cluster radius", required=True, type=float)
+    parser.add_argument("--radius", help="Coefficient to use as the max cluster radius", required=True, type=float)
     parser.add_argument("--particles", choices=("photons", "electrons", "pions"), required=True)
     parser.add_argument("--pileup", help="tag for PU200 vs PU0", choices=("PU0", "PU200"), required=True)
     parser.add_argument("--weighted", help="Apply pre-calculated layer weights", default=False)
@@ -55,7 +55,7 @@ def cluster_coef(pars, cfg):
     FLAGS = parser.parse_args()
     pars = common.dot_dict(vars(FLAGS))
 
-    radius = round(pars.coef, 3)
+    radius_str = round(pars.radius, 3)
 
     with open(params.CfgPath, "r") as afile:
         cfg = yaml.safe_load(afile)
@@ -63,7 +63,7 @@ def cluster_coef(pars, cfg):
     if pars.weighted:
         weight_dir = "{}/PU0/".format(params.LocalStorage)
         weights_by_particle = cl_helpers.read_weights(weight_dir, cfg)
-        weights = weights_by_particle[pars.particles][radius]
+        weights = weights_by_particle[pars.particles][radius_str]
         cfg["weights"] = weights
 
-    cluster_coef(pars, cfg)
\ No newline at end of file
+    cluster_radius(pars, cfg)
\ No newline at end of file
diff --git a/config.yaml b/config.yaml
index ea69d2b6..81b8e0fe 100644
--- a/config.yaml
+++ b/config.yaml
@@ -141,12 +141,21 @@ job:
   queue: short
   local: False
   script: /grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/production/submit_scripts/condor_cluster_size.sh
-  arguments: [filename, particles, pileup]
+  iterOver: radius
+  arguments:
+    radius: [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009,
+            0.01 , 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.017, 0.018,
+            0.019, 0.02 , 0.021, 0.022, 0.023, 0.024, 0.025, 0.026, 0.027,
+            0.028, 0.029, 0.03 , 0.031, 0.032, 0.033, 0.034, 0.035, 0.036,
+            0.037, 0.038, 0.039, 0.04 , 0.041, 0.042, 0.043, 0.044, 0.045,
+            0.046, 0.047, 0.048, 0.049, 0.05 ]
+    particles: pions
+    pileup: PU0
   read_dir: False # {True: read arguments as paths in <submit_dir>/ntuples, False: read arguments stored on the .txt <files>}
   photons:
     submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/photons/
     files: /data_CMS/cms/ehle/L1HGCAL/PU200/coefs.txt
-    files_per_batch: 1
+    files_per_batch: 10
   electrons:
     submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/electrons/
     files: /data_CMS/cms/ehle/L1HGCAL/PU200/coefs.txt
@@ -154,7 +163,7 @@ job:
   pions:
     submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/pions/
     files: /data_CMS/cms/ehle/L1HGCAL/PU200/coefs.txt
-    files_per_batch: 1
+    files_per_batch: 10
 
 clusterStudies:
   localDir: /home/llr/cms/mchiusi/event_display/bye_splits/data/new_algos/

From 0073ffe5c14093e5c05a39c0f46bc1cbc5cfa18d Mon Sep 17 00:00:00 2001
From: Isaac Ehle <ehle@llr.in2p3.fr>
Date: Wed, 11 Oct 2023 17:56:41 +0200
Subject: [PATCH 06/12] general clean up and documentation

---
 README.md                                     |   2 +-
 bye_splits/production/produce.cc              |  94 -----------
 .../production/submit_scripts/job_submit.py   | 149 +++++++++++-------
 .../cluster_size/condor/run_cluster.py        |  20 ++-
 .../scripts/cluster_size/run_combine.py       |  38 +++--
 .../scripts/cluster_size/run_init_tasks.py    |  49 +++---
 bye_splits/utils/cl_helpers.py                |  57 +------
 bye_splits/utils/common.py                    |  73 +--------
 bye_splits/utils/job_helpers.py               |  81 ++++++++++
 config.yaml                                   |  17 +-
 10 files changed, 247 insertions(+), 333 deletions(-)
 delete mode 100644 bye_splits/production/produce.cc
 create mode 100644 bye_splits/utils/job_helpers.py

diff --git a/README.md b/README.md
index f3acfffd..a66bc6cd 100644
--- a/README.md
+++ b/README.md
@@ -103,7 +103,7 @@ The `PU0` files above were merged and are stored under `/data_CMS/cms/alves/L1HG
 <a id="job-submission"></a>
 ## Job Submission
 
-Job submission to HT Condor is handled through `bye_splits/production/submit_scripts/job_submit.py` using the `job` section of `config.yaml` for its configuration. The configuration should include usual condor variables, i.e `user`, `proxy`, `queue`, and `local`, as well as a path to the `script` you would like to run on condor and a list of `arguments` that the script accepts. It then contains a section for each particle type which should contain a `submit_dir`, i.e. the directory in which to read and write submission related files, `files` which should be a `.txt` file containing the values you would like to iterate over, and `files_per_batch` which can be any number between 1 and the total number of values you would like to run.
+Job submission to HT Condor is handled through `bye_splits/production/submit_scripts/job_submit.py` using the `job` section of `config.yaml` for its configuration. The configuration should include usual condor variables, i.e `user`, `proxy`, `queue`, and `local`, as well as a path to the `script` you would like to run on condor. The `arguments` sub-section should contain `key/value` pairs matching the expected arguments that `script` accepts. The variable that you would like to iterate over should be set in `iterOver` and its value should correspond to a `key` in the `arguments` sub-section whose value is a list containing the values the script should iterate over. It then contains a section for each particle type which should contain a `submit_dir`, i.e. the directory in which to read and write submission related files, and `args_per_batch` which can be any number between 1 and `len(arguments[<iterOver>])`.
 
 
 <a id="org0bc224d"></a>
diff --git a/bye_splits/production/produce.cc b/bye_splits/production/produce.cc
deleted file mode 100644
index cef61f1f..00000000
--- a/bye_splits/production/produce.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-#include <iostream>
-#include "include/skim.h"
-
-#include <stdio.h>  // for printf()
-#include <stdlib.h> // for strtol()
-#include <errno.h>  // for errno
-#include <limits.h> // for INT_MIN and INT_MAX
-#include <string.h>  // for strlen
-
-int convert_to_int(char** argv, int idx) {
-  char* p;
-  errno = 0; // not 'int errno', because the '#include' already defined it
-  long arg = strtol(argv[idx], &p, 10);
-  if (*p != '\0' || errno != 0) {
-	return 1; // In main(), returning non-zero means failure
-  }
-
-  if (arg < INT_MIN || arg > INT_MAX) {
-	return 1;
-  }
-  int arg_int = arg;
-
-  // Everything went well, print it as a regular number plus a newline
-  return arg_int;
-}
-
-void show_help(const po::options_description&, const std::string&);
-po::variables_map process_program_options(int argc, char **argv);
-
-void validate(po::variables_map args) {
-  std::string particles = args["particles"].as<string>();
-  if(!(particles=="photons" || particles=="electrons" || particles=="pions")) {
-    throw po::validation_error(po::validation_error::invalid_option_value, "particles");
-  }
-}
-
-void show_help(const po::options_description& desc,
-			   const std::string& topic = "") {
-  std::cout << desc << '\n';
-  if (topic != "") {
-	std::cout << "You asked for help on: " << topic << '\n';
-  }
-}
-
-po::variables_map process_program_options(int argc, char **argv)
-{
-  po::options_description desc("Usage");
-  desc.add_options()
-	("help,h",
-	 po::value<string>()->implicit_value("")
-	 ->notifier([&desc](const std::string &topic) {show_help(desc, topic);}),
-	 "Show help. If given, show help on the specified topic.")
-	("nevents", po::value<int>()->default_value(-1),
-	 "number of entries to consider, useful for debugging (-1 means all)")
-	("particles", po::value<string>()->required(),
-	 "type of particle");
-  
-  if (argc <= 1) {
-	show_help(desc); // does not return
-	exit( EXIT_SUCCESS );
-  }
-
-  po::variables_map args;
-  try {
-	po::store(po::parse_command_line(argc, argv, desc), args);
-  }
-  catch (po::error const& e) {
-	std::cerr << e.what() << '\n';
-	exit( EXIT_FAILURE );
-  }
-  po::notify(args);
-  validate(args);
-  return args;
-}
-
-//Run with ./produce.exe photons
-int main(int argc, char **argv) {
-  std::string dir = "/eos/user/b/bfontana/FPGAs/new_algos/";
-  std::string tree_name = "FloatingpointMixedbcstcrealsig4DummyHistomaxxydr015GenmatchGenclustersntuple/HGCalTriggerNtuple";
-
-  po::variables_map args = process_program_options(argc, argv);
-  if (args.count("help")) {
-    return 1;
-  }
-
-  string particles = args["particles"].as<string>();
-  int nevents = args["nevents"].as<int>();
-
-  std::string infile = particles + "_0PU_bc_stc_hadd.root";
-  std::string events_str = nevents > 0 ? std::to_string(nevents) + "events_" : "";
-  std::string outfile = "skim_" + events_str + infile;
-  skim(tree_name, dir + infile, dir + outfile, particles, nevents);
-  return 0;
-}
diff --git a/bye_splits/production/submit_scripts/job_submit.py b/bye_splits/production/submit_scripts/job_submit.py
index fb7ed6b7..43334214 100644
--- a/bye_splits/production/submit_scripts/job_submit.py
+++ b/bye_splits/production/submit_scripts/job_submit.py
@@ -7,24 +7,29 @@
 parent_dir = os.path.abspath(__file__ + 5 * "../")
 sys.path.insert(0, parent_dir)
 
-from bye_splits.utils import params, common
+from bye_splits.utils import params, common, job_helpers
 
 import subprocess
 import yaml
 
-# Read particle specific variables from the YAML file
-particle_var = lambda part, var: config["job"][part][var]
-
 class JobBatches:
+    """Class for setting up job batches and setting configuration
+    variables. The function setup_batches() will take the list in
+    config[arguments[<iterOver>]] and return a list of lists containing
+    <args_per_batch> values in each sublist. Example for five total values
+    with <args_per_batch> = 2:
+    [0.01, 0.02, 0.03, 0.04, 0.05] --> [[0.01, 0.02], [0.03, 0.04], [0.05]]"""
+
     def __init__(self, particle, config):
         self.particle = particle
-        self.config = config
+        self.config   = config
         self.iterOver = config["job"]["iterOver"]
+        self.particle_var = lambda part, var: config["job"][part][var]
 
     def setup_batches(self):
         total = self.config["job"]["arguments"][self.iterOver]
 
-        vals_per_batch = particle_var(self.particle, "files_per_batch")    
+        vals_per_batch = self.particle_var(self.particle, "args_per_batch")    
 
         batches = [total[i: i + vals_per_batch] for i in range(0, len(total), vals_per_batch)]
         
@@ -32,26 +37,75 @@ def setup_batches(self):
     
 class CondJobBase:
     def __init__(self, particle, config):
-        self.particle = particle
-        self.script = config["job"]["script"]
-        self.iterOver = config["job"]["iterOver"]
-        self.args = config["job"]["arguments"]
-        self.queue = config["job"]["queue"]
-        self.proxy = config["job"]["proxy"]
-        self.local = config["job"]["local"]
-        self.user = config["job"]["user"]
-        self.particle_dir = particle_var(self.particle, "submit_dir")
-        self.batches = JobBatches(particle, config).setup_batches()
+        self.particle     = particle
+        self.script       = config["job"]["script"]
+        self.iterOver     = config["job"]["iterOver"]
+        self.args         = config["job"]["arguments"]
+        self.queue        = config["job"]["queue"]
+        self.proxy        = config["job"]["proxy"]
+        self.local        = config["job"]["local"]
+        self.user         = config["job"]["user"]
+        self.batch        = JobBatches(particle, config)
+        self.particle_dir = self.batch.particle_var(self.particle, "submit_dir")
+        self.batches      = self.batch.setup_batches()
+
+    def _write_arg_keys(self, current_version):
+        """Writes the line containing the argument
+        names to the buffer list."""
+        
+        arg_keys = ["Arguments ="]
+        for arg in self.args.keys():
+            arg_keys.append("$({})".format(arg))
+        arg_keys = " ".join(arg_keys) + "\n"
+
+        current_version.append(arg_keys)
+
+    def _write_arg_values(self, current_version):
+        """Adds the argument values, where the batch lists are converted
+        to strings as [val_1, val_2, ...] --> "[val_1;val_2]".
+        The choice of a semicolon as the delimiter is arbitrary but it
+        cannot be a comma because this is the delimeter condor itself uses.
+
+        Example:
+
+        queue radius, particle from (
+        [0.01, 0.02], photon
+        )
+        incorrectly assigns radius="[0.01", particle="0.02]"
+
+        queue radius, particle from (
+            [0.01;0.02], photon
+        )
+        correctly assigns radius="[0.01, 0.02]", particle="photon"
+        """   
+        arg_keys = ", ".join(self.args.keys())
+        arg_keys = "queue " + arg_keys + " from (\n"
+        current_version.append(arg_keys)
+        for batch in self.batches:
+            sub_args = list(self.args.keys())[1:]
+            arg_vals = [self.args[key] for key in sub_args]
+            all_vals = ["{}".format(batch).replace(", ", ";")]+arg_vals
+            all_vals = ", ".join(all_vals) + "\n"
+            current_version.append(all_vals)
+
+        current_version.append(")")
 
     def prepare_batch_submission(self):
+        """Writes the .sh script that constitutes the executable
+        in the .sub script. The basename will be the same as the fundamental
+        script, followed by a version number. Stores the contents in a list that's
+        used as a buffer and checked against the content in previous
+        versions, only writing the file if an identical file doesn't
+        already exist. The version number will be incrimented in this case."""
+
         sub_dir = "{}subs/".format(self.particle_dir)
 
-        if not os.path.exists(sub_dir):
-            os.makedirs(sub_dir)
+        common.create_dir(sub_dir)
+
         script_basename = os.path.basename(self.script).replace(".sh", "").replace(".py", "")
 
         submit_file_name_template = "{}{}_submit.sh".format(sub_dir, script_basename)
-        submit_file_versions = common.grab_most_recent(submit_file_name_template, return_all=True)
+        submit_file_versions = job_helpers.grab_most_recent(submit_file_name_template, return_all=True)
 
         current_version = []
         current_version.append("#!/usr/bin/env bash\n")
@@ -71,26 +125,27 @@ def prepare_batch_submission(self):
             current_version.append("bash {}".format(self.script))
             
         # Write the file only if an identical file doesn't already exist
-        self.sub_file = common.conditional_write(submit_file_versions, submit_file_name_template, current_version)
+        self.sub_file = job_helpers.conditional_write(submit_file_versions, submit_file_name_template, current_version)
 
     def prepare_multi_job_condor(self):
+        """Writes the .sub script that is submitted to HT Condor.
+        Follows the same naming convention and conditional_write()
+        procedure as the previous function."""
+
         log_dir = "{}logs/".format(self.particle_dir)
 
         script_basename = os.path.basename(self.script).replace(".sh", "").replace(".py", "")
 
         job_file_name_template = "{}jobs/{}.sub".format(self.particle_dir, script_basename)
 
-        job_file_versions = common.grab_most_recent(job_file_name_template, return_all=True)
+        job_file_versions = job_helpers.grab_most_recent(job_file_name_template, return_all=True)
 
         current_version = []
         current_version.append("executable = {}\n".format(self.sub_file))
         current_version.append("Universe              = vanilla\n")
-        if len(self.args) > 0:
-            current_version.append("Arguments =")
 
-            for arg in self.args.keys():
-                current_version.append(" $({}) ".format(arg))
-            current_version.append("\n")
+        if len(self.args) > 0:
+            self._write_arg_keys(current_version)
 
         current_version.append("output = {}{}_C$(Cluster)P$(Process).out\n".format(log_dir, script_basename))
         current_version.append("error = {}{}_C$(Cluster)P$(Process).err\n".format(log_dir, script_basename))
@@ -100,52 +155,36 @@ def prepare_multi_job_condor(self):
         current_version.append("WNTag                 = el7\n")
         current_version.append('+SingularityCmd       = ""\n')
         current_version.append("include: /opt/exp_soft/cms/t3/t3queue |\n")
-
+        
         if len(self.args.keys()) > 0:
-            arg_keys = ", ".join(self.args.keys())
-            arg_keys = "queue " + arg_keys + " from (\n"
-            current_version.append(arg_keys)
-            for batch in self.batches:
-                sub_args = list(self.args.keys())[1:]
-                arg_vals = [self.args[key] for key in sub_args]
-                all_vals = ["{}".format(batch).replace(", ", ";")]+arg_vals
-                all_vals = ", ".join(all_vals) + "\n"
-                current_version.append(all_vals)
-
-            current_version.append(")")
+            self._write_arg_values(current_version)
 
         # Write the file only if an identical file doesn't already exist
-        self.submission_file = common.conditional_write(job_file_versions, job_file_name_template, current_version) # Save to launch later
+        self.submission_file = job_helpers.conditional_write(job_file_versions, job_file_name_template, current_version) # Save to launch later
 
 class CondJob:
+    """Creates the job directories and files
+    with prepare_jobs() and runs the jobs with
+    launch_jobs()."""
+
     def __init__(self, particle, config):
         self.base = CondJobBase(particle=particle, config=config)
 
-
     def prepare_jobs(self):
 
-        configs = lambda dir: dir + "configs"
-        jobs = lambda dir: dir + "jobs"
-        logs = lambda dir: dir + "logs"
-
-        config_dir = configs(self.base.particle_dir)
-        job_dir = jobs(self.base.particle_dir)
-        log_dir = logs(self.base.particle_dir)
+        config_dir = self.base.particle_dir + "configs"
+        job_dir    = self.base.particle_dir + "jobs"
+        log_dir    = self.base.particle_dir + "logs"
 
-        if not os.path.exists(config_dir):
-            os.makedirs(config_dir)
-        if not os.path.exists(job_dir):
-            os.makedirs(job_dir)
-        if not os.path.exists(log_dir):
-            os.makedirs(log_dir)
+        for d in (config_dir, job_dir, log_dir):
+            common.create_dir(d)
 
         self.base.prepare_batch_submission()
         self.base.prepare_multi_job_condor()
 
-
     def launch_jobs(self):
 
-        if self.base.local == True:
+        if self.base.local:
             machine = "local"
         else:
             machine = "llrt3.in2p3.fr"
diff --git a/bye_splits/scripts/cluster_size/condor/run_cluster.py b/bye_splits/scripts/cluster_size/condor/run_cluster.py
index 52d9df0e..ee186f52 100644
--- a/bye_splits/scripts/cluster_size/condor/run_cluster.py
+++ b/bye_splits/scripts/cluster_size/condor/run_cluster.py
@@ -1,6 +1,6 @@
 # coding: utf-8
 
-_all_ = []
+_all_ = ['cluster_radius']
 
 import os
 import sys
@@ -21,6 +21,11 @@
 import yaml
 
 def cluster_radius(pars, cfg):
+    """Runs the default clustering algorithm using the
+    specified radius. Runs on both negative and positive
+    eta files, and adds layer weights to the cluster
+    kwargs if specified."""
+    
     cluster_d = params.read_task_params("cluster")
 
     particles = pars["particles"]
@@ -37,19 +42,20 @@ def cluster_radius(pars, cfg):
     if "weights" in cfg:
         cluster_d["weights"] = cfg["weights"]
 
-    for key in ("ClusterInTC", "ClusterInSeeds", "ClusterOutPlot", "ClusterOutValidation"):
-        name = cluster_d[key]
+    for eta_tag in ("negEta", "posEta"):
+        for key in ("ClusterInTC", "ClusterInSeeds", "ClusterOutPlot", "ClusterOutValidation"):
+            name = cluster_d[key]
 
-        cluster_d[key] =  "{}_{}_{}_posEta_9oct".format(particles, pileup, name)
-    
-    nevents_end = tasks.cluster.cluster_default(pars, **cluster_d)
+            cluster_d[key] =  "{}_{}_{}_{}".format(particles, pileup, name, eta_tag)
+        
+        nevents_end = tasks.cluster.cluster_default(pars, **cluster_d)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="")
     parser.add_argument("--radius", help="Coefficient to use as the max cluster radius", required=True, type=float)
     parser.add_argument("--particles", choices=("photons", "electrons", "pions"), required=True)
     parser.add_argument("--pileup", help="tag for PU200 vs PU0", choices=("PU0", "PU200"), required=True)
-    parser.add_argument("--weighted", help="Apply pre-calculated layer weights", default=False)
+    parser.add_argument("--weighted", help="Apply pre-caluclated layer weights", action="store_true")
     parsing.add_parameters(parser)
     
     FLAGS = parser.parse_args()
diff --git a/bye_splits/scripts/cluster_size/run_combine.py b/bye_splits/scripts/cluster_size/run_combine.py
index 731fd65e..ca44870c 100644
--- a/bye_splits/scripts/cluster_size/run_combine.py
+++ b/bye_splits/scripts/cluster_size/run_combine.py
@@ -1,6 +1,6 @@
 # coding: utf-8
 
-_all_ = []
+_all_ = ['split_dfs', 'combine_normalize', 'combine_files_by_coef', 'split_and_norm', 'combine_cluster']
 
 import os
 import sys
@@ -9,14 +9,13 @@
 parent_dir = os.path.abspath(__file__ + 3 * "/..")
 sys.path.insert(0, parent_dir)
 
-from utils import params, common
+from utils import params, common, parsing
 
 from data_handle.data_process import get_data_reco_chain_start
 
 import random
 import re
 
-random.seed(10)
 import numpy as np
 import pandas as pd
 import sys
@@ -25,6 +24,9 @@
 from tqdm import tqdm
 
 def split_dfs(cl_df):
+    """Splits the dataframe created by cluster() into an
+    unweighted dataframe and a weighted dataframe."""
+
     weighted_cols = [col for col in cl_df.keys() if "weighted" in col]
     weighted_cols += [col for col in cl_df.keys() if "layer" in col]
     original_cols = [col.replace("weighted_","") for col in weighted_cols]
@@ -36,7 +38,12 @@ def split_dfs(cl_df):
 
     return original_df, weighted_df
 
-def normalize_df(cl_df, gen_df, dRThresh):
+def combine_normalize(cl_df, gen_df, dRThresh):
+    """Combines a cluster dataframe with an associated
+    gen-level dataframe. Useful variables that may or may not
+    be present are added to the combined dataframe before
+    normalized energy and pt columns are added."""
+
     cl_df=cl_df.reset_index().set_index(["event","seed_idx"])
     combined_df = cl_df.join(
         gen_df.set_index("event"), on="event", how="inner"
@@ -56,6 +63,10 @@ def normalize_df(cl_df, gen_df, dRThresh):
     return combined_df
 
 def combine_files_by_coef(in_dir, file_pattern):
+    """Combines .hdf5 cluster files of individual
+    radii into one .hdf5 file containing dataframes
+    for all radii."""
+
     files = [
         file for file in os.listdir(in_dir) if re.search(file_pattern, file) != None and "valid" not in file
     ]
@@ -70,11 +81,10 @@ def combine_files_by_coef(in_dir, file_pattern):
 
 def split_and_norm(df_cl, df_gen, dRthresh):
     original_df, weighted_df = split_dfs(df_cl)
-    normed_df, normed_weighted_df = normalize_df(original_df, df_gen, dRthresh), normalize_df(weighted_df, df_gen, dRthresh)
+    normed_df, normed_weighted_df = combine_normalize(original_df, df_gen, dRthresh), combine_normalize(weighted_df, df_gen, dRthresh)
     df_dict = {"original": normed_df,
                 "weighted": normed_weighted_df}
-    df_cl = pd.Series(df_dict)
-    return df_cl
+    return pd.Series(df_dict)
 
 def combine_cluster(cfg, **pars):
     """Originally designed to combine the files returned by cluster for each radii,
@@ -85,9 +95,9 @@ def combine_cluster(cfg, **pars):
     unweighted = pars["unweighted"] if "unweighted" in pars.keys() else False
 
     particles = cfg["particles"]
-    nevents = cfg["clusterStudies"]["nevents"]
+    nevents = pars.nevents
 
-    if input_file_path == None:
+    if input_file_path is None:
         pileup = "PU0" if not cfg["clusterStudies"]["pileup"] else "PU200"
 
         basename = cfg["clusterStudies"]["combination"][pileup][particles]["basename"]
@@ -106,25 +116,25 @@ def combine_cluster(cfg, **pars):
         df_gen, _, _ = get_data_reco_chain_start(
             particles=particles, nevents=nevents, reprocess=False, tag = cfg["clusterStudies"]["parquetTag"]
         )
-        #if "negEta" in sub_dir:
         if "negEta" in cl_size_out:
             df_gen = df_gen[ df_gen.gen_eta < 0 ]
             df_gen["gen_eta"] = abs(df_gen.gen_eta)
         else:
             df_gen = df_gen[ df_gen.gen_eta > 0 ]
         dRthresh = cfg["selection"]["deltarThreshold"]
-        if input_file_path != None:
-            clSizeOut["data"] = split_and_norm(clSizeOut["data"], df_gen, dRthresh) if not unweighted else normalize_df(clSizeOut["data"], df_gen, dRthresh)
+        if input_file_path is not None:
+            clSizeOut["data"] = split_and_norm(clSizeOut["data"], df_gen, dRthresh) if not unweighted else combine_normalize(clSizeOut["data"], df_gen, dRthresh)
         else:
             coef_keys = clSizeOut.keys()
             print("\nNormalizing Files:\n")
             for coef in tqdm(coef_keys):
-                clSizeOut[coef] = split_and_norm(clSizeOut[coef], df_gen, dRthresh) if not unweighted else normalize_df(clSizeOut[coef], df_gen, dRthresh)
+                clSizeOut[coef] = split_and_norm(clSizeOut[coef], df_gen, dRthresh) if not unweighted else combine_normalize(clSizeOut[coef], df_gen, dRthresh)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="")
     parser.add_argument("--file", type=str)
     parser.add_argument("--unweighted", action="store_true")
+    parsing.add_parameters(parser)
 
     FLAGS = parser.parse_args()
     pars = common.dot_dict(vars(FLAGS))
@@ -134,4 +144,4 @@ def combine_cluster(cfg, **pars):
 
     for particles in ("photons", "electrons", "pions"):
         cfg.update({"particles": particles})
-        combine_cluster(cfg)
\ No newline at end of file
+        combine_cluster(cfg, pars)
\ No newline at end of file
diff --git a/bye_splits/scripts/cluster_size/run_init_tasks.py b/bye_splits/scripts/cluster_size/run_init_tasks.py
index e7c425da..ddba90bf 100644
--- a/bye_splits/scripts/cluster_size/run_init_tasks.py
+++ b/bye_splits/scripts/cluster_size/run_init_tasks.py
@@ -16,54 +16,60 @@
 import argparse
 import random
 
-random.seed(10)
 import sys
 
 import yaml
 
 def start_chain(pars, cfg):
+    """Runs the first three steps of the TPG on
+    negative and positive eta samples."""
+
     particles = cfg["selection"]["particles"]
-    pileup = "PU0" if not cfg["clusterStudies"]["pileup"] else "PU200"
+    pileup    = "PU0" if not cfg["clusterStudies"]["pileup"] else "PU200"
     reprocess = cfg["clusterStudies"]["reprocess"]
-    nevents = cfg["clusterStudies"]["nevents"]
-    tag = cfg["clusterStudies"]["parquetTag"]
+    tag       = cfg["clusterStudies"]["parquetTag"]
+    nevents   = pars.nevents
 
     df_gen, df_cl, df_tc = get_data_reco_chain_start(
         particles=particles, nevents=nevents, reprocess=reprocess, tag=tag
     )
 
-    df_gen_pos, df_gen_neg = df_gen[ df_gen.gen_eta > 0 ], df_gen[ df_gen.gen_eta < 0]
-    df_cl_pos, df_cl_neg = df_cl[ df_cl.cl3d_eta > 0 ], df_cl[ df_cl.cl3d_eta < 0]
-    df_tc_pos, df_tc_neg = df_tc[ df_tc.tc_eta > 0 ], df_tc[ df_tc.tc_eta < 0]
+    df_gen_pos, df_gen_neg = df_gen[ df_gen.gen_eta > 0 ], df_gen[ df_gen.gen_eta < 0 ]
+    df_cl_pos, df_cl_neg   = df_cl[ df_cl.cl3d_eta > 0 ],  df_cl[ df_cl.cl3d_eta < 0 ]
+    df_tc_pos, df_tc_neg   = df_tc[ df_tc.tc_eta > 0 ],    df_tc[ df_tc.tc_eta < 0 ]
+
+    eta_dict = {"negEta": {"df_gen": df_gen_neg,
+                           "df_cl": df_cl_neg,
+                           "df_tc": df_tc_neg},
+                "posEta": {"df_gen": df_gen_pos,
+                           "df_cl": df_cl_pos,
+                           "df_tc": df_tc_pos}  
+                }
 
-    for i in range(2):
-        if i==1:
-            df_gen, df_cl, df_tc = df_gen_neg, df_cl_neg, df_tc_neg
-            eta_tag = "negEta"
+    for eta_tag, dfs in eta_dict.items():
+        df_gen, df_cl, df_tc = dfs.values()
+
+        if eta_tag=="negEta":
             df_gen["gen_eta"], df_cl["cl3d_eta"], df_tc["tc_eta"], df_tc["tc_z"] = abs(df_gen.gen_eta), abs(df_cl.cl3d_eta), abs(df_tc.tc_eta), abs(df_tc.tc_z)
-        else:
-            df_gen, df_cl, df_tc = df_gen_pos, df_cl_pos, df_tc_pos
-            eta_tag = "posEta"
 
         print(f"{particles}: {eta_tag}")
 
         fill_d = params.read_task_params("fill")
-        #for key in ("FillOut", "FillOutComp", "FillOutPlot"):
         for key in ("FillOut", "FillOutGenCl", "FillOutTcAll"):
             name = fill_d[key]
-            fill_d[key] = "{}_{}_{}_{}_9oct".format(particles, pileup, name, eta_tag)
+            fill_d[key] = "{}_{}_{}_{}_pt_test".format(particles, pileup, name, eta_tag)
         tasks.fill.fill(pars, df_gen, df_cl, df_tc, **fill_d)
 
         smooth_d = params.read_task_params("smooth")
         for key in ("SmoothIn", "SmoothOut"):
             name = smooth_d[key]
-            smooth_d[key] =  "{}_{}_{}_{}_9oct".format(particles, pileup, name, eta_tag)
+            smooth_d[key] =  "{}_{}_{}_{}_pt_test".format(particles, pileup, name, eta_tag)
         tasks.smooth.smooth(pars, **smooth_d)
 
         seed_d = params.read_task_params("seed")
         for key in ("SeedIn", "SeedOut"):
             name = seed_d[key]
-            seed_d[key] = "{}_{}_{}_{}_9oct".format(particles, pileup, name, eta_tag)
+            seed_d[key] = "{}_{}_{}_{}_pt_test".format(particles, pileup, name, eta_tag)
         tasks.seed.seed(pars, **seed_d)
 
 if __name__ == "__main__":
@@ -75,9 +81,6 @@ def start_chain(pars, cfg):
     with open(params.CfgPath, "r") as afile:
         cfg = yaml.safe_load(afile)
 
-    cfg["selection"]["particles"] = "pions"
-    start_chain(common.dot_dict(vars(FLAGS)), cfg)
-
-    '''for particles in ("photons", "electrons", "pions"):
+    for particles in ("photons", "electrons", "pions"):
         cfg["selection"]["particles"] = particles
-        start_chain(common.dot_dict(vars(FLAGS)), cfg)'''
\ No newline at end of file
+        start_chain(common.dot_dict(vars(FLAGS)), cfg)
\ No newline at end of file
diff --git a/bye_splits/utils/cl_helpers.py b/bye_splits/utils/cl_helpers.py
index 0d376483..cb3c7d2c 100644
--- a/bye_splits/utils/cl_helpers.py
+++ b/bye_splits/utils/cl_helpers.py
@@ -9,14 +9,8 @@
 import numpy as np
 import pandas as pd
 
-#from bye_splits.utils import common, parsing, params
 from bye_splits.utils import common, params
 
-'''parser = argparse.ArgumentParser(description="Seeding standalone step.")
-parsing.add_parameters(parser)
-FLAGS = parser.parse_args()'''
-
-
 def get_last_version(name):
     """Takes a template path, such as '/full/path/to/my_file.ext' and returns the path to the latest version
     corresponding to '/full/path/to/my_file_vN.ext' where N is the latest version number in the directory.
@@ -25,7 +19,6 @@ def get_last_version(name):
     base, ext = os.path.splitext(base)
     dir = os.path.dirname(name)
     if os.path.exists:
-        # pattern = rf"{base}_v(\d{ext})"
         pattern = r"{}_v(\d{})".format(base, ext)
         matches = [re.match(pattern, file) for file in os.listdir(dir)]
         version = max(
@@ -37,17 +30,12 @@ def get_last_version(name):
         )
     return version
 
-
 def update_version_name(name):
     """Takes the same template path as get_last_version(), and uses it to update the version number."""
     base, ext = os.path.splitext(name)
     version = 0 if not os.path.exists(name) else get_last_version(name)
     return f"{base}_v{str(version+1)}{ext}"
 
-
-# def_k = 0.0
-
-
 def closest(list, k=0.0):
     """Find the element of a list containing strings ['coef_{float_1}', 'coef_{float_2}', ...] which is closest to some float_i"""
     try:
@@ -157,7 +145,6 @@ def get_input_files(base_path, pile_up=False):
 
 def read_weights(dir, cfg, version="final", mode="weights"):
     weights_by_particle = {}
-    #for particle in ("photons", "electrons", "pions"):
     for particle in ("photons", "pions"):
         basename = "optimization_selectOneEffRms_maxSeed_bc_stc" if particle == "pions" else "optimization_selectOneStd_adjustMaxWeight_maxSeed"
         
@@ -176,46 +163,4 @@ def read_weights(dir, cfg, version="final", mode="weights"):
     
     weights_by_particle["electrons"] = weights_by_particle["photons"]
     
-    return weights_by_particle
-
-
-'''def get_output_files(cfg):
-    """Accepts a configuration file containing the base directory, a file basename, local (Bool) and pileUp (Bool).
-    Finds the full paths of the files created by cluster_size.py, and returns
-    a dictionary corresponding to particles:[file_paths]."""
-
-    output_files = {"photons": [], "pions": [], "electrons": []}
-    template = os.path.basename(
-        common.fill_path(cfg["clusterStudies"]["fileBaseName"], **vars(FLAGS))
-    )
-    template = re.split("_", template)
-    if cfg["clusterStudies"]["local"]:
-        base_path = cfg["clusterStudies"]["localDir"]
-    else:
-        base_path = (
-            params.EOSStorage(FLAGS.user, "data/PU0/")
-            if not cfg["clusterStudies"]["pileUp"]
-            else params.EOSStorage(FLAGS.user, "data/PU200/")
-        )
-    for particles in output_files.keys():
-        particle_dir = (
-            base_path + particles + "/" if cfg["clusterStudies"]["local"] else base_path
-        )
-        files = [re.split("_", file) for file in os.listdir(particle_dir)]
-        for filename in files:
-            if set(template).issubset(set(filename)):
-                path = os.path.join(f"{particle_dir}{'_'.join(filename)}")
-                with pd.HDFStore(path, "r") as File:
-                    if len(File.keys()) > 0:
-                        if ("photon" in filename) or ("photons" in filename):
-                            output_files["photons"].append(path)
-                        elif ("electron" in filename) or ("electrons" in filename):
-                            output_files["electrons"].append(path)
-                        else:
-                            output_files["pions"].append(path)
-
-        # Get rid of duplicates that the dictionary filling produced
-        for key in output_files.keys():
-            output_files[key] = list(set(output_files[key]))
-
-    return output_files'''
+    return weights_by_particle
\ No newline at end of file
diff --git a/bye_splits/utils/common.py b/bye_splits/utils/common.py
index fbaa2653..b2138620 100644
--- a/bye_splits/utils/common.py
+++ b/bye_splits/utils/common.py
@@ -151,75 +151,4 @@ def std_eff(values, c=0.68):
 def seed_extra_name(cfg):
     s = '_hexdist' if cfg['seed_cs']['hexDist'] else ''
     s += '_' + cfg['seed_cs']['InputName']
-    return s
-
-# Accepts a template for a full path to a file and increments the version
-def increment_version(file_path):
-    dir, file = os.path.split(file_path)
-    base, ext = os.path.splitext(file)
-    i = 0
-    file_path = "{}/{}_v{}{}".format(dir, base, i, ext)
-    while os.path.exists(file_path):
-        i += 1
-        file_path = "{}/{}_v{}{}".format(dir, base, i, ext)
-    return file_path
-
-
-# Grab the most recent version of the file corresponding to the template file_path (or return all matches)
-def grab_most_recent(file_path, return_all=False):
-    dir, file = os.path.split(file_path)
-    base, ext = os.path.splitext(file)
-    files = os.listdir(dir)
-    version_pattern = re.compile("{}_v(\\d+)\\{}".format(base, ext))
-    matches = [version_pattern.search(file) for file in files]
-    matches = [match for match in matches if not match is None]
-    if len(matches) > 0:
-        matches = [int(match.group(1)) for match in matches]
-        most_recent = max(matches)
-        if not return_all:
-            file_path = dir + "/" + base + "_v" + str(most_recent) + ext
-        else:
-            file_path = [dir + "/" + base + "_v" + str(f) + ext for f in matches]
-        return file_path
-
-def compare_file_contents(file_path, buffer_list):
-    """
-    Compares the content in <file_path> with <buffer_list>,
-    which should be a list of strings that you wish to write
-    to a new file.
-    """
-    with open(file_path, "r") as file:
-        contents = file.readlines()
-    return contents==buffer_list
-
-def write_file_version(template, version):
-    file_name = increment_version(template)
-    with open(file_name, "w") as job_file:
-        job_file.writelines(version)
-    st = os.stat(file_name)
-    os.chmod(file_name, st.st_mode | 0o744)
-    return file_name
-
-def conditional_write(file_versions, file_template, current_version):
-    """
-    Loops through the files in <file_versions>, comparing their contents
-    to the current version. If an identical version is found, the function
-    breaks and does nothing. Otherwise, it will write the contents in
-    <current_version> to an updated version number whose basename corresponds to
-    <file_template>.
-    """
-    if file_versions != None:
-        identical_version = False
-        for file in file_versions:
-            if not compare_file_contents(file, current_version):
-                continue
-            else:
-                identical_version = True
-                file_path = file
-                break
-        if not identical_version:
-            file_path = write_file_version(file_template, current_version)
-    
-    else:
-        file_path = write_file_version(file_template, current_version)
-    return file_path
+    return s
\ No newline at end of file
diff --git a/bye_splits/utils/job_helpers.py b/bye_splits/utils/job_helpers.py
new file mode 100644
index 00000000..49e84165
--- /dev/null
+++ b/bye_splits/utils/job_helpers.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+
+import re
+
+import os
+import sys
+
+parent_dir = os.path.abspath(__file__ + 5 * "../")
+sys.path.insert(0, parent_dir)
+
+_all_ = ['increment_version', 'grab_most_recent', 'compare_file_contents', 'write_file_version', 'conditional_write']
+
+# Accepts a template for a full path to a file and increments the version
+def increment_version(file_path):
+    dir, file = os.path.split(file_path)
+    base, ext = os.path.splitext(file)
+    i = 0
+    file_path = "{}/{}_v{}{}".format(dir, base, i, ext)
+    while os.path.exists(file_path):
+        i += 1
+        file_path = "{}/{}_v{}{}".format(dir, base, i, ext)
+    return file_path
+
+# Grab the most recent version of the file corresponding to the template file_path (or return all matches)
+def grab_most_recent(file_path, return_all=False):
+    dir, file = os.path.split(file_path)
+    base, ext = os.path.splitext(file)
+    files = os.listdir(dir)
+    version_pattern = re.compile("{}_v(\\d+)\\{}".format(base, ext))
+    matches = [version_pattern.search(file) for file in files]
+    matches = [match for match in matches if not match is None]
+    if len(matches) > 0:
+        matches = [int(match.group(1)) for match in matches]
+        most_recent = max(matches)
+        if not return_all:
+            file_path = dir + "/" + base + "_v" + str(most_recent) + ext
+        else:
+            file_path = [dir + "/" + base + "_v" + str(f) + ext for f in matches]
+        return file_path
+
+def compare_file_contents(file_path, buffer_list):
+    """
+    Compares the content in <file_path> with <buffer_list>,
+    which should be a list of strings that you wish to write
+    to a new file.
+    """
+    with open(file_path, "r") as file:
+        contents = file.readlines()
+    return contents==buffer_list
+
+def write_file_version(template, version):
+    file_name = increment_version(template)
+    with open(file_name, "w") as job_file:
+        job_file.writelines(version)
+    st = os.stat(file_name)
+    os.chmod(file_name, st.st_mode | 0o744)
+    return file_name
+
+def conditional_write(file_versions, file_template, current_version):
+    """
+    Loops through the files in <file_versions>, comparing their contents
+    to the current version. If an identical version is found, the function
+    breaks and does nothing. Otherwise, it will write the contents in
+    <current_version> to an updated version number whose basename corresponds to
+    <file_template>.
+    """
+    if file_versions != None:
+        identical_version = False
+        for file in file_versions:
+            if not compare_file_contents(file, current_version):
+                continue
+            else:
+                identical_version = True
+                file_path = file
+                break
+        if not identical_version:
+            file_path = write_file_version(file_template, current_version)
+    
+    else:
+        file_path = write_file_version(file_template, current_version)
+    return file_path
\ No newline at end of file
diff --git a/config.yaml b/config.yaml
index 81b8e0fe..d61020a4 100644
--- a/config.yaml
+++ b/config.yaml
@@ -72,7 +72,7 @@ varEvents:
     geta: 'good_genpart_exeta'
     gphi: 'good_genpart_exphi'
     gen: 'good_genpart_energy'
-#    gpt: 'good_genpart_pt'
+    gpt: 'good_genpart_pt'
   tc:
     event: 'event'
     wu: 'good_tc_waferu'
@@ -151,30 +151,25 @@ job:
             0.046, 0.047, 0.048, 0.049, 0.05 ]
     particles: pions
     pileup: PU0
-  read_dir: False # {True: read arguments as paths in <submit_dir>/ntuples, False: read arguments stored on the .txt <files>}
   photons:
     submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/photons/
-    files: /data_CMS/cms/ehle/L1HGCAL/PU200/coefs.txt
-    files_per_batch: 10
+    args_per_batch: 10
   electrons:
     submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/electrons/
-    files: /data_CMS/cms/ehle/L1HGCAL/PU200/coefs.txt
-    files_per_batch: 1
+    args_per_batch: 10
   pions:
     submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/pions/
-    files: /data_CMS/cms/ehle/L1HGCAL/PU200/coefs.txt
-    files_per_batch: 10
+    args_per_batch: 10
 
 clusterStudies:
   localDir: /home/llr/cms/mchiusi/event_display/bye_splits/data/new_algos/
   ehleDir: /eos/user/i/iehle/
   dataFolder: data/
   reinit: True #False
-  reprocess: False # Reproduce .parquet file
-  parquetTag: jobTest
+  reprocess: True # Reproduce .parquet file
+  parquetTag: pt_test
   clusterSizeBaseName: cluster_size_jobTest_yesWeight
   coeffs: [0.0, 0.5, 50]
-  #nevents: 100
   nevents: -1
   pileup: False
   tree: FloatingpointMixedbcstcrealsig4DummyHistomaxxydr015GenmatchGenclustersntuple/HGCalTriggerNtuple

From 8161d31e69a3942b89df3758216c73f916c18baa Mon Sep 17 00:00:00 2001
From: Isaac Ehle <ehle@llr.in2p3.fr>
Date: Thu, 12 Oct 2023 17:04:34 +0200
Subject: [PATCH 07/12] implementation of final comments

---
 .../submit_scripts/condor_cluster_size.sh     |  2 +-
 .../production/submit_scripts/job_submit.py   | 18 ++++++------
 .../cluster_size/condor/run_cluster.py        |  6 +---
 .../scripts/cluster_size/run_combine.py       | 15 ++++------
 .../scripts/cluster_size/run_init_tasks.py    |  7 ++---
 bye_splits/utils/cl_helpers.py                | 29 +++++++++----------
 bye_splits/utils/common.py                    |  3 --
 bye_splits/utils/job_helpers.py               |  8 +++--
 config.yaml                                   |  6 ++--
 9 files changed, 40 insertions(+), 54 deletions(-)

diff --git a/bye_splits/production/submit_scripts/condor_cluster_size.sh b/bye_splits/production/submit_scripts/condor_cluster_size.sh
index 8f0e26a7..a2830f2c 100644
--- a/bye_splits/production/submit_scripts/condor_cluster_size.sh
+++ b/bye_splits/production/submit_scripts/condor_cluster_size.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-cd /home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/scripts/cluster_size/condor/
+cd /home/llr/cms/${USER}/NewRepos/bye_splits/bye_splits/scripts/cluster_size/condor/
 
 radius=()
 particles=""
diff --git a/bye_splits/production/submit_scripts/job_submit.py b/bye_splits/production/submit_scripts/job_submit.py
index 43334214..ed184f9e 100644
--- a/bye_splits/production/submit_scripts/job_submit.py
+++ b/bye_splits/production/submit_scripts/job_submit.py
@@ -2,13 +2,13 @@
 
 import os
 import sys
-from datetime import datetime
 
 parent_dir = os.path.abspath(__file__ + 5 * "../")
 sys.path.insert(0, parent_dir)
 
 from bye_splits.utils import params, common, job_helpers
 
+from datetime import datetime
 import subprocess
 import yaml
 
@@ -53,10 +53,10 @@ def _write_arg_keys(self, current_version):
         """Writes the line containing the argument
         names to the buffer list."""
         
-        arg_keys = ["Arguments ="]
+        arg_keys = "Arguments ="
         for arg in self.args.keys():
-            arg_keys.append("$({})".format(arg))
-        arg_keys = " ".join(arg_keys) + "\n"
+            arg_keys += " $({})".format(arg)
+        arg_keys += "\n"
 
         current_version.append(arg_keys)
 
@@ -92,11 +92,11 @@ def _write_arg_values(self, current_version):
 
     def prepare_batch_submission(self):
         """Writes the .sh script that constitutes the executable
-        in the .sub script. The basename will be the same as the fundamental
-        script, followed by a version number. Stores the contents in a list that's
-        used as a buffer and checked against the content in previous
-        versions, only writing the file if an identical file doesn't
-        already exist. The version number will be incrimented in this case."""
+        in the .sub script. The basename will be the same as the running script, i.e.
+        the script set in the configuration file. This is then followed by a version number.
+        Stores the contents in a list that's used as a buffer and checked against the content
+        in previous versions, only writing the file if an identical file doesn't already exist.
+        The version number will be incrimented in this case."""
 
         sub_dir = "{}subs/".format(self.particle_dir)
 
diff --git a/bye_splits/scripts/cluster_size/condor/run_cluster.py b/bye_splits/scripts/cluster_size/condor/run_cluster.py
index ee186f52..c12289c6 100644
--- a/bye_splits/scripts/cluster_size/condor/run_cluster.py
+++ b/bye_splits/scripts/cluster_size/condor/run_cluster.py
@@ -12,12 +12,8 @@
 from utils import params, common, parsing, cl_helpers
 
 import argparse
-import random
-
-random.seed(10)
 import numpy as np
 import pandas as pd
-
 import yaml
 
 def cluster_radius(pars, cfg):
@@ -67,7 +63,7 @@ def cluster_radius(pars, cfg):
         cfg = yaml.safe_load(afile)
 
     if pars.weighted:
-        weight_dir = "{}/PU0/".format(params.LocalStorage)
+        weight_dir = os.path.join(params.LocalStorage, "PU0/")
         weights_by_particle = cl_helpers.read_weights(weight_dir, cfg)
         weights = weights_by_particle[pars.particles][radius_str]
         cfg["weights"] = weights
diff --git a/bye_splits/scripts/cluster_size/run_combine.py b/bye_splits/scripts/cluster_size/run_combine.py
index ca44870c..33bfba39 100644
--- a/bye_splits/scripts/cluster_size/run_combine.py
+++ b/bye_splits/scripts/cluster_size/run_combine.py
@@ -4,22 +4,17 @@
 
 import os
 import sys
-import argparse
 
 parent_dir = os.path.abspath(__file__ + 3 * "/..")
 sys.path.insert(0, parent_dir)
 
 from utils import params, common, parsing
-
 from data_handle.data_process import get_data_reco_chain_start
 
-import random
+import argparse
 import re
-
 import numpy as np
 import pandas as pd
-import sys
-
 import yaml
 from tqdm import tqdm
 
@@ -68,7 +63,7 @@ def combine_files_by_coef(in_dir, file_pattern):
     for all radii."""
 
     files = [
-        file for file in os.listdir(in_dir) if re.search(file_pattern, file) != None and "valid" not in file
+        file for file in os.listdir(in_dir) if re.search(file_pattern, file) is not None and "valid" not in file
     ]
     coef_pattern = r"coef_0p(\d+)"
     out_path = common.fill_path(file_pattern, data_dir=in_dir)
@@ -80,10 +75,10 @@ def combine_files_by_coef(in_dir, file_pattern):
                 clusterSizeOut[key] = clSizeCoef["/data"]
 
 def split_and_norm(df_cl, df_gen, dRthresh):
-    original_df, weighted_df = split_dfs(df_cl)
-    normed_df, normed_weighted_df = combine_normalize(original_df, df_gen, dRthresh), combine_normalize(weighted_df, df_gen, dRthresh)
+    o_df, w_df = split_dfs(df_cl)
+    normed_df, normed_w_df = combine_normalize(o_df, df_gen, dRthresh), combine_normalize(w_df, df_gen, dRthresh)
     df_dict = {"original": normed_df,
-                "weighted": normed_weighted_df}
+                "weighted": normed_w_df}
     return pd.Series(df_dict)
 
 def combine_cluster(cfg, **pars):
diff --git a/bye_splits/scripts/cluster_size/run_init_tasks.py b/bye_splits/scripts/cluster_size/run_init_tasks.py
index ddba90bf..ff64c8df 100644
--- a/bye_splits/scripts/cluster_size/run_init_tasks.py
+++ b/bye_splits/scripts/cluster_size/run_init_tasks.py
@@ -10,14 +10,10 @@
 
 import tasks
 from utils import params, common, parsing
-
 from data_handle.data_process import get_data_reco_chain_start
 
 import argparse
 import random
-
-import sys
-
 import yaml
 
 def start_chain(pars, cfg):
@@ -25,10 +21,10 @@ def start_chain(pars, cfg):
     negative and positive eta samples."""
 
     particles = cfg["selection"]["particles"]
-    pileup    = "PU0" if not cfg["clusterStudies"]["pileup"] else "PU200"
     reprocess = cfg["clusterStudies"]["reprocess"]
     tag       = cfg["clusterStudies"]["parquetTag"]
     nevents   = pars.nevents
+    pileup    = pars.pileup
 
     df_gen, df_cl, df_tc = get_data_reco_chain_start(
         particles=particles, nevents=nevents, reprocess=reprocess, tag=tag
@@ -74,6 +70,7 @@ def start_chain(pars, cfg):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="")
+    parser.add_argument("--pileup", help="tag for pileup choice", default="PU0")
     parsing.add_parameters(parser)
 
     FLAGS = parser.parse_args()
diff --git a/bye_splits/utils/cl_helpers.py b/bye_splits/utils/cl_helpers.py
index cb3c7d2c..b778afd4 100644
--- a/bye_splits/utils/cl_helpers.py
+++ b/bye_splits/utils/cl_helpers.py
@@ -1,15 +1,15 @@
 import os
 import sys
-import re
-import argparse
 
 parent_dir = os.path.abspath(__file__ + 3 * "/..")
 sys.path.insert(0, parent_dir)
 
+from bye_splits.utils import common, params
+
 import numpy as np
 import pandas as pd
-
-from bye_splits.utils import common, params
+import re
+import argparse
 
 def get_last_version(name):
     """Takes a template path, such as '/full/path/to/my_file.ext' and returns the path to the latest version
@@ -19,11 +19,11 @@ def get_last_version(name):
     base, ext = os.path.splitext(base)
     dir = os.path.dirname(name)
     if os.path.exists:
-        pattern = r"{}_v(\d{})".format(base, ext)
+        pattern = r"{}_v(\d){}".format(base, ext)
         matches = [re.match(pattern, file) for file in os.listdir(dir)]
         version = max(
             [
-                int(match.group(1).replace(ext, ""))
+                int(match.group(1))
                 for match in matches
                 if not match is None
             ]
@@ -36,19 +36,15 @@ def update_version_name(name):
     version = 0 if not os.path.exists(name) else get_last_version(name)
     return f"{base}_v{str(version+1)}{ext}"
 
-def closest(list, k=0.0):
+def closest(coef_list, k=0.0):
     """Find the element of a list containing strings ['coef_{float_1}', 'coef_{float_2}', ...] which is closest to some float_i"""
-    try:
-        list = np.reshape(np.asarray(list), 1)
-    except ValueError:
-        list = np.asarray(list)
+    coef_list = np.asarray(coef_list)
     if isinstance(k, str):
         k_num = float(re.split("coef_", k)[1].replace("p", "."))
     else:
         k_num = k
-    id = (np.abs(list - k_num)).argmin()
-    return list[id]
-
+    id = (np.abs(coef_list - k_num)).argmin()
+    return coef_list[id]
 
 def get_str(coef, df_dict):
     """Accepts a coefficient, either as a float or string starting with coef_, along with a dictionary of coefficient:DataFrame pairs.
@@ -66,7 +62,6 @@ def get_str(coef, df_dict):
         coef_str = "/coef_{}".format(str(new_coef).replace(".", "p"))
     return coef_str
 
-
 # Old Naming Conventions used different column names in the dataframes
 column_matching = {
     "etanew": "eta",
@@ -161,6 +156,10 @@ def read_weights(dir, cfg, version="final", mode="weights"):
     
         weights_by_particle[particle] = weights_by_radius
     
+    '''Weights are calculated from pt_norm distributions, which
+    are distorted by brem events for electrons. As this is
+    a physics effect uncorrelated to the TPG response, we correct
+    electrons with weights derived from photon pt_norm distributions'''
     weights_by_particle["electrons"] = weights_by_particle["photons"]
     
     return weights_by_particle
\ No newline at end of file
diff --git a/bye_splits/utils/common.py b/bye_splits/utils/common.py
index b2138620..d362e5df 100644
--- a/bye_splits/utils/common.py
+++ b/bye_splits/utils/common.py
@@ -16,9 +16,6 @@
 import numpy as np
 import pandas as pd
 
-import re
-
-
 def binConv(vals, dist, amin):
     """
     Converts bin indexes back to values (central values in the bin).
diff --git a/bye_splits/utils/job_helpers.py b/bye_splits/utils/job_helpers.py
index 49e84165..764d38bd 100644
--- a/bye_splits/utils/job_helpers.py
+++ b/bye_splits/utils/job_helpers.py
@@ -10,8 +10,8 @@
 
 _all_ = ['increment_version', 'grab_most_recent', 'compare_file_contents', 'write_file_version', 'conditional_write']
 
-# Accepts a template for a full path to a file and increments the version
 def increment_version(file_path):
+    """Accepts a template for a full path to a file and increments the version"""
     dir, file = os.path.split(file_path)
     base, ext = os.path.splitext(file)
     i = 0
@@ -21,8 +21,8 @@ def increment_version(file_path):
         file_path = "{}/{}_v{}{}".format(dir, base, i, ext)
     return file_path
 
-# Grab the most recent version of the file corresponding to the template file_path (or return all matches)
 def grab_most_recent(file_path, return_all=False):
+    """Grab the most recent version of the file corresponding to the template file_path (or return all matches)"""
     dir, file = os.path.split(file_path)
     base, ext = os.path.splitext(file)
     files = os.listdir(dir)
@@ -37,6 +37,8 @@ def grab_most_recent(file_path, return_all=False):
         else:
             file_path = [dir + "/" + base + "_v" + str(f) + ext for f in matches]
         return file_path
+    else:
+        raise ValueError("There are no versions of the passed file: {}".format(file_path))
 
 def compare_file_contents(file_path, buffer_list):
     """
@@ -64,7 +66,7 @@ def conditional_write(file_versions, file_template, current_version):
     <current_version> to an updated version number whose basename corresponds to
     <file_template>.
     """
-    if file_versions != None:
+    if file_versions is not None:
         identical_version = False
         for file in file_versions:
             if not compare_file_contents(file, current_version):
diff --git a/config.yaml b/config.yaml
index d61020a4..d9888518 100644
--- a/config.yaml
+++ b/config.yaml
@@ -148,7 +148,7 @@ job:
             0.019, 0.02 , 0.021, 0.022, 0.023, 0.024, 0.025, 0.026, 0.027,
             0.028, 0.029, 0.03 , 0.031, 0.032, 0.033, 0.034, 0.035, 0.036,
             0.037, 0.038, 0.039, 0.04 , 0.041, 0.042, 0.043, 0.044, 0.045,
-            0.046, 0.047, 0.048, 0.049, 0.05 ]
+            0.046, 0.047, 0.048, 0.049, 0.05]
     particles: pions
     pileup: PU0
   photons:
@@ -162,7 +162,7 @@ job:
     args_per_batch: 10
 
 clusterStudies:
-  localDir: /home/llr/cms/mchiusi/event_display/bye_splits/data/new_algos/
+  localDir: /home/llr/cms/ehle/NewRepos/bye_splits/data/new_algos/
   ehleDir: /eos/user/i/iehle/
   dataFolder: data/
   reinit: True #False
@@ -174,7 +174,7 @@ clusterStudies:
   pileup: False
   tree: FloatingpointMixedbcstcrealsig4DummyHistomaxxydr015GenmatchGenclustersntuple/HGCalTriggerNtuple
   fileBaseName: energy_out
-  local: False
+  local: True
 
 base:
   NbinsRz: 42

From 482e84cd1bab79fa1fe7ce6f4a1f82aff869c3b2 Mon Sep 17 00:00:00 2001
From: Isaac Ehle <ehle@llr.in2p3.fr>
Date: Fri, 13 Oct 2023 13:46:43 +0200
Subject: [PATCH 08/12] minor updates/corrections

---
 .../production/submit_scripts/condor_cluster_size.sh     | 2 +-
 bye_splits/production/submit_scripts/job_submit.py       | 3 ++-
 bye_splits/scripts/cluster_size/condor/run_cluster.py    | 2 +-
 bye_splits/scripts/cluster_size/run_init_tasks.py        | 2 +-
 bye_splits/utils/cl_helpers.py                           | 9 ++++-----
 config.yaml                                              | 5 +++++
 6 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/bye_splits/production/submit_scripts/condor_cluster_size.sh b/bye_splits/production/submit_scripts/condor_cluster_size.sh
index a2830f2c..ea696e46 100644
--- a/bye_splits/production/submit_scripts/condor_cluster_size.sh
+++ b/bye_splits/production/submit_scripts/condor_cluster_size.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-cd /home/llr/cms/${USER}/NewRepos/bye_splits/bye_splits/scripts/cluster_size/condor/
+cd ${HOME}/bye_splits/bye_splits/scripts/cluster_size/condor/
 
 radius=()
 particles=""
diff --git a/bye_splits/production/submit_scripts/job_submit.py b/bye_splits/production/submit_scripts/job_submit.py
index ed184f9e..9b3c65c1 100644
--- a/bye_splits/production/submit_scripts/job_submit.py
+++ b/bye_splits/production/submit_scripts/job_submit.py
@@ -77,7 +77,8 @@ def _write_arg_values(self, current_version):
             [0.01;0.02], photon
         )
         correctly assigns radius="[0.01, 0.02]", particle="photon"
-        """   
+        """
+        if "particles" in self.args.keys(): self.args["particles"] = self.particle
         arg_keys = ", ".join(self.args.keys())
         arg_keys = "queue " + arg_keys + " from (\n"
         current_version.append(arg_keys)
diff --git a/bye_splits/scripts/cluster_size/condor/run_cluster.py b/bye_splits/scripts/cluster_size/condor/run_cluster.py
index c12289c6..e779035f 100644
--- a/bye_splits/scripts/cluster_size/condor/run_cluster.py
+++ b/bye_splits/scripts/cluster_size/condor/run_cluster.py
@@ -33,7 +33,7 @@ def cluster_radius(pars, cfg):
         str(round(radius, 3)).replace(".", "p"),
     )
     cluster_d["ClusterOutPlot"], cluster_d["ClusterOutValidation"] = cl_size_radius, cl_size_radius+"_valid"
-    cluster_d["CoeffA"] = [radius] * 50
+    cluster_d["CoeffA"] = [radius] * (cfg["geometry"]["nlayersCEE"]+cfg["geometry"]["nlayersCEH"]) # Radii in each of the HGCAL layers
     
     if "weights" in cfg:
         cluster_d["weights"] = cfg["weights"]
diff --git a/bye_splits/scripts/cluster_size/run_init_tasks.py b/bye_splits/scripts/cluster_size/run_init_tasks.py
index ff64c8df..89a55f7e 100644
--- a/bye_splits/scripts/cluster_size/run_init_tasks.py
+++ b/bye_splits/scripts/cluster_size/run_init_tasks.py
@@ -1,6 +1,6 @@
 # coding: utf-8
 
-_all_ = []
+_all_ = ['start_chain']
 
 import os
 import sys
diff --git a/bye_splits/utils/cl_helpers.py b/bye_splits/utils/cl_helpers.py
index b778afd4..465aba86 100644
--- a/bye_splits/utils/cl_helpers.py
+++ b/bye_splits/utils/cl_helpers.py
@@ -138,13 +138,12 @@ def get_input_files(base_path, pile_up=False):
 
     return input_files
 
-def read_weights(dir, cfg, version="final", mode="weights"):
+def read_weights(dir, cfg, version="layer", mode="weights"):
     weights_by_particle = {}
-    for particle in ("photons", "pions"):
-        basename = "optimization_selectOneEffRms_maxSeed_bc_stc" if particle == "pions" else "optimization_selectOneStd_adjustMaxWeight_maxSeed"
+    weight_path_templates = cfg["clusterStudies"]["weights"][version]
+    for particle, basename in weight_path_templates.items():
         
-        version_dir = "{}/".format(version)
-        particle_dir = "{}{}/optimization/official/{}".format(dir, particle, version_dir)
+        particle_dir = os.path.join(dir, particle, cfg["clusterStudies"]["weights"]["subDir"])
 
         files = [f for f in os.listdir(particle_dir) if basename in f]
         weights_by_radius = {}
diff --git a/config.yaml b/config.yaml
index d9888518..94cf9f8f 100644
--- a/config.yaml
+++ b/config.yaml
@@ -175,6 +175,11 @@ clusterStudies:
   tree: FloatingpointMixedbcstcrealsig4DummyHistomaxxydr015GenmatchGenclustersntuple/HGCalTriggerNtuple
   fileBaseName: energy_out
   local: True
+  weights:
+    subDir: optimization/official/final/
+    layer:
+      photons: optimization_selectOneStd_adjustMaxWeight_maxSeed
+      pions:  optimization_selectOneEffRms_maxSeed_bc_stc
 
 base:
   NbinsRz: 42

From fc9e0002d2c5befe56f12c011c6794eebe62bf5d Mon Sep 17 00:00:00 2001
From: Isaac Ehle <ehle@llr.in2p3.fr>
Date: Sat, 14 Oct 2023 16:10:03 +0200
Subject: [PATCH 09/12] implemented Marco's comments

---
 README.md                                     | 66 ++++++++++++++++++-
 .../production/submit_scripts/job_submit.py   |  6 +-
 bye_splits/utils/job_helpers.py               |  7 +-
 3 files changed, 71 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index a66bc6cd..f27151de 100644
--- a/README.md
+++ b/README.md
@@ -103,8 +103,70 @@ The `PU0` files above were merged and are stored under `/data_CMS/cms/alves/L1HG
 <a id="job-submission"></a>
 ## Job Submission
 
-Job submission to HT Condor is handled through `bye_splits/production/submit_scripts/job_submit.py` using the `job` section of `config.yaml` for its configuration. The configuration should include usual condor variables, i.e `user`, `proxy`, `queue`, and `local`, as well as a path to the `script` you would like to run on condor. The `arguments` sub-section should contain `key/value` pairs matching the expected arguments that `script` accepts. The variable that you would like to iterate over should be set in `iterOver` and its value should correspond to a `key` in the `arguments` sub-section whose value is a list containing the values the script should iterate over. It then contains a section for each particle type which should contain a `submit_dir`, i.e. the directory in which to read and write submission related files, and `args_per_batch` which can be any number between 1 and `len(arguments[<iterOver>])`.
-
+Job submission to HT Condor is handled through `bye_splits/production/submit_scripts/job_submit.py` using the `job` section of `config.yaml` for its configuration. The configuration should include usual condor variables, i.e `user`, `proxy`, `queue`, and `local`, as well as a path to the `script` you would like to run on condor. The `arguments` sub-section should contain `key/value` pairs matching the expected arguments that `script` accepts. The variable that you would like to iterate over should be set in `iterOver` and its value should correspond to a `key` in the `arguments` sub-section whose value is a list containing the values the script should iterate over. It then contains a section for each particle type which should contain a `submit_dir`, i.e. the directory in which to read and write submission related files, and `args_per_batch` which can be any number between 1 and `len(arguments[<iterOver>])`. An example of the `job` configuration settings is as such:
+
+    job:
+        user: iehle
+        proxy: ~/.t3/proxy.cert
+        queue: short
+        local: False
+        script: /grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/production/submit_scripts/condor_cluster_size.sh
+        iterOver: radius
+        arguments:
+            radius: [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009,
+                    0.01 , 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.017, 0.018,
+                    0.019, 0.02 , 0.021, 0.022, 0.023, 0.024, 0.025, 0.026, 0.027,
+                    0.028, 0.029, 0.03 , 0.031, 0.032, 0.033, 0.034, 0.035, 0.036,
+                    0.037, 0.038, 0.039, 0.04 , 0.041, 0.042, 0.043, 0.044, 0.045,
+                    0.046, 0.047, 0.048, 0.049, 0.05]
+            particles: photons
+            pileup: PU0
+        photons:
+            submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/photons/
+            args_per_batch: 10
+        electrons:
+            submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/electrons/
+            args_per_batch: 10
+        pions:
+            submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/pions/
+            args_per_batch: 10
+
+After setting the configuration variables, the jobs are created and launched via
+
+    python bye_splits/production/submit_scripts/job_submit.py
+
+and will produce both the executable `.sh` file which will have a form like
+
+    #!/usr/bin/env bash
+    workdir=/grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/production/submit_scripts
+    cd $workdir
+    export VO_CMS_SW_DIR=/cvmfs/cms.cern.ch
+    export SITECONFIG_PATH=$VO_CMS_SW_DIR/SITECONF/T2_FR_GRIF_LLR/GRIF-LLR/
+    source $VO_CMS_SW_DIR/cmsset_default.sh
+    bash /grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/production/submit_scripts/condor_cluster_size.sh --radius $1 --particles $2 --pileup $3
+
+and the `.sub` file submitted to HT Condor:
+
+    executable = /data_CMS/cms/ehle/L1HGCAL/PU0/photons/subs/condor_cluster_size_submit_v1.sh
+    Universe              = vanilla
+    Arguments = $(radius) $(particles) $(pileup)
+    output = /data_CMS/cms/ehle/L1HGCAL/PU0/photons/logs/condor_cluster_size_C$(Cluster)P$(Process).out
+    error = /data_CMS/cms/ehle/L1HGCAL/PU0/photons/logs/condor_cluster_size_C$(Cluster)P$(Process).err
+    log = /data_CMS/cms/ehle/L1HGCAL/PU0/photons/logs/condor_cluster_size_C$(Cluster)P$(Process).log
+    getenv                = true
+    T3Queue = short
+    WNTag                 = el7
+    +SingularityCmd       = ""
+    include: /opt/exp_soft/cms/t3/t3queue |
+    queue radius, particles, pileup from (
+    [0.001;0.002;0.003;0.004;0.005;0.006;0.007;0.008;0.009;0.01], photons, PU0
+    [0.011;0.012;0.013;0.014;0.015;0.016;0.017;0.018;0.019;0.02], photons, PU0
+    [0.021;0.022;0.023;0.024;0.025;0.026;0.027;0.028;0.029;0.03], photons, PU0
+    [0.031;0.032;0.033;0.034;0.035;0.036;0.037;0.038;0.039;0.04], photons, PU0
+    [0.041;0.042;0.043;0.044;0.045;0.046;0.047;0.048;0.049;0.05], photons, PU0
+    )
+
+This describes just one example, and other uses can be easily implemented. You may, for example, run the [skimming procedure](#skimming) on HT Condor with a small bash script that accepts `--particles` and `--nevents` as arguments, and update the configuration variables accordingly.
 
 <a id="org0bc224d"></a>
 # Reconstruction Chain
diff --git a/bye_splits/production/submit_scripts/job_submit.py b/bye_splits/production/submit_scripts/job_submit.py
index 9b3c65c1..e2b32594 100644
--- a/bye_splits/production/submit_scripts/job_submit.py
+++ b/bye_splits/production/submit_scripts/job_submit.py
@@ -27,11 +27,11 @@ def __init__(self, particle, config):
         self.particle_var = lambda part, var: config["job"][part][var]
 
     def setup_batches(self):
-        total = self.config["job"]["arguments"][self.iterOver]
+        total_vals = self.config["job"]["arguments"][self.iterOver]
 
         vals_per_batch = self.particle_var(self.particle, "args_per_batch")    
 
-        batches = [total[i: i + vals_per_batch] for i in range(0, len(total), vals_per_batch)]
+        batches = [total_vals[i: i + vals_per_batch] for i in range(0, len(total_vals), vals_per_batch)]
         
         return batches
     
@@ -86,7 +86,7 @@ def _write_arg_values(self, current_version):
             sub_args = list(self.args.keys())[1:]
             arg_vals = [self.args[key] for key in sub_args]
             all_vals = ["{}".format(batch).replace(", ", ";")]+arg_vals
-            all_vals = ", ".join(all_vals) + "\n"
+            all_vals = ", ".join(map(str, all_vals)) + "\n"
             current_version.append(all_vals)
 
         current_version.append(")")
diff --git a/bye_splits/utils/job_helpers.py b/bye_splits/utils/job_helpers.py
index 764d38bd..428a4a87 100644
--- a/bye_splits/utils/job_helpers.py
+++ b/bye_splits/utils/job_helpers.py
@@ -22,7 +22,8 @@ def increment_version(file_path):
     return file_path
 
 def grab_most_recent(file_path, return_all=False):
-    """Grab the most recent version of the file corresponding to the template file_path (or return all matches)"""
+    """Grab the most recent version of the file corresponding to the template file_path (or return all matches).
+    Returns None if no files mathing template <file_path> have been written."""
     dir, file = os.path.split(file_path)
     base, ext = os.path.splitext(file)
     files = os.listdir(dir)
@@ -38,7 +39,7 @@ def grab_most_recent(file_path, return_all=False):
             file_path = [dir + "/" + base + "_v" + str(f) + ext for f in matches]
         return file_path
     else:
-        raise ValueError("There are no versions of the passed file: {}".format(file_path))
+        return None
 
 def compare_file_contents(file_path, buffer_list):
     """
@@ -64,7 +65,7 @@ def conditional_write(file_versions, file_template, current_version):
     to the current version. If an identical version is found, the function
     breaks and does nothing. Otherwise, it will write the contents in
     <current_version> to an updated version number whose basename corresponds to
-    <file_template>.
+    <file_template>. If file_versions is None, writes the v0 version.
     """
     if file_versions is not None:
         identical_version = False

From 5dd003fbae73a4af9fd573ec84b302d28ba3efa5 Mon Sep 17 00:00:00 2001
From: Isaac Ehle <ehle@llr.in2p3.fr>
Date: Mon, 16 Oct 2023 19:02:30 +0200
Subject: [PATCH 10/12] use python files as <script> directly

---
 .../submit_scripts/condor_cluster_size.sh     |  31 -----
 .../production/submit_scripts/job_submit.py   | 113 +++++++++++-------
 config.yaml                                   |   6 +-
 3 files changed, 75 insertions(+), 75 deletions(-)
 delete mode 100644 bye_splits/production/submit_scripts/condor_cluster_size.sh

diff --git a/bye_splits/production/submit_scripts/condor_cluster_size.sh b/bye_splits/production/submit_scripts/condor_cluster_size.sh
deleted file mode 100644
index ea696e46..00000000
--- a/bye_splits/production/submit_scripts/condor_cluster_size.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/usr/bin/env bash
-
-cd ${HOME}/bye_splits/bye_splits/scripts/cluster_size/condor/
-
-radius=()
-particles=""
-pileup=""
-
-while [[ "$#" -gt 0 ]]; do
-    case "$1" in
-        --radius)
-            IFS=";" read -ra radius <<< "${2:1:-1}"
-            shift 2
-            ;;
-        --particles)
-            particles="$2"
-            shift 2
-            ;;
-        --pileup)
-            pileup="$2"
-            shift 2
-            ;;
-        *)
-            echo "Unrecognized argument $1"
-            exit 1;;
-    esac
-done
-
-for rad in ${radius[@]}; do
-    python run_cluster.py --radius "$rad" --particles "$particles" --pileup "$pileup"
-done
\ No newline at end of file
diff --git a/bye_splits/production/submit_scripts/job_submit.py b/bye_splits/production/submit_scripts/job_submit.py
index e2b32594..4e97df50 100644
--- a/bye_splits/production/submit_scripts/job_submit.py
+++ b/bye_splits/production/submit_scripts/job_submit.py
@@ -49,16 +49,15 @@ def __init__(self, particle, config):
         self.particle_dir = self.batch.particle_var(self.particle, "submit_dir")
         self.batches      = self.batch.setup_batches()
 
-    def _write_arg_keys(self, current_version):
-        """Writes the line containing the argument
-        names to the buffer list."""
+    def _check_script_arguments(self):
+        script_args = subprocess.run([self.script, "--args"], capture_output=True, text=True)
+        if script_args.returncode != 0:
+            result = script_args.stdout.strip().split("\n")
+        else:
+            result = []
         
-        arg_keys = "Arguments ="
-        for arg in self.args.keys():
-            arg_keys += " $({})".format(arg)
-        arg_keys += "\n"
+        assert(result == list(self.args.keys())), "Provided arguments {} do not match accepted arguments {}.".format(list(self.args.keys()), result)
 
-        current_version.append(arg_keys)
 
     def _write_arg_values(self, current_version):
         """Adds the argument values, where the batch lists are converted
@@ -78,57 +77,71 @@ def _write_arg_values(self, current_version):
         )
         correctly assigns radius="[0.01, 0.02]", particle="photon"
         """
+        
         if "particles" in self.args.keys(): self.args["particles"] = self.particle
         arg_keys = ", ".join(self.args.keys())
         arg_keys = "queue " + arg_keys + " from (\n"
+
         current_version.append(arg_keys)
+
+        sub_args = list(self.args.keys())
+        sub_args.remove(self.iterOver)
+        arg_vals = [self.args[key] for key in sub_args]
+
         for batch in self.batches:
-            sub_args = list(self.args.keys())[1:]
-            arg_vals = [self.args[key] for key in sub_args]
-            all_vals = ["{}".format(batch).replace(", ", ";")]+arg_vals
+            all_vals = [str(batch).replace(", ", ";")]+arg_vals
             all_vals = ", ".join(map(str, all_vals)) + "\n"
             current_version.append(all_vals)
 
         current_version.append(")")
 
-    def prepare_batch_submission(self):
-        """Writes the .sh script that constitutes the executable
-        in the .sub script. The basename will be the same as the running script, i.e.
-        the script set in the configuration file. This is then followed by a version number.
-        Stores the contents in a list that's used as a buffer and checked against the content
-        in previous versions, only writing the file if an identical file doesn't already exist.
-        The version number will be incrimented in this case."""
-
+    def write_exec_file(self):
+        """"""
+        script_dir, script_name = os.path.split(self.script)
+        basename, ext = os.path.splitext(script_name)
+        
         sub_dir = "{}subs/".format(self.particle_dir)
 
         common.create_dir(sub_dir)
 
         script_basename = os.path.basename(self.script).replace(".sh", "").replace(".py", "")
 
-        submit_file_name_template = "{}{}_submit.sh".format(sub_dir, script_basename)
-        submit_file_versions = job_helpers.grab_most_recent(submit_file_name_template, return_all=True)
+        exec_file_name_template = "{}{}_exec.sh".format(sub_dir, script_basename)
+        exec_file_versions = job_helpers.grab_most_recent(exec_file_name_template, return_all=True)
 
         current_version = []
         current_version.append("#!/usr/bin/env bash\n")
-        current_version.append("workdir={}/bye_splits/production/submit_scripts\n".format(parent_dir))
-        current_version.append("cd $workdir\n")
         current_version.append("export VO_CMS_SW_DIR=/cvmfs/cms.cern.ch\n")
         current_version.append("export SITECONFIG_PATH=$VO_CMS_SW_DIR/SITECONF/T2_FR_GRIF_LLR/GRIF-LLR/\n")
         current_version.append("source $VO_CMS_SW_DIR/cmsset_default.sh\n")
         
-        if len(self.args.keys()) > 0:
-            args = ["bash {}".format(self.script)]
-            for i, key in enumerate(self.args.keys()):
-                args.append("--{} ${}".format(key, i+1))
-            args = " ".join(args)
-            current_version.append(args)
-        else:
-            current_version.append("bash {}".format(self.script))
+        sub_args = list(self.args.keys())
+        if self.iterOver is not None:
+            sub_args.remove(self.iterOver)
+
+            current_version.append('list=$1\n')
+            current_version.append("cleaned_list=$(echo $list | tr -d '[]' | tr ';' '\n')\n")
+            current_version.append('while IFS=";" read -r val; do\n')
             
-        # Write the file only if an identical file doesn't already exist
-        self.sub_file = job_helpers.conditional_write(submit_file_versions, submit_file_name_template, current_version)
+            python_call = '    python {} --{} "$val"'.format(self.script, self.iterOver)
+            for arg in sub_args:
+                python_call +=' --{} {}'.format(arg, self.args[arg])
+            python_call += "\n"
+
+            current_version.append(python_call)
+            current_version.append('done <<< "$cleaned_list"')
+
+        else:
+            python_call = 'python {}'.format(self.script)
+
+            for arg in sub_args:
+                python_call +=' --{} {}'.format(arg, self.args[arg])   
+
+            current_version.append(python_call)
+
+        self.sub_file = job_helpers.conditional_write(exec_file_versions, exec_file_name_template, current_version)
 
-    def prepare_multi_job_condor(self):
+    def write_sub_file(self):
         """Writes the .sub script that is submitted to HT Condor.
         Follows the same naming convention and conditional_write()
         procedure as the previous function."""
@@ -144,9 +157,23 @@ def prepare_multi_job_condor(self):
         current_version = []
         current_version.append("executable = {}\n".format(self.sub_file))
         current_version.append("Universe              = vanilla\n")
+        
+        #self._check_script_arguments()
+
+        """Add argument keys, requring that the first argument corresponds to <iterOver>.
+        This is because the handling of the iterated argument is handled differently by
+        than the static variables by write_exec_file() which expectes <iterOver> to be
+        passed first."""
+        arg_keys = "Arguments ="
+        sub_args = list(self.args.keys())
+        if self.iterOver is not None:
+            arg_keys += " $({})".format(self.iterOver)
+            sub_args.remove(self.iterOver)
+        for arg in sub_args:
+            arg_keys += " $({})".format(arg)
+        arg_keys += "\n"
 
-        if len(self.args) > 0:
-            self._write_arg_keys(current_version)
+        current_version.append(arg_keys)          
 
         current_version.append("output = {}{}_C$(Cluster)P$(Process).out\n".format(log_dir, script_basename))
         current_version.append("error = {}{}_C$(Cluster)P$(Process).err\n".format(log_dir, script_basename))
@@ -180,8 +207,8 @@ def prepare_jobs(self):
         for d in (config_dir, job_dir, log_dir):
             common.create_dir(d)
 
-        self.base.prepare_batch_submission()
-        self.base.prepare_multi_job_condor()
+        self.base.write_exec_file()
+        self.base.write_sub_file()
 
     def launch_jobs(self):
 
@@ -209,13 +236,17 @@ def launch_jobs(self):
             comm = sub_comm + sub_args
 
         print(str(datetime.now()), " ".join(comm))
-        status = subprocess.run(comm)
+        subprocess.run(comm)
 
 if __name__ == "__main__":    
     with open(params.CfgPath, "r") as afile:
         config = yaml.safe_load(afile)
 
-    for particle in ("photons", "electrons", "pions"):
+    job = CondJob("photons", config)
+    job.prepare_jobs()
+    job.launch_jobs()
+
+    '''for particle in ("photons", "electrons", "pions"):
         job = CondJob(particle, config)
         job.prepare_jobs()
-        job.launch_jobs()
\ No newline at end of file
+        job.launch_jobs()'''
\ No newline at end of file
diff --git a/config.yaml b/config.yaml
index 94cf9f8f..f6c291d4 100644
--- a/config.yaml
+++ b/config.yaml
@@ -140,7 +140,7 @@ job:
   proxy: ~/.t3/proxy.cert
   queue: short
   local: False
-  script: /grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/production/submit_scripts/condor_cluster_size.sh
+  script: /grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/scripts/cluster_size/condor/run_cluster.py
   iterOver: radius
   arguments:
     radius: [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009,
@@ -156,10 +156,10 @@ job:
     args_per_batch: 10
   electrons:
     submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/electrons/
-    args_per_batch: 10
+    args_per_batch: 5
   pions:
     submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/pions/
-    args_per_batch: 10
+    args_per_batch: 5
 
 clusterStudies:
   localDir: /home/llr/cms/ehle/NewRepos/bye_splits/data/new_algos/

From 511fcf0d772199c83cf561bb6fd88a9b7b7cb27e Mon Sep 17 00:00:00 2001
From: Isaac Ehle <ehle@llr.in2p3.fr>
Date: Tue, 17 Oct 2023 16:42:56 +0200
Subject: [PATCH 11/12] added dummy script for job testing; updated
 documentation; cleanup

---
 README.md                                     | 147 ++++++++++--------
 .../production/submit_scripts/job_submit.py   |  48 +++---
 .../scripts/cluster_size/run_init_tasks.py    |   6 +-
 config.yaml                                   |  35 +++--
 tests/submission/dummy_submit.py              |  34 ++++
 5 files changed, 160 insertions(+), 110 deletions(-)
 create mode 100644 tests/submission/dummy_submit.py

diff --git a/README.md b/README.md
index f27151de..989892d5 100644
--- a/README.md
+++ b/README.md
@@ -103,70 +103,63 @@ The `PU0` files above were merged and are stored under `/data_CMS/cms/alves/L1HG
 <a id="job-submission"></a>
 ## Job Submission
 
-Job submission to HT Condor is handled through `bye_splits/production/submit_scripts/job_submit.py` using the `job` section of `config.yaml` for its configuration. The configuration should include usual condor variables, i.e `user`, `proxy`, `queue`, and `local`, as well as a path to the `script` you would like to run on condor. The `arguments` sub-section should contain `key/value` pairs matching the expected arguments that `script` accepts. The variable that you would like to iterate over should be set in `iterOver` and its value should correspond to a `key` in the `arguments` sub-section whose value is a list containing the values the script should iterate over. It then contains a section for each particle type which should contain a `submit_dir`, i.e. the directory in which to read and write submission related files, and `args_per_batch` which can be any number between 1 and `len(arguments[<iterOver>])`. An example of the `job` configuration settings is as such:
-
-    job:
-        user: iehle
-        proxy: ~/.t3/proxy.cert
-        queue: short
-        local: False
-        script: /grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/production/submit_scripts/condor_cluster_size.sh
-        iterOver: radius
-        arguments:
-            radius: [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009,
-                    0.01 , 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.017, 0.018,
-                    0.019, 0.02 , 0.021, 0.022, 0.023, 0.024, 0.025, 0.026, 0.027,
-                    0.028, 0.029, 0.03 , 0.031, 0.032, 0.033, 0.034, 0.035, 0.036,
-                    0.037, 0.038, 0.039, 0.04 , 0.041, 0.042, 0.043, 0.044, 0.045,
-                    0.046, 0.047, 0.048, 0.049, 0.05]
-            particles: photons
-            pileup: PU0
-        photons:
-            submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/photons/
-            args_per_batch: 10
-        electrons:
-            submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/electrons/
-            args_per_batch: 10
-        pions:
-            submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/pions/
-            args_per_batch: 10
+Job submission to HT Condor is handled through `bye_splits/production/submit_scripts/job_submit.py` using the section of `config.yaml` for its configuration. The configuration should include usual condor variables, i.e `user`, `proxy`, `queue`, and `local`, as well as a path to the `script` you would like to run on condor. The `arguments` sub-section should contain `key/value` pairs matching the expected arguments that `script` accepts. The variable that you would like to iterate over should be set in `iterOver` and its value should correspond to a `key` in the `arguments` sub-section whose value is a list containing the values the script should iterate over. It then contains a section for each particle type which should contain a `submit_dir`, i.e. the directory in which to read and write submission related files, and `args_per_batch` which can be any number between 1 and `len(arguments[<iterOver>])`. An example of the `job` configuration settings is as such:
+
+```yaml
+job:
+    user: iehle
+    proxy: ~/.t3/proxy.cert
+    queue: short
+    local: False
+    script: /grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/tests/submission/dummy_submit.py
+    iterOver: gen_arg
+    arguments:
+        float_arg: 0.11
+        str_arg: a_string
+        gen_arg: [gen, 3.14, work, broke, 9, False, 12.9, hello]
+    test:
+        submit_dir: /home/llr/cms/ehle/NewRepos/bye_splits/tests/submission/
+        args_per_batch: 2
+```
 
 After setting the configuration variables, the jobs are created and launched via
 
     python bye_splits/production/submit_scripts/job_submit.py
 
-and will produce both the executable `.sh` file which will have a form like
+while will produce the executable `.sh` file in `<submit_dir>/subs/` that looks like:
 
     #!/usr/bin/env bash
-    workdir=/grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/production/submit_scripts
-    cd $workdir
     export VO_CMS_SW_DIR=/cvmfs/cms.cern.ch
     export SITECONFIG_PATH=$VO_CMS_SW_DIR/SITECONF/T2_FR_GRIF_LLR/GRIF-LLR/
     source $VO_CMS_SW_DIR/cmsset_default.sh
-    bash /grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/production/submit_scripts/condor_cluster_size.sh --radius $1 --particles $2 --pileup $3
+    list=$1
+    cleaned_list=$(echo $list | tr -d '[]' | tr ';' '
+    ')
+    while IFS=";" read -r val; do
+        python /grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/tests/submission/dummy_submit.py --gen_arg "$val" --float_arg 0.11 --str_arg a_string
+    done <<< "$cleaned_list"
 
-and the `.sub` file submitted to HT Condor:
+and the `.sub` file submitted to HT Condor in `<subdmit_dir>/jobs/` that looks like:
 
-    executable = /data_CMS/cms/ehle/L1HGCAL/PU0/photons/subs/condor_cluster_size_submit_v1.sh
+    executable = /home/llr/cms/ehle/NewRepos/bye_splits/tests/submission/subs/dummy_submit_exec_v5.sh
     Universe              = vanilla
-    Arguments = $(radius) $(particles) $(pileup)
-    output = /data_CMS/cms/ehle/L1HGCAL/PU0/photons/logs/condor_cluster_size_C$(Cluster)P$(Process).out
-    error = /data_CMS/cms/ehle/L1HGCAL/PU0/photons/logs/condor_cluster_size_C$(Cluster)P$(Process).err
-    log = /data_CMS/cms/ehle/L1HGCAL/PU0/photons/logs/condor_cluster_size_C$(Cluster)P$(Process).log
+    Arguments = $(gen_arg) $(float_arg) $(str_arg)
+    output = /home/llr/cms/ehle/NewRepos/bye_splits/tests/submission/logs/dummy_submit_C$(Cluster)P$(Process).out
+    error = /home/llr/cms/ehle/NewRepos/bye_splits/tests/submission/logs/dummy_submit_C$(Cluster)P$(Process).err
+    log = /home/llr/cms/ehle/NewRepos/bye_splits/tests/submission/logs/dummy_submit_C$(Cluster)P$(Process).log
     getenv                = true
     T3Queue = short
     WNTag                 = el7
     +SingularityCmd       = ""
     include: /opt/exp_soft/cms/t3/t3queue |
-    queue radius, particles, pileup from (
-    [0.001;0.002;0.003;0.004;0.005;0.006;0.007;0.008;0.009;0.01], photons, PU0
-    [0.011;0.012;0.013;0.014;0.015;0.016;0.017;0.018;0.019;0.02], photons, PU0
-    [0.021;0.022;0.023;0.024;0.025;0.026;0.027;0.028;0.029;0.03], photons, PU0
-    [0.031;0.032;0.033;0.034;0.035;0.036;0.037;0.038;0.039;0.04], photons, PU0
-    [0.041;0.042;0.043;0.044;0.045;0.046;0.047;0.048;0.049;0.05], photons, PU0
+    queue gen_arg, float_arg, str_arg from (
+    ['gen';3.14], 0.11, a_string
+    ['work';'broke'], 0.11, a_string
+    [9;False], 0.11, a_string
+    [12.9;'hello'], 0.11, a_string
     )
 
-This describes just one example, and other uses can be easily implemented. You may, for example, run the [skimming procedure](#skimming) on HT Condor with a small bash script that accepts `--particles` and `--nevents` as arguments, and update the configuration variables accordingly.
+All logs, outputs, and errors are written to their respective files in `<submit_dir>/logs/`. Some primary uses of `job_submit.py` include running the [skimming procedure](#skimming), iterating over each particle type, and running the [cluster studies](#cluster-size-studies) over a list of radii.
 
 <a id="org0bc224d"></a>
 # Reconstruction Chain
@@ -188,33 +181,51 @@ The above will create `html` files with interactive outputs.
 
 ## Cluster Size Studies
 
-The script `bye_splits/scripts/cluster_size.py` reads a configuration file `bye_splits/scripts/cl_size_params.yaml` and runs the Reconstruction Chain on the `.root` inside corresponding to the chosen particle, where the clustering step is repeated for a range of cluster radii that is specified in the parameter file under `cl_size: Coeffs`.
-
-The most convenient way of running the study is to do:
-
-    bash run_cluster_size.sh <username>
-
-where `<username>` is your lxplus username, creating `.hdf5` files containing Pandas DFs containing cluster properties (notably energy, eta, phi) and associated gen-level particle information for each radius. The bash script acts as a wrapper for the python script, setting a few options that are convenient for the cluster size studies that are not the default options for the general reconstruction chain. As of now, the output `.hdf5` files will be written to your local directory using the structure:
-
-    ├── /<base_dir>
-    │            ├── out
-    │            ├── data
-    │            │   ├──new_algos
+The optimization of the clustering radius is done via the scripts in `bye_splits/scripts/cluster_size/`. The configuration is done in the `config.yaml` file under `clusterStudies`.
+The initial steps of the reconstruction chain (fill, smooth, seed) are run via
+
+    python run_init_tasks.py --pileup <PU0/PU200>
+
+which will produce the files required for `bye_splits/scripts/cluster_size/condor/run_cluster.py` (default value for `pileup==PU0`). One can run the script on a single radius:
+
+    python run_cluster.py --radius <float> --particles <photons/electrons/pions> --pileup <PU0/PU200>
+
+As the directory name suggests, `run_cluster.py` can and should be run as a `script` passed to an HTCondor job as described by [Job Submission](#job-submission) if you wish
+to run over all radii. The configuration would look something like this:
+
+```yaml
+job:
+user: iehle
+proxy: ~/.t3/proxy.cert
+queue: short
+local: False
+script: /grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/scripts/cluster_size/condor/run_cluster.py
+iterOver: radius
+arguments:
+    radius: [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009,
+            0.01 , 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.017, 0.018,
+            0.019, 0.02 , 0.021, 0.022, 0.023, 0.024, 0.025, 0.026, 0.027,
+            0.028, 0.029, 0.03 , 0.031, 0.032, 0.033, 0.034, 0.035, 0.036,
+            0.037, 0.038, 0.039, 0.04 , 0.041, 0.042, 0.043, 0.044, 0.045,
+            0.046, 0.047, 0.048, 0.049, 0.05]
+    particles: pions
+    pileup: PU0
+photons:
+    submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/photons/
+    args_per_batch: 10
+electrons:
+    submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/electrons/
+    args_per_batch: 10
+pions:
+    submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/pions/
+    args_per_batch: 10
+```
 
-with the files ending up in `new_algos/`. Currently working on implementing an option to send the files directly to your `eos/` directory, assuming the structure:
+This will produce the output of `cluster.cluster_default()` for each radius. These files are then combined into one larger `.hdf5` file whose keys correspond to the various radii, and combined and normalized with the gen-level data via:
 
-    ├── /eos/user/<first_letter>/<username>
-    │                                   ├── out
-    │                                   ├── data
-    │                                   │   ├──PU0
-    │                                   │   │   ├──electrons
-    │                                   │   │   ├──photons
-    │                                   │   │   ├──pions
-    │                                   │   ├──PU200
-    │                                   │   │   ├──electrons
-    │                                   │   │   ├──photons
-    │                                   │   │   ├──pions
+    python run_combine.py
 
+The optional `--file` argument performs the combination and normalization with the gen-level data on only `<file>`.
 
 <a id="org44a4071"></a>
 
diff --git a/bye_splits/production/submit_scripts/job_submit.py b/bye_splits/production/submit_scripts/job_submit.py
index 4e97df50..27b52cd6 100644
--- a/bye_splits/production/submit_scripts/job_submit.py
+++ b/bye_splits/production/submit_scripts/job_submit.py
@@ -9,7 +9,9 @@
 from bye_splits.utils import params, common, job_helpers
 
 from datetime import datetime
+import re
 import subprocess
+import argparse
 import yaml
 
 class JobBatches:
@@ -49,16 +51,6 @@ def __init__(self, particle, config):
         self.particle_dir = self.batch.particle_var(self.particle, "submit_dir")
         self.batches      = self.batch.setup_batches()
 
-    def _check_script_arguments(self):
-        script_args = subprocess.run([self.script, "--args"], capture_output=True, text=True)
-        if script_args.returncode != 0:
-            result = script_args.stdout.strip().split("\n")
-        else:
-            result = []
-        
-        assert(result == list(self.args.keys())), "Provided arguments {} do not match accepted arguments {}.".format(list(self.args.keys()), result)
-
-
     def _write_arg_values(self, current_version):
         """Adds the argument values, where the batch lists are converted
         to strings as [val_1, val_2, ...] --> "[val_1;val_2]".
@@ -79,7 +71,14 @@ def _write_arg_values(self, current_version):
         """
         
         if "particles" in self.args.keys(): self.args["particles"] = self.particle
-        arg_keys = ", ".join(self.args.keys())
+        
+        arg_keys = list(self.args.keys())
+        # Ensure that the iterated argument is queued first
+        if self.iterOver in self.args.keys():
+            arg_keys.remove(self.iterOver)
+            arg_keys.insert(0, self.iterOver)
+
+        arg_keys = ", ".join(arg_keys)
         arg_keys = "queue " + arg_keys + " from (\n"
 
         current_version.append(arg_keys)
@@ -96,11 +95,19 @@ def _write_arg_values(self, current_version):
         current_version.append(")")
 
     def write_exec_file(self):
-        """"""
+        """Writes the .sh file that the condor .sub file runs as the
+        <executable>. This constitutes a few common exports and sourcing,
+        and follows by writing the Python call to <script>, passing the given
+        arguments and values. The contents of this file is written to a buffer
+        list first, and conditional_write() will check the contents of other scripts
+        in the same directory that contain the same basename. If an identical script
+        already exists, will use this script. Otherwise, it will increment the version
+        number (<script_name> + _v<#>). If no versions exist, the v0 version will be written."""
+
         script_dir, script_name = os.path.split(self.script)
         basename, ext = os.path.splitext(script_name)
         
-        sub_dir = "{}subs/".format(self.particle_dir)
+        sub_dir = os.path.join(self.particle_dir, "subs/")
 
         common.create_dir(sub_dir)
 
@@ -146,19 +153,17 @@ def write_sub_file(self):
         Follows the same naming convention and conditional_write()
         procedure as the previous function."""
 
-        log_dir = "{}logs/".format(self.particle_dir)
+        log_dir = os.path.join(self.particle_dir, "logs/")
 
         script_basename = os.path.basename(self.script).replace(".sh", "").replace(".py", "")
 
-        job_file_name_template = "{}jobs/{}.sub".format(self.particle_dir, script_basename)
+        job_file_name_template = "{}/jobs/{}.sub".format(self.particle_dir, script_basename)
 
         job_file_versions = job_helpers.grab_most_recent(job_file_name_template, return_all=True)
 
         current_version = []
         current_version.append("executable = {}\n".format(self.sub_file))
         current_version.append("Universe              = vanilla\n")
-        
-        #self._check_script_arguments()
 
         """Add argument keys, requring that the first argument corresponds to <iterOver>.
         This is because the handling of the iterated argument is handled differently by
@@ -200,11 +205,8 @@ def __init__(self, particle, config):
 
     def prepare_jobs(self):
 
-        config_dir = self.base.particle_dir + "configs"
-        job_dir    = self.base.particle_dir + "jobs"
-        log_dir    = self.base.particle_dir + "logs"
-
-        for d in (config_dir, job_dir, log_dir):
+        for d in ("jobs", "logs"):
+            d = os.path.join(self.base.particle_dir, d)
             common.create_dir(d)
 
         self.base.write_exec_file()
@@ -242,7 +244,7 @@ def launch_jobs(self):
     with open(params.CfgPath, "r") as afile:
         config = yaml.safe_load(afile)
 
-    job = CondJob("photons", config)
+    job = CondJob("test", config)
     job.prepare_jobs()
     job.launch_jobs()
 
diff --git a/bye_splits/scripts/cluster_size/run_init_tasks.py b/bye_splits/scripts/cluster_size/run_init_tasks.py
index 89a55f7e..c312f3bc 100644
--- a/bye_splits/scripts/cluster_size/run_init_tasks.py
+++ b/bye_splits/scripts/cluster_size/run_init_tasks.py
@@ -53,19 +53,19 @@ def start_chain(pars, cfg):
         fill_d = params.read_task_params("fill")
         for key in ("FillOut", "FillOutGenCl", "FillOutTcAll"):
             name = fill_d[key]
-            fill_d[key] = "{}_{}_{}_{}_pt_test".format(particles, pileup, name, eta_tag)
+            fill_d[key] = "{}_{}_{}_{}".format(particles, pileup, name, eta_tag)
         tasks.fill.fill(pars, df_gen, df_cl, df_tc, **fill_d)
 
         smooth_d = params.read_task_params("smooth")
         for key in ("SmoothIn", "SmoothOut"):
             name = smooth_d[key]
-            smooth_d[key] =  "{}_{}_{}_{}_pt_test".format(particles, pileup, name, eta_tag)
+            smooth_d[key] =  "{}_{}_{}_{}".format(particles, pileup, name, eta_tag)
         tasks.smooth.smooth(pars, **smooth_d)
 
         seed_d = params.read_task_params("seed")
         for key in ("SeedIn", "SeedOut"):
             name = seed_d[key]
-            seed_d[key] = "{}_{}_{}_{}_pt_test".format(particles, pileup, name, eta_tag)
+            seed_d[key] = "{}_{}_{}_{}".format(particles, pileup, name, eta_tag)
         tasks.seed.seed(pars, **seed_d)
 
 if __name__ == "__main__":
diff --git a/config.yaml b/config.yaml
index f6c291d4..b290bcb8 100644
--- a/config.yaml
+++ b/config.yaml
@@ -140,17 +140,26 @@ job:
   proxy: ~/.t3/proxy.cert
   queue: short
   local: False
-  script: /grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/scripts/cluster_size/condor/run_cluster.py
-  iterOver: radius
+#  script: /grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/bye_splits/scripts/cluster_size/condor/run_cluster.py
+#  iterOver: radius
+#  arguments:
+#    radius: [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009,
+#            0.01 , 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.017, 0.018,
+#            0.019, 0.02 , 0.021, 0.022, 0.023, 0.024, 0.025, 0.026, 0.027,
+#            0.028, 0.029, 0.03 , 0.031, 0.032, 0.033, 0.034, 0.035, 0.036,
+#            0.037, 0.038, 0.039, 0.04 , 0.041, 0.042, 0.043, 0.044, 0.045,
+#            0.046, 0.047, 0.048, 0.049, 0.05]
+#    particles: pions
+#    pileup: PU0
+  script: /grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/tests/submission/dummy_submit.py
+  iterOver: gen_arg
   arguments:
-    radius: [0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009,
-            0.01 , 0.011, 0.012, 0.013, 0.014, 0.015, 0.016, 0.017, 0.018,
-            0.019, 0.02 , 0.021, 0.022, 0.023, 0.024, 0.025, 0.026, 0.027,
-            0.028, 0.029, 0.03 , 0.031, 0.032, 0.033, 0.034, 0.035, 0.036,
-            0.037, 0.038, 0.039, 0.04 , 0.041, 0.042, 0.043, 0.044, 0.045,
-            0.046, 0.047, 0.048, 0.049, 0.05]
-    particles: pions
-    pileup: PU0
+    float_arg: 0.11
+    str_arg: a_string
+    gen_arg: [gen, 3.14, work, broke, 9, False, 12.9, hello]
+  test:
+    submit_dir: /home/llr/cms/ehle/NewRepos/bye_splits/tests/submission/
+    args_per_batch: 2
   photons:
     submit_dir: /data_CMS/cms/ehle/L1HGCAL/PU0/photons/
     args_per_batch: 10
@@ -165,16 +174,10 @@ clusterStudies:
   localDir: /home/llr/cms/ehle/NewRepos/bye_splits/data/new_algos/
   ehleDir: /eos/user/i/iehle/
   dataFolder: data/
-  reinit: True #False
   reprocess: True # Reproduce .parquet file
   parquetTag: pt_test
   clusterSizeBaseName: cluster_size_jobTest_yesWeight
-  coeffs: [0.0, 0.5, 50]
-  nevents: -1
   pileup: False
-  tree: FloatingpointMixedbcstcrealsig4DummyHistomaxxydr015GenmatchGenclustersntuple/HGCalTriggerNtuple
-  fileBaseName: energy_out
-  local: True
   weights:
     subDir: optimization/official/final/
     layer:
diff --git a/tests/submission/dummy_submit.py b/tests/submission/dummy_submit.py
new file mode 100644
index 00000000..9ed71878
--- /dev/null
+++ b/tests/submission/dummy_submit.py
@@ -0,0 +1,34 @@
+# coding: utf-8
+
+_all_ = ['dummy']
+
+import os
+import sys
+
+parent_dir = os.path.abspath(__file__ + 3 * "/..")
+sys.path.insert(0, parent_dir)
+
+from bye_splits.utils import common
+
+import argparse
+
+
+def dummy(pars):
+    """Dummy script for testing HT Condor job submission.
+    Prints passed argument key/vals and the type of <val>."""
+
+    for key, val in pars.items():
+        print("Passed {} for --{}, read as type {}.".format(val, key, type(val)))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument("--float_arg", help="Dummy float", required=True, type=float)
+    parser.add_argument("--str_arg", help="Dummy string.", required=True, type=str)
+    parser.add_argument("--gen_arg", help="Dummy gen.")
+    parser.add_argument("--store_arg", help="Dummy store_strue variable.", action="store_true")
+    
+    FLAGS = parser.parse_args()
+    pars = common.dot_dict(vars(FLAGS))
+
+    dummy(pars)
\ No newline at end of file

From fa47e2c2993df2d68f4eafd30bf0614685cba4a5 Mon Sep 17 00:00:00 2001
From: Isaac Ehle <ehle@llr.in2p3.fr>
Date: Tue, 31 Oct 2023 16:01:20 +0100
Subject: [PATCH 12/12] introduce Arguments class to handle argument adding,
 combining, verification

---
 README.md                                     |   2 +-
 .../production/submit_scripts/job_submit.py   |  98 +++++++++-------
 .../cluster_size/condor/run_cluster.py        |  36 ++++--
 bye_splits/utils/job_helpers.py               | 105 +++++++++++++++++-
 config.yaml                                   |   2 +
 tests/submission/dummy_run.py                 |  16 +++
 tests/submission/dummy_submit.py              |  37 ++++--
 7 files changed, 231 insertions(+), 65 deletions(-)
 create mode 100644 tests/submission/dummy_run.py

diff --git a/README.md b/README.md
index 989892d5..6ef9aee0 100644
--- a/README.md
+++ b/README.md
@@ -103,7 +103,7 @@ The `PU0` files above were merged and are stored under `/data_CMS/cms/alves/L1HG
 <a id="job-submission"></a>
 ## Job Submission
 
-Job submission to HT Condor is handled through `bye_splits/production/submit_scripts/job_submit.py` using the section of `config.yaml` for its configuration. The configuration should include usual condor variables, i.e `user`, `proxy`, `queue`, and `local`, as well as a path to the `script` you would like to run on condor. The `arguments` sub-section should contain `key/value` pairs matching the expected arguments that `script` accepts. The variable that you would like to iterate over should be set in `iterOver` and its value should correspond to a `key` in the `arguments` sub-section whose value is a list containing the values the script should iterate over. It then contains a section for each particle type which should contain a `submit_dir`, i.e. the directory in which to read and write submission related files, and `args_per_batch` which can be any number between 1 and `len(arguments[<iterOver>])`. An example of the `job` configuration settings is as such:
+Job submission to HT Condor is handled through `bye_splits/production/submit_scripts/job_submit.py` using the section of `config.yaml` for its configuration. The configuration should include usual condor variables, i.e `user`, `proxy`, `queue`, and `local`, as well as a path to the `script` you would like to run on condor. The `arguments` sub-section should contain `key/value` pairs matching the expected arguments that `script` accepts. You can also pass arguments directly in the command line, in which case these values will superseed the defaults set in the configuration file. The new `Arguments` class in `bye_splits/utils/job_helpers.py` verifies that the passed arguments are accepted by `script` and that all required arguments have assigned values. For now, this requires that `script` uses `Arguments` to import its arguments, using a dictionary called `arg_dict`; an example can be found in `tests/submission/dummy_submit.py`. The variable that you would like to iterate over should be set in `iterOver` and its value should correspond to a `key` in the `arguments` sub-section whose value is a list containing the values the script should iterate over. It then contains a section for each particle type which should contain a `submit_dir`, i.e. the directory in which to read and write submission related files, and `args_per_batch` which can be any number between 1 and `len(arguments[<iterOver>])`. An example of the `job` configuration settings is as such:
 
 ```yaml
 job:
diff --git a/bye_splits/production/submit_scripts/job_submit.py b/bye_splits/production/submit_scripts/job_submit.py
index 27b52cd6..ddc114a9 100644
--- a/bye_splits/production/submit_scripts/job_submit.py
+++ b/bye_splits/production/submit_scripts/job_submit.py
@@ -7,6 +7,7 @@
 sys.path.insert(0, parent_dir)
 
 from bye_splits.utils import params, common, job_helpers
+from bye_splits.utils.job_helpers import Arguments
 
 from datetime import datetime
 import re
@@ -39,18 +40,38 @@ def setup_batches(self):
     
 class CondJobBase:
     def __init__(self, particle, config):
-        self.particle     = particle
         self.script       = config["job"]["script"]
-        self.iterOver     = config["job"]["iterOver"]
-        self.args         = config["job"]["arguments"]
         self.queue        = config["job"]["queue"]
         self.proxy        = config["job"]["proxy"]
         self.local        = config["job"]["local"]
         self.user         = config["job"]["user"]
+
+        self.true_args    = Arguments(self.script)
+        self.default_args = {}
+        for arg, val in config["job"]["arguments"].items():
+            self.default_args["--"+arg] = val
+        self.combined_args = self.true_args.verify_args(self.default_args)
+
+        if "--particles" in self.combined_args:
+            self.particle = self.combined_args["--particles"]
+        else:
+            self.particle     = particle
+
         self.batch        = JobBatches(particle, config)
         self.particle_dir = self.batch.particle_var(self.particle, "submit_dir")
+        self.iterOver     = "--"+config["job"]["iterOver"]
         self.batches      = self.batch.setup_batches()
 
+    def _get_condor_args(self):
+        condor_args   = []
+        for arg in self.combined_args:
+            if "action" in self.true_args.accepted_args[arg] and self.true_args.accepted_args[arg]["action"]=="store_true":
+                continue
+            else:
+                condor_args.append(arg.replace("--", ""))
+        
+        return condor_args
+
     def _write_arg_values(self, current_version):
         """Adds the argument values, where the batch lists are converted
         to strings as [val_1, val_2, ...] --> "[val_1;val_2]".
@@ -70,28 +91,26 @@ def _write_arg_values(self, current_version):
         correctly assigns radius="[0.01, 0.02]", particle="photon"
         """
         
-        if "particles" in self.args.keys(): self.args["particles"] = self.particle
-        
-        arg_keys = list(self.args.keys())
-        # Ensure that the iterated argument is queued first
-        if self.iterOver in self.args.keys():
-            arg_keys.remove(self.iterOver)
-            arg_keys.insert(0, self.iterOver)
-
-        arg_keys = ", ".join(arg_keys)
-        arg_keys = "queue " + arg_keys + " from (\n"
+        condor_args = self._get_condor_args()
 
+        arg_keys = "queue " + ", ".join(condor_args) + " from (\n"
         current_version.append(arg_keys)
-
-        sub_args = list(self.args.keys())
-        sub_args.remove(self.iterOver)
-        arg_vals = [self.args[key] for key in sub_args]
-
-        for batch in self.batches:
-            all_vals = [str(batch).replace(", ", ";")]+arg_vals
-            all_vals = ", ".join(map(str, all_vals)) + "\n"
-            current_version.append(all_vals)
-
+        
+        batch_strs = [str(batch).replace(", ", ";") for batch in self.batches]
+        
+        for batch in batch_strs:
+            inner_test = []
+            for key, val in self.combined_args.items():
+                if "action" in self.true_args.accepted_args[key] and self.true_args.accepted_args[key]["action"]=="store_true":
+                    continue
+                else:
+                    if key != self.iterOver:
+                        inner_test.append(val)
+                    else:
+                        inner_test.append(batch)
+            inner_test = ", ".join(map(str, inner_test)) + "\n"
+            current_version.append(inner_test)
+        
         current_version.append(")")
 
     def write_exec_file(self):
@@ -122,18 +141,22 @@ def write_exec_file(self):
         current_version.append("export SITECONFIG_PATH=$VO_CMS_SW_DIR/SITECONF/T2_FR_GRIF_LLR/GRIF-LLR/\n")
         current_version.append("source $VO_CMS_SW_DIR/cmsset_default.sh\n")
         
-        sub_args = list(self.args.keys())
+        condor_args = self._get_condor_args()
         if self.iterOver is not None:
-            sub_args.remove(self.iterOver)
+            iter_arg_pos = condor_args.index(self.iterOver.replace("--", ""))
+            current_version.append('list=${}\n'.format(iter_arg_pos + 1))
 
-            current_version.append('list=$1\n')
             current_version.append("cleaned_list=$(echo $list | tr -d '[]' | tr ';' '\n')\n")
             current_version.append('while IFS=";" read -r val; do\n')
+
+            batch_strs = [str(batch).replace(", ", ";") for batch in self.batches]
+
+            self.default_args[self.iterOver] = '"$val"'
             
-            python_call = '    python {} --{} "$val"'.format(self.script, self.iterOver)
-            for arg in sub_args:
-                python_call +=' --{} {}'.format(arg, self.args[arg])
-            python_call += "\n"
+
+            """write_comm() combines default_args with the command-line arguments,
+            verifies that they're accepted by the script, and then returns the python command."""
+            python_call = "    " + " ".join(self.true_args.write_comm(self.default_args)) + "\n"
 
             current_version.append(python_call)
             current_version.append('done <<< "$cleaned_list"')
@@ -142,7 +165,7 @@ def write_exec_file(self):
             python_call = 'python {}'.format(self.script)
 
             for arg in sub_args:
-                python_call +=' --{} {}'.format(arg, self.args[arg])   
+                python_call +=' --{} {}'.format(arg, self.default_args[arg])   
 
             current_version.append(python_call)
 
@@ -165,16 +188,9 @@ def write_sub_file(self):
         current_version.append("executable = {}\n".format(self.sub_file))
         current_version.append("Universe              = vanilla\n")
 
-        """Add argument keys, requring that the first argument corresponds to <iterOver>.
-        This is because the handling of the iterated argument is handled differently by
-        than the static variables by write_exec_file() which expectes <iterOver> to be
-        passed first."""
         arg_keys = "Arguments ="
-        sub_args = list(self.args.keys())
-        if self.iterOver is not None:
-            arg_keys += " $({})".format(self.iterOver)
-            sub_args.remove(self.iterOver)
-        for arg in sub_args:
+        condor_args = self._get_condor_args()
+        for arg in self._get_condor_args():
             arg_keys += " $({})".format(arg)
         arg_keys += "\n"
 
@@ -189,7 +205,7 @@ def write_sub_file(self):
         current_version.append('+SingularityCmd       = ""\n')
         current_version.append("include: /opt/exp_soft/cms/t3/t3queue |\n")
         
-        if len(self.args.keys()) > 0:
+        if len(self.default_args.keys()) > 0:
             self._write_arg_values(current_version)
 
         # Write the file only if an identical file doesn't already exist
diff --git a/bye_splits/scripts/cluster_size/condor/run_cluster.py b/bye_splits/scripts/cluster_size/condor/run_cluster.py
index e779035f..26b9abc2 100644
--- a/bye_splits/scripts/cluster_size/condor/run_cluster.py
+++ b/bye_splits/scripts/cluster_size/condor/run_cluster.py
@@ -10,12 +10,33 @@
 
 import tasks
 from utils import params, common, parsing, cl_helpers
+from utils.job_helpers import Arguments
 
-import argparse
 import numpy as np
 import pandas as pd
 import yaml
 
+arg_dict = {
+    "--radius": {
+        "help": "Coefficient to use as the max cluster radius",
+        "required": True,
+        "type": float
+    },
+    "--particles": {
+        "choices": ("photons", "electrons", "pions"),
+        "required": True
+    }, 
+    "--pileup": {
+        "help": "tag for PU200 vs PU0",
+        "choices": ("PU0", "PU200"),
+        "required": True
+    },
+    "--weighted": {
+        "help": "Apply pre-calculated layer weights",
+        "action": "store_true"
+    }
+}
+
 def cluster_radius(pars, cfg):
     """Runs the default clustering algorithm using the
     specified radius. Runs on both negative and positive
@@ -47,15 +68,10 @@ def cluster_radius(pars, cfg):
         nevents_end = tasks.cluster.cluster_default(pars, **cluster_d)
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="")
-    parser.add_argument("--radius", help="Coefficient to use as the max cluster radius", required=True, type=float)
-    parser.add_argument("--particles", choices=("photons", "electrons", "pions"), required=True)
-    parser.add_argument("--pileup", help="tag for PU200 vs PU0", choices=("PU0", "PU200"), required=True)
-    parser.add_argument("--weighted", help="Apply pre-caluclated layer weights", action="store_true")
-    parsing.add_parameters(parser)
-    
-    FLAGS = parser.parse_args()
-    pars = common.dot_dict(vars(FLAGS))
+
+    args = Arguments(script=__file__)
+    FLAGS = args.add_args(description="Cluster size script.", arg_dict=arg_dict)
+    pars = common.dot_dict(FLAGS)
 
     radius_str = round(pars.radius, 3)
 
diff --git a/bye_splits/utils/job_helpers.py b/bye_splits/utils/job_helpers.py
index 428a4a87..531d6035 100644
--- a/bye_splits/utils/job_helpers.py
+++ b/bye_splits/utils/job_helpers.py
@@ -1,13 +1,17 @@
 #!/usr/bin/env python
 
-import re
-
 import os
 import sys
 
 parent_dir = os.path.abspath(__file__ + 5 * "../")
 sys.path.insert(0, parent_dir)
 
+import re
+import argparse
+import subprocess
+import inspect
+import importlib.util
+
 _all_ = ['increment_version', 'grab_most_recent', 'compare_file_contents', 'write_file_version', 'conditional_write']
 
 def increment_version(file_path):
@@ -81,4 +85,99 @@ def conditional_write(file_versions, file_template, current_version):
     
     else:
         file_path = write_file_version(file_template, current_version)
-    return file_path
\ No newline at end of file
+    return file_path
+
+class Arguments:
+
+  def __init__(self, script):
+    self.script       = script
+    self.called_file  = inspect.stack()[1].filename
+    if self.script != self.called_file:
+      spec = importlib.util.spec_from_file_location("script", self.script)
+      original_script = importlib.util.module_from_spec(spec)
+      spec.loader.exec_module(original_script)
+      self.accepted_args = original_script.arg_dict
+
+  def _get_options(self):
+    return subprocess.run(["python", self.script, "-h"], capture_output=True, text=True).stdout
+
+  def get_running_args(self):
+    """Returns dictionary of command-line
+    argument key/value pairs."""
+    running_args = sys.argv[1:]
+    run_arg_dict = {}
+    for i, arg in enumerate(running_args):
+      if arg.startswith("--"):
+        arg_name = arg
+        if i + 1 < len(running_args):
+          if running_args[i + 1].startswith("--"):
+            run_arg_dict[arg_name] = True
+          else:
+            run_arg_dict[arg_name] = running_args[i + 1]
+        else:
+          run_arg_dict[arg_name] = True
+
+    return run_arg_dict
+
+  def combine_args(self, passed_args_dict):
+    """Combines the passed argument dictionary
+    with the command-line argument dictionary."""
+    running_args = self.get_running_args()
+
+    combined_args = {}
+    for key in set(passed_args_dict.keys()).union(set(running_args.keys())):
+      if key in running_args:
+        combined_args[key] = running_args[key]
+      else:
+        combined_args[key] = passed_args_dict[key]
+    
+    return combined_args
+
+
+  def verify_args(self, passed_args_dict):
+    """Verifies that the arguments dictionary (passed+command_line)
+    is a valid set of arguments for the script."""
+    
+    combined_args = self.combine_args(passed_args_dict)
+
+    true_keys     = set(self.accepted_args.keys()).union({'"--$val"'})
+    combined_keys = set(combined_args.keys())
+
+    if combined_keys.issubset(true_keys):
+      for arg, arg_info in self.accepted_args.items():
+        if "required" in arg_info.keys() and arg not in \
+        combined_keys and arg_info["required"] is True:
+          raise Exception("Required argument not passed: {}".format(arg))
+      return combined_args
+    else:
+      raise Exception("Passed arguments not in script: {}\n{}".format(
+          combined_keys.difference(true_keys), self._get_options()))
+
+  def write_comm(self, arg_dict):
+    """Verifies arguments and writes python command."""
+    full_args = self.verify_args(arg_dict)
+    comm = ["python", self.script]
+    for arg_name, arg_value in full_args.items():
+      if "action" in self.accepted_args[arg_name] and \
+      self.accepted_args[arg_name]["action"] == "store_true":
+        if arg_value is True:
+          comm.append(arg_name)
+      else:
+        comm.append(arg_name)
+        comm.append(str(arg_value))
+    return comm
+
+  def add_args(self, arg_dict, description=None):
+    """Adds the arguments to the script's argument list,
+    and returns the updated argument list."""
+
+    parser = argparse.ArgumentParser(description=description)
+    for arg_name, arg_info in arg_dict.items():
+        parser.add_argument(arg_name, **arg_info)
+
+    self.args = vars(parser.parse_args())
+    return self.args
+
+  def run_script(self, arg_dict):
+    comm = self.write_comm(arg_dict)
+    subprocess.run(comm)
\ No newline at end of file
diff --git a/config.yaml b/config.yaml
index b290bcb8..3a9b4325 100644
--- a/config.yaml
+++ b/config.yaml
@@ -151,12 +151,14 @@ job:
 #            0.046, 0.047, 0.048, 0.049, 0.05]
 #    particles: pions
 #    pileup: PU0
+#    weighted: False
   script: /grid_mnt/vol_home/llr/cms/ehle/NewRepos/bye_splits/tests/submission/dummy_submit.py
   iterOver: gen_arg
   arguments:
     float_arg: 0.11
     str_arg: a_string
     gen_arg: [gen, 3.14, work, broke, 9, False, 12.9, hello]
+    flag: False
   test:
     submit_dir: /home/llr/cms/ehle/NewRepos/bye_splits/tests/submission/
     args_per_batch: 2
diff --git a/tests/submission/dummy_run.py b/tests/submission/dummy_run.py
new file mode 100644
index 00000000..1223f6ec
--- /dev/null
+++ b/tests/submission/dummy_run.py
@@ -0,0 +1,16 @@
+import os
+import sys
+
+parent_dir = os.path.abspath(__file__ + 3 * "/..")
+sys.path.insert(0, parent_dir)
+
+from bye_splits.utils.job_helpers import Arguments
+
+if __name__ == "__main__":
+    script = "/home/llr/cms/ehle/NewRepos/bye_splits/tests/submission/dummy_submit.py"
+
+    arg_dict = {"--float_arg": 0.5, "--str_arg": "woop", "--flag": False}
+
+    arg_object = Arguments(script=script)
+
+    arg_object.run_script(arg_dict)
\ No newline at end of file
diff --git a/tests/submission/dummy_submit.py b/tests/submission/dummy_submit.py
index 9ed71878..9cb366f5 100644
--- a/tests/submission/dummy_submit.py
+++ b/tests/submission/dummy_submit.py
@@ -9,9 +9,33 @@
 sys.path.insert(0, parent_dir)
 
 from bye_splits.utils import common
+from bye_splits.utils.job_helpers import Arguments
 
 import argparse
 
+arg_dict = {
+    "--float_arg": {
+        "help": "Dummy float",
+        "type": float,
+        "required": True
+    },
+    "--str_arg": {
+        "help": "Dummy string.",
+        "type": str,
+        "required": True,
+    },
+    # Generic argument will be interpreted as a string
+    "--gen_arg": {
+        "help": "Dummy generic.",
+        "required": False,
+        "default": None
+    },
+    "--flag": {
+        "help": "Dummy flag number 2.",
+        "action": "store_true",
+        "required": False
+    }
+}
 
 def dummy(pars):
     """Dummy script for testing HT Condor job submission.
@@ -22,13 +46,6 @@ def dummy(pars):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="")
-    parser.add_argument("--float_arg", help="Dummy float", required=True, type=float)
-    parser.add_argument("--str_arg", help="Dummy string.", required=True, type=str)
-    parser.add_argument("--gen_arg", help="Dummy gen.")
-    parser.add_argument("--store_arg", help="Dummy store_strue variable.", action="store_true")
-    
-    FLAGS = parser.parse_args()
-    pars = common.dot_dict(vars(FLAGS))
-
-    dummy(pars)
\ No newline at end of file
+    args = Arguments(script=__file__)
+    FLAGS = args.add_args(description="A dummy script to submit.", arg_dict=arg_dict)
+    dummy(FLAGS)
\ No newline at end of file