|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +import datetime |
| 4 | +import fnmatch |
| 5 | +import os |
| 6 | +import re |
| 7 | +import requests |
| 8 | +import subprocess |
| 9 | +import urllib3 |
| 10 | + |
1 | 11 | from dataset_records import *
|
2 |
| -from os import listdir |
3 |
| -from os.path import isfile, join |
4 |
| -from requests.packages.urllib3.exceptions import InsecureRequestWarning |
| 12 | +from mcm_store import get_mcm_dict |
| 13 | +from utils import get_from_deep_json |
| 14 | + |
| 15 | +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) |
| 16 | + |
| 17 | +RECID_INFO = {} |
| 18 | +exec(open("inputs/recid_info.py", "r").read()) # import RECID_INFO |
5 | 19 |
|
6 | 20 |
|
7 |
| -exec(open('inputs/recid_info.py', 'r').read()) # import RECID_INFO |
8 |
| -requests.packages.urllib3.disable_warnings(InsecureRequestWarning) |
| 21 | +def log(recid, logtype, logmessage): |
| 22 | + """Store a log message of a certain type to record-ID-based log file system.""" |
| 23 | + logdir = f"./lhe_generators/2016-sim/gridpacks/{recid}" |
| 24 | + if not os.path.exists(logdir): |
| 25 | + os.makedirs(logdir) |
| 26 | + with open(f"{logdir}/LOG.txt", "a") as fdesc: |
| 27 | + now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
| 28 | + fdesc.write(f"{now} | {logtype} | {logmessage}\n") |
| 29 | + |
9 | 30 |
|
10 |
| -# get LHE Parent or False |
11 | 31 | def get_lhe(dataset, mcm_dir):
|
12 |
| - path = mcm_dir + '/chain/' + dataset.replace('/', '@') |
| 32 | + """Get LHE Parent or False""" |
| 33 | + path = mcm_dir + "/chain/" + dataset.replace("/", "@") |
13 | 34 | step_dirs = os.listdir(path)
|
14 | 35 | for step in step_dirs:
|
15 |
| - step_dir = path + '/' + step |
16 |
| - datatier = get_from_deep_json(get_mcm_dict(dataset,step_dir),'datatier') |
17 |
| - if "LHE" in datatier: |
| 36 | + step_dir = path + "/" + step |
| 37 | + datatier = get_from_deep_json(get_mcm_dict(dataset, step_dir), "datatier") |
| 38 | + if "LHE" in datatier: |
18 | 39 | return step_dir
|
19 | 40 |
|
20 | 41 | return False
|
21 | 42 |
|
22 | 43 |
|
23 |
| -def cmd_run(cmds, dataset): |
| 44 | +def cmd_run(cmds, recid): |
24 | 45 | for cmd in cmds:
|
25 |
| - err = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE, |
26 |
| - stdout=subprocess.PIPE).stderr.decode() |
| 46 | + err = subprocess.run( |
| 47 | + cmd, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE |
| 48 | + ).stderr.decode() |
27 | 49 | if err:
|
28 |
| - print("<pserr>\n[Error] in " + dataset + "\n==>\t" + |
29 |
| - err + "<==\n</pserr>", file=sys.stderr) |
| 50 | + log(recid, "ERROR", f"Error {err}") |
30 | 51 | return False
|
31 | 52 | return True
|
32 | 53 |
|
33 | 54 |
|
34 |
| -def create_lhe_generator(dataset, recid, mcm_dir, gen_store='./lhe_generators/2016-sim'): |
35 |
| -# mcm_dir is the directory of the LHE step |
36 |
| - fragment_url = get_genfragment_url(dataset, mcm_dir) |
37 |
| - if fragment_url: |
38 |
| - fragment_url = fragment_url[0] |
39 |
| - fragment = requests.get(fragment_url, verify=False).text |
40 |
| - if not fragment: |
41 |
| - fragment = get_from_deep_json( |
42 |
| - get_mcm_dict(dataset, mcm_dir), "fragment") |
43 |
| - else: |
44 |
| - fragment = get_from_deep_json( |
45 |
| - get_mcm_dict(dataset, mcm_dir), "fragment") |
| 55 | +def create_lhe_generator( |
| 56 | + dataset, recid, mcm_dir, gen_store="./lhe_generators/2016-sim" |
| 57 | +): |
| 58 | + # mcm_dir is the directory of the LHE step |
| 59 | + mcdb_id = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), "mcdb_id") or 0 |
| 60 | + if mcdb_id > 0: |
| 61 | + log(recid, "WARNING", f"Skipping because of mcdb_id value {mcdb_id}") |
| 62 | + return |
| 63 | + |
| 64 | + # Find fragment |
| 65 | + fragment_url = get_genfragment_url(dataset, mcm_dir) |
| 66 | + if fragment_url: |
| 67 | + fragment_url = fragment_url[0] |
| 68 | + fragment = requests.get(fragment_url, verify=False).text |
46 | 69 | if not fragment:
|
47 |
| - print("<emp>\n[Error] in" + dataset + |
48 |
| - "\n==>\t No fragment URL and Empty fragment in mcm dict, Skipping\n</emp>", file=sys.stderr) |
49 |
| - return |
50 |
| - |
51 |
| - path = re.search(r"cms.vstring\('(.*?)'", fragment) |
52 |
| - |
53 |
| - if not path: |
54 |
| - print("<vstring>\n[Warning] in" + dataset + |
55 |
| - "\n==>\t 'cms.vstring' not found in fragment , Skipping\n</vstring>", file=sys.stderr) |
56 |
| - return |
57 |
| - path = path.group(1) |
58 |
| - # print("found path: " + str(path) ) |
59 |
| - outfilepath = "{gen_store}/gridpacks/{recid}".format( |
60 |
| - gen_store=gen_store, recid=recid) |
61 |
| - |
62 |
| - if os.path.exists(outfilepath) and len(os.listdir(outfilepath)) != 0: |
63 |
| - print(str(recid) + ' recid gridpack Exist, Skipping') |
64 |
| - return |
65 |
| - |
66 |
| - if 'amcatnlo' in path or 'amcatnlo' in dataset: |
67 |
| - print(dataset + '\n' + str(recid) + |
68 |
| - "amcatnlo gridpack!!! path:" + path) |
69 |
| - files = [ |
70 |
| - 'process/Cards/run_card.dat', |
71 |
| - 'process/Cards/proc_card*.dat', |
72 |
| - 'process/Cards/param_card.dat', |
| 70 | + fragment = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), "fragment") |
| 71 | + else: |
| 72 | + fragment = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), "fragment") |
| 73 | + if not fragment: |
| 74 | + log( |
| 75 | + recid, |
| 76 | + "ERROR", |
| 77 | + f"No fragment URL and Empty fragment in mcm dict; skipping.", |
| 78 | + ) |
| 79 | + return |
| 80 | + |
| 81 | + # Find gridpack path |
| 82 | + path = re.search(r"cms.vstring\('(/cvmfs.*?)'", fragment) |
| 83 | + if not path: |
| 84 | + log( |
| 85 | + recid, |
| 86 | + "ERROR", |
| 87 | + f"No 'cms.vstring(/cvmfs' found in fragment; skipping.", |
| 88 | + ) |
| 89 | + return |
| 90 | + |
| 91 | + path = path.group(1) |
| 92 | + log(recid, "INFO", f"Found path {path}") |
| 93 | + outfilepath = "{gen_store}/gridpacks/{recid}".format( |
| 94 | + gen_store=gen_store, recid=recid |
| 95 | + ) |
| 96 | + if os.path.exists(outfilepath) and len(os.listdir(outfilepath)) > 1: |
| 97 | + log( |
| 98 | + recid, |
| 99 | + "WARNING", |
| 100 | + f"Gridpack seems to exist for this record ID already. Skipping.", |
| 101 | + ) |
| 102 | + return |
| 103 | + |
| 104 | + # Identify gridpack case |
| 105 | + gridpack_case = "UNKNOWN" |
| 106 | + path_lower = path.lower() |
| 107 | + path_lower_position = {} |
| 108 | + for acase in ["amcatnlo", "madgraph", "powheg", "jhugen", "phantom", "mcfm"]: |
| 109 | + path_lower_position[acase] = path_lower.find(acase) |
| 110 | + found = 1e10 |
| 111 | + for key, val in path_lower_position.items(): |
| 112 | + if val > 0 and val < found: |
| 113 | + gridpack_case = key |
| 114 | + if gridpack_case == "UNKNOWN": |
| 115 | + log(recid, "ERROR", f"Found case {gridpack_case}") |
| 116 | + else: |
| 117 | + log(recid, "INFO", f"Found case {gridpack_case}") |
| 118 | + |
| 119 | + # List content if all files in gridpack tarball |
| 120 | + files_all = [] |
| 121 | + res = subprocess.check_output(f"tar tf {path}", shell=True) |
| 122 | + for line in res.splitlines(): |
| 123 | + files_all.append(line.decode()) |
| 124 | + |
| 125 | + # Select interesting files based on gridpack case |
| 126 | + files = [ |
| 127 | + "./InputCards/*.dat", |
| 128 | + "./runcmsgrid.sh", |
| 129 | + "InputCards/*.dat", |
| 130 | + "runcmsgrid.sh", |
| 131 | + ] |
| 132 | + if gridpack_case == "amcatnlo": |
| 133 | + files.extend( |
| 134 | + [ |
| 135 | + "./process/Cards/param_card.dat", |
| 136 | + "./process/Cards/proc_card*.dat", |
| 137 | + "./process/Cards/run_card.dat", |
| 138 | + "process/Cards/param_card.dat", |
| 139 | + "process/Cards/proc_card*.dat", |
| 140 | + "process/Cards/run_card.dat", |
73 | 141 | ]
|
74 |
| - mv_cmd = "mv process/Cards/*dat .; rmdir -p process/Cards" |
75 |
| - elif 'madgraph' in path: |
76 |
| - files = [ |
77 |
| - 'process/madevent/Cards/run_card.dat', |
78 |
| - 'process/madevent/Cards/proc_card*.dat', |
79 |
| - 'process/madevent/Cards/param_card.dat', |
| 142 | + ) |
| 143 | + elif gridpack_case == "madgraph": |
| 144 | + files.extend( |
| 145 | + [ |
| 146 | + "./process/madevent/Cards/param_card.dat", |
| 147 | + "./process/madevent/Cards/proc_card*.dat", |
| 148 | + "./process/madevent/Cards/run_card.dat", |
| 149 | + "process/madevent/Cards/param_card.dat", |
| 150 | + "process/madevent/Cards/proc_card*.dat", |
| 151 | + "process/madevent/Cards/run_card.dat", |
80 | 152 | ]
|
81 |
| - mv_cmd = "mv process/madevent/Cards/*dat .; rmdir -p process/madevent/Cards" |
82 |
| - elif 'powheg' in path: |
83 |
| - files = [ |
84 |
| - '*.input', |
| 153 | + ) |
| 154 | + elif gridpack_case == "powheg": |
| 155 | + files.extend( |
| 156 | + [ |
| 157 | + "*.input", |
85 | 158 | ]
|
86 |
| - mv_cmd = "" |
87 |
| - else: |
88 |
| - print("<path>\n[Error] Unknown path:('" + path + |
89 |
| - "')\nDataset: " + dataset + '\n</path>', file=sys.stderr) |
90 |
| - return |
91 |
| - |
92 |
| - files = "'" + "' '".join(files) + "'" |
| 159 | + ) |
| 160 | + elif gridpack_case == "jhugen": |
| 161 | + files.extend( |
| 162 | + [ |
| 163 | + "./jhugen.input", |
| 164 | + "./jhugen_decay.input", |
| 165 | + "jhugen.input", |
| 166 | + "jhugen_decay.input", |
| 167 | + ] |
| 168 | + ) |
| 169 | + elif gridpack_case == "phantom": |
| 170 | + files.extend( |
| 171 | + [ |
| 172 | + "./r_GEN.in", |
| 173 | + "r_GEN.in", |
| 174 | + ] |
| 175 | + ) |
| 176 | + elif gridpack_case == "mcfm": |
| 177 | + files.extend( |
| 178 | + [ |
| 179 | + "./readInput.DAT", |
| 180 | + "readInput.DAT", |
| 181 | + ] |
| 182 | + ) |
| 183 | + |
| 184 | + # Select only those files that are present |
| 185 | + files_selected = [] |
| 186 | + for afile in files: |
| 187 | + files_selected.extend(fnmatch.filter(files_all, afile)) |
| 188 | + |
| 189 | + # Warn if there was no runcmsgrid or InputCards found for some cases |
| 190 | + if gridpack_case in ("amcatnlo", "madgraph"): |
| 191 | + if not "InputCards" in " ".join(files_selected): |
| 192 | + log(recid, "ERROR", f"InputCards not present in the tarball.") |
| 193 | + if not "runcmsgrid.sh" in " ".join(files_selected): |
| 194 | + log(recid, "ERROR", f"runcmsgrid.sh not present in the tarball.") |
| 195 | + |
| 196 | + # Warn if no interesting files were found at all |
| 197 | + if len(files_selected) == 0: |
| 198 | + log(recid, "ERROR", "Found no interesting files at all.") |
| 199 | + else: |
| 200 | + # Inform about which files are going to be extracted |
| 201 | + log( |
| 202 | + recid, |
| 203 | + "INFO", |
| 204 | + f"Found the following interesting files: {' '.join(files_selected)}", |
| 205 | + ) |
| 206 | + # Prepare the tarball extraction command |
93 | 207 | cmds = [
|
94 |
| - "mkdir -p {out}; cd {out};\ |
95 |
| - tar -xf {path} {files} -C {out}; {mv}".format(out=outfilepath, path=path, files=files, mv=mv_cmd) |
| 208 | + f"mkdir -p {outfilepath}; cd {outfilepath}; tar -xf {path} {' '.join(files_selected)} -C {outfilepath}" |
96 | 209 | ]
|
97 |
| - # print("Prepared commands: " + str(cmds)) |
98 |
| - cmd_run(cmds, dataset) |
| 210 | + log(recid, "INFO", f"Executing commands {cmds}") |
| 211 | + # Run the tarball extraction command |
| 212 | + cmd_run(cmds, recid) |
| 213 | + |
| 214 | + # Print full content of gridpack tarball for debugging purposes |
| 215 | + log(recid, "DEBUG", f"Full gridpack tarball content is:") |
| 216 | + for afile in files_all: |
| 217 | + log(recid, "DEBUG", f"- {afile}") |
99 | 218 |
|
100 | 219 |
|
101 | 220 | das_dir = "./inputs/das-json-store"
|
102 | 221 | mcm_dir = "./inputs/mcm-store"
|
103 |
| -with open("./inputs/CMS-2016-mc-datasets.txt", 'r') as file: |
| 222 | +with open("./inputs/CMS-2016-mc-datasets.txt", "r") as file: |
104 | 223 | dataset_full_names = file.readlines()
|
105 | 224 |
|
106 |
| -dataset_nanoaod = [name[:-1] for name in dataset_full_names if name[:-1].endswith('NANOAODSIM')] |
| 225 | +dataset_nanoaod = [ |
| 226 | + name[:-1] for name in dataset_full_names if name[:-1].endswith("NANOAODSIM") |
| 227 | +] |
107 | 228 | i = 1
|
108 | 229 | l = len(dataset_nanoaod)
|
109 | 230 | for dataset in dataset_nanoaod:
|
| 231 | + recid = RECID_INFO[dataset] |
110 | 232 |
|
111 |
| - #dataset = dataset[:-1] |
| 233 | + print(f"Getting LHE {i}/{l}") |
| 234 | + log(recid, "INFO", f"Getting LHE {i}/{l}") |
| 235 | + log(recid, "INFO", f"Found record ID {recid}") |
| 236 | + log(recid, "INFO", f"Found dataset {dataset}") |
112 | 237 |
|
113 | 238 | lhe_dir = get_lhe(dataset, mcm_dir)
|
114 | 239 | if not lhe_dir:
|
| 240 | + log(recid, "WARNING", f"There is no LHE directory. Skipping.") |
115 | 241 | continue
|
116 | 242 |
|
117 |
| - recid = RECID_INFO[dataset] |
118 |
| - |
119 |
| - print("Getting ({i}/{l}): {ds}".format( |
120 |
| - i=i, l=l, ds=lhe_dir or 'No LHE parent for this record')) |
| 243 | + log(recid, "INFO", f"Found LHE directory {lhe_dir}") |
121 | 244 |
|
122 |
| - t = threading.Thread(target=create_lhe_generator, |
123 |
| - args=(dataset, recid, lhe_dir)) |
| 245 | + t = threading.Thread(target=create_lhe_generator, args=(dataset, recid, lhe_dir)) |
124 | 246 | t.start()
|
125 | 247 | i += 1
|
126 | 248 | while threading.activeCount() >= 20:
|
|
0 commit comments