Skip to content

Commit b922c4c

Browse files
committed
WIP
1 parent 6b12500 commit b922c4c

File tree

2 files changed

+213
-91
lines changed

2 files changed

+213
-91
lines changed

Diff for: cms-2016-simulated-datasets/README.md

+8-8
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ Warning: Creating the full local cache might take a long time.
4343
First step is to create EOS file index cache:
4444

4545
```console
46-
$ time python3 ./code/interface.py --create-eos-indexes inputs/CMS-2016-mc-datasets.txt
46+
$ python3 ./code/interface.py --create-eos-indexes inputs/CMS-2016-mc-datasets.txt
4747
```
4848

4949
This requires the data files to be placed in their final location. However, for
@@ -53,17 +53,17 @@ by means of adding the command-line option `--ignore-eos-store` to the commands
5353
We can now build sample records by doing:
5454

5555
```console
56-
$ time python3 ./code/interface.py --create-das-json-store --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
56+
$ python3 ./code/interface.py --create-das-json-store --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
5757

5858
$ auth-get-sso-cookie -u https://cms-pdmv.cern.ch/mcm -o cookies.txt
59-
$ time python3 ./code/interface.py --create-mcm-store --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
59+
$ python3 ./code/interface.py --create-mcm-store --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
6060

61-
$ time python3 ./code/interface.py --get-conf-files --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
61+
$ python3 ./code/interface.py --get-conf-files --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
6262

63-
$ time python3 code/lhe_generators.py
63+
$ python3 code/lhe_generators.py
6464

65-
$ time python3 ./code/interface.py --create-records --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
66-
$ time python3 ./code/interface.py --create-conffiles-records --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
65+
$ python3 ./code/interface.py --create-records --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
66+
$ python3 ./code/interface.py --create-conffiles-records --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
6767
```
6868

6969
Note that to build the test records an (empty) input file for DOIs and a recid
@@ -80,7 +80,7 @@ The output JSON files for the dataset records will be generated in the
8080

8181

8282
```console
83-
python3 code/lhe_generators.py 2> errors > output &
83+
$ python3 code/lhe_generators.py >& output
8484
```
8585

8686
- This will get lhe generator parameters from gridpacks for datasets listed in `./inputs/CMS-2016-mc-datasets.txt`.

Diff for: cms-2016-simulated-datasets/code/lhe_generators.py

+205-83
Original file line numberDiff line numberDiff line change
@@ -1,126 +1,248 @@
1+
#!/usr/bin/env python3
2+
3+
import datetime
4+
import fnmatch
5+
import os
6+
import re
7+
import requests
8+
import subprocess
9+
import urllib3
10+
111
from dataset_records import *
2-
from os import listdir
3-
from os.path import isfile, join
4-
from requests.packages.urllib3.exceptions import InsecureRequestWarning
12+
from mcm_store import get_mcm_dict
13+
from utils import get_from_deep_json
14+
15+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
16+
17+
RECID_INFO = {}
18+
exec(open("inputs/recid_info.py", "r").read()) # import RECID_INFO
519

620

7-
exec(open('inputs/recid_info.py', 'r').read()) # import RECID_INFO
8-
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
21+
def log(recid, logtype, logmessage):
22+
"""Store a log message of a certain type to record-ID-based log file system."""
23+
logdir = f"./lhe_generators/2016-sim/gridpacks/{recid}"
24+
if not os.path.exists(logdir):
25+
os.makedirs(logdir)
26+
with open(f"{logdir}/LOG.txt", "a") as fdesc:
27+
now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
28+
fdesc.write(f"{now} | {logtype} | {logmessage}\n")
29+
930

10-
# get LHE Parent or False
1131
def get_lhe(dataset, mcm_dir):
12-
path = mcm_dir + '/chain/' + dataset.replace('/', '@')
32+
"""Get LHE Parent or False"""
33+
path = mcm_dir + "/chain/" + dataset.replace("/", "@")
1334
step_dirs = os.listdir(path)
1435
for step in step_dirs:
15-
step_dir = path + '/' + step
16-
datatier = get_from_deep_json(get_mcm_dict(dataset,step_dir),'datatier')
17-
if "LHE" in datatier:
36+
step_dir = path + "/" + step
37+
datatier = get_from_deep_json(get_mcm_dict(dataset, step_dir), "datatier")
38+
if "LHE" in datatier:
1839
return step_dir
1940

2041
return False
2142

2243

23-
def cmd_run(cmds, dataset):
44+
def cmd_run(cmds, recid):
2445
for cmd in cmds:
25-
err = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE,
26-
stdout=subprocess.PIPE).stderr.decode()
46+
err = subprocess.run(
47+
cmd, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE
48+
).stderr.decode()
2749
if err:
28-
print("<pserr>\n[Error] in " + dataset + "\n==>\t" +
29-
err + "<==\n</pserr>", file=sys.stderr)
50+
log(recid, "ERROR", f"Error {err}")
3051
return False
3152
return True
3253

3354

34-
def create_lhe_generator(dataset, recid, mcm_dir, gen_store='./lhe_generators/2016-sim'):
35-
# mcm_dir is the directory of the LHE step
36-
fragment_url = get_genfragment_url(dataset, mcm_dir)
37-
if fragment_url:
38-
fragment_url = fragment_url[0]
39-
fragment = requests.get(fragment_url, verify=False).text
40-
if not fragment:
41-
fragment = get_from_deep_json(
42-
get_mcm_dict(dataset, mcm_dir), "fragment")
43-
else:
44-
fragment = get_from_deep_json(
45-
get_mcm_dict(dataset, mcm_dir), "fragment")
55+
def create_lhe_generator(
56+
dataset, recid, mcm_dir, gen_store="./lhe_generators/2016-sim"
57+
):
58+
# mcm_dir is the directory of the LHE step
59+
mcdb_id = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), "mcdb_id") or 0
60+
if mcdb_id > 0:
61+
log(recid, "WARNING", f"Skipping because of mcdb_id value {mcdb_id}")
62+
return
63+
64+
# Find fragment
65+
fragment_url = get_genfragment_url(dataset, mcm_dir)
66+
if fragment_url:
67+
fragment_url = fragment_url[0]
68+
fragment = requests.get(fragment_url, verify=False).text
4669
if not fragment:
47-
print("<emp>\n[Error] in" + dataset +
48-
"\n==>\t No fragment URL and Empty fragment in mcm dict, Skipping\n</emp>", file=sys.stderr)
49-
return
50-
51-
path = re.search(r"cms.vstring\('(.*?)'", fragment)
52-
53-
if not path:
54-
print("<vstring>\n[Warning] in" + dataset +
55-
"\n==>\t 'cms.vstring' not found in fragment , Skipping\n</vstring>", file=sys.stderr)
56-
return
57-
path = path.group(1)
58-
# print("found path: " + str(path) )
59-
outfilepath = "{gen_store}/gridpacks/{recid}".format(
60-
gen_store=gen_store, recid=recid)
61-
62-
if os.path.exists(outfilepath) and len(os.listdir(outfilepath)) != 0:
63-
print(str(recid) + ' recid gridpack Exist, Skipping')
64-
return
65-
66-
if 'amcatnlo' in path or 'amcatnlo' in dataset:
67-
print(dataset + '\n' + str(recid) +
68-
"amcatnlo gridpack!!! path:" + path)
69-
files = [
70-
'process/Cards/run_card.dat',
71-
'process/Cards/proc_card*.dat',
72-
'process/Cards/param_card.dat',
70+
fragment = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), "fragment")
71+
else:
72+
fragment = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), "fragment")
73+
if not fragment:
74+
log(
75+
recid,
76+
"ERROR",
77+
f"No fragment URL and Empty fragment in mcm dict; skipping.",
78+
)
79+
return
80+
81+
# Find gridpack path
82+
path = re.search(r"cms.vstring\('(/cvmfs.*?)'", fragment)
83+
if not path:
84+
log(
85+
recid,
86+
"ERROR",
87+
f"No 'cms.vstring(/cvmfs' found in fragment; skipping.",
88+
)
89+
return
90+
91+
path = path.group(1)
92+
log(recid, "INFO", f"Found path {path}")
93+
outfilepath = "{gen_store}/gridpacks/{recid}".format(
94+
gen_store=gen_store, recid=recid
95+
)
96+
if os.path.exists(outfilepath) and len(os.listdir(outfilepath)) > 1:
97+
log(
98+
recid,
99+
"WARNING",
100+
f"Gridpack seems to exist for this record ID already. Skipping.",
101+
)
102+
return
103+
104+
# Identify gridpack case
105+
gridpack_case = "UNKNOWN"
106+
path_lower = path.lower()
107+
path_lower_position = {}
108+
for acase in ["amcatnlo", "madgraph", "powheg", "jhugen", "phantom", "mcfm"]:
109+
path_lower_position[acase] = path_lower.find(acase)
110+
found = 1e10
111+
for key, val in path_lower_position.items():
112+
if val > 0 and val < found:
113+
gridpack_case = key
114+
if gridpack_case == "UNKNOWN":
115+
log(recid, "ERROR", f"Found case {gridpack_case}")
116+
else:
117+
log(recid, "INFO", f"Found case {gridpack_case}")
118+
119+
# List content if all files in gridpack tarball
120+
files_all = []
121+
res = subprocess.check_output(f"tar tf {path}", shell=True)
122+
for line in res.splitlines():
123+
files_all.append(line.decode())
124+
125+
# Select interesting files based on gridpack case
126+
files = [
127+
"./InputCards/*.dat",
128+
"./runcmsgrid.sh",
129+
"InputCards/*.dat",
130+
"runcmsgrid.sh",
131+
]
132+
if gridpack_case == "amcatnlo":
133+
files.extend(
134+
[
135+
"./process/Cards/param_card.dat",
136+
"./process/Cards/proc_card*.dat",
137+
"./process/Cards/run_card.dat",
138+
"process/Cards/param_card.dat",
139+
"process/Cards/proc_card*.dat",
140+
"process/Cards/run_card.dat",
73141
]
74-
mv_cmd = "mv process/Cards/*dat .; rmdir -p process/Cards"
75-
elif 'madgraph' in path:
76-
files = [
77-
'process/madevent/Cards/run_card.dat',
78-
'process/madevent/Cards/proc_card*.dat',
79-
'process/madevent/Cards/param_card.dat',
142+
)
143+
elif gridpack_case == "madgraph":
144+
files.extend(
145+
[
146+
"./process/madevent/Cards/param_card.dat",
147+
"./process/madevent/Cards/proc_card*.dat",
148+
"./process/madevent/Cards/run_card.dat",
149+
"process/madevent/Cards/param_card.dat",
150+
"process/madevent/Cards/proc_card*.dat",
151+
"process/madevent/Cards/run_card.dat",
80152
]
81-
mv_cmd = "mv process/madevent/Cards/*dat .; rmdir -p process/madevent/Cards"
82-
elif 'powheg' in path:
83-
files = [
84-
'*.input',
153+
)
154+
elif gridpack_case == "powheg":
155+
files.extend(
156+
[
157+
"*.input",
85158
]
86-
mv_cmd = ""
87-
else:
88-
print("<path>\n[Error] Unknown path:('" + path +
89-
"')\nDataset: " + dataset + '\n</path>', file=sys.stderr)
90-
return
91-
92-
files = "'" + "' '".join(files) + "'"
159+
)
160+
elif gridpack_case == "jhugen":
161+
files.extend(
162+
[
163+
"./jhugen.input",
164+
"./jhugen_decay.input",
165+
"jhugen.input",
166+
"jhugen_decay.input",
167+
]
168+
)
169+
elif gridpack_case == "phantom":
170+
files.extend(
171+
[
172+
"./r_GEN.in",
173+
"r_GEN.in",
174+
]
175+
)
176+
elif gridpack_case == "mcfm":
177+
files.extend(
178+
[
179+
"./readInput.DAT",
180+
"readInput.DAT",
181+
]
182+
)
183+
184+
# Select only those files that are present
185+
files_selected = []
186+
for afile in files:
187+
files_selected.extend(fnmatch.filter(files_all, afile))
188+
189+
# Warn if there was no runcmsgrid or InputCards found for some cases
190+
if gridpack_case in ("amcatnlo", "madgraph"):
191+
if not "InputCards" in " ".join(files_selected):
192+
log(recid, "ERROR", f"InputCards not present in the tarball.")
193+
if not "runcmsgrid.sh" in " ".join(files_selected):
194+
log(recid, "ERROR", f"runcmsgrid.sh not present in the tarball.")
195+
196+
# Warn if no interesting files were found at all
197+
if len(files_selected) == 0:
198+
log(recid, "ERROR", "Found no interesting files at all.")
199+
else:
200+
# Inform about which files are going to be extracted
201+
log(
202+
recid,
203+
"INFO",
204+
f"Found the following interesting files: {' '.join(files_selected)}",
205+
)
206+
# Prepare the tarball extraction command
93207
cmds = [
94-
"mkdir -p {out}; cd {out};\
95-
tar -xf {path} {files} -C {out}; {mv}".format(out=outfilepath, path=path, files=files, mv=mv_cmd)
208+
f"mkdir -p {outfilepath}; cd {outfilepath}; tar -xf {path} {' '.join(files_selected)} -C {outfilepath}"
96209
]
97-
# print("Prepared commands: " + str(cmds))
98-
cmd_run(cmds, dataset)
210+
log(recid, "INFO", f"Executing commands {cmds}")
211+
# Run the tarball extraction command
212+
cmd_run(cmds, recid)
213+
214+
# Print full content of gridpack tarball for debugging purposes
215+
log(recid, "DEBUG", f"Full gridpack tarball content is:")
216+
for afile in files_all:
217+
log(recid, "DEBUG", f"- {afile}")
99218

100219

101220
das_dir = "./inputs/das-json-store"
102221
mcm_dir = "./inputs/mcm-store"
103-
with open("./inputs/CMS-2016-mc-datasets.txt", 'r') as file:
222+
with open("./inputs/CMS-2016-mc-datasets.txt", "r") as file:
104223
dataset_full_names = file.readlines()
105224

106-
dataset_nanoaod = [name[:-1] for name in dataset_full_names if name[:-1].endswith('NANOAODSIM')]
225+
dataset_nanoaod = [
226+
name[:-1] for name in dataset_full_names if name[:-1].endswith("NANOAODSIM")
227+
]
107228
i = 1
108229
l = len(dataset_nanoaod)
109230
for dataset in dataset_nanoaod:
231+
recid = RECID_INFO[dataset]
110232

111-
#dataset = dataset[:-1]
233+
print(f"Getting LHE {i}/{l}")
234+
log(recid, "INFO", f"Getting LHE {i}/{l}")
235+
log(recid, "INFO", f"Found record ID {recid}")
236+
log(recid, "INFO", f"Found dataset {dataset}")
112237

113238
lhe_dir = get_lhe(dataset, mcm_dir)
114239
if not lhe_dir:
240+
log(recid, "WARNING", f"There is no LHE directory. Skipping.")
115241
continue
116242

117-
recid = RECID_INFO[dataset]
118-
119-
print("Getting ({i}/{l}): {ds}".format(
120-
i=i, l=l, ds=lhe_dir or 'No LHE parent for this record'))
243+
log(recid, "INFO", f"Found LHE directory {lhe_dir}")
121244

122-
t = threading.Thread(target=create_lhe_generator,
123-
args=(dataset, recid, lhe_dir))
245+
t = threading.Thread(target=create_lhe_generator, args=(dataset, recid, lhe_dir))
124246
t.start()
125247
i += 1
126248
while threading.activeCount() >= 20:

0 commit comments

Comments
 (0)