Skip to content

Commit abb2d31

Browse files
update
1 parent e26af53 commit abb2d31

3 files changed

Lines changed: 105 additions & 10 deletions

File tree

scripts/das.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
parser = optparse.OptionParser()
77
parser.add_option("--get-parent", dest="get_parent", action="store_true", default=False)
88
parser.add_option("--get-files", dest="get_files", action="store_true", default=False)
9+
parser.add_option("--get-meta", dest="get_meta", action="store_true", default=False)
10+
parser.add_option("--get-cmsdb", dest="get_cmsdb", action="store_true", default=False)
911
parser.add_option("--get-xs", dest="get_xs", action="store_true", default=False)
1012
(opts, args) = parser.parse_args()
1113

@@ -46,6 +48,51 @@ def get_xs( dataset, nf=1 ):
4648
xs_unit = xs_line.split(" ")[-1]
4749

4850
return xs_val, xs_unit
51+
52+
def get_meta( dataset ):
53+
# first get datasset info and id
54+
cmd = f"dasgoclient -query='dataset={dataset}' -json"
55+
infos = call(cmd)
56+
57+
dataset_id = infos[0]["dataset"][0].get("dataset_id", -1)
58+
dataset_name = infos[0]["dataset"][0]["name"]
59+
dataset_era = infos[0]["dataset"][0].get("acquisition_era_name", " ")[-1]
60+
is_data = infos[0]["dataset"][0].get("datatype", "") == "data"
61+
era_aux = ""
62+
procs = ""
63+
if is_data:
64+
era_aux = {"era": dataset_era}
65+
procs = "procs.data"
66+
67+
68+
# then get number of files
69+
cmd = f"dasgoclient -query='summary dataset={dataset}' -json"
70+
infos = call(cmd)
71+
72+
nfiles = infos[0]["summary"][0]["nfiles"]
73+
nevents = infos[0]["summary"][0]["nevents"]
74+
75+
return {"id": dataset_id, "name": dataset_name, "nfiles": nfiles, "nevents": nevents, "is_data": is_data, "aux": era_aux, "procs": procs}
76+
77+
78+
def format_cmsdb( dataset ):
79+
template = """
80+
cpn.add_dataset(
81+
name="",
82+
id={id},
83+
is_data={is_data},
84+
processes=[{procs}],
85+
keys=[
86+
"{name}",
87+
],
88+
n_files={nfiles},
89+
n_events={nevents},
90+
aux={aux}
91+
)
92+
"""
93+
return template.format( **get_meta(dataset) )
94+
95+
4996

5097
# loop over all arguments
5198
out_dict = {}
@@ -76,6 +123,10 @@ def get_xs( dataset, nf=1 ):
76123
ret = get_parent(dataset)
77124
elif opts.get_xs:
78125
ret = get_xs(dataset)
126+
elif opts.get_meta:
127+
ret = get_meta(dataset)
128+
elif opts.get_cmsdb:
129+
ret = format_cmsdb(dataset)
79130
else:
80131
# call dasgoclient command
81132
cmd = f"dasgoclient -query='dataset={dataset}' -json"

scripts/pq.py

Lines changed: 53 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
1-
print("using pq.py")
21
import awkward as ak
32
import pickle
43
import numpy as np
54
import sys, os
5+
import pyarrow.parquet as pq
66

7-
print("call 'help()' to show features")
87
def help():
98
print("usage:")
109
print("\tpq [parquet/pickle files]")
@@ -16,14 +15,6 @@ def load_file(f):
1615
else:
1716
return ak.from_parquet(f)
1817

19-
infiles = sys.argv[1:]
20-
files = [
21-
load_file(f)
22-
for f in infiles
23-
]
24-
data = files[0]
25-
26-
2718
def to_np(arr):
2819
if arr.ndim > 1:
2920
return ak.flatten(arr).to_numpy()
@@ -47,3 +38,55 @@ def percentiles(arr, percentiles=[0, 5, 10, 25, 50, 75, 90, 95, 100], ret=False)
4738
print("percentiles:")
4839
for a, b in zip(percentiles, x):
4940
print(f"{a}%: {b}")
41+
42+
def validate(f):
43+
try:
44+
if f.endswith(".parquet"):
45+
data = pq.ParquetFile(f)
46+
else:
47+
with open(f, "rb") as pf:
48+
pickle.load(pf)
49+
50+
except:
51+
print(f"Broken file {f}")
52+
return False
53+
54+
return True
55+
56+
if __name__ == "__main__":
57+
import optparse
58+
import glob
59+
parser = optparse.OptionParser()
60+
parser.add_option("-v", "--validate", dest="validate")
61+
parser.add_option("--del", dest="delete", action="store_true", default=False)
62+
(opts, args) = parser.parse_args()
63+
64+
infiles = []
65+
for f in args:
66+
if "*" in f:
67+
infiles += glob.glob(args)
68+
else:
69+
infiles.append(f)
70+
71+
if opts.validate:
72+
print(f"validating {len(infiles)} pq files...")
73+
i = 0
74+
for f in infiles:
75+
if not validate(f):
76+
i += 1
77+
if opts.delete:
78+
print(f"\t--> Removing file")
79+
os.remove(f)
80+
print(f"\n--> {i}/{len(infiles)} broken")
81+
82+
83+
else:
84+
print("using pq.py")
85+
print("call 'help()' to show features")
86+
files = [
87+
load_file(f)
88+
for f in infiles
89+
]
90+
data = files[0]
91+
92+

setup.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ alias condorSubmit='python3 $TOOLBOX/scripts/condorSubmit.py'
1111
alias summary='python3 -m correctionlib.cli summary'
1212

1313
alias pq='python3 -i $TOOLBOX/scripts/pq.py'
14+
alias pq_validate='python3 $TOOLBOX/scripts/pq.py -v'
1415
alias das='python3 $TOOLBOX/scripts/das.py'
1516

1617
PYTHONPATH=$TOOLBOX:$PYTHONPATH

0 commit comments

Comments
 (0)