Skip to content

Commit 911fa12

Browse files
committed
Issue #5: Copy slurm-status.py from https://github.com/Snakemake-Profiles/slurm
use --cluster-status slurm-status.py to recognize slurm-killed jobs as dead
1 parent 54a18dc commit 911fa12

File tree

1 file changed

+71
-0
lines changed

1 file changed

+71
-0
lines changed

slurm-status.py

+71
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#!/usr/bin/env python3
2+
#
3+
import re
4+
import subprocess as sp
5+
import shlex
6+
import sys
7+
import time
8+
import logging
9+
10+
logger = logging.getLogger("__name__")
11+
12+
STATUS_ATTEMPTS = 20
13+
14+
jobid = sys.argv[1]
15+
16+
17+
for i in range(STATUS_ATTEMPTS):
18+
try:
19+
sacct_res = sp.check_output(shlex.split("sacct -P -b -j {} -n".format(jobid)))
20+
res = {
21+
x.split("|")[0]: x.split("|")[1]
22+
for x in sacct_res.decode().strip().split("\n")
23+
}
24+
break
25+
except sp.CalledProcessError as e:
26+
logger.error("sacct process error")
27+
logger.error(e)
28+
except IndexError as e:
29+
pass
30+
# Try getting job with scontrol instead in case sacct is misconfigured
31+
try:
32+
sctrl_res = sp.check_output(
33+
shlex.split("scontrol -o show job {}".format(jobid))
34+
)
35+
m = re.search("JobState=(\w+)", sctrl_res.decode())
36+
res = {jobid: m.group(1)}
37+
break
38+
except sp.CalledProcessError as e:
39+
logger.error("scontrol process error")
40+
logger.error(e)
41+
if i >= STATUS_ATTEMPTS - 1:
42+
print("failed")
43+
exit(0)
44+
else:
45+
time.sleep(1)
46+
47+
status = res[jobid]
48+
49+
if status == "BOOT_FAIL":
50+
print("failed")
51+
elif status == "OUT_OF_MEMORY":
52+
print("failed")
53+
elif status.startswith("CANCELLED"):
54+
print("failed")
55+
elif status == "COMPLETED":
56+
print("success")
57+
elif status == "DEADLINE":
58+
print("failed")
59+
elif status == "FAILED":
60+
print("failed")
61+
elif status == "NODE_FAIL":
62+
print("failed")
63+
elif status == "PREEMPTED":
64+
print("failed")
65+
elif status == "TIMEOUT":
66+
print("failed")
67+
# Unclear whether SUSPENDED should be treated as running or failed
68+
elif status == "SUSPENDED":
69+
print("failed")
70+
else:
71+
print("running")

0 commit comments

Comments
 (0)