Skip to content
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
99 commits
Select commit Hold shift + click to select a range
b81e3ea
Add email notification system for scrontab-launched Rocoto workflows
AntonMFernando-NOAA Jan 7, 2026
bb35450
update rocoto scripts
AntonMFernando-NOAA Jan 13, 2026
227ece7
Merge changes into feature/scrontab branch
AntonMFernando-NOAA Jan 13, 2026
0275f6c
Revert "Add email notification system for scrontab-launched Rocoto wo…
AntonMFernando-NOAA Jan 13, 2026
573c451
update rocoto
AntonMFernando-NOAA Jan 14, 2026
51f6744
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 14, 2026
6a46e53
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 15, 2026
79dac8a
update dev/workflow/rocoto/rocoto_cron_template.sh
AntonMFernando-NOAA Jan 15, 2026
7dcfe25
update dev/workflow/rocoto/rocoto_cron_template.sh
AntonMFernando-NOAA Jan 15, 2026
7f16b5b
updated for a test
AntonMFernando-NOAA Jan 16, 2026
c3cf594
update dev/workflow/rocoto/rocoto_cron_template.sh
AntonMFernando-NOAA Jan 16, 2026
54f373f
update name
AntonMFernando-NOAA Jan 16, 2026
26bafaa
update dev/workflow/rocoto/rocoto_scron_template.sh
AntonMFernando-NOAA Jan 16, 2026
d21db79
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 16, 2026
1cee746
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 16, 2026
07a9f08
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 20, 2026
e9a8772
remove typo
AntonMFernando-NOAA Jan 20, 2026
31d8ae5
update ufs submodule
AntonMFernando-NOAA Jan 20, 2026
26b5946
update ufs_utils submodule
AntonMFernando-NOAA Jan 20, 2026
2d1bf18
update dev/workflow/rocoto/rocoto_xml.py
AntonMFernando-NOAA Jan 20, 2026
22c3725
update dev/workflow/rocoto/rocoto_scron_template.sh
AntonMFernando-NOAA Jan 20, 2026
262178c
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 20, 2026
f53c449
update dev/workflow/rocoto/rocoto_scron_template.sh
AntonMFernando-NOAA Jan 20, 2026
c9f2745
Merge branch 'feature/scrontab' of https://github.com/AntonMFernando-…
AntonMFernando-NOAA Jan 20, 2026
0907055
shelcheck error fix
AntonMFernando-NOAA Jan 20, 2026
e0a5db9
update dev/workflow/rocoto/rocoto_scron_template.sh
AntonMFernando-NOAA Jan 20, 2026
d34feaa
update dev/workflow/rocoto/rocoto_scron_template.sh
AntonMFernando-NOAA Jan 20, 2026
966f0a2
update dev/workflow/rocoto/rocoto_xml.py
AntonMFernando-NOAA Jan 20, 2026
1556bc5
Merge branch 'develop' into feature/scrontab
AntonMFernando-NOAA Jan 20, 2026
a4d2024
shellcheck errors
AntonMFernando-NOAA Jan 20, 2026
abf88c8
update to fix shellcheck errors
AntonMFernando-NOAA Jan 20, 2026
4ab3636
update bash
AntonMFernando-NOAA Jan 20, 2026
bb65399
update dev/workflow/rocoto/rocoto_xml.py
AntonMFernando-NOAA Jan 20, 2026
68af080
submodule update
AntonMFernando-NOAA Jan 20, 2026
6f010af
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 21, 2026
ad48031
added template
AntonMFernando-NOAA Jan 21, 2026
e1aab1f
Merge branch 'develop' into feature/scrontab
DavidHuber-NOAA Jan 22, 2026
325e44c
Update dev/workflow/rocoto/rocoto_scron.sh.j2
AntonMFernando-NOAA Jan 22, 2026
b12b1f4
added email settings
AntonMFernando-NOAA Jan 23, 2026
bd5eabb
removed typo
AntonMFernando-NOAA Jan 23, 2026
3a5c7e3
Update submodules to develop branch hashes
AntonMFernando-NOAA Jan 23, 2026
8fb44f9
Align submodules with upstream develop branch hashes
AntonMFernando-NOAA Jan 23, 2026
90b082c
cleaning
AntonMFernando-NOAA Jan 23, 2026
f11e95b
get REPLYTO from env variables
AntonMFernando-NOAA Jan 23, 2026
9d7a644
update dev/workflow/rocoto/rocoto_xml.py
AntonMFernando-NOAA Jan 23, 2026
219ba1f
typos
AntonMFernando-NOAA Jan 23, 2026
e7badd0
bug
AntonMFernando-NOAA Jan 23, 2026
2d9aec5
change comment
AntonMFernando-NOAA Jan 23, 2026
447002b
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 26, 2026
8a626f4
added a message
AntonMFernando-NOAA Jan 26, 2026
ba292d9
add only to scrontab
AntonMFernando-NOAA Jan 26, 2026
49df8de
add scrontab read option
AntonMFernando-NOAA Jan 26, 2026
2ecdaac
update generate_workflow.sh
AntonMFernando-NOAA Jan 26, 2026
55d8149
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
f304a12
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
d13bcfe
update scripts
AntonMFernando-NOAA Jan 26, 2026
db30ff2
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
468f9e7
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
5b87973
updated replyto>mailto
AntonMFernando-NOAA Jan 26, 2026
0e7d161
Fix crontab MAILTO handling and conditional message display
AntonMFernando-NOAA Jan 26, 2026
158d7df
Add MAILTO to tests.cron regardless of -c flag usage
AntonMFernando-NOAA Jan 26, 2026
becde3e
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
a518d80
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
5577920
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
9dddf3a
Suppress MAILTO message when email already exists in crontab
AntonMFernando-NOAA Jan 26, 2026
0ed8c56
Fix MAILTO detection to match any MAILTO format
AntonMFernando-NOAA Jan 26, 2026
3b2ab13
update generate_workflows
AntonMFernando-NOAA Jan 26, 2026
9466f15
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
11992ab
typo
AntonMFernando-NOAA Jan 26, 2026
efcc25d
Format MAILTO line with consistent 65-character width
AntonMFernando-NOAA Jan 26, 2026
e7eb283
typo
AntonMFernando-NOAA Jan 27, 2026
5851901
update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 27, 2026
7f8948a
typo
AntonMFernando-NOAA Jan 27, 2026
016b3c4
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 27, 2026
ecf4324
typo
AntonMFernando-NOAA Jan 27, 2026
e2dfd99
typos
AntonMFernando-NOAA Jan 27, 2026
a262543
shellcheck
AntonMFernando-NOAA Jan 27, 2026
ce44ca4
dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 27, 2026
dbf9d88
typo
AntonMFernando-NOAA Jan 27, 2026
84f56fe
update dev/workflow/rocoto/rocoto_xml.py
AntonMFernando-NOAA Jan 27, 2026
cd59295
typo
AntonMFernando-NOAA Jan 27, 2026
caa4b4e
Merge branch 'develop' into feature/scrontab
AntonMFernando-NOAA Jan 27, 2026
17d038e
delete extra file
AntonMFernando-NOAA Jan 27, 2026
2835d6b
Merge branch 'develop' into feature/scrontab
AntonMFernando-NOAA Jan 27, 2026
ad32fd2
update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 27, 2026
85440a9
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 27, 2026
9c08d00
remove comment
AntonMFernando-NOAA Jan 27, 2026
7b5d085
remove unncessary functions
AntonMFernando-NOAA Jan 27, 2026
2bb573c
update with cron check
AntonMFernando-NOAA Jan 27, 2026
d84ae51
warning conditions changes
AntonMFernando-NOAA Jan 27, 2026
bf49fb5
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 27, 2026
ecb5bd6
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 27, 2026
d6aba95
update warning
AntonMFernando-NOAA Jan 27, 2026
777ae9c
remove all the additional stuff. back to normal.
AntonMFernando-NOAA Jan 28, 2026
8975476
update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 28, 2026
daf15b4
update generate_workflow
AntonMFernando-NOAA Jan 28, 2026
d82b122
update parm/globus/init_xfer.sh.j2
AntonMFernando-NOAA Jan 28, 2026
3fdafff
Merge branch 'develop' into feature/scrontab
AntonMFernando-NOAA Jan 28, 2026
d10615d
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 29, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions dev/workflow/rocoto/rocoto_scron_template.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/env bash
source "{HOMEgfs}/dev/ush/gw_setup.sh"
# Set platform variable if hostname contains 'gaea'
if [[ "$(hostname -s)" == *gaea* ]]; then
platform="gaea"
else
platform="unknown"
fi
# Run rocotorun
# Template variable {{rocotorunstr}} will be substituted by the Python template system
bash -c "{rocotorunstr}"

# Monitor for failed jobs using rocotostat
LOCKFILE="{expdir}/.failed_jobs.lock"
ROCOTOSTAT=$(command -v rocotostat)

if [[ -n "${{ROCOTOSTAT}}" ]]; then
FAILED_JOBS=$(${{ROCOTOSTAT}} -d "{expdir}/{pslot}.db" -w "{expdir}/{pslot}.xml" -c all 2> /dev/null | grep -E 'DEAD')

if [[ -n "${{FAILED_JOBS}}" ]]; then
# Read previously reported failures
PREV_FAILED=""
if [[ -f "${{LOCKFILE}}" ]]; then
PREV_FAILED=$(cat "${{LOCKFILE}}")
fi

# Check for NEW failures only (not just changes)
NEW_FAILURES=""
while IFS= read -r job; do
if [[ -n "${{job}}" ]] && ! echo "${{PREV_FAILED}}" | grep -qF "${{job}}"; then
NEW_FAILURES="${{NEW_FAILURES}}${{job}}"$'\n'
fi
done <<< "${{FAILED_JOBS}}"

# Send email only if there are NEW failures
if [[ -n "${{NEW_FAILURES}}" ]]; then
MSGFILE="/tmp/rocoto_fail_msg_$$.txt"
{{
echo "The following jobs have failed in experiment {pslot} on ${{platform}}:"
echo ""

# Format each failed job with detailed information
while IFS= read -r line; do
if [[ -n "${{line}}" ]]; then
# Parse rocotostat output: Cycle Task JobID State Try MaxTries Duration
read -r cycle task jobid state try maxtries duration <<< "${{line}}"
# Extract YYYYMMDDHH from cycle (first 10 characters)
cycle_short=${{cycle:0:10}}
# Get current timestamp
timestamp=$(date -u '+%m/%d/%y %H:%M:%S UTC')

# Format similar to user's example
echo "${{timestamp}} :: {pslot}.xml :: Cycle ${{cycle}}, Task ${{task}}, jobid=${{jobid}}, in state ${{state}}, ran for ${{duration}} seconds, try=${{try}} (of ${{maxtries}})"
echo "Check log: {comroot}/{pslot}/logs/${{cycle_short}}/${{task}}.log"
echo ""
fi
done <<< "${{NEW_FAILURES}}"
}} > "${{MSGFILE}}"

# Try to send email
EMAIL="{replyto}"
hostname_domain=$(hostname -d)
FROM_EMAIL="no-reply@${{hostname_domain}}"
if [[ -n "${{EMAIL}}" ]] && command -v mail &> /dev/null; then
# On Gaea, the mail utility requires the -v (verbose) flag to ensure delivery.
# To avoid receiving verbose output as an actual email, a spoofed 'from' address is used for notifications.
mail -r "${{FROM_EMAIL}}" -v -s "[{pslot}] Workflow Job Failures Detected" "${{EMAIL}}" < "${{MSGFILE}}" 2>&1
fi

rm -f "${{MSGFILE}}"
fi

# Always update lockfile to reflect current failures
echo "${{FAILED_JOBS}}" > "${{LOCKFILE}}"
else
# No failures, remove lockfile if it exists
[[ -f "${{LOCKFILE}}" ]] && rm -f "${{LOCKFILE}}"
fi
fi
54 changes: 43 additions & 11 deletions dev/workflow/rocoto/rocoto_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,37 @@ def write(self, xml_file: str = None, crontab_file: str = None):
if self._base["DO_ARCHCOM"] and self._base["ARCHCOM_TO"] == "globus_hpss":
self._write_server_crontab()

def _get_scron_script_content(self, rocotorunstr: str, replyto: str) -> str:
"""
Load and format the cron script template with experiment-specific values.

Parameters
----------
rocotorunstr : str
The rocotorun command string
replyto : str
Email address for notifications

Returns
-------
str
Formatted bash script content
"""
template_path = os.path.join(os.path.dirname(__file__), 'rocoto_scron_template.sh')

with open(template_path, 'r') as fh:
template_content = fh.read()

# Format the template with experiment-specific values
return template_content.format(
HOMEgfs=self.HOMEgfs,
rocotorunstr=rocotorunstr,
expdir=self.expdir,
pslot=self.pslot,
replyto=replyto,
comroot=self._base.get('COMROOT')
)

def _write_xml(self, xml_file: str = None) -> None:

if xml_file is None:
Expand All @@ -148,6 +179,10 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None:
cronintstr = f'*/{cronint} * * * *'

replyto = os.environ.get('REPLYTO', "")
# Use fallback email if REPLYTO is not set
if not replyto:
user = os.environ.get('USER', 'user')
replyto = f"{user}@noaa.gov"

crontab_strings = [
'',
Expand All @@ -169,28 +204,25 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None:
crontab_strings.extend([
f'#SCRON --partition={partition}',
f'#SCRON --account={account}',
f'#SCRON --mail-user={replyto}',
f'#SCRON --job-name={self.pslot}_scron',
f'#SCRON --output={self.expdir}/logs/scron.log',
'#SCRON --time=00:10:00',
'#SCRON --dependency=singleton'
f'#SCRON --time=00:10:00',
f'#SCRON --dependency=singleton'
])

# Now write the script that actually runs rocotorun
# Now write the script that actually runs rocotorun and monitors for failures
cron_cmd = f"{self.expdir}/{self.pslot}.scron.sh"
script_content = "#!/usr/bin/env bash\nset -x\n" + self._get_scron_script_content(rocotorunstr, replyto)

with open(cron_cmd, "w") as script_fh:
script_fh.write(
"#!/usr/bin/env bash\n" +
"set -x\n" +
f"source {self.HOMEgfs}/dev/ush/gw_setup.sh" + "\n" +
rocotorunstr + "\n"
)
script_fh.write(script_content)

# Make the script executable
mode = os.stat(cron_cmd)
os.chmod(cron_cmd, mode.st_mode | stat.S_IEXEC)
else:
cron_cmd = rocotorunstr
# For regular crontab, create a wrapper script with monitoring
cron_cmd = f"{self.expdir}/{self.pslot}.cron.sh"
crontab_strings.extend([
'SHELL="/bin/bash"',
f'MAILTO="{replyto}"'
Expand Down
Loading