Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
99 commits
Select commit Hold shift + click to select a range
b81e3ea
Add email notification system for scrontab-launched Rocoto workflows
AntonMFernando-NOAA Jan 7, 2026
bb35450
update rocoto scripts
AntonMFernando-NOAA Jan 13, 2026
227ece7
Merge changes into feature/scrontab branch
AntonMFernando-NOAA Jan 13, 2026
0275f6c
Revert "Add email notification system for scrontab-launched Rocoto wo…
AntonMFernando-NOAA Jan 13, 2026
573c451
update rocoto
AntonMFernando-NOAA Jan 14, 2026
51f6744
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 14, 2026
6a46e53
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 15, 2026
79dac8a
update dev/workflow/rocoto/rocoto_cron_template.sh
AntonMFernando-NOAA Jan 15, 2026
7dcfe25
update dev/workflow/rocoto/rocoto_cron_template.sh
AntonMFernando-NOAA Jan 15, 2026
7f16b5b
updated for a test
AntonMFernando-NOAA Jan 16, 2026
c3cf594
update dev/workflow/rocoto/rocoto_cron_template.sh
AntonMFernando-NOAA Jan 16, 2026
54f373f
update name
AntonMFernando-NOAA Jan 16, 2026
26bafaa
update dev/workflow/rocoto/rocoto_scron_template.sh
AntonMFernando-NOAA Jan 16, 2026
d21db79
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 16, 2026
1cee746
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 16, 2026
07a9f08
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 20, 2026
e9a8772
remove typo
AntonMFernando-NOAA Jan 20, 2026
31d8ae5
update ufs submodule
AntonMFernando-NOAA Jan 20, 2026
26b5946
update ufs_utils submodule
AntonMFernando-NOAA Jan 20, 2026
2d1bf18
update dev/workflow/rocoto/rocoto_xml.py
AntonMFernando-NOAA Jan 20, 2026
22c3725
update dev/workflow/rocoto/rocoto_scron_template.sh
AntonMFernando-NOAA Jan 20, 2026
262178c
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 20, 2026
f53c449
update dev/workflow/rocoto/rocoto_scron_template.sh
AntonMFernando-NOAA Jan 20, 2026
c9f2745
Merge branch 'feature/scrontab' of https://github.com/AntonMFernando-…
AntonMFernando-NOAA Jan 20, 2026
0907055
shelcheck error fix
AntonMFernando-NOAA Jan 20, 2026
e0a5db9
update dev/workflow/rocoto/rocoto_scron_template.sh
AntonMFernando-NOAA Jan 20, 2026
d34feaa
update dev/workflow/rocoto/rocoto_scron_template.sh
AntonMFernando-NOAA Jan 20, 2026
966f0a2
update dev/workflow/rocoto/rocoto_xml.py
AntonMFernando-NOAA Jan 20, 2026
1556bc5
Merge branch 'develop' into feature/scrontab
AntonMFernando-NOAA Jan 20, 2026
a4d2024
shellcheck errors
AntonMFernando-NOAA Jan 20, 2026
abf88c8
update to fix shellcheck errors
AntonMFernando-NOAA Jan 20, 2026
4ab3636
update bash
AntonMFernando-NOAA Jan 20, 2026
bb65399
update dev/workflow/rocoto/rocoto_xml.py
AntonMFernando-NOAA Jan 20, 2026
68af080
submodule update
AntonMFernando-NOAA Jan 20, 2026
6f010af
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 21, 2026
ad48031
added template
AntonMFernando-NOAA Jan 21, 2026
e1aab1f
Merge branch 'develop' into feature/scrontab
DavidHuber-NOAA Jan 22, 2026
325e44c
Update dev/workflow/rocoto/rocoto_scron.sh.j2
AntonMFernando-NOAA Jan 22, 2026
b12b1f4
added email settings
AntonMFernando-NOAA Jan 23, 2026
bd5eabb
removed typo
AntonMFernando-NOAA Jan 23, 2026
3a5c7e3
Update submodules to develop branch hashes
AntonMFernando-NOAA Jan 23, 2026
8fb44f9
Align submodules with upstream develop branch hashes
AntonMFernando-NOAA Jan 23, 2026
90b082c
cleaning
AntonMFernando-NOAA Jan 23, 2026
f11e95b
get REPLYTO from env variables
AntonMFernando-NOAA Jan 23, 2026
9d7a644
update dev/workflow/rocoto/rocoto_xml.py
AntonMFernando-NOAA Jan 23, 2026
219ba1f
typos
AntonMFernando-NOAA Jan 23, 2026
e7badd0
bug
AntonMFernando-NOAA Jan 23, 2026
2d9aec5
change comment
AntonMFernando-NOAA Jan 23, 2026
447002b
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 26, 2026
8a626f4
added a message
AntonMFernando-NOAA Jan 26, 2026
ba292d9
add only to scrontab
AntonMFernando-NOAA Jan 26, 2026
49df8de
add scrontab read option
AntonMFernando-NOAA Jan 26, 2026
2ecdaac
update generate_workflow.sh
AntonMFernando-NOAA Jan 26, 2026
55d8149
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
f304a12
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
d13bcfe
update scripts
AntonMFernando-NOAA Jan 26, 2026
db30ff2
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
468f9e7
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
5b87973
updated replyto>mailto
AntonMFernando-NOAA Jan 26, 2026
0e7d161
Fix crontab MAILTO handling and conditional message display
AntonMFernando-NOAA Jan 26, 2026
158d7df
Add MAILTO to tests.cron regardless of -c flag usage
AntonMFernando-NOAA Jan 26, 2026
becde3e
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
a518d80
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
5577920
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
9dddf3a
Suppress MAILTO message when email already exists in crontab
AntonMFernando-NOAA Jan 26, 2026
0ed8c56
Fix MAILTO detection to match any MAILTO format
AntonMFernando-NOAA Jan 26, 2026
3b2ab13
update generate_workflows
AntonMFernando-NOAA Jan 26, 2026
9466f15
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
11992ab
typo
AntonMFernando-NOAA Jan 26, 2026
efcc25d
Format MAILTO line with consistent 65-character width
AntonMFernando-NOAA Jan 26, 2026
e7eb283
typo
AntonMFernando-NOAA Jan 27, 2026
5851901
update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 27, 2026
7f8948a
typo
AntonMFernando-NOAA Jan 27, 2026
016b3c4
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 27, 2026
ecf4324
typo
AntonMFernando-NOAA Jan 27, 2026
e2dfd99
typos
AntonMFernando-NOAA Jan 27, 2026
a262543
shellcheck
AntonMFernando-NOAA Jan 27, 2026
ce44ca4
dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 27, 2026
dbf9d88
typo
AntonMFernando-NOAA Jan 27, 2026
84f56fe
update dev/workflow/rocoto/rocoto_xml.py
AntonMFernando-NOAA Jan 27, 2026
cd59295
typo
AntonMFernando-NOAA Jan 27, 2026
caa4b4e
Merge branch 'develop' into feature/scrontab
AntonMFernando-NOAA Jan 27, 2026
17d038e
delete extra file
AntonMFernando-NOAA Jan 27, 2026
2835d6b
Merge branch 'develop' into feature/scrontab
AntonMFernando-NOAA Jan 27, 2026
ad32fd2
update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 27, 2026
85440a9
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 27, 2026
9c08d00
remove comment
AntonMFernando-NOAA Jan 27, 2026
7b5d085
remove unncessary functions
AntonMFernando-NOAA Jan 27, 2026
2bb573c
update with cron check
AntonMFernando-NOAA Jan 27, 2026
d84ae51
warning conditions changes
AntonMFernando-NOAA Jan 27, 2026
bf49fb5
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 27, 2026
ecb5bd6
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 27, 2026
d6aba95
update warning
AntonMFernando-NOAA Jan 27, 2026
777ae9c
remove all the additional stuff. back to normal.
AntonMFernando-NOAA Jan 28, 2026
8975476
update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 28, 2026
daf15b4
update generate_workflow
AntonMFernando-NOAA Jan 28, 2026
d82b122
update parm/globus/init_xfer.sh.j2
AntonMFernando-NOAA Jan 28, 2026
3fdafff
Merge branch 'develop' into feature/scrontab
AntonMFernando-NOAA Jan 28, 2026
d10615d
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 29, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 62 additions & 7 deletions dev/workflow/generate_workflows.sh
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,18 @@ function send_email() {
echo "${_body}" | mail -s "${_subject}" "${_email}"
}

# Function to notify user about REPLYTO for scrontab workflows
function mail_warning() {
if [[ "${_use_scron}" == true && "${_set_email}" == false && -z "${REPLYTO:-}" ]]; then
echo -e "\033[0;33mWARNING:\033[0m Set \033[0;32mexport REPLYTO=\"your_email\"\033[0m in your .bashrc or use generate_workflows.sh with \033[0;32m-e \"your_email\"\033[0m to receive job failure notifications."
fi
}

# Export REPLYTO if email was provided via -e flag and is not empty
if [[ "${_set_email}" == "true" && -n "${_email}" ]]; then
export REPLYTO="${_email}"
fi

function delete_dir() {
local dir_to_rm="${1:-}"
if [[ -z "${dir_to_rm}" ]]; then
Expand Down Expand Up @@ -584,20 +596,35 @@ for _case in "${_yaml_list[@]}"; do

if [[ "${_use_scron}" == true ]]; then
{
grep "^#.*${_pslot}" "${_runtests}/EXPDIR/${_pslot}/${_pslot}.crontab"
grep "^####" "${cron_file}"
grep "^#SCRON" "${cron_file}"
grep "${scron_sh_file}" "${_runtests}/EXPDIR/${_pslot}/${_pslot}.crontab"
grep "${scron_sh_file}" "${cron_file}"
} >> tests.cron
else
grep "${_pslot}" "${_runtests}/EXPDIR/${_pslot}/${_pslot}.crontab" >> tests.cron
fi
done
echo

# Add MAILTO to tests.cron for regular crontab
if [[ "${_use_scron}" == false ]]; then
if [[ "${_set_email}" == "true" ]]; then
# Use email from -e flag
sed -i "1i MAILTO=\"${_email}\"" tests.cron
elif [[ -n "${REPLYTO:-}" ]]; then
# Use REPLYTO environment variable
sed -i "1i MAILTO=\"${REPLYTO}\"" tests.cron
else
# Use empty MAILTO
sed -i "1i MAILTO=\"\"" tests.cron
fi
fi

# Update the cron
if [[ "${_update_cron}" == "true" ]]; then
printf "Updating the existing crontab\n\n"
echo
mail_warning
rm -f existing.cron final.cron "${_verbose_flag}"
touch existing.cron final.cron

Expand All @@ -610,16 +637,42 @@ if [[ "${_update_cron}" == "true" ]]; then
echo "#######################"
fi

# Save existing MAILTO before removing it
existing_mailto=$(grep "^MAILTO=" existing.cron 2> /dev/null | head -1 || echo "")

# Remove ALL MAILTO lines from existing.cron and tests.cron to prevent duplicates
sed -i '/^MAILTO=/d' existing.cron 2> /dev/null || true
sed -i '/^MAILTO=/d' tests.cron 2> /dev/null || true

if [[ "${_set_email}" == "true" ]]; then
# Replace the existing email in the crontab
# For scrontab, REPLYTO is already exported earlier; for crontab, set MAILTO
if [[ "${_verbose}" == "true" ]]; then
printf "Updating crontab/scrontab email to %s\n\n" "${_email}"
fi

if [[ "${_use_scron}" == true ]]; then
sed -i "s/.*--mail-user.*/#SCRON --mail-user=\"${_email}\"/" tests.cron
else
sed -i "s/^MAILTO.*/MAILTO=\"${_email}\"/" existing.cron
if [[ "${_use_scron}" == false ]]; then
# For regular crontab, set MAILTO at the top of final.cron
echo "MAILTO=\"${_email}\"" > final.cron
fi
else
# Preserve existing MAILTO if present with non-empty value (only for regular crontab)
if [[ "${_use_scron}" == false ]]; then
# Check if there was a MAILTO with a non-empty value in the original crontab
# Extract the email value from MAILTO="email" or MAILTO=email
if [[ -n "${existing_mailto}" ]]; then
# Extract email value between quotes or after =
existing_email=$(echo "${existing_mailto}" | sed -n 's/^MAILTO=["'\'']*\([^"'\'']*\)["'\'']*$/\1/p')
else
existing_email=""
fi

if [[ -n "${existing_email}" ]]; then
echo "${existing_mailto}" > final.cron
elif [[ -n "${REPLYTO:-}" ]]; then
echo "MAILTO=\"${REPLYTO}\"" > final.cron
else
echo "MAILTO=\"\"" > final.cron
fi
fi
fi

Expand All @@ -634,10 +687,12 @@ if [[ "${_update_cron}" == "true" ]]; then

${_crontab_cmd} final.cron
else
mail_warning
_message="Add the following to your crontab or scrontab to start running:"
_cron_tests=$(cat tests.cron)
_message="${_message}"$'\n'"${_cron_tests}"
echo "${_message}"
echo
if [[ "${_set_email}" == true ]]; then
final_message="${final_message:-}"$'\n'"${_message}"
fi
Expand Down
73 changes: 73 additions & 0 deletions dev/workflow/rocoto/rocoto_scron.sh.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#! /usr/bin/env bash
source "{{ HOMEgfs }}/dev/ush/gw_setup.sh"

# Run rocotorun
bash -c "{{ rocotorunstr }}"

# Monitor for failed jobs using rocotostat
LOCKFILE="{{ expdir }}/.failed_jobs.lock"
ROCOTOSTAT=$(command -v rocotostat)

if [[ -n "${ROCOTOSTAT}" ]]; then
FAILED_JOBS=$(${ROCOTOSTAT} -d "{{ expdir }}/{{ pslot }}.db" -w "{{ expdir }}/{{ pslot }}.xml" -c all 2> /dev/null | grep -E 'DEAD')

if [[ -n "${FAILED_JOBS}" ]]; then
# Read previously reported failures
PREV_FAILED=""
if [[ -f "${LOCKFILE}" ]]; then
PREV_FAILED=$(cat "${LOCKFILE}")
fi

# Check for NEW failures only (not just changes)
NEW_FAILURES=""
while IFS= read -r job; do
if [[ -n "${job}" ]] && ! echo "${PREV_FAILED}" | grep -qF "${job}"; then
NEW_FAILURES="${NEW_FAILURES}${job}"$'\n'
fi
done <<< "${FAILED_JOBS}"

# Send email only if there are NEW failures
if [[ -n "${NEW_FAILURES}" ]]; then
MSGFILE="/tmp/rocoto_fail_msg_$$.txt"
{
echo "The following jobs have failed in experiment {{ pslot }} on ${MACHINE_ID}:"
echo ""

# Format each failed job with detailed information
while IFS= read -r line; do
if [[ -n "${line}" ]]; then
# Parse rocotostat output: Cycle Task JobID State Try MaxTries Duration
read -r cycle task jobid state try maxtries duration <<< "${line}"
# Extract YYYYMMDDHH from cycle (first 10 characters)
cycle_short=${cycle:0:10}
# Get current timestamp
timestamp=$(date -u '+%m/%d/%y %H:%M:%S UTC')

# Format similar to user's example
echo "${timestamp} :: {{ pslot }}.xml :: Cycle ${cycle}, Task ${task}, jobid=${jobid}, in state ${state}, ran for ${duration} seconds, try=${try} (of ${maxtries})"
echo "Check log: {{ comroot }}/{{ pslot }}/logs/${cycle_short}/${task}.log"
echo ""
fi
done <<< "${NEW_FAILURES}"
} > "${MSGFILE}"

# Try to send email
EMAIL="{{ replyto }}"
hostname_domain=$(hostname -d)
FROM_EMAIL="no-reply@${hostname_domain}"
if [[ "${EMAIL}" != "None" ]] && command -v mail &> /dev/null; then
# On Gaea, the mail utility requires the -v (verbose) flag to ensure delivery.
# To avoid receiving verbose output as an actual email, a spoofed 'from' address is used for notifications.
mail -r "${FROM_EMAIL}" -v -s "[{{ pslot }}] Workflow Job Failures Detected" "${EMAIL}" < "${MSGFILE}" 2>&1
fi

rm -f "${MSGFILE}"
fi

# Always update lockfile to reflect current failures
echo "${FAILED_JOBS}" > "${LOCKFILE}"
else
# No failures, remove lockfile if it exists
[[ -f "${LOCKFILE}" ]] && rm -f "${LOCKFILE}"
fi
fi
52 changes: 38 additions & 14 deletions dev/workflow/rocoto/rocoto_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from applications.applications import AppConfig
from workflow_suite import WorkflowSuite
from rocoto.workflow_tasks import get_wf_tasks
from wxflow import which, mkdir
from wxflow import which, mkdir, parse_j2tmpl
import rocoto.rocoto as rocoto
from abc import ABC, abstractmethod
from logging import getLogger
Expand Down Expand Up @@ -123,6 +123,36 @@ def write(self, xml_file: str = None, crontab_file: str = None):
if self._base["DO_ARCHCOM"] and self._base["ARCHCOM_TO"] == "globus_hpss":
self._write_server_crontab()

def _get_scron_script_content(self, rocotorunstr: str, replyto: str) -> str:
"""
Load and format the cron script template with experiment-specific values.

Parameters
----------
rocotorunstr : str
The rocotorun command string
replyto : str
Email address for notifications

Returns
-------
str
Formatted bash script content
"""
template_path = os.path.join(os.path.dirname(__file__), 'rocoto_scron.sh.j2')

# Format the template with experiment-specific values
context = {
'HOMEgfs': self.HOMEgfs,
'rocotorunstr': rocotorunstr,
'expdir': self.expdir,
'pslot': self.pslot,
'replyto': replyto,
'comroot': self._base.get('COMROOT')
}
template_content = parse_j2tmpl(template_path, context)
return template_content

def _write_xml(self, xml_file: str = None) -> None:

if xml_file is None:
Expand All @@ -147,8 +177,7 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None:
rocotorunstr = f'{rocotoruncmd} -d {self.expdir}/{self.pslot}.db -w {self.expdir}/{self.pslot}.xml'
cronintstr = f'*/{cronint} * * * *'

replyto = os.environ.get('REPLYTO', "")

replyto = os.environ.get('REPLYTO', None)
crontab_strings = [
'',
f'#################### {self.pslot} ####################'
Expand All @@ -169,28 +198,23 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None:
crontab_strings.extend([
f'#SCRON --partition={partition}',
f'#SCRON --account={account}',
f'#SCRON --mail-user={replyto}',
f'#SCRON --job-name={self.pslot}_scron',
f'#SCRON --output={self.expdir}/logs/scron.log',
'#SCRON --time=00:10:00',
'#SCRON --dependency=singleton'
f'#SCRON --time=00:10:00',
f'#SCRON --dependency=singleton'
])

# Now write the script that actually runs rocotorun
# Now write the script that actually runs rocotorun and monitors for failures
cron_cmd = f"{self.expdir}/{self.pslot}.scron.sh"
with open(cron_cmd, "w") as script_fh:
script_fh.write(
"#!/usr/bin/env bash\n" +
"set -x\n" +
f"source {self.HOMEgfs}/dev/ush/gw_setup.sh" + "\n" +
rocotorunstr + "\n"
)
script_fh.write(self._get_scron_script_content(rocotorunstr, replyto))

# Make the script executable
mode = os.stat(cron_cmd)
os.chmod(cron_cmd, mode.st_mode | stat.S_IEXEC)
else:
cron_cmd = rocotorunstr
# For regular crontab, create a wrapper script with monitoring
cron_cmd = f"{self.expdir}/{self.pslot}.cron.sh"
crontab_strings.extend([
'SHELL="/bin/bash"',
f'MAILTO="{replyto}"'
Expand Down
Loading