Skip to content
Merged
Show file tree
Hide file tree
Changes from 84 commits
Commits
Show all changes
99 commits
Select commit Hold shift + click to select a range
b81e3ea
Add email notification system for scrontab-launched Rocoto workflows
AntonMFernando-NOAA Jan 7, 2026
bb35450
update rocoto scripts
AntonMFernando-NOAA Jan 13, 2026
227ece7
Merge changes into feature/scrontab branch
AntonMFernando-NOAA Jan 13, 2026
0275f6c
Revert "Add email notification system for scrontab-launched Rocoto wo…
AntonMFernando-NOAA Jan 13, 2026
573c451
update rocoto
AntonMFernando-NOAA Jan 14, 2026
51f6744
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 14, 2026
6a46e53
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 15, 2026
79dac8a
update dev/workflow/rocoto/rocoto_cron_template.sh
AntonMFernando-NOAA Jan 15, 2026
7dcfe25
update dev/workflow/rocoto/rocoto_cron_template.sh
AntonMFernando-NOAA Jan 15, 2026
7f16b5b
updated for a test
AntonMFernando-NOAA Jan 16, 2026
c3cf594
update dev/workflow/rocoto/rocoto_cron_template.sh
AntonMFernando-NOAA Jan 16, 2026
54f373f
update name
AntonMFernando-NOAA Jan 16, 2026
26bafaa
update dev/workflow/rocoto/rocoto_scron_template.sh
AntonMFernando-NOAA Jan 16, 2026
d21db79
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 16, 2026
1cee746
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 16, 2026
07a9f08
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 20, 2026
e9a8772
remove typo
AntonMFernando-NOAA Jan 20, 2026
31d8ae5
update ufs submodule
AntonMFernando-NOAA Jan 20, 2026
26b5946
update ufs_utils submodule
AntonMFernando-NOAA Jan 20, 2026
2d1bf18
update dev/workflow/rocoto/rocoto_xml.py
AntonMFernando-NOAA Jan 20, 2026
22c3725
update dev/workflow/rocoto/rocoto_scron_template.sh
AntonMFernando-NOAA Jan 20, 2026
262178c
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 20, 2026
f53c449
update dev/workflow/rocoto/rocoto_scron_template.sh
AntonMFernando-NOAA Jan 20, 2026
c9f2745
Merge branch 'feature/scrontab' of https://github.com/AntonMFernando-…
AntonMFernando-NOAA Jan 20, 2026
0907055
shelcheck error fix
AntonMFernando-NOAA Jan 20, 2026
e0a5db9
update dev/workflow/rocoto/rocoto_scron_template.sh
AntonMFernando-NOAA Jan 20, 2026
d34feaa
update dev/workflow/rocoto/rocoto_scron_template.sh
AntonMFernando-NOAA Jan 20, 2026
966f0a2
update dev/workflow/rocoto/rocoto_xml.py
AntonMFernando-NOAA Jan 20, 2026
1556bc5
Merge branch 'develop' into feature/scrontab
AntonMFernando-NOAA Jan 20, 2026
a4d2024
shellcheck errors
AntonMFernando-NOAA Jan 20, 2026
abf88c8
update to fix shellcheck errors
AntonMFernando-NOAA Jan 20, 2026
4ab3636
update bash
AntonMFernando-NOAA Jan 20, 2026
bb65399
update dev/workflow/rocoto/rocoto_xml.py
AntonMFernando-NOAA Jan 20, 2026
68af080
submodule update
AntonMFernando-NOAA Jan 20, 2026
6f010af
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 21, 2026
ad48031
added template
AntonMFernando-NOAA Jan 21, 2026
e1aab1f
Merge branch 'develop' into feature/scrontab
DavidHuber-NOAA Jan 22, 2026
325e44c
Update dev/workflow/rocoto/rocoto_scron.sh.j2
AntonMFernando-NOAA Jan 22, 2026
b12b1f4
added email settings
AntonMFernando-NOAA Jan 23, 2026
bd5eabb
removed typo
AntonMFernando-NOAA Jan 23, 2026
3a5c7e3
Update submodules to develop branch hashes
AntonMFernando-NOAA Jan 23, 2026
8fb44f9
Align submodules with upstream develop branch hashes
AntonMFernando-NOAA Jan 23, 2026
90b082c
cleaning
AntonMFernando-NOAA Jan 23, 2026
f11e95b
get REPLYTO from env variables
AntonMFernando-NOAA Jan 23, 2026
9d7a644
update dev/workflow/rocoto/rocoto_xml.py
AntonMFernando-NOAA Jan 23, 2026
219ba1f
typos
AntonMFernando-NOAA Jan 23, 2026
e7badd0
bug
AntonMFernando-NOAA Jan 23, 2026
2d9aec5
change comment
AntonMFernando-NOAA Jan 23, 2026
447002b
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 26, 2026
8a626f4
added a message
AntonMFernando-NOAA Jan 26, 2026
ba292d9
add only to scrontab
AntonMFernando-NOAA Jan 26, 2026
49df8de
add scrontab read option
AntonMFernando-NOAA Jan 26, 2026
2ecdaac
update generate_workflow.sh
AntonMFernando-NOAA Jan 26, 2026
55d8149
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
f304a12
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
d13bcfe
update scripts
AntonMFernando-NOAA Jan 26, 2026
db30ff2
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
468f9e7
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
5b87973
updated replyto>mailto
AntonMFernando-NOAA Jan 26, 2026
0e7d161
Fix crontab MAILTO handling and conditional message display
AntonMFernando-NOAA Jan 26, 2026
158d7df
Add MAILTO to tests.cron regardless of -c flag usage
AntonMFernando-NOAA Jan 26, 2026
becde3e
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
a518d80
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
5577920
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
9dddf3a
Suppress MAILTO message when email already exists in crontab
AntonMFernando-NOAA Jan 26, 2026
0ed8c56
Fix MAILTO detection to match any MAILTO format
AntonMFernando-NOAA Jan 26, 2026
3b2ab13
update generate_workflows
AntonMFernando-NOAA Jan 26, 2026
9466f15
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 26, 2026
11992ab
typo
AntonMFernando-NOAA Jan 26, 2026
efcc25d
Format MAILTO line with consistent 65-character width
AntonMFernando-NOAA Jan 26, 2026
e7eb283
typo
AntonMFernando-NOAA Jan 27, 2026
5851901
update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 27, 2026
7f8948a
typo
AntonMFernando-NOAA Jan 27, 2026
016b3c4
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 27, 2026
ecf4324
typo
AntonMFernando-NOAA Jan 27, 2026
e2dfd99
typos
AntonMFernando-NOAA Jan 27, 2026
a262543
shellcheck
AntonMFernando-NOAA Jan 27, 2026
ce44ca4
dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 27, 2026
dbf9d88
typo
AntonMFernando-NOAA Jan 27, 2026
84f56fe
update dev/workflow/rocoto/rocoto_xml.py
AntonMFernando-NOAA Jan 27, 2026
cd59295
typo
AntonMFernando-NOAA Jan 27, 2026
caa4b4e
Merge branch 'develop' into feature/scrontab
AntonMFernando-NOAA Jan 27, 2026
17d038e
delete extra file
AntonMFernando-NOAA Jan 27, 2026
2835d6b
Merge branch 'develop' into feature/scrontab
AntonMFernando-NOAA Jan 27, 2026
ad32fd2
update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 27, 2026
85440a9
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 27, 2026
9c08d00
remove comment
AntonMFernando-NOAA Jan 27, 2026
7b5d085
remove unncessary functions
AntonMFernando-NOAA Jan 27, 2026
2bb573c
update with cron check
AntonMFernando-NOAA Jan 27, 2026
d84ae51
warning conditions changes
AntonMFernando-NOAA Jan 27, 2026
bf49fb5
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 27, 2026
ecb5bd6
Update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 27, 2026
d6aba95
update warning
AntonMFernando-NOAA Jan 27, 2026
777ae9c
remove all the additional stuff. back to normal.
AntonMFernando-NOAA Jan 28, 2026
8975476
update dev/workflow/generate_workflows.sh
AntonMFernando-NOAA Jan 28, 2026
daf15b4
update generate_workflow
AntonMFernando-NOAA Jan 28, 2026
d82b122
update parm/globus/init_xfer.sh.j2
AntonMFernando-NOAA Jan 28, 2026
3fdafff
Merge branch 'develop' into feature/scrontab
AntonMFernando-NOAA Jan 28, 2026
d10615d
Merge branch 'NOAA-EMC:develop' into feature/scrontab
AntonMFernando-NOAA Jan 29, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 68 additions & 16 deletions dev/workflow/generate_workflows.sh
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ EOF
}

set -eu
shopt -s inherit_errexit

# Set default options
HOMEgfs=""
Expand Down Expand Up @@ -187,6 +188,11 @@ function send_email() {
echo "${_body}" | mail -s "${_subject}" "${_email}"
}

# Export MAILTO if email was provided via -e flag and is not empty
if [[ "${_set_email}" == "true" && -n "${_email}" ]]; then
export MAILTO="${_email}"
fi

function delete_dir() {
local dir_to_rm="${1:-}"
if [[ -z "${dir_to_rm}" ]]; then
Expand Down Expand Up @@ -531,9 +537,9 @@ for _case in "${_yaml_list[@]}"; do
_pslot="${_case}${_tag}"
_create_exp_cmd="./create_experiment.py -y ${_yaml_dir}/${_case}.yaml --overwrite"
if [[ "${_verbose}" == true ]]; then
pslot=${_pslot} RUNTESTS=${_runtests} ${_create_exp_cmd}
pslot=${_pslot} RUNTESTS=${_runtests} MAILTO="${MAILTO:-}" ${_create_exp_cmd}
else
if ! pslot=${_pslot} RUNTESTS=${_runtests} ${_create_exp_cmd} 2> stderr 1> stdout; then
if ! pslot=${_pslot} RUNTESTS=${_runtests} MAILTO="${MAILTO:-}" ${_create_exp_cmd} 2> stderr 1> stdout; then
_output=$(cat stdout stderr)
_message="The create_experiment command (${_create_exp_cmd}) failed with a non-zero status. Output:"
_message="${_message}"$'\n'"${_output}"
Expand Down Expand Up @@ -584,43 +590,82 @@ for _case in "${_yaml_list[@]}"; do

if [[ "${_use_scron}" == true ]]; then
{
grep "^#.*${_pslot}" "${_runtests}/EXPDIR/${_pslot}/${_pslot}.crontab"
# Skip MAILTO lines (will be added once at the top later)
grep "^####" "${cron_file}" | grep -v "MAILTO="
grep "^#SCRON" "${cron_file}"
grep "${scron_sh_file}" "${_runtests}/EXPDIR/${_pslot}/${_pslot}.crontab"
grep "${scron_sh_file}" "${cron_file}"
} >> tests.cron
else
grep "${_pslot}" "${_runtests}/EXPDIR/${_pslot}/${_pslot}.crontab" >> tests.cron
fi
done
echo

# Function to format crontab comment lines with consistent width
_format_crontab_comment_line() {
set -e
local text="${1:-}"
local total_length=65

if [[ -z "${text}" ]]; then
printf '%*s' "${total_length}" '' | tr ' ' '#'
else
local text_with_spaces=" ${text} "
local text_len=${#text_with_spaces}
local remaining=$((total_length - text_len))
local left_padding=$((remaining / 2))
local right_padding=$((remaining - left_padding))
printf '%*s%s%*s' "${left_padding}" '' "${text_with_spaces}" "${right_padding}" '' | tr ' ' '#'
fi
echo
}

# Add email to tests.cron if provided via -e flag
if [[ "${_set_email}" == "true" ]]; then
# Remove any existing MAILTO lines from tests.cron
sed -i '/MAILTO=/d' tests.cron 2> /dev/null || true

# Add MAILTO as the first line - format depends on whether using scrontab
if [[ "${_use_scron}" == true ]]; then
# For scrontab, use formatted comment line
mailto_formatted=$(_format_crontab_comment_line "MAILTO=${_email}")
sed -i "1i ${mailto_formatted}" tests.cron
else
# For regular crontab, use plain directive
sed -i "1i MAILTO=${_email}" tests.cron
fi
fi

# Update the cron
if [[ "${_update_cron}" == "true" ]]; then
printf "Updating the existing crontab\n\n"
echo
rm -f existing.cron final.cron "${_verbose_flag}"
touch existing.cron final.cron

echo
${_crontab_cmd} -l | grep -v "no crontab for" > existing.cron || true

# Show warning only if MAILTO is not set as env variable and not present in existing.cron
if [[ "${_set_email}" == "false" && -z "${MAILTO:-}" ]]; then
if ! grep -q "MAILTO=" existing.cron 2> /dev/null; then
echo -e "\033[0;33mWARNING:\033[0m Set \033[0;32mexport MAILTO=\"your_email\"\033[0m in your .bashrc or use generate_workflows.sh with \033[0;32m-e \"your_email\"\033[0m to receive job failure notifications."
fi
fi

if [[ "${_debug}" == "true" ]]; then
echo "Existing crontab: "
echo "#######################"
cat existing.cron
echo "#######################"
fi

if [[ "${_set_email}" == "true" ]]; then
# Replace the existing email in the crontab
if [[ "${_verbose}" == "true" ]]; then
printf "Updating crontab/scrontab email to %s\n\n" "${_email}"
fi
# Remove MAILTO lines from both existing.cron and tests.cron to prevent duplicates
sed -i '/MAILTO=/d' existing.cron 2> /dev/null || true
sed -i '/MAILTO=/d' tests.cron 2> /dev/null || true

if [[ "${_use_scron}" == true ]]; then
sed -i "s/.*--mail-user.*/#SCRON --mail-user=\"${_email}\"/" tests.cron
else
sed -i "s/^MAILTO.*/MAILTO=\"${_email}\"/" existing.cron
fi
# Extract MAILTO line from the crontab file and put at top of final.cron
mailto_line=$(grep "MAILTO=" "${_runtests}/EXPDIR/${_pslot}/${_pslot}.crontab" 2> /dev/null | head -1 || echo "")
if [[ -n "${mailto_line}" ]]; then
echo "${mailto_line}" > final.cron
fi

cat existing.cron tests.cron >> final.cron
Expand All @@ -634,10 +679,17 @@ if [[ "${_update_cron}" == "true" ]]; then

${_crontab_cmd} final.cron
else
# Show warning only if MAILTO is not set as env variable and not present in tests.cron
if [[ "${_set_email}" == "false" && -z "${MAILTO:-}" ]]; then
if ! grep -q "MAILTO=" tests.cron 2> /dev/null; then
echo -e "\033[0;33mWARNING:\033[0m Set \033[0;32mexport MAILTO=\"your_email\"\033[0m in your .bashrc or use generate_workflows.sh with \033[0;32m-e \"your_email\"\033[0m to receive job failure notifications."
fi
fi
_message="Add the following to your crontab or scrontab to start running:"
_cron_tests=$(cat tests.cron)
_message="${_message}"$'\n'"${_cron_tests}"
echo "${_message}"
echo
if [[ "${_set_email}" == true ]]; then
final_message="${final_message:-}"$'\n'"${_message}"
fi
Expand Down
73 changes: 73 additions & 0 deletions dev/workflow/rocoto/rocoto_scron.sh.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#! /usr/bin/env bash
source "{{ HOMEgfs }}/dev/ush/gw_setup.sh"

# Run rocotorun
bash -c "{{ rocotorunstr }}"

# Monitor for failed jobs using rocotostat
LOCKFILE="{{ expdir }}/.failed_jobs.lock"
ROCOTOSTAT=$(command -v rocotostat)

if [[ -n "${ROCOTOSTAT}" ]]; then
FAILED_JOBS=$(${ROCOTOSTAT} -d "{{ expdir }}/{{ pslot }}.db" -w "{{ expdir }}/{{ pslot }}.xml" -c all 2> /dev/null | grep -E 'DEAD')

if [[ -n "${FAILED_JOBS}" ]]; then
# Read previously reported failures
PREV_FAILED=""
if [[ -f "${LOCKFILE}" ]]; then
PREV_FAILED=$(cat "${LOCKFILE}")
fi

# Check for NEW failures only (not just changes)
NEW_FAILURES=""
while IFS= read -r job; do
if [[ -n "${job}" ]] && ! echo "${PREV_FAILED}" | grep -qF "${job}"; then
NEW_FAILURES="${NEW_FAILURES}${job}"$'\n'
fi
done <<< "${FAILED_JOBS}"

# Send email only if there are NEW failures
if [[ -n "${NEW_FAILURES}" ]]; then
MSGFILE="/tmp/rocoto_fail_msg_$$.txt"
{
echo "The following jobs have failed in experiment {{ pslot }} on ${MACHINE_ID}:"
echo ""

# Format each failed job with detailed information
while IFS= read -r line; do
if [[ -n "${line}" ]]; then
# Parse rocotostat output: Cycle Task JobID State Try MaxTries Duration
read -r cycle task jobid state try maxtries duration <<< "${line}"
# Extract YYYYMMDDHH from cycle (first 10 characters)
cycle_short=${cycle:0:10}
# Get current timestamp
timestamp=$(date -u '+%m/%d/%y %H:%M:%S UTC')

# Format similar to user's example
echo "${timestamp} :: {{ pslot }}.xml :: Cycle ${cycle}, Task ${task}, jobid=${jobid}, in state ${state}, ran for ${duration} seconds, try=${try} (of ${maxtries})"
echo "Check log: {{ comroot }}/{{ pslot }}/logs/${cycle_short}/${task}.log"
echo ""
fi
done <<< "${NEW_FAILURES}"
} > "${MSGFILE}"

# Try to send email
EMAIL="{{ mailto }}"
hostname_domain=$(hostname -d)
FROM_EMAIL="no-reply@${hostname_domain}"
if [[ "${EMAIL}" != "None" ]] && command -v mail &> /dev/null; then
# On Gaea, the mail utility requires the -v (verbose) flag to ensure delivery.
# To avoid receiving verbose output as an actual email, a spoofed 'from' address is used for notifications.
mail -r "${FROM_EMAIL}" -v -s "[{{ pslot }}] Workflow Job Failures Detected" "${EMAIL}" < "${MSGFILE}" 2>&1
fi

rm -f "${MSGFILE}"
fi

# Always update lockfile to reflect current failures
echo "${FAILED_JOBS}" > "${LOCKFILE}"
else
# No failures, remove lockfile if it exists
[[ -f "${LOCKFILE}" ]] && rm -f "${LOCKFILE}"
fi
fi
Loading
Loading