From 8ea47711743d33ee998672b6b59eb3d98f697877 Mon Sep 17 00:00:00 2001 From: Rahul Mahajan Date: Tue, 23 Dec 2025 13:07:40 -0500 Subject: [PATCH 01/19] add bit of code for building on head node --- sorc/build_compute.sh | 104 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/sorc/build_compute.sh b/sorc/build_compute.sh index dc70c3f2c6b..3fa5b05b200 100755 --- a/sorc/build_compute.sh +++ b/sorc/build_compute.sh @@ -94,6 +94,110 @@ if [[ "${rc}" -ne 0 ]]; then exit 1 fi +# grep for tags in the build.xml and collect the commands in an array +mapfile -t commands < <(grep -oP '(?<=).*(?=)' "${build_xml}") +# get the corresponding log file names from the build.xml in an array +mapfile -t logs < <(grep -oP '(?<=).*(?=)' "${build_xml}") +# get the number of build jobs each command corresponds to in an array. The build jobs are the strings -j N in each command. +mapfile -t cores < <(echo "${commands[@]}" | grep -oP '(?<=-j )\d+') +# create an array of build names from the log file names (by obtaining the basename and stripping the .log extension) +mapfile -t names < <(printf "%s\n" "${logs[@]}" | xargs -n1 basename | sed 's/\.log$//') + +echo "The following build commands will be executed with the corresponding logs:" +declare -A build_names build_status build_commands build_logs build_cores build_pids +for i in "${!names[@]}"; do + + name="${names[i]}" + + build_names["${name}"]="${name}" + build_status["${name}"]="pending" + build_commands["${name}"]="${commands[i]}" + build_logs["${name}"]="${logs[i]}" + build_cores["${name}"]="${cores[i]}" + build_pids["${name}"]="" + + echo + echo "Name: ${build_names[${name}]}" + echo "Command: ${build_commands[${name}]}" + echo "Log: ${build_logs[${name}]}" + echo "Cores: ${build_cores[${name}]}" + echo "Status: ${build_status[${name}]}" +done + +max_cores=40 # Set the maximum number of cores to use for building + +# copy build_names into a new array to iterate over +builds_to_process=("${!build_names[@]}") + +builds_in_progress=true +current_cores=0 +while [[ ${builds_in_progress} == true ]]; do + + for name in "${!builds_to_process[@]}"; do + + if [[ ${build_status[${name}]} == "completed" ]]; then + continue + fi + + # Check if the build is still running + pid="${build_pids[${name}]}" + if [[ -z "${pid}" ]]; then # No pid means build not started yet + cores_needed="${build_cores[${name}]}" + if (( current_cores + cores_needed <= max_cores )); then + # Launch the build command in the background and redirect output to log file + command="${build_commands[${name}]}" + log_file="${build_logs[${name}]}" + echo "Launching build command: ${command} > ${log_file} 2>&1" + bash -c "${command} > ${log_file} 2>&1 &" + pid=$! + build_pids["${name}"]="${pid}" + build_status["${name}"]="building" + current_cores=$((current_cores + cores_needed)) + else + # Not enough cores available, skip to next build + continue + fi + + else + + if ! ps -p "${pid}" > /dev/null 2>&1; then + # Build has finished + wait "${pid}" + rc=$? + if [[ "${rc}" -ne 0 ]]; then + echo "BUILD ERROR: Build command '${build_commands[${name}]}' failed with exit code ${rc}." + echo "See log file: ${build_logs[${name}]}.log" + build_status["${name}"]="failed" + else + echo "BUILD SUCCESS: Build command '${build_commands[${name}]}' completed successfully." + build_status["${name}"]="completed" + current_cores=$((current_cores - build_cores[${name}])) + fi + fi + + fi + + # If the build failed, exit immediately + if [[ ${build_status[${name}]} == "failed" ]]; then + exit 1 + fi + + done + + # Remove completed builds from the list to process + builds_to_process=() + builds_in_progress=false + for name in "${!build_names[@]}"; do + if [[ ${build_status[${name}]} != "completed" ]]; then + builds_to_process+=("${name}") + builds_in_progress=true + fi + done + + sleep 10s + +done + echo "Launching builds in parallel on compute nodes ..." runcmd="rocotorun -w ${build_xml} -d ${build_db} ${rocoto_verbose_opt}" From 7f4f6662b864f80d1c16ef4ed3225f4468f1e603 Mon Sep 17 00:00:00 2001 From: Rahul Mahajan Date: Tue, 23 Dec 2025 14:35:05 -0500 Subject: [PATCH 02/19] make building on compute an option --- sorc/build_compute.sh | 288 +++++++++++++++++++++++------------------- 1 file changed, 157 insertions(+), 131 deletions(-) diff --git a/sorc/build_compute.sh b/sorc/build_compute.sh index 3fa5b05b200..a8a7e2bc70f 100755 --- a/sorc/build_compute.sh +++ b/sorc/build_compute.sh @@ -11,6 +11,7 @@ Usage: ${BASH_SOURCE[0]} [-h][-v] -A HPC_ACCOUNT [gfs gefs sfs gcafs gsi gdas al Verbose mode -A: HPC account to use for the compute-node builds [REQUIRED] + -c Build on compute nodes (default is NO) Input arguments are the system(s) to build. Valid options are @@ -30,12 +31,14 @@ build_xml="build.xml" build_db="build.db" build_lock_db="build_lock.db" HPC_ACCOUNT="UNDEFINED" +build_on_compute="NO" OPTIND=1 -while getopts ":hA:v" option; do +while getopts ":hA:vc" option; do case "${option}" in h) _usage ;; A) HPC_ACCOUNT="${OPTARG}" ;; + c) build_on_compute="YES" ;; v) verbose="YES" && rocoto_verbose_opt="-v10" ;; :) echo "[${BASH_SOURCE[0]}]: ${option} requires an argument" @@ -94,164 +97,187 @@ if [[ "${rc}" -ne 0 ]]; then exit 1 fi -# grep for tags in the build.xml and collect the commands in an array -mapfile -t commands < <(grep -oP '(?<=).*(?=)' "${build_xml}") -# get the corresponding log file names from the build.xml in an array -mapfile -t logs < <(grep -oP '(?<=).*(?=)' "${build_xml}") -# get the number of build jobs each command corresponds to in an array. The build jobs are the strings -j N in each command. -mapfile -t cores < <(echo "${commands[@]}" | grep -oP '(?<=-j )\d+') -# create an array of build names from the log file names (by obtaining the basename and stripping the .log extension) -mapfile -t names < <(printf "%s\n" "${logs[@]}" | xargs -n1 basename | sed 's/\.log$//') - -echo "The following build commands will be executed with the corresponding logs:" -declare -A build_names build_status build_commands build_logs build_cores build_pids -for i in "${!names[@]}"; do - - name="${names[i]}" - - build_names["${name}"]="${name}" - build_status["${name}"]="pending" - build_commands["${name}"]="${commands[i]}" - build_logs["${name}"]="${logs[i]}" - build_cores["${name}"]="${cores[i]}" - build_pids["${name}"]="" - - echo - echo "Name: ${build_names[${name}]}" - echo "Command: ${build_commands[${name}]}" - echo "Log: ${build_logs[${name}]}" - echo "Cores: ${build_cores[${name}]}" - echo "Status: ${build_status[${name}]}" -done +if [[ "${build_on_compute}" != "YES" ]]; then -max_cores=40 # Set the maximum number of cores to use for building + echo "Building on head node as requested ..." -# copy build_names into a new array to iterate over -builds_to_process=("${!build_names[@]}") + # grep for tags in the build.xml and collect the commands in an array + mapfile -t commands < <(grep -oP '(?<=).*(?=)' "${build_xml}") + # get the corresponding log file names from the build.xml in an array + mapfile -t logs < <(grep -oP '(?<=).*(?=)' "${build_xml}") + # get the number of build jobs each command corresponds to in an array. The build jobs are the strings -j N in each command. + mapfile -t cores < <(echo "${commands[@]}" | grep -oP '(?<=-j )\d+') + # create an array of build names from the log file names (by obtaining the basename and stripping the .log extension) + mapfile -t names < <(printf "%s\n" "${logs[@]}" | xargs -n1 basename | sed 's/\.log$//') -builds_in_progress=true -current_cores=0 -while [[ ${builds_in_progress} == true ]]; do + # Initialize associative arrays to track build status + declare -A build_names build_status build_commands build_logs build_cores build_pids + for i in "${!names[@]}"; do - for name in "${!builds_to_process[@]}"; do + name="${names[i]}" - if [[ ${build_status[${name}]} == "completed" ]]; then - continue - fi + build_names["${name}"]="${name}" + build_commands["${name}"]="${commands[i]}" + build_logs["${name}"]="${logs[i]}" + build_cores["${name}"]="${cores[i]}" + build_status["${name}"]="pending" + build_pids["${name}"]="" - # Check if the build is still running - pid="${build_pids[${name}]}" - if [[ -z "${pid}" ]]; then # No pid means build not started yet - cores_needed="${build_cores[${name}]}" - if (( current_cores + cores_needed <= max_cores )); then - # Launch the build command in the background and redirect output to log file - command="${build_commands[${name}]}" - log_file="${build_logs[${name}]}" - echo "Launching build command: ${command} > ${log_file} 2>&1" - bash -c "${command} > ${log_file} 2>&1 &" - pid=$! - build_pids["${name}"]="${pid}" - build_status["${name}"]="building" - current_cores=$((current_cores + cores_needed)) - else - # Not enough cores available, skip to next build + done + unset commands logs cores names + + # copy build_names into a new array to iterate over + builds_to_process=("${!build_names[@]}") + + declare -r max_cores=40 + current_cores=0 + builds_in_progress=true + while [[ ${builds_in_progress} == true ]]; do + + for name in "${!builds_to_process[@]}"; do + + # If the build is already completed, skip it + if [[ ${build_status[${name}]} == "completed" ]]; then continue fi - else - - if ! ps -p "${pid}" > /dev/null 2>&1; then - # Build has finished - wait "${pid}" - rc=$? - if [[ "${rc}" -ne 0 ]]; then - echo "BUILD ERROR: Build command '${build_commands[${name}]}' failed with exit code ${rc}." - echo "See log file: ${build_logs[${name}]}.log" - build_status["${name}"]="failed" + # Check if the build is still running + pid="${build_pids[${name}]}" + if [[ -z "${pid}" ]]; then # No pid means build not started yet + cores_needed="${build_cores[${name}]}" + if (( current_cores + cores_needed <= max_cores )); then + # Launch the build command in the background and redirect output to log file + command="${build_commands[${name}]}" + log_file="${build_logs[${name}]}" + echo "Launching build command: ${command} > ${log_file} 2>&1" + bash -c "${command} > ${log_file} 2>&1 &" + pid=$! + build_pids["${name}"]="${pid}" + build_status["${name}"]="building" + current_cores=$((current_cores + cores_needed)) else - echo "BUILD SUCCESS: Build command '${build_commands[${name}]}' completed successfully." - build_status["${name}"]="completed" + # Not enough cores available, skip to next build + continue + fi + + else + + if ! ps -p "${pid}" > /dev/null 2>&1; then + # Build has finished + wait "${pid}" + rc=$? + if [[ "${rc}" -ne 0 ]]; then + echo "BUILD ERROR: Build command '${build_commands[${name}]}' failed with exit code ${rc}." + echo "See log file: ${build_logs[${name}]}.log" + build_status["${name}"]="failed" + else + echo "BUILD SUCCESS: Build command '${build_commands[${name}]}' completed successfully." + build_status["${name}"]="completed" + fi + # Free up the cores used by this build (regardless of success or failure) current_cores=$((current_cores - build_cores[${name}])) fi + fi - fi + # If the build failed, exit immediately + if [[ ${build_status[${name}]} == "failed" ]]; then + exit 1 + fi + + done - # If the build failed, exit immediately - if [[ ${build_status[${name}]} == "failed" ]]; then + # Check for any failed builds, and abort all if any found + abort_all_builds=false + for name in "${!build_names[@]}"; do + if [[ ${build_status[${name}]} == "failed" ]]; then + echo "Detected failed build: ${name}" + abort_all_builds=true + fi + done + if [[ ${abort_all_builds} == true ]]; then + echo "FATAL ERROR: One or more builds failed. Aborting remaining builds." + # Terminate all running build processes + for pid in "${build_pids[@]}"; do + if kill -0 "${pid}" 2> /dev/null; then # Check if process still exists + kill "${pid}" + fi + done exit 1 fi - done + # Remove completed builds from the list to process during the next iteration + builds_to_process=() + builds_in_progress=false + for name in "${!build_names[@]}"; do + if [[ ${build_status[${name}]} != "completed" ]]; then + builds_to_process+=("${name}") + builds_in_progress=true + fi + done + + sleep 30s - # Remove completed builds from the list to process - builds_to_process=() - builds_in_progress=false - for name in "${!build_names[@]}"; do - if [[ ${build_status[${name}]} != "completed" ]]; then - builds_to_process+=("${name}") - builds_in_progress=true - fi done - sleep 10s +else -done + echo "Building on compute nodes as requested ..." + runcmd="rocotorun -w ${build_xml} -d ${build_db} ${rocoto_verbose_opt}" -echo "Launching builds in parallel on compute nodes ..." -runcmd="rocotorun -w ${build_xml} -d ${build_db} ${rocoto_verbose_opt}" + finished=false + ${runcmd} + rc=$? + if [[ "${rc}" -ne 0 ]]; then + echo "FATAL ERROR: ${BASH_SOURCE[0]} failed to run rocoto on the first attempt!" + exit 1 + fi -finished=false -${runcmd} -rc=$? -if [[ "${rc}" -ne 0 ]]; then - echo "FATAL ERROR: ${BASH_SOURCE[0]} failed to run rocoto on the first attempt!" - exit 1 -fi + echo "Monitoring builds on compute nodes" + while [[ "${finished}" == "false" ]]; do + sleep 1m + ${runcmd} -echo "Monitoring builds on compute nodes" -while [[ "${finished}" == "false" ]]; do - sleep 1m - ${runcmd} + state="$("${HOMEgfs}/dev/ci/scripts/utils/rocotostat.py" -w "${build_xml}" -d "${build_db}")" || true + if [[ "${verbose_opt}" == "true" ]]; then + echo "Rocoto is in state ${state}" + else + echo -n "." + fi - state="$("${HOMEgfs}/dev/ci/scripts/utils/rocotostat.py" -w "${build_xml}" -d "${build_db}")" || true - if [[ "${verbose_opt}" == "true" ]]; then - echo "Rocoto is in state ${state}" - else - echo -n "." - fi + if [[ "${state}" == "DONE" ]]; then + finished=true + elif [[ "${state}" == "RUNNING" ]]; then + finished=false + else + msg="FATAL ERROR: ${BASH_SOURCE[0]} rocoto failed with state '${state}'" + echo "${msg}" + err_file="${PWD}/logs/error.logs" + rm -f "${err_file}" + # Determine which build(s) failed + stat_out="$(rocotostat -w "${build_xml}" -d "${build_db}")" + echo "${stat_out}" > rocotostat.out + line_number=0 + while read -r line; do + ((line_number += 1)) + # Skip the first two lines (header) + if [[ ${line_number} -lt 3 ]]; then + continue + fi - if [[ "${state}" == "DONE" ]]; then - finished=true - elif [[ "${state}" == "RUNNING" ]]; then - finished=false - else - msg="FATAL ERROR: ${BASH_SOURCE[0]} rocoto failed with state '${state}'" - echo "${msg}" - err_file="${PWD}/logs/error.logs" - rm -f "${err_file}" - # Determine which build(s) failed - stat_out="$(rocotostat -w "${build_xml}" -d "${build_db}")" - echo "${stat_out}" > rocotostat.out - line_number=0 - while read -r line; do - ((line_number += 1)) - # Skip the first two lines (header) - if [[ ${line_number} -lt 3 ]]; then - continue - fi + if [[ "${line}" =~ "DEAD" || "${line}" =~ "UNKNOWN" || + "${line}" =~ "UNAVAILABLE" || "${line}" =~ "FAIL" ]]; then + job=$(echo "${line}" | awk '{ print $2 }') + log_file="${PWD}/logs/${job}.log" + echo "${log_file}" >> "${err_file}" + echo "Rocoto reported that the build failed for ${job}" + fi + done < rocotostat.out + exit 1 + fi + done - if [[ "${line}" =~ "DEAD" || "${line}" =~ "UNKNOWN" || - "${line}" =~ "UNAVAILABLE" || "${line}" =~ "FAIL" ]]; then - job=$(echo "${line}" | awk '{ print $2 }') - log_file="${PWD}/logs/${job}.log" - echo "${log_file}" >> "${err_file}" - echo "Rocoto reported that the build failed for ${job}" - fi - done < rocotostat.out - exit 1 - fi -done +fi echo "All builds completed successfully!" From 250ac155e46f6b443a973ddc661ab23509b9693e Mon Sep 17 00:00:00 2001 From: Rahul Mahajan Date: Tue, 23 Dec 2025 15:37:59 -0500 Subject: [PATCH 03/19] keep default behavior of build_compute.sh --- sorc/build_compute.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sorc/build_compute.sh b/sorc/build_compute.sh index a8a7e2bc70f..6bef9aab12e 100755 --- a/sorc/build_compute.sh +++ b/sorc/build_compute.sh @@ -4,14 +4,14 @@ function _usage() { cat << EOF Builds all of the global-workflow components on compute nodes. -Usage: ${BASH_SOURCE[0]} [-h][-v] -A HPC_ACCOUNT [gfs gefs sfs gcafs gsi gdas all] +Usage: ${BASH_SOURCE[0]} [-h][-v] -A HPC_ACCOUNT -b [gfs gefs sfs gcafs gsi gdas all] -h: Print this help message and exit -v: Verbose mode -A: - HPC account to use for the compute-node builds [REQUIRED] - -c Build on compute nodes (default is NO) + HPC account to use for the compute-node builds [REQUIRED when building on compute nodes] + -b Build on login nodes (DEFAULT: NO) Input arguments are the system(s) to build. Valid options are @@ -31,14 +31,14 @@ build_xml="build.xml" build_db="build.db" build_lock_db="build_lock.db" HPC_ACCOUNT="UNDEFINED" -build_on_compute="NO" +build_on_compute="YES" OPTIND=1 -while getopts ":hA:vc" option; do +while getopts ":hA:vb" option; do case "${option}" in h) _usage ;; A) HPC_ACCOUNT="${OPTARG}" ;; - c) build_on_compute="YES" ;; + b) build_on_compute="NO" ;; v) verbose="YES" && rocoto_verbose_opt="-v10" ;; :) echo "[${BASH_SOURCE[0]}]: ${option} requires an argument" @@ -59,8 +59,8 @@ else systems=$* fi -if [[ "${HPC_ACCOUNT}" == "UNDEFINED" ]]; then - echo "FATAL ERROR: -A is required, ABORT!" +if [[ "${build_on_compute}" == "YES" && "${HPC_ACCOUNT}" == "UNDEFINED" ]]; then + echo "FATAL ERROR: -A is required when building on compute nodes, ABORT!" _usage fi From 733337a5e548854f7ca870db7ca0cb34184f93c4 Mon Sep 17 00:00:00 2001 From: Rahul Mahajan Date: Tue, 6 Jan 2026 14:44:19 -0500 Subject: [PATCH 04/19] update build_compute.sh to fix errors --- sorc/build_compute.sh | 66 ++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/sorc/build_compute.sh b/sorc/build_compute.sh index 6bef9aab12e..c1af1e4bb2e 100755 --- a/sorc/build_compute.sh +++ b/sorc/build_compute.sh @@ -85,11 +85,8 @@ mkdir -p "${HOMEgfs}/sorc/logs" || exit 1 # Delete the rocoto XML and database if they exist rm -f "${build_xml}" "${build_db}" "${build_lock_db}" +echo "Generating build.xml for building global-workflow programs ..." yaml="${HOMEgfs}/dev/workflow/build_opts.yaml" -echo "Generating build.xml for building global-workflow programs on compute nodes ..." -# Catch errors manually from here out -set +e - "${HOMEgfs}/dev/workflow/build_compute.py" --account "${HPC_ACCOUNT}" --yaml "${yaml}" --systems "${systems}" rc=$? if [[ "${rc}" -ne 0 ]]; then @@ -97,6 +94,9 @@ if [[ "${rc}" -ne 0 ]]; then exit 1 fi +# Catch errors manually from here out +set +e + if [[ "${build_on_compute}" != "YES" ]]; then echo "Building on head node as requested ..." @@ -105,36 +105,36 @@ if [[ "${build_on_compute}" != "YES" ]]; then mapfile -t commands < <(grep -oP '(?<=).*(?=)' "${build_xml}") # get the corresponding log file names from the build.xml in an array mapfile -t logs < <(grep -oP '(?<=).*(?=)' "${build_xml}") - # get the number of build jobs each command corresponds to in an array. The build jobs are the strings -j N in each command. - mapfile -t cores < <(echo "${commands[@]}" | grep -oP '(?<=-j )\d+') - # create an array of build names from the log file names (by obtaining the basename and stripping the .log extension) - mapfile -t names < <(printf "%s\n" "${logs[@]}" | xargs -n1 basename | sed 's/\.log$//') # Initialize associative arrays to track build status - declare -A build_names build_status build_commands build_logs build_cores build_pids - for i in "${!names[@]}"; do + declare -A build_names build_status build_dirs build_commands build_logs build_cores build_pids + for i in "${!logs[@]}"; do - name="${names[i]}" + cmd="${commands[i]}" + log="${logs[i]}" + name=$(echo "${log}" | xargs -n1 basename | sed 's/\.log$//') build_names["${name}"]="${name}" - build_commands["${name}"]="${commands[i]}" - build_logs["${name}"]="${logs[i]}" - build_cores["${name}"]="${cores[i]}" + build_dirs["${name}"]="$(echo "${cmd}" | awk -F';' '{ print $1 }' | sed 's/cd //')" + build_commands["${name}"]="$(echo "${cmd}" | awk -F';' '{ $1=""; print $0 }' | sed 's/^[[:space:]]*//')" + build_logs["${name}"]="${log}" + build_cores["${name}"]="$(echo "${cmd}" | grep -oP '(?<=-j )\d+')" build_status["${name}"]="pending" build_pids["${name}"]="" done - unset commands logs cores names + unset commands logs # copy build_names into a new array to iterate over builds_to_process=("${!build_names[@]}") - declare -r max_cores=40 + # Maximum number of cores to use for builds on head node + declare -r max_cores=20 current_cores=0 builds_in_progress=true while [[ ${builds_in_progress} == true ]]; do - for name in "${!builds_to_process[@]}"; do + for name in "${builds_to_process[@]}"; do # If the build is already completed, skip it if [[ ${build_status[${name}]} == "completed" ]]; then @@ -145,15 +145,18 @@ if [[ "${build_on_compute}" != "YES" ]]; then pid="${build_pids[${name}]}" if [[ -z "${pid}" ]]; then # No pid means build not started yet cores_needed="${build_cores[${name}]}" - if (( current_cores + cores_needed <= max_cores )); then + if ((current_cores + cores_needed <= max_cores)); then # Launch the build command in the background and redirect output to log file + dir="${build_dirs[${name}]}" command="${build_commands[${name}]}" log_file="${build_logs[${name}]}" - echo "Launching build command: ${command} > ${log_file} 2>&1" - bash -c "${command} > ${log_file} 2>&1 &" + cd "${dir}" || exit 1 + ${command} > "${log_file}" 2>&1 & pid=$! + echo "Build for ${name} started with PID ${pid}, using ${cores_needed} cores." build_pids["${name}"]="${pid}" build_status["${name}"]="building" + # Update the current cores in use current_cores=$((current_cores + cores_needed)) else # Not enough cores available, skip to next build @@ -162,16 +165,17 @@ if [[ "${build_on_compute}" != "YES" ]]; then else + #echo "Checking status of build for ${name} with PID ${pid} ..." if ! ps -p "${pid}" > /dev/null 2>&1; then # Build has finished wait "${pid}" rc=$? if [[ "${rc}" -ne 0 ]]; then - echo "BUILD ERROR: Build command '${build_commands[${name}]}' failed with exit code ${rc}." - echo "See log file: ${build_logs[${name}]}.log" + echo "BUILD ERROR: Build for ${name} failed with exit code ${rc}." + echo "See log file: ${build_logs[${name}]}" build_status["${name}"]="failed" else - echo "BUILD SUCCESS: Build command '${build_commands[${name}]}' completed successfully." + echo "BUILD SUCCESS: Build for ${name} completed successfully." build_status["${name}"]="completed" fi # Free up the cores used by this build (regardless of success or failure) @@ -180,9 +184,9 @@ if [[ "${build_on_compute}" != "YES" ]]; then fi - # If the build failed, exit immediately + # If the build failed, do not submit any more builds if [[ ${build_status[${name}]} == "failed" ]]; then - exit 1 + break fi done @@ -196,11 +200,14 @@ if [[ "${build_on_compute}" != "YES" ]]; then fi done if [[ ${abort_all_builds} == true ]]; then - echo "FATAL ERROR: One or more builds failed. Aborting remaining builds." + echo "FATAL ERROR: One or more builds failed. Aborting all builds." # Terminate all running build processes - for pid in "${build_pids[@]}"; do + for i in "${!build_pids[@]}"; do + pid="${build_pids[${i}]}" + name="${build_names[${i}]}" if kill -0 "${pid}" 2> /dev/null; then # Check if process still exists - kill "${pid}" + echo "Terminating build for ${name} with PID ${pid} ..." + pkill -P "${pid}" # Kill any child processes fi done exit 1 @@ -216,7 +223,8 @@ if [[ "${build_on_compute}" != "YES" ]]; then fi done - sleep 30s + echo "Waiting for builds to complete. Current cores in use: ${current_cores}/${max_cores}" + sleep 1m done From 62f8719a93a4c2395e546b2dfbebc4f82eab1aa4 Mon Sep 17 00:00:00 2001 From: Rahul Mahajan Date: Tue, 6 Jan 2026 15:07:10 -0500 Subject: [PATCH 05/19] loop over names --- sorc/build_compute.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sorc/build_compute.sh b/sorc/build_compute.sh index c1af1e4bb2e..51c59d9bc5e 100755 --- a/sorc/build_compute.sh +++ b/sorc/build_compute.sh @@ -193,7 +193,7 @@ if [[ "${build_on_compute}" != "YES" ]]; then # Check for any failed builds, and abort all if any found abort_all_builds=false - for name in "${!build_names[@]}"; do + for name in "${build_names[@]}"; do if [[ ${build_status[${name}]} == "failed" ]]; then echo "Detected failed build: ${name}" abort_all_builds=true From d077349b538d1e0d83c3051f9c3f39ab598cf050 Mon Sep 17 00:00:00 2001 From: Rahul Mahajan Date: Wed, 7 Jan 2026 13:09:09 -0500 Subject: [PATCH 06/19] remove build_all.sh --- sorc/build_all.sh | 338 ---------------------------------------------- 1 file changed, 338 deletions(-) delete mode 100755 sorc/build_all.sh diff --git a/sorc/build_all.sh b/sorc/build_all.sh deleted file mode 100755 index e7c52810513..00000000000 --- a/sorc/build_all.sh +++ /dev/null @@ -1,338 +0,0 @@ -#! /usr/bin/env bash -#shellcheck disable=SC2317 - -set +x -#------------------------------------ -# Exception handling is now included. -# -# USER DEFINED STUFF: -# -#------------------------------------ - -#------------------------------------ -# END USER DEFINED STUFF -#------------------------------------ -function _usage() { - cat << EOF -Builds all of the global-workflow components by calling the individual build scripts in parallel. - -Usage: ${BASH_SOURCE[0]} [-a UFS_app][-c build_config][-d][-f][-h][-v] [gfs] [gefs] [sfs] [gcafs] [gsi] [gdas] [all] - -a UFS_app: - Build a specific UFS app instead of the default. This will be applied to all UFS (GFS, GEFS, SFS, GCAFS) builds. - -d: - Build in debug mode - -f: - Build the UFS model(s) using the -DFASTER=ON option. - -h: - Print this help message and exit - -k: - Kill all builds if any build fails - -v: - Execute all build scripts with -v option to turn on verbose where supported - -p: - Valid only for WCOSS2; enable parallel restart I/O when compiling the UFS - - Specified systems (gfs, gefs, sfs, gcafs, gsi, gdas) are non-exclusive, so they can be built together. -EOF - exit 1 -} - -# shellcheck disable=SC2155 -readonly HOMEgfs=$(cd "$(dirname "$(readlink -f -n "${BASH_SOURCE[0]}")")" && git rev-parse --show-toplevel) -cd "${HOMEgfs}/sorc" || exit 1 - -_build_ufs_opt="" -_build_debug="" -_verbose_opt="" -_build_job_max=20 -_quick_kill="NO" -_ufs_exec="-e gfs_model.x" -# Reset option counter in case this script is sourced -OPTIND=1 -while getopts ":a:dfhkpv" option; do - case "${option}" in - a) _build_ufs_opt+="-a ${OPTARG} " ;; - f) _build_ufs_opt+="-f " ;; - d) _build_debug="-d" ;; - h) _usage ;; - k) _quick_kill="YES" ;; - # TODO: Remove this option when UFS#2716 is fixed - p) _build_ufs_opt+="-p " ;; - v) _verbose_opt="-v" ;; - :) - echo "[${BASH_SOURCE[0]}]: ${option} requires an argument" - _usage - ;; - *) - echo "[${BASH_SOURCE[0]}]: Unrecognized option: ${option}" - _usage - ;; - esac -done -shift $((OPTIND - 1)) - -# If no build system was specified, build for gfs forecast-only -if [[ $# -eq 0 ]]; then - selected_systems="gfs" -else - selected_systems="$*" -fi - -supported_systems=("gfs" "gefs" "sfs" "gcafs" "gsi" "gdas" "all") - -declare -A system_builds -system_builds=( - ["gfs"]="ufs_gfs gfs_utils ufs_utils upp ww3_gfs" - ["gefs"]="ufs_gefs gfs_utils ufs_utils upp ww3_gefs" - ["sfs"]="ufs_sfs gfs_utils ufs_utils upp ww3_gefs" - ["gcafs"]="ufs_gcafs gfs_utils ufs_utils upp nexus gsi_utils" - ["gsi"]="gsi_enkf gsi_monitor gsi_utils" - ["gdas"]="gdas gsi_monitor gsi_utils" - ["all"]="ufs_gfs gfs_utils ufs_utils upp ww3_gfs ufs_gefs ufs_sfs ufs_gcafs ww3_gefs gdas gsi_enkf gsi_monitor gsi_utils nexus" -) - -logs_dir="${HOMEgfs}/sorc/logs" -if [[ ! -d "${logs_dir}" ]]; then - echo "Creating logs folder" - mkdir -p "${logs_dir}" || exit 1 -fi - -# Jobs per build ("min max") -declare -A build_jobs build_opts build_scripts -build_jobs=( - ["ufs_gfs"]=8 ["ufs_gefs"]=8 ["ufs_sfs"]=8 ["ufs_gcafs"]=8 ["gdas"]=8 ["gsi_enkf"]=2 ["gfs_utils"]=1 ["ufs_utils"]=1 - ["ww3_gfs"]=1 ["ww3_gefs"]=1 ["gsi_utils"]=1 ["gsi_monitor"]=1 ["gfs_utils"]=1 ["upp"]=1 ["nexus"]=1 -) - -# Establish build options for each job -_gfs_exec="gfs_model.x" -_gefs_exec="gefs_model.x" -_sfs_exec="sfs_model.x" -_gcafs_exec="gcafs_model.x" -build_opts=( - ["ufs_gfs"]="${wave_opt} ${_build_ufs_opt} ${_verbose_opt} ${_build_debug} -e ${_gfs_exec}" - ["ufs_gefs"]="${wave_opt} ${_build_ufs_opt} ${_verbose_opt} ${_build_debug} -w -e ${_gefs_exec}" - ["ufs_sfs"]="${wave_opt} ${_build_ufs_opt} ${_verbose_opt} ${_build_debug} -y -e ${_sfs_exec}" - ["ufs_gcafs"]="-a ATMAERO ${_build_ufs_opt} ${_verbose_opt} ${_build_debug} -e ${_gcafs_exec}" - ["upp"]="${_build_debug}" - ["ww3_gfs"]="${_verbose_opt} ${_build_debug}" - ["ww3_gefs"]="-w ${_verbose_opt} ${_build_debug}" - ["gdas"]="${_verbose_opt} ${_build_debug}" - ["ufs_utils"]="${_verbose_opt} ${_build_debug}" - ["gfs_utils"]="${_verbose_opt} ${_build_debug}" - ["gsi_utils"]="${_verbose_opt} ${_build_debug}" - ["gsi_enkf"]="${_verbose_opt} ${_build_debug}" - ["gsi_monitor"]="${_verbose_opt} ${_build_debug}" - ["nexus"]="${_verbose_opt} ${_build_debug}" -) - -# Set the build script name for each build -build_scripts=( - ["ufs_gfs"]="build_ufs.sh" - ["ufs_gefs"]="build_ufs.sh" - ["ufs_sfs"]="build_ufs.sh" - ["ufs_gcafs"]="build_ufs.sh" - ["gdas"]="build_gdas.sh" - ["gsi_enkf"]="build_gsi_enkf.sh" - ["gfs_utils"]="build_gfs_utils.sh" - ["ufs_utils"]="build_ufs_utils.sh" - ["ww3_gfs"]="build_ww3prepost.sh" - ["ww3_gefs"]="build_ww3prepost.sh" - ["gsi_utils"]="build_gsi_utils.sh" - ["gsi_monitor"]="build_gsi_monitor.sh" - ["gfs_utils"]="build_gfs_utils.sh" - ["upp"]="build_upp.sh" - ["nexus"]="build_nexus.sh" -) - -# Check the requested systems to make sure we can build them -declare -A builds -system_count=0 -for system in ${selected_systems}; do - # shellcheck disable=SC2076 - if [[ " ${supported_systems[*]} " =~ " ${system} " ]]; then - ((system_count += 1)) - for build in ${system_builds["${system}"]}; do - builds["${build}"]="yes" - done - else - echo "Unsupported build system: ${system}" - _usage - fi -done - -#------------------------------------ -# GET MACHINE -#------------------------------------ -export COMPILER="intel" -source "${HOMEgfs}/ush/detect_machine.sh" -source "${HOMEgfs}/ush/module-setup.sh" -if [[ -z "${MACHINE_ID}" ]]; then - echo "FATAL: Unable to determine target machine" - exit 1 -fi - -# Create the log directory -mkdir -p "${HOMEgfs}/sorc/logs" - -#------------------------------------ -# SOURCE BUILD VERSION FILES -#------------------------------------ -# TODO: Commented out until components aligned for build -#source ../versions/build.ver - -#------------------------------------ -# Exception Handling Init -#------------------------------------ -# Disable shellcheck warning about single quotes not being substituted. -# shellcheck disable=SC2016 -ERRSCRIPT=${ERRSCRIPT:-'eval [[ $errs = 0 ]]'} -# shellcheck disable= -errs=0 - -#------------------------------------ -# Check which builds to do and assign # of build jobs -#------------------------------------ - -echo "Building ${build_list}" - -procs_in_use=0 -declare -A build_ids - -check_builds() { - for chk_build in "${!builds[@]}"; do - # Check if the build is complete and if so what the status was - if [[ -n "${build_ids[${chk_build}]+0}" ]]; then - if ! ps -p "${build_ids[${chk_build}]}" > /dev/null; then - wait "${build_ids[${chk_build}]}" - build_stat=$? - if [[ ${build_stat} != 0 ]]; then - echo "build_${chk_build}.sh failed! Exiting!" - echo "Check logs/build_${chk_build}.log for details." - echo "logs/build_${chk_build}.log" > "${HOMEgfs}/sorc/logs/error.logs" - for kill_build in "${!builds[@]}"; do - if [[ -n "${build_ids[${kill_build}]+0}" ]]; then - pkill -P "${build_ids[${kill_build}]}" - fi - done - return "${build_stat}" - fi - fi - fi - done - return 0 -} - -# Cleanup function to kill the GDASApp build on ctrl-c or non-clean exit -# shellcheck disable=SC2329 -function cleanup() { - echo "Exiting build script. Terminating subprocesses..." - for pid in "${build_ids[@]}"; do - if kill -0 "${pid}" 2> /dev/null; then # Check if process still exists - kill "${pid}" - fi - done - exit 0 -} - -trap cleanup ERR -trap cleanup INT -trap cleanup TERM - -builds_started=0 -# Now start looping through all of the jobs until everything is done -while [[ ${builds_started} -lt ${#builds[@]} ]]; do - for build in "${!builds[@]}"; do - # Has the job started? - if [[ -n "${build_jobs[${build}]+0}" && -z "${build_ids[${build}]+0}" ]]; then - # Do we have enough processors to run it? - if [[ ${_build_job_max} -ge $((build_jobs[build] + procs_in_use)) ]]; then - # double-quoting build_opts here will not work since it is a string of options - #shellcheck disable=SC2086 - "./${build_scripts[${build}]}" ${build_opts[${build}]:-} -j "${build_jobs[${build}]}" > \ - "${logs_dir}/build_${build}.log" 2>&1 & - build_ids["${build}"]=$! - echo "Starting build_${build}.sh" - procs_in_use=$((procs_in_use + build_jobs[${build}])) - fi - fi - done - - # Check if all builds have completed - # Also recalculate how many processors are in use to account for completed builds - builds_started=0 - procs_in_use=0 - for build in "${!builds[@]}"; do - # Has the build started? - if [[ -n "${build_ids[${build}]+0}" ]]; then - builds_started=$((builds_started + 1)) - # Calculate how many processors are in use - # Is the build still running? - if ps -p "${build_ids[${build}]}" > /dev/null; then - procs_in_use=$((procs_in_use + build_jobs["${build}"])) - fi - fi - done - - # If requested, check if any build has failed and exit if so - if [[ "${_quick_kill}" == "YES" ]]; then - check_builds - build_stat=$? - if ((build_stat != 0)); then - exit "${build_stat}" - fi - fi - - sleep 5s -done - -# Wait for all jobs to complete and check return statuses -while [[ "${#builds[@]}" -gt 0 ]]; do - - # If requested, check if any build has failed and exit if so - if [[ "${_quick_kill}" == "YES" ]]; then - check_builds - build_stat=$? - if [[ ${build_stat} != 0 ]]; then - exit "${build_stat}" - fi - fi - - for build in "${!builds[@]}"; do - # Test if each job is complete and if so, notify and remove from the array - if [[ -n "${build_ids[${build}]+0}" ]]; then - if ! ps -p "${build_ids[${build}]}" > /dev/null; then - wait "${build_ids[${build}]}" - build_stat=$? - errs=$((errs + build_stat)) - if [[ ${build_stat} == 0 ]]; then - echo "${build_scripts[${build}]} completed successfully!" - else - echo "${build_scripts[${build}]} failed with status ${build_stat}!" - fi - - # Remove the completed build from the list of PIDs - unset 'build_ids[${build}]' - unset 'builds[${build}]' - fi - fi - done - - sleep 5s -done - -#------------------------------------ -# Exception Handling -#------------------------------------ -if ((errs != 0)); then - cat << EOF -BUILD ERROR: One or more components failed to build - Check the associated build log(s) for details. -EOF - ${ERRSCRIPT} || exit "${errs}" -fi - -echo -echo " .... Build system finished .... " - -exit 0 From 81b64817ecadccf4cdf4a26bff0f77dc2426fd83 Mon Sep 17 00:00:00 2001 From: Rahul Mahajan Date: Wed, 7 Jan 2026 13:31:09 -0500 Subject: [PATCH 07/19] update build scripts and add an option in generate_workflows.sh --- dev/workflow/generate_workflows.sh | 14 +++++++++++--- .../{build_compute.py => setup_buildxml.py} | 6 +++--- sorc/{build_compute.sh => build_all.sh} | 16 ++++++++-------- 3 files changed, 22 insertions(+), 14 deletions(-) rename dev/workflow/{build_compute.py => setup_buildxml.py} (97%) rename sorc/{build_compute.sh => build_all.sh} (95%) diff --git a/dev/workflow/generate_workflows.sh b/dev/workflow/generate_workflows.sh index 6efe46f80b0..4a13c30ecb7 100755 --- a/dev/workflow/generate_workflows.sh +++ b/dev/workflow/generate_workflows.sh @@ -17,7 +17,10 @@ function _usage() { directory up from this script's residing directory. -b Run build_all.sh with default flags - (build the UFS, UPP, UFS_Utils, and GFS-utils only) + (build the UFS, UPP, UFS_Utils, and GFS-utils only on login nodes) + + -B Run build_all.sh -c with default flags [-c triggers build on compute nodes] + (build the UFS, UPP, UFS_Utils, and GFS-utils only on compute nodes) -u Update submodules before building and/or generating experiments. @@ -84,6 +87,7 @@ set -eu HOMEgfs="" _specified_home=false _build=false +_compute_build=false _build_flags="" _update_submods=false declare -a _yaml_list=("C48_ATM") @@ -110,7 +114,7 @@ _auto_del=false _nonflag_option_count=0 while [[ $# -gt 0 && "$1" != "--" ]]; do - while getopts ":H:bDuy:Y:GESCA:ce:t:vVdh" option; do + while getopts ":H:bBDuy:Y:GESCA:ce:t:vVdh" option; do case "${option}" in H) HOMEgfs="${OPTARG}" @@ -121,6 +125,7 @@ while [[ $# -gt 0 && "$1" != "--" ]]; do fi ;; b) _build=true ;; + B) _build=true && _compute_build=true ;; D) _auto_del=true ;; u) _update_submods=true ;; y) # Start over with an empty _yaml_list @@ -442,8 +447,11 @@ fi if [[ "${_build}" == "true" ]]; then printf "Building via build_all.sh %s\n\n" "${_build_flags}" # Let the output of build_all.sh go to stdout regardless of verbose options + if [[ "${_compute_build}" == true ]]; then + _compute_build_flag="-c -A ${HPC_ACCOUNT}" + fi #shellcheck disable=SC2086,SC2248 - ${HOMEgfs}/sorc/build_all.sh ${_verbose_flag} ${_build_flags} + ${HOMEgfs}/sorc/build_all.sh ${_compute_build_flag:-} ${_verbose_flag} ${_build_flags} fi # Link the workflow silently unless there's an error diff --git a/dev/workflow/build_compute.py b/dev/workflow/setup_buildxml.py similarity index 97% rename from dev/workflow/build_compute.py rename to dev/workflow/setup_buildxml.py index 0436f9cdb8b..d5a94537b97 100755 --- a/dev/workflow/build_compute.py +++ b/dev/workflow/setup_buildxml.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ -Entry point for setting up a compute-node build +Entry point for setting up a builds of global-workflow programs """ import os @@ -20,11 +20,11 @@ def input_args(*argv): """ - Method to collect user arguments for `compute_build.py` + Method to collect user arguments for `setup_buildxml.py` """ description = """ - Setup files and directories to start a compute build. + Setup buildXML to compile global-workflow programs. """ parser = ArgumentParser(description=description, diff --git a/sorc/build_compute.sh b/sorc/build_all.sh similarity index 95% rename from sorc/build_compute.sh rename to sorc/build_all.sh index 51c59d9bc5e..0d109893042 100755 --- a/sorc/build_compute.sh +++ b/sorc/build_all.sh @@ -4,14 +4,14 @@ function _usage() { cat << EOF Builds all of the global-workflow components on compute nodes. -Usage: ${BASH_SOURCE[0]} [-h][-v] -A HPC_ACCOUNT -b [gfs gefs sfs gcafs gsi gdas all] +Usage: ${BASH_SOURCE[0]} [-h][-v] -A HPC_ACCOUNT -c [gfs gefs sfs gcafs gsi gdas all] -h: Print this help message and exit -v: Verbose mode -A: HPC account to use for the compute-node builds [REQUIRED when building on compute nodes] - -b Build on login nodes (DEFAULT: NO) + -c Build on compute nodes (DEFAULT: NO) Input arguments are the system(s) to build. Valid options are @@ -31,14 +31,14 @@ build_xml="build.xml" build_db="build.db" build_lock_db="build_lock.db" HPC_ACCOUNT="UNDEFINED" -build_on_compute="YES" +compute_build="NO" OPTIND=1 -while getopts ":hA:vb" option; do +while getopts ":hA:vc" option; do case "${option}" in h) _usage ;; A) HPC_ACCOUNT="${OPTARG}" ;; - b) build_on_compute="NO" ;; + c) compute_build="YES" ;; v) verbose="YES" && rocoto_verbose_opt="-v10" ;; :) echo "[${BASH_SOURCE[0]}]: ${option} requires an argument" @@ -59,7 +59,7 @@ else systems=$* fi -if [[ "${build_on_compute}" == "YES" && "${HPC_ACCOUNT}" == "UNDEFINED" ]]; then +if [[ "${compute_build}" == "YES" && "${HPC_ACCOUNT}" == "UNDEFINED" ]]; then echo "FATAL ERROR: -A is required when building on compute nodes, ABORT!" _usage fi @@ -87,7 +87,7 @@ rm -f "${build_xml}" "${build_db}" "${build_lock_db}" echo "Generating build.xml for building global-workflow programs ..." yaml="${HOMEgfs}/dev/workflow/build_opts.yaml" -"${HOMEgfs}/dev/workflow/build_compute.py" --account "${HPC_ACCOUNT}" --yaml "${yaml}" --systems "${systems}" +"${HOMEgfs}/dev/workflow/setup_buildxml.py" --account "${HPC_ACCOUNT}" --yaml "${yaml}" --systems "${systems}" rc=$? if [[ "${rc}" -ne 0 ]]; then echo "FATAL ERROR: ${BASH_SOURCE[0]} failed to create 'build.xml' with error code ${rc}" @@ -97,7 +97,7 @@ fi # Catch errors manually from here out set +e -if [[ "${build_on_compute}" != "YES" ]]; then +if [[ "${compute_build}" != "YES" ]]; then echo "Building on head node as requested ..." From 8084876adf80adae22f9341103dcc5a2097fea09 Mon Sep 17 00:00:00 2001 From: Rahul Mahajan Date: Wed, 7 Jan 2026 14:14:50 -0500 Subject: [PATCH 08/19] respect max_cores when on head node --- sorc/build_all.sh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/sorc/build_all.sh b/sorc/build_all.sh index 0d109893042..f0e41ceadfd 100755 --- a/sorc/build_all.sh +++ b/sorc/build_all.sh @@ -101,6 +101,9 @@ if [[ "${compute_build}" != "YES" ]]; then echo "Building on head node as requested ..." + # Maximum number of cores to use for builds on head node + declare -r max_cores=20 + # grep for tags in the build.xml and collect the commands in an array mapfile -t commands < <(grep -oP '(?<=).*(?=)' "${build_xml}") # get the corresponding log file names from the build.xml in an array @@ -114,11 +117,19 @@ if [[ "${compute_build}" != "YES" ]]; then log="${logs[i]}" name=$(echo "${log}" | xargs -n1 basename | sed 's/\.log$//') + # Get the number of cores from the command (-j N). + # If N is greater than max_cores, set it to max_cores and update the command accordingly. + cores=$(echo "${cmd}" | grep -oP '(?<=-j )\d+') + if [[ ${cores} -gt ${max_cores} ]]; then + cores=${max_cores} + cmd="$(echo "${cmd}" | sed -E "s/-j [0-9]+/-j ${cores}/")" + fi + build_names["${name}"]="${name}" build_dirs["${name}"]="$(echo "${cmd}" | awk -F';' '{ print $1 }' | sed 's/cd //')" build_commands["${name}"]="$(echo "${cmd}" | awk -F';' '{ $1=""; print $0 }' | sed 's/^[[:space:]]*//')" build_logs["${name}"]="${log}" - build_cores["${name}"]="$(echo "${cmd}" | grep -oP '(?<=-j )\d+')" + build_cores["${name}"]="${cores}" build_status["${name}"]="pending" build_pids["${name}"]="" @@ -128,8 +139,6 @@ if [[ "${compute_build}" != "YES" ]]; then # copy build_names into a new array to iterate over builds_to_process=("${!build_names[@]}") - # Maximum number of cores to use for builds on head node - declare -r max_cores=20 current_cores=0 builds_in_progress=true while [[ ${builds_in_progress} == true ]]; do From 4fb8148389ee1e275162f72e26c1de8cdc4a79fa Mon Sep 17 00:00:00 2001 From: Rahul Mahajan Date: Wed, 7 Jan 2026 14:56:07 -0500 Subject: [PATCH 09/19] update dox --- docs/source/clone.rst | 24 +++++++++++++++++------- docs/source/development.rst | 2 +- sorc/build_all.sh | 2 +- {dev/workflow => sorc}/build_opts.yaml | 0 4 files changed, 19 insertions(+), 9 deletions(-) rename {dev/workflow => sorc}/build_opts.yaml (100%) diff --git a/docs/source/clone.rst b/docs/source/clone.rst index 4a01fe7a7fe..ddcb7b43e56 100644 --- a/docs/source/clone.rst +++ b/docs/source/clone.rst @@ -20,7 +20,15 @@ Clone the `global-workflow` and `cd` into the `sorc` directory: .. _build_examples: -The build_all.sh script can be used to build all required components of the global workflow. The accepted arguments is a list of systems to be built. This includes builds for GFS, GEFS, and SFS forecast-only experiments, GSI and GDASApp-based DA for cycled GFS experiments. See `feature availability `__ to see which system(s) are available on each supported system. +The `build_all.sh` script can be used to build all required components of the global workflow. +`build_all.sh` allows for optional flags to modify the build behavior: + + - ``-c``: Build on compute nodes. The default behaviour is to build on the head node. + - ``-A HPC_ACCOUNT``: Specify the HPC account to be used when building on compute nodes. + - ``-v``: Execute all build scripts with -v option to turn on verbose where supported + - ``-h``: Print help message and exit + +The accepted arguments is a list of systems to be built. This includes builds for GFS, GEFS, and SFS forecast-only experiments, GSI and GDASApp-based DA for cycled GFS experiments. See `feature availability `__ to see which system(s) are available on each supported system. :: @@ -125,17 +133,19 @@ Under the ``/sorc`` folder is a script to build all components called ``build_al :: - ./build_all.sh [-a UFS_app][-k][-h][-v] [list of system(s) to build] - -a UFS_app: - Build a specific UFS app instead of the default - -k: - Kill all builds immediately if one fails + ./build_all.sh [-c][-A HPC_ACCOUNT][-h][-v] [list of system(s) to build] + -c: + Build on compute nodes. The default behaviour is to build on the head node. + -A HPC_ACCOUNT: + Specify the HPC account to be used when building on compute nodes. -h: Print this help message and exit -v: Execute all build scripts with -v option to turn on verbose where supported - Lastly, pass to build_all.sh a list of systems to build. This includes `gfs`, `gefs`, `sfs`, `gcafs`, `gsi`, `gdas`, and `all`. +Lastly, pass to `build_all.sh` a list of systems to build. This includes `gfs`, `gefs`, `sfs`, `gcafs`, `gsi`, `gdas`, and `all`. + +To configure the build with specific flags or options for the various components, you can update the respective build command in the `build_opts.yaml` file. For examples of how to use this script, see :ref:`build examples `. diff --git a/docs/source/development.rst b/docs/source/development.rst index 2dd558f48d3..4775ca15aa1 100644 --- a/docs/source/development.rst +++ b/docs/source/development.rst @@ -88,7 +88,7 @@ The commonly run tests are written in YAML format and can be found in the ``dev/ where: * ``-A`` is used to specify the HPC (slurm or PBS) account to use - * ``-b`` indicates that the workflow should be built fresh + * ``-b|B`` indicates that the workflow should be built fresh (`-B` uses compute nodes for the build) * ``-GESC`` specifies that all of the GFS, GEFS, SFS, GCAFS cases should be run (this also influences the build flags to use) * ``-c`` tells the tool to append the rocotorun commands for each experiment to your crontab diff --git a/sorc/build_all.sh b/sorc/build_all.sh index f0e41ceadfd..4a0b65f6ae8 100755 --- a/sorc/build_all.sh +++ b/sorc/build_all.sh @@ -86,7 +86,7 @@ mkdir -p "${HOMEgfs}/sorc/logs" || exit 1 rm -f "${build_xml}" "${build_db}" "${build_lock_db}" echo "Generating build.xml for building global-workflow programs ..." -yaml="${HOMEgfs}/dev/workflow/build_opts.yaml" +yaml="${HOMEgfs}/sorc/build_opts.yaml" "${HOMEgfs}/dev/workflow/setup_buildxml.py" --account "${HPC_ACCOUNT}" --yaml "${yaml}" --systems "${systems}" rc=$? if [[ "${rc}" -ne 0 ]]; then diff --git a/dev/workflow/build_opts.yaml b/sorc/build_opts.yaml similarity index 100% rename from dev/workflow/build_opts.yaml rename to sorc/build_opts.yaml From 8f926a4b010038b30dc9f5a1d96455c2fb532c83 Mon Sep 17 00:00:00 2001 From: Rahul Mahajan Date: Wed, 7 Jan 2026 15:17:51 -0500 Subject: [PATCH 10/19] Empty push since gh is having issues From 7a182e822c5d4cf2c6b2e0e0a5184802098afdc6 Mon Sep 17 00:00:00 2001 From: Rahul Mahajan Date: Thu, 8 Jan 2026 15:33:43 -0500 Subject: [PATCH 11/19] print build status consistently between head node and compute node options in a table format --- sorc/build_all.sh | 210 +++++++++++++++++++++++++++++----------------- 1 file changed, 132 insertions(+), 78 deletions(-) diff --git a/sorc/build_all.sh b/sorc/build_all.sh index 4a0b65f6ae8..6d54bdad707 100755 --- a/sorc/build_all.sh +++ b/sorc/build_all.sh @@ -32,6 +32,7 @@ build_db="build.db" build_lock_db="build_lock.db" HPC_ACCOUNT="UNDEFINED" compute_build="NO" +max_cores=20 # Maximum number of cores to use for builds on head node OPTIND=1 while getopts ":hA:vc" option; do @@ -94,29 +95,21 @@ if [[ "${rc}" -ne 0 ]]; then exit 1 fi -# Catch errors manually from here out -set +e - -if [[ "${compute_build}" != "YES" ]]; then - - echo "Building on head node as requested ..." +# grep for tags in the build.xml and collect the commands in an array +mapfile -t commands < <(grep -oP '(?<=).*(?=)' "${build_xml}") +# get the corresponding log file names from the build.xml in an array +mapfile -t logs < <(grep -oP '(?<=).*(?=)' "${build_xml}") - # Maximum number of cores to use for builds on head node - declare -r max_cores=20 +# Initialize associative arrays to track build status +declare -A build_names build_status build_dirs build_commands build_logs build_cores build_pids +for i in "${!logs[@]}"; do - # grep for tags in the build.xml and collect the commands in an array - mapfile -t commands < <(grep -oP '(?<=).*(?=)' "${build_xml}") - # get the corresponding log file names from the build.xml in an array - mapfile -t logs < <(grep -oP '(?<=).*(?=)' "${build_xml}") - - # Initialize associative arrays to track build status - declare -A build_names build_status build_dirs build_commands build_logs build_cores build_pids - for i in "${!logs[@]}"; do - - cmd="${commands[i]}" - log="${logs[i]}" - name=$(echo "${log}" | xargs -n1 basename | sed 's/\.log$//') + cmd="${commands[i]}" + log="${logs[i]}" + name=$(echo "${log}" | xargs -n1 basename | sed 's/\.log$//') + # If building on head node, limit the number of cores used per build + if [[ ${compute_build} != "YES" ]]; then # Get the number of cores from the command (-j N). # If N is greater than max_cores, set it to max_cores and update the command accordingly. cores=$(echo "${cmd}" | grep -oP '(?<=-j )\d+') @@ -124,23 +117,63 @@ if [[ "${compute_build}" != "YES" ]]; then cores=${max_cores} cmd="$(echo "${cmd}" | sed -E "s/-j [0-9]+/-j ${cores}/")" fi + fi + + build_names["${name}"]="${name}" + build_dirs["${name}"]="$(echo "${cmd}" | awk -F';' '{ print $1 }' | sed 's/cd //')" + build_commands["${name}"]="$(echo "${cmd}" | awk -F';' '{ $1=""; print $0 }' | sed 's/^[[:space:]]*//')" + build_logs["${name}"]="${log}" + build_cores["${name}"]="${cores}" + build_status["${name}"]="PENDING" + build_pids["${name}"]="" + +done +unset commands logs - build_names["${name}"]="${name}" - build_dirs["${name}"]="$(echo "${cmd}" | awk -F';' '{ print $1 }' | sed 's/cd //')" - build_commands["${name}"]="$(echo "${cmd}" | awk -F';' '{ $1=""; print $0 }' | sed 's/^[[:space:]]*//')" - build_logs["${name}"]="${log}" - build_cores["${name}"]="${cores}" - build_status["${name}"]="pending" - build_pids["${name}"]="" +nbuilds=${#build_names[@]} +nback=$((nbuilds + 4)) +print_build_status() { + echo "------------------------------------------------------------------------" + printf "| %-12s | %-30s | %-10s | %-9s |\n" "System" "Build Command" "PID" "Status" + echo "------------------------------------------------------------------------" + for name in "${build_names[@]}"; do + printf "| %-12s | %-30s | %-10s | %-9s |\n" "${name}" "${build_commands[${name}]}" "${build_pids[${name}]}" "${build_status[${name}]}" done - unset commands logs + echo "------------------------------------------------------------------------" +} + +# Catch errors manually from here out +set +e + +if [[ "${compute_build}" != "YES" ]]; then + + echo "Building on head node as requested ..." + + cleanup() { + for i in "${!build_pids[@]}"; do + pid="${build_pids[${i}]}" + name="${build_names[${i}]}" + if kill -0 "${pid}" 2> /dev/null; then # Check if process still exists + pkill -P "${pid}" # Kill any child processes + fi + done + } + + trap cleanup EXIT + trap cleanup SIGINT + trap cleanup SIGTERM + trap cleanup SIGHUP + trap cleanup ERR # copy build_names into a new array to iterate over builds_to_process=("${!build_names[@]}") current_cores=0 builds_in_progress=true + + print_build_status + while [[ ${builds_in_progress} == true ]]; do for name in "${builds_to_process[@]}"; do @@ -162,9 +195,8 @@ if [[ "${compute_build}" != "YES" ]]; then cd "${dir}" || exit 1 ${command} > "${log_file}" 2>&1 & pid=$! - echo "Build for ${name} started with PID ${pid}, using ${cores_needed} cores." build_pids["${name}"]="${pid}" - build_status["${name}"]="building" + build_status["${name}"]="RUNNING" # Update the current cores in use current_cores=$((current_cores + cores_needed)) else @@ -174,18 +206,14 @@ if [[ "${compute_build}" != "YES" ]]; then else - #echo "Checking status of build for ${name} with PID ${pid} ..." if ! ps -p "${pid}" > /dev/null 2>&1; then # Build has finished wait "${pid}" rc=$? if [[ "${rc}" -ne 0 ]]; then - echo "BUILD ERROR: Build for ${name} failed with exit code ${rc}." - echo "See log file: ${build_logs[${name}]}" - build_status["${name}"]="failed" + build_status["${name}"]="FAILED" else - echo "BUILD SUCCESS: Build for ${name} completed successfully." - build_status["${name}"]="completed" + build_status["${name}"]="SUCCEEDED" fi # Free up the cores used by this build (regardless of success or failure) current_cores=$((current_cores - build_cores[${name}])) @@ -194,31 +222,43 @@ if [[ "${compute_build}" != "YES" ]]; then fi # If the build failed, do not submit any more builds - if [[ ${build_status[${name}]} == "failed" ]]; then + if [[ ${build_status[${name}]} == "FAILED" ]]; then break fi done + echo -ne "\033[${nback}A" + print_build_status + # Check for any failed builds, and abort all if any found abort_all_builds=false for name in "${build_names[@]}"; do - if [[ ${build_status[${name}]} == "failed" ]]; then - echo "Detected failed build: ${name}" + if [[ ${build_status[${name}]} == "FAILED" ]]; then + #echo "Detected failed build: ${name}" abort_all_builds=true fi done if [[ ${abort_all_builds} == true ]]; then - echo "FATAL ERROR: One or more builds failed. Aborting all builds." # Terminate all running build processes for i in "${!build_pids[@]}"; do pid="${build_pids[${i}]}" name="${build_names[${i}]}" if kill -0 "${pid}" 2> /dev/null; then # Check if process still exists - echo "Terminating build for ${name} with PID ${pid} ..." pkill -P "${pid}" # Kill any child processes + build_status["${name}"]="ABORTED" # Mark as aborted + current_cores=$((current_cores - build_cores[${name}])) # Free up cores fi done + echo -ne "\033[${nback}A" + print_build_status + echo "FATAL ERROR: The following builds failed, see log files for details:" + for name in "${build_names[@]}"; do + if [[ ${build_status[${name}]} == "FAILED" ]]; then + echo "${name}: ${build_logs[${name}]}" + fi + done + cleanup exit 1 fi @@ -226,13 +266,12 @@ if [[ "${compute_build}" != "YES" ]]; then builds_to_process=() builds_in_progress=false for name in "${!build_names[@]}"; do - if [[ ${build_status[${name}]} != "completed" ]]; then + if [[ ${build_status[${name}]} != "SUCCEEDED" ]]; then builds_to_process+=("${name}") builds_in_progress=true fi done - echo "Waiting for builds to complete. Current cores in use: ${current_cores}/${max_cores}" sleep 1m done @@ -242,7 +281,8 @@ else echo "Building on compute nodes as requested ..." runcmd="rocotorun -w ${build_xml} -d ${build_db} ${rocoto_verbose_opt}" - finished=false + print_build_status + ${runcmd} rc=$? if [[ "${rc}" -ne 0 ]]; then @@ -250,48 +290,62 @@ else exit 1 fi - echo "Monitoring builds on compute nodes" - while [[ "${finished}" == "false" ]]; do + builds_in_progress=true + while [[ ${builds_in_progress} == true ]]; do sleep 1m ${runcmd} + sleep 10s + stat_out="$(rocotostat -w "${build_xml}" -d "${build_db}")" + echo "${stat_out}" > rocotostat.out + # Ignore 1st 2 lines and store each row of rocotostat output in an array + mapfile -t stat_lines < <(tail -n +3 rocotostat.out) + + for line in "${stat_lines[@]}"; do + # Read each line into an array using read + IFS=' ' read -r -a columns <<< "${line}" + + # Get the name of the build in this row + name=${columns[1]} + + # Update build_pids and build_status arrays + build_pids["${name}"]="${columns[2]}" + build_status["${name}"]="${columns[3]}" + done - state="$("${HOMEgfs}/dev/ci/scripts/utils/rocotostat.py" -w "${build_xml}" -d "${build_db}")" || true - if [[ "${verbose_opt}" == "true" ]]; then - echo "Rocoto is in state ${state}" - else - echo -n "." - fi + echo -ne "\033[${nback}A" + print_build_status - if [[ "${state}" == "DONE" ]]; then - finished=true - elif [[ "${state}" == "RUNNING" ]]; then - finished=false - else - msg="FATAL ERROR: ${BASH_SOURCE[0]} rocoto failed with state '${state}'" - echo "${msg}" - err_file="${PWD}/logs/error.logs" - rm -f "${err_file}" - # Determine which build(s) failed - stat_out="$(rocotostat -w "${build_xml}" -d "${build_db}")" - echo "${stat_out}" > rocotostat.out - line_number=0 - while read -r line; do - ((line_number += 1)) - # Skip the first two lines (header) - if [[ ${line_number} -lt 3 ]]; then - continue - fi + # Count number of builds still in progress and check for failures + nsuccess=0 + nfailed=0 + for name in "${build_names[@]}"; do + job_state="${build_status[${name}]}" + if [[ "${job_state}" =~ "DEAD" || "${job_state}" =~ "UNKNOWN" || + "${job_state}" =~ "UNAVAILABLE" || "${job_state}" =~ "FAIL" ]]; then + nfailed=$((nfailed + 1)) + elif [[ "${job_state}" == "SUCCEEDED" ]]; then + nsuccess=$((nsuccess + 1)) + fi + done - if [[ "${line}" =~ "DEAD" || "${line}" =~ "UNKNOWN" || - "${line}" =~ "UNAVAILABLE" || "${line}" =~ "FAIL" ]]; then - job=$(echo "${line}" | awk '{ print $2 }') - log_file="${PWD}/logs/${job}.log" - echo "${log_file}" >> "${err_file}" - echo "Rocoto reported that the build failed for ${job}" + # If any builds failed, exit with error + if [[ ${nfailed} -gt 0 ]]; then + echo "FATAL ERROR: The following builds failed, see log files for details:" + for name in "${build_names[@]}"; do + job_state="${build_status[${name}]}" + if [[ "${job_state}" =~ "DEAD" || "${job_state}" =~ "UNKNOWN" || + "${job_state}" =~ "UNAVAILABLE" || "${job_state}" =~ "FAIL" ]]; then + echo "${name}: ${build_logs[${name}]}" fi - done < rocotostat.out + done exit 1 fi + + # If all builds succeeded, exit the loop + if [[ ${nsuccess} -eq ${nbuilds} ]]; then + builds_in_progress=false + fi + done fi From 57f71daec54877d2631bbc1812ddad3b424f21d3 Mon Sep 17 00:00:00 2001 From: Rahul Mahajan Date: Thu, 8 Jan 2026 15:54:56 -0500 Subject: [PATCH 12/19] update ci_utils.sh for build_all.sh update --- .github/copilot-instructions.md | 23 +++++++++++------------ dev/ci/scripts/utils/ci_utils.sh | 2 +- sorc/build_all.sh | 19 +++++++++++++------ 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 5abba1de845..c5ba2dbec15 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -15,15 +15,14 @@ This document provides comprehensive guidance for AI agents working on the NOAA ### Build System Commands ```bash # Build all components (from sorc/) -./build_all.sh # Default build -./build_all.sh -d # Debug mode -./build_all.sh -f # Fast build with -DFASTER=ON -./build_all.sh -v # Verbose output -./build_all.sh -k # Kill all builds if any fails +./build_all.sh # Default build on current login node +./build_all.sh -d # Debug mode +./build_all.sh -v # Verbose output +./build_all.sh -c -A # Compute node build with HPC account # Build specific systems ./build_all.sh gfs # GFS forecast system -./build_all.sh gefs # GEFS ensemble system +./build_all.sh gefs # GEFS ensemble system ./build_all.sh sfs # Seasonal forecast system ./build_all.sh gcafs # Climate analysis system ./build_all.sh gsi # GSI data assimilation @@ -55,7 +54,7 @@ python setup_xml.py /path/to/experiment ```bash # Supported platforms (use detect_machine.sh) WCOSS2 # Tier 1 - Full operational support -Hercules # Tier 1 - MSU, no TC Tracker +Hercules # Tier 1 - MSU, no TC Tracker Hera # Tier 2 - NOAA RDHPCS Orion # Tier 2 - MSU, GSI runs slowly Gaea # Cloud platforms via EPIC @@ -74,7 +73,7 @@ Gaea # Cloud platforms via EPIC ``` jobs/ # Production Job Control Language (JCL) scripts (89 files) ├── JGDAS_* # GDAS (Global Data Assimilation System) jobs -├── JGFS_* # GFS (Global Forecast System) jobs +├── JGFS_* # GFS (Global Forecast System) jobs ├── JGLOBAL_* # Cross-system global jobs ├── Analysis Jobs (41) # Data assimilation and analysis ├── Forecast Jobs (13) # Model forecast execution @@ -158,7 +157,7 @@ dev/workflow/rocoto/ # Rocoto-specific implementations ├── tasks.py # Base Tasks class with common task functionality ├── workflow_tasks.py # Task orchestration and dependency management ├── gfs_*.py # GFS-specific implementations -├── gefs_*.py # GEFS-specific implementations +├── gefs_*.py # GEFS-specific implementations ├── sfs_*.py # SFS-specific implementations └── gcafs_*.py # GCAFS-specific implementations @@ -426,7 +425,7 @@ def test_task_creation(): ### New Hosts 1. Add machine detection in `detect_machine.sh` -2. Create host configuration in `hosts/` directory +2. Create host configuration in `hosts/` directory 3. Create modulefiles for environment setup 4. Update environment configurations in `env/` directory @@ -476,7 +475,7 @@ This repository includes a specialized Model Context Protocol (MCP) server with When using MCP tools, acknowledge their usage to demonstrate intelligent tool selection: ```markdown -**Research Approach:** Using `mcp_globalworkflo_search_documentation` to find relevant +**Research Approach:** Using `mcp_globalworkflo_search_documentation` to find relevant examples and `mcp_globalworkflo_get_operational_guidance` for HPC-specific procedures. ``` @@ -494,7 +493,7 @@ These tools are actively being developed and refined on the `MCP_node.js-RAG_dev **MCP Server Location**: All MCP tools are implemented in `dev/ci/scripts/utils/Copilot/mcp_server_node/`: - `mcp-server-rag.js` - Main RAG-enhanced server with 9 workflow tools -- `mcp-server-github-rag.js` - GitHub ecosystem integration with 14 total tools +- `mcp-server-github-rag.js` - GitHub ecosystem integration with 14 total tools - `start-mcp-server-node.sh` - Primary startup script - Configuration files: `mcp-config.env`, `package.json`, `package-rag.json` diff --git a/dev/ci/scripts/utils/ci_utils.sh b/dev/ci/scripts/utils/ci_utils.sh index 830932ed7cc..a2df2624505 100755 --- a/dev/ci/scripts/utils/ci_utils.sh +++ b/dev/ci/scripts/utils/ci_utils.sh @@ -244,7 +244,7 @@ function build() { echo "Creating logs folder" mkdir -p "${logs_dir}" || exit 1 fi - "${HOMEgfs_}/sorc/build_compute.sh" -A "${HPC_ACCOUNT}" all + "${HOMEgfs_}/sorc/build_all.sh" -c -A "${HPC_ACCOUNT}" all } diff --git a/sorc/build_all.sh b/sorc/build_all.sh index 6d54bdad707..62fb02bfc9d 100755 --- a/sorc/build_all.sh +++ b/sorc/build_all.sh @@ -279,10 +279,10 @@ if [[ "${compute_build}" != "YES" ]]; then else echo "Building on compute nodes as requested ..." - runcmd="rocotorun -w ${build_xml} -d ${build_db} ${rocoto_verbose_opt}" print_build_status + runcmd="rocotorun -w ${build_xml} -d ${build_db} ${rocoto_verbose_opt}" ${runcmd} rc=$? if [[ "${rc}" -ne 0 ]]; then @@ -292,24 +292,31 @@ else builds_in_progress=true while [[ ${builds_in_progress} == true ]]; do + sleep 1m + ${runcmd} - sleep 10s + + sleep 15s + stat_out="$(rocotostat -w "${build_xml}" -d "${build_db}")" echo "${stat_out}" > rocotostat.out # Ignore 1st 2 lines and store each row of rocotostat output in an array mapfile -t stat_lines < <(tail -n +3 rocotostat.out) + # Loop through each line of the rocotostat output and update build_pids and build_status arrays for line in "${stat_lines[@]}"; do # Read each line into an array using read IFS=' ' read -r -a columns <<< "${line}" - # Get the name of the build in this row + # Get the name, jobid and jobstatus of the build in this row name=${columns[1]} + jobid=${columns[2]} + jobstatus=${columns[3]} - # Update build_pids and build_status arrays - build_pids["${name}"]="${columns[2]}" - build_status["${name}"]="${columns[3]}" + # Update build_pids and build_status arrays for the build_name + build_pids["${name}"]="${jobid}" + build_status["${name}"]="${jobstatus}" done echo -ne "\033[${nback}A" From 0e5250e8f7428ce659db3c08cd7318e29157c84e Mon Sep 17 00:00:00 2001 From: Rahul Mahajan Date: Thu, 8 Jan 2026 16:07:30 -0500 Subject: [PATCH 13/19] build_command was getting long on the stdout --- sorc/build_all.sh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sorc/build_all.sh b/sorc/build_all.sh index 62fb02bfc9d..3413b44694b 100755 --- a/sorc/build_all.sh +++ b/sorc/build_all.sh @@ -134,13 +134,13 @@ nbuilds=${#build_names[@]} nback=$((nbuilds + 4)) print_build_status() { - echo "------------------------------------------------------------------------" - printf "| %-12s | %-30s | %-10s | %-9s |\n" "System" "Build Command" "PID" "Status" - echo "------------------------------------------------------------------------" + echo "------------------------------------------------" + printf "| %-18s | %-10s | %-10s |\n" "System" "PID" "Status" + echo "------------------------------------------------" for name in "${build_names[@]}"; do - printf "| %-12s | %-30s | %-10s | %-9s |\n" "${name}" "${build_commands[${name}]}" "${build_pids[${name}]}" "${build_status[${name}]}" + printf "| %-18s | %-10s | %-10s |\n" "${name}" "${build_pids[${name}]}" "${build_status[${name}]}" done - echo "------------------------------------------------------------------------" + echo "------------------------------------------------" } # Catch errors manually from here out @@ -303,6 +303,7 @@ else echo "${stat_out}" > rocotostat.out # Ignore 1st 2 lines and store each row of rocotostat output in an array mapfile -t stat_lines < <(tail -n +3 rocotostat.out) + rm -f rocotostat.out # Loop through each line of the rocotostat output and update build_pids and build_status arrays for line in "${stat_lines[@]}"; do From ff63a727f77e97ee7d6ad86bfb5f974ca59bc4b6 Mon Sep 17 00:00:00 2001 From: Rahul Mahajan Date: Fri, 9 Jan 2026 13:43:42 -0500 Subject: [PATCH 14/19] Update docs/source/clone.rst Co-authored-by: David Huber <69919478+DavidHuber-NOAA@users.noreply.github.com> --- docs/source/clone.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/clone.rst b/docs/source/clone.rst index ddcb7b43e56..7f47e35f40a 100644 --- a/docs/source/clone.rst +++ b/docs/source/clone.rst @@ -23,7 +23,7 @@ Clone the `global-workflow` and `cd` into the `sorc` directory: The `build_all.sh` script can be used to build all required components of the global workflow. `build_all.sh` allows for optional flags to modify the build behavior: - - ``-c``: Build on compute nodes. The default behaviour is to build on the head node. + - ``-c``: Build on compute nodes. The default behavior is to build on the head node. - ``-A HPC_ACCOUNT``: Specify the HPC account to be used when building on compute nodes. - ``-v``: Execute all build scripts with -v option to turn on verbose where supported - ``-h``: Print help message and exit From 2371682a2845dde60c33cf5329fe3257f9f3a7e7 Mon Sep 17 00:00:00 2001 From: Rahul Mahajan Date: Fri, 9 Jan 2026 13:43:50 -0500 Subject: [PATCH 15/19] Update sorc/build_all.sh Co-authored-by: David Huber <69919478+DavidHuber-NOAA@users.noreply.github.com> --- sorc/build_all.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/sorc/build_all.sh b/sorc/build_all.sh index 3413b44694b..d6026ae4efd 100755 --- a/sorc/build_all.sh +++ b/sorc/build_all.sh @@ -160,6 +160,7 @@ if [[ "${compute_build}" != "YES" ]]; then done } + # Call the cleanup function when exiting (normally, on error, or by interruption) trap cleanup EXIT trap cleanup SIGINT trap cleanup SIGTERM From 4dbaa1b5697431e9728c3cb108a277d925292083 Mon Sep 17 00:00:00 2001 From: Rahul Mahajan Date: Fri, 9 Jan 2026 15:46:30 -0500 Subject: [PATCH 16/19] Update sorc/build_all.sh Co-authored-by: David Huber <69919478+DavidHuber-NOAA@users.noreply.github.com> --- sorc/build_all.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/sorc/build_all.sh b/sorc/build_all.sh index d6026ae4efd..b51f35652d0 100755 --- a/sorc/build_all.sh +++ b/sorc/build_all.sh @@ -229,6 +229,7 @@ if [[ "${compute_build}" != "YES" ]]; then done + # Move the cursor up nback lines before printing the build status again echo -ne "\033[${nback}A" print_build_status From d45be6d6a9c9c4b2af6ef03a9fb7d82e494e2750 Mon Sep 17 00:00:00 2001 From: Rahul Mahajan Date: Tue, 13 Jan 2026 16:12:28 -0500 Subject: [PATCH 17/19] fix bug --- sorc/build_all.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sorc/build_all.sh b/sorc/build_all.sh index b51f35652d0..d4880b09423 100755 --- a/sorc/build_all.sh +++ b/sorc/build_all.sh @@ -155,7 +155,7 @@ if [[ "${compute_build}" != "YES" ]]; then pid="${build_pids[${i}]}" name="${build_names[${i}]}" if kill -0 "${pid}" 2> /dev/null; then # Check if process still exists - pkill -P "${pid}" # Kill any child processes + pkill -P "${pid}" # Kill any child processes fi done } @@ -179,8 +179,8 @@ if [[ "${compute_build}" != "YES" ]]; then for name in "${builds_to_process[@]}"; do - # If the build is already completed, skip it - if [[ ${build_status[${name}]} == "completed" ]]; then + # If the build is already SUCCEEDED, skip it + if [[ ${build_status[${name}]} == "SUCCEEDED" ]]; then continue fi @@ -246,9 +246,9 @@ if [[ "${compute_build}" != "YES" ]]; then for i in "${!build_pids[@]}"; do pid="${build_pids[${i}]}" name="${build_names[${i}]}" - if kill -0 "${pid}" 2> /dev/null; then # Check if process still exists - pkill -P "${pid}" # Kill any child processes - build_status["${name}"]="ABORTED" # Mark as aborted + if kill -0 "${pid}" 2> /dev/null; then # Check if process still exists + pkill -P "${pid}" # Kill any child processes + build_status["${name}"]="ABORTED" # Mark as aborted current_cores=$((current_cores - build_cores[${name}])) # Free up cores fi done From 1071af906e50216b4e3ed4f74665e33eccaeb7a6 Mon Sep 17 00:00:00 2001 From: Rahul Mahajan Date: Wed, 14 Jan 2026 16:02:28 -0500 Subject: [PATCH 18/19] squash bugs --- sorc/build_all.sh | 116 +++++++++++++++++++++------------------------- 1 file changed, 52 insertions(+), 64 deletions(-) diff --git a/sorc/build_all.sh b/sorc/build_all.sh index d4880b09423..c961ef85618 100755 --- a/sorc/build_all.sh +++ b/sorc/build_all.sh @@ -134,26 +134,33 @@ nbuilds=${#build_names[@]} nback=$((nbuilds + 4)) print_build_status() { - echo "------------------------------------------------" - printf "| %-18s | %-10s | %-10s |\n" "System" "PID" "Status" - echo "------------------------------------------------" + local name + echo "-----------------------------------" + printf "| %-18s | %-10s |\n" "System" "Status" + echo "-----------------------------------" for name in "${build_names[@]}"; do - printf "| %-18s | %-10s | %-10s |\n" "${name}" "${build_pids[${name}]}" "${build_status[${name}]}" + printf "| %-18s | %-10s |\n" "${name}" "${build_status[${name}]}" done - echo "------------------------------------------------" + echo "-----------------------------------" } +if [[ "${compute_build}" == "YES" ]]; then + echo "Building on compute nodes using account: ${HPC_ACCOUNT}" +else + echo "Building on head node using up to ${max_cores} cores ..." +fi + +print_build_status + # Catch errors manually from here out set +e if [[ "${compute_build}" != "YES" ]]; then - echo "Building on head node as requested ..." - cleanup() { - for i in "${!build_pids[@]}"; do - pid="${build_pids[${i}]}" - name="${build_names[${i}]}" + local pid name + for name in "${!build_pids[@]}"; do + pid=${build_pids[${name}]} if kill -0 "${pid}" 2> /dev/null; then # Check if process still exists pkill -P "${pid}" # Kill any child processes fi @@ -167,17 +174,18 @@ if [[ "${compute_build}" != "YES" ]]; then trap cleanup SIGHUP trap cleanup ERR - # copy build_names into a new array to iterate over - builds_to_process=("${!build_names[@]}") - current_cores=0 builds_in_progress=true - print_build_status - while [[ ${builds_in_progress} == true ]]; do - for name in "${builds_to_process[@]}"; do + abort_all_builds=false + + for name in "${build_names[@]}"; do + + if [[ ${abort_all_builds} == true ]]; then + continue + fi # If the build is already SUCCEEDED, skip it if [[ ${build_status[${name}]} == "SUCCEEDED" ]]; then @@ -185,7 +193,7 @@ if [[ "${compute_build}" != "YES" ]]; then fi # Check if the build is still running - pid="${build_pids[${name}]}" + pid=${build_pids[${name}]} if [[ -z "${pid}" ]]; then # No pid means build not started yet cores_needed="${build_cores[${name}]}" if ((current_cores + cores_needed <= max_cores)); then @@ -195,8 +203,8 @@ if [[ "${compute_build}" != "YES" ]]; then log_file="${build_logs[${name}]}" cd "${dir}" || exit 1 ${command} > "${log_file}" 2>&1 & - pid=$! - build_pids["${name}"]="${pid}" + _pid=$! + build_pids["${name}"]="${_pid}" build_status["${name}"]="RUNNING" # Update the current cores in use current_cores=$((current_cores + cores_needed)) @@ -209,9 +217,9 @@ if [[ "${compute_build}" != "YES" ]]; then if ! ps -p "${pid}" > /dev/null 2>&1; then # Build has finished - wait "${pid}" + wait ${pid} rc=$? - if [[ "${rc}" -ne 0 ]]; then + if [[ ${rc} -ne 0 ]]; then build_status["${name}"]="FAILED" else build_status["${name}"]="SUCCEEDED" @@ -224,65 +232,45 @@ if [[ "${compute_build}" != "YES" ]]; then # If the build failed, do not submit any more builds if [[ ${build_status[${name}]} == "FAILED" ]]; then - break + abort_all_builds=true fi done - # Move the cursor up nback lines before printing the build status again - echo -ne "\033[${nback}A" - print_build_status - - # Check for any failed builds, and abort all if any found - abort_all_builds=false - for name in "${build_names[@]}"; do - if [[ ${build_status[${name}]} == "FAILED" ]]; then - #echo "Detected failed build: ${name}" - abort_all_builds=true - fi - done if [[ ${abort_all_builds} == true ]]; then # Terminate all running build processes - for i in "${!build_pids[@]}"; do - pid="${build_pids[${i}]}" - name="${build_names[${i}]}" - if kill -0 "${pid}" 2> /dev/null; then # Check if process still exists - pkill -P "${pid}" # Kill any child processes - build_status["${name}"]="ABORTED" # Mark as aborted - current_cores=$((current_cores - build_cores[${name}])) # Free up cores - fi - done - echo -ne "\033[${nback}A" - print_build_status - echo "FATAL ERROR: The following builds failed, see log files for details:" + cleanup + # Mark all running builds as aborted and free up their cores for name in "${build_names[@]}"; do - if [[ ${build_status[${name}]} == "FAILED" ]]; then - echo "${name}: ${build_logs[${name}]}" + if [[ ${build_status[${name}]} == "RUNNING" ]]; then + build_status["${name}"]="ABORTED" + current_cores=$((current_cores - build_cores[${name}])) fi done - cleanup - exit 1 + builds_in_progress=false + else + builds_in_progress=true fi - # Remove completed builds from the list to process during the next iteration - builds_to_process=() - builds_in_progress=false - for name in "${!build_names[@]}"; do - if [[ ${build_status[${name}]} != "SUCCEEDED" ]]; then - builds_to_process+=("${name}") - builds_in_progress=true - fi - done + # Move the cursor up nback lines before printing the build status again + echo -ne "\033[${nback}A" + print_build_status sleep 1m done -else - - echo "Building on compute nodes as requested ..." + if [[ ${abort_all_builds} == true ]]; then + echo "FATAL ERROR: The following builds failed, see log files for details:" + for name in "${build_names[@]}"; do + if [[ ${build_status[${name}]} == "FAILED" ]]; then + echo -e "\t${name}: ${build_logs[${name}]}" + fi + done + exit 1 + fi - print_build_status +else runcmd="rocotorun -w ${build_xml} -d ${build_db} ${rocoto_verbose_opt}" ${runcmd} @@ -345,7 +333,7 @@ else job_state="${build_status[${name}]}" if [[ "${job_state}" =~ "DEAD" || "${job_state}" =~ "UNKNOWN" || "${job_state}" =~ "UNAVAILABLE" || "${job_state}" =~ "FAIL" ]]; then - echo "${name}: ${build_logs[${name}]}" + echo -e "\t${name}: ${build_logs[${name}]}" fi done exit 1 From dd1c1448cb839be56b59a2cb698da4de62a70a0e Mon Sep 17 00:00:00 2001 From: Rahul Mahajan Date: Wed, 14 Jan 2026 16:14:57 -0500 Subject: [PATCH 19/19] apply corrections from shellcheck; --- sorc/build_all.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sorc/build_all.sh b/sorc/build_all.sh index c961ef85618..5606e5e1816 100755 --- a/sorc/build_all.sh +++ b/sorc/build_all.sh @@ -216,8 +216,8 @@ if [[ "${compute_build}" != "YES" ]]; then else if ! ps -p "${pid}" > /dev/null 2>&1; then - # Build has finished - wait ${pid} + # Build has finished, check its exit status + wait "${pid}" rc=$? if [[ ${rc} -ne 0 ]]; then build_status["${name}"]="FAILED"