From dc71406f4688462c49f77097c134b193678f3037 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Wed, 20 May 2026 15:39:22 -0400 Subject: [PATCH] [Klaud Cold] mi300x runner: switch from --nodelist pin to --exclude -049 The previous --nodelist pin only allowed allocations on a hand-picked subset (034-036, 054, 057-058) and blocked nodes that have since recovered (033, 035, 037). Switch to --exclude=chi-mi300x-049 so Slurm can pick any healthy node; -049 stays banned (persistent /nvme_home disk-full). Co-Authored-By: Claude Opus 4.7 (1M context) --- runners/launch_mi300x-amds.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index 4f085d0ad..47ad12698 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -9,10 +9,9 @@ LOCK_FILE="${SQUASH_FILE}.lock" set -x -# Pin to the known-good mi300x nodes; others are unavailable: -# chi-mi300x-033, chi-mi300x-037: down (Not responding) -# chi-mi300x-049: drained (persistent /nvme_home disk-full) -JOB_ID=$(salloc --partition=$PARTITION --nodelist=chi-mi300x-[034-036,054,057-058].ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +# Exclude known-bad nodes; let Slurm pick from anything else: +# chi-mi300x-049: drained (persistent /nvme_home disk-full) +JOB_ID=$(salloc --partition=$PARTITION --exclude=chi-mi300x-049.ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') if [ -z "$JOB_ID" ]; then echo "ERROR: salloc failed to allocate a job"