diff --git a/runners/launch_mi300x-amds.sh b/runners/launch_mi300x-amds.sh index 4f085d0ad..47ad12698 100644 --- a/runners/launch_mi300x-amds.sh +++ b/runners/launch_mi300x-amds.sh @@ -9,10 +9,9 @@ LOCK_FILE="${SQUASH_FILE}.lock" set -x -# Pin to the known-good mi300x nodes; others are unavailable: -# chi-mi300x-033, chi-mi300x-037: down (Not responding) -# chi-mi300x-049: drained (persistent /nvme_home disk-full) -JOB_ID=$(salloc --partition=$PARTITION --nodelist=chi-mi300x-[034-036,054,057-058].ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') +# Exclude known-bad nodes; let Slurm pick from anything else: +# chi-mi300x-049: drained (persistent /nvme_home disk-full) +JOB_ID=$(salloc --partition=$PARTITION --exclude=chi-mi300x-049.ord.vultr.cpe.ice.amd.com --gres=gpu:$TP --cpus-per-task=256 --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') if [ -z "$JOB_ID" ]; then echo "ERROR: salloc failed to allocate a job"