From 4e2f7cf355e8d76f380ab23bd656c4a6c00b840c Mon Sep 17 00:00:00 2001 From: Anthony Ramirez Date: Thu, 21 May 2026 17:28:53 -0400 Subject: [PATCH 1/3] Add base changes for getting vLLM multinode working again on Lustre. Add multiple prompts for controlled experiment. --- frontier/sample_apps/vllm/README.md | 12 ++++----- .../vllm/launchmultinode_lustre.sbatch | 4 +-- .../sample_apps/vllm/start_head_lustre.sh | 3 ++- frontier/sample_apps/vllm/start_worker.sh | 2 ++ .../sample_apps/vllm/testprompt_lustre.py | 25 ++++++++++++++++--- frontier/sample_apps/vllm/vllm_rocm.def | 8 ++++++ 6 files changed, 41 insertions(+), 13 deletions(-) create mode 100644 frontier/sample_apps/vllm/vllm_rocm.def diff --git a/frontier/sample_apps/vllm/README.md b/frontier/sample_apps/vllm/README.md index 9046d97..c51d7ef 100644 --- a/frontier/sample_apps/vllm/README.md +++ b/frontier/sample_apps/vllm/README.md @@ -1,22 +1,22 @@ -# vLLM example running astrollama-2-7b-base_abstract +# vLLM example running gemma-4-31B-it Example showing how to get download AMD's vLLM container release and run it on Frontier -Download the vLLM container from Dockerhub: +Download the vLLM container from DockerHub using the included build spec: ``` -apptainer pull --disable-cache vllm_rocm.sif docker://docker.io/rocm/vllm:rocm6.3.1_vllm_0.8.5_20250513 +apptainer build vllm_rocm.sif vllm_rocm.def ``` -Download the astrollama model +Download the gemma-4 model ``` module load git-lfs git lfs install -git clone https://huggingface.co/AstroMLab/astrollama-2-7b-base_abstract +git clone https://huggingface.co/google/gemma-4-31B-it ``` If you plan on moving the model to the burst buffer first, then tar the model directory ``` -tar --use-compress-program="pigz -p 16" -cf astrollama-2-7b-base_abstract.tar.gz ./astrollama-2-7b-base_abstract/ +tar --use-compress-program="pigz -p 16" -cf gemma-4-31B-it.tar.gz ./gemma-4-31B-it/ ``` diff --git a/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch b/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch index 9924d0b..5e50f18 100644 --- a/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch +++ b/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch @@ -18,10 +18,10 @@ APPTAINER_CMD="apptainer exec --fakeroot --writable-tmpfs ./vllm_rocm.sif " export APPTAINER_BINDPATH=/mnt/bb # starting head node (head node start also starts a worker) -srun -N1 -n1 -c56 -G8 -w $HEAD_NODE_ADDR $APPTAINER_CMD ./start_head_lustre.sh $SLURM_NNODES > logs/headnodelog 2>&1 & +srun -N1 -n1 -c56 -G8 -w $HEAD_NODE_ADDR $APPTAINER_CMD ./start_head_lustre.sh $SLURM_NNODES > logs/headnodelog_$SLURM_JOB_ID 2>&1 & # starting workers on other nodes -srun -N3 -n3 --tasks-per-node=1 --cpus-per-task=56 --gpus-per-task=8 -x $HEAD_NODE_ADDR $APPTAINER_CMD ./start_worker.sh $HEAD_NODE_ADDR > logs/workernodeslog 2>&1 & +srun -N3 -n3 --tasks-per-node=1 --cpus-per-task=56 --gpus-per-task=8 -x $HEAD_NODE_ADDR $APPTAINER_CMD ./start_worker.sh $HEAD_NODE_ADDR > logs/workernodeslog_$SLURM_JOB_ID 2>&1 & $APPTAINER_CMD python3 ./testprompt_lustre.py duration=$(( SECONDS - start )) diff --git a/frontier/sample_apps/vllm/start_head_lustre.sh b/frontier/sample_apps/vllm/start_head_lustre.sh index 573cd39..9a57dd1 100755 --- a/frontier/sample_apps/vllm/start_head_lustre.sh +++ b/frontier/sample_apps/vllm/start_head_lustre.sh @@ -1,6 +1,7 @@ #!/bin/bash NNODES=$@ +export HIP_VISIBLE_DEVICES=$SLURM_STEP_GPUS export VLLM_HOST_IP=$(hostname -I | awk '{print $2}') echo "VLLM_HOST_IP: $VLLM_HOST_IP" ray start --node-ip-address=$VLLM_HOST_IP --head --port=6379 @@ -10,4 +11,4 @@ echo "head node: slurm nnodes - $NNODES" ray status -vllm serve --chat-template "./chattemplate.jinja" --tensor-parallel-size 8 --pipeline-parallel-size $NNODES --distributed-executor-backend ray "./astrollama-2-7b-base_abstract" --host 0.0.0.0 --port 8000 +vllm serve --chat-template "./chattemplate.jinja" --tensor-parallel-size 8 --pipeline-parallel-size $NNODES --distributed-executor-backend ray "./gemma-4-31B-it" --host 0.0.0.0 --port 8000 --gpu-memory-utilization 0.75 --enforce-eager \ No newline at end of file diff --git a/frontier/sample_apps/vllm/start_worker.sh b/frontier/sample_apps/vllm/start_worker.sh index b2f65c0..3da43ee 100755 --- a/frontier/sample_apps/vllm/start_worker.sh +++ b/frontier/sample_apps/vllm/start_worker.sh @@ -1,6 +1,8 @@ #!/bin/bash +export HIP_VISIBLE_DEVICES=$SLURM_STEP_GPUS export VLLM_HOST_IP=$(hostname -I | awk '{print $2}') echo "VLLM_HOST_IP: $VLLM_HOST_IP" HEAD_NODE_ADDR=$@ + ray start --node-ip-address $VLLM_HOST_IP --address=$HEAD_NODE_ADDR:6379 --block diff --git a/frontier/sample_apps/vllm/testprompt_lustre.py b/frontier/sample_apps/vllm/testprompt_lustre.py index 2239e04..fc0e41b 100644 --- a/frontier/sample_apps/vllm/testprompt_lustre.py +++ b/frontier/sample_apps/vllm/testprompt_lustre.py @@ -1,5 +1,5 @@ from openai import OpenAI -import os +import os import time import httpcore import httpx @@ -13,10 +13,27 @@ base_url=openai_api_base, ) +test_prompts = [ + "In a few sentences, describe the theory of relativity.", + "Write a small Python script for calling a model named `./gemma-4-31B-it` using the `openai` Python library.", + "Describe the difference between GPT and LLAMA AI models.", +] +completions = [] + while True: try: - completion = client.completions.create(model=f"./astrollama-2-7b-base_abstract", - prompt="The Magellanic Cloud is a") + start = time.time() + for test_prompt in test_prompts: + completion = client.chat.completions.create( + model=f"./gemma-4-31B-it", + messages=[ + {"role": "user", "content": test_prompt }, + ], + stream=False + ) + print("Completion result:", completion, flush=True) + print("Time since beginning:", time.time() - start, flush=True) + completions.append(completion) except httpcore.ConnectError: print("vllm server is not ready. Waiting 10 seconds...", flush=True) time.sleep(10) @@ -28,4 +45,4 @@ time.sleep(10) else: break -print("Completion result:", completion, flush=True) +print("Completion result:", [completion for completion in completions], flush=True) \ No newline at end of file diff --git a/frontier/sample_apps/vllm/vllm_rocm.def b/frontier/sample_apps/vllm/vllm_rocm.def new file mode 100644 index 0000000..bea701b --- /dev/null +++ b/frontier/sample_apps/vllm/vllm_rocm.def @@ -0,0 +1,8 @@ +Bootstrap: docker +From: vllm/vllm-openai-rocm:v0.21.0 + +%post + +set -e + +pip3 install "ray[default]" \ No newline at end of file From 3535fb402a74cb9f16c03a96670158acd2d2c3a4 Mon Sep 17 00:00:00 2001 From: Anthony Ramirez Date: Fri, 22 May 2026 12:28:06 -0400 Subject: [PATCH 2/3] README.md Add instructions for setting up gpt-oss-120b. launchmultinode_lustre.sbatch: add required env variable for gpt-oss-120b --- frontier/sample_apps/vllm/README.md | 31 +++++++++++++++++-- .../vllm/launchmultinode_lustre.sbatch | 3 ++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/frontier/sample_apps/vllm/README.md b/frontier/sample_apps/vllm/README.md index c51d7ef..1fb07ef 100644 --- a/frontier/sample_apps/vllm/README.md +++ b/frontier/sample_apps/vllm/README.md @@ -1,12 +1,17 @@ -# vLLM example running gemma-4-31B-it +# vLLM multi-node example running gemma-4-31B-it and/or gpt-oss-120b -Example showing how to get download AMD's vLLM container release and run it on Frontier +Example showing how to build from vLLM's ROCm container release and run it on Frontier -Download the vLLM container from DockerHub using the included build spec: +## Build the image + +Build the vLLM container from DockerHub using the included build spec, which includes `ray`: ``` apptainer build vllm_rocm.sif vllm_rocm.def ``` +## Download the model + +### `gemma-4-31B-it` Download the gemma-4 model ``` module load git-lfs @@ -19,8 +24,28 @@ If you plan on moving the model to the burst buffer first, then tar the model di tar --use-compress-program="pigz -p 16" -cf gemma-4-31B-it.tar.gz ./gemma-4-31B-it/ ``` +### `gpt-oss-120b` +Download the GPT-OSS model +``` +module load git-lfs +git lfs install +git clone https://huggingface.co/openai/gpt-oss-120b +``` +If you plan on moving the model to the burst buffer first, then tar the model directory +``` +tar --use-compress-program="pigz -p 16" -cf gpt-oss-120b.tar.gz ./gpt-oss-120b/ +``` + +[!NOTE] +`gpt-oss-120b` requires an additional step from login nodes. +Please additionally run the following commands to fetch the vocab file: +```bash +mkdir vocab_cache +TIKTOKEN_RS_CACHE_DIR=./vocab_cache apptainer exec vllm_rocm.sif python -c 'from openai_harmony import load_harmony_encoding; load_harmony_encoding("HarmonyGptOss")' +``` +## Run inference Submit the job with ``` # running the model directly from Lustre diff --git a/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch b/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch index 5e50f18..c9502c2 100644 --- a/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch +++ b/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch @@ -17,6 +17,9 @@ export APPTAINERENV_TRITON_CACHE_DIR="/tmp/triton/cache/" # change write/build APPTAINER_CMD="apptainer exec --fakeroot --writable-tmpfs ./vllm_rocm.sif " export APPTAINER_BINDPATH=/mnt/bb +# Needed for gpt-oss-120b +export TIKTOKEN_RS_CACHE_DIR=./vocab_cache + # starting head node (head node start also starts a worker) srun -N1 -n1 -c56 -G8 -w $HEAD_NODE_ADDR $APPTAINER_CMD ./start_head_lustre.sh $SLURM_NNODES > logs/headnodelog_$SLURM_JOB_ID 2>&1 & From c55ece312f2224c49856a899f4f71201c71c2208 Mon Sep 17 00:00:00 2001 From: Anthony Ramirez Date: Fri, 22 May 2026 16:55:47 -0400 Subject: [PATCH 3/3] launchmultinode_lustre.sbatch: Add checks and configuration to allow running either gemma or gpt-oss at submit time. start_head_lustre.sh: Add capability to run model passed by batch config. testprompt_lustre.py: Add model detection. README.md: Add latest instructions' --- frontier/sample_apps/vllm/README.md | 2 +- .../vllm/launchmultinode_lustre.sbatch | 19 ++++++++++++++++++- .../sample_apps/vllm/start_head_lustre.sh | 8 +++++--- .../sample_apps/vllm/testprompt_lustre.py | 5 +++-- 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/frontier/sample_apps/vllm/README.md b/frontier/sample_apps/vllm/README.md index 1fb07ef..9517793 100644 --- a/frontier/sample_apps/vllm/README.md +++ b/frontier/sample_apps/vllm/README.md @@ -49,7 +49,7 @@ TIKTOKEN_RS_CACHE_DIR=./vocab_cache apptainer exec vllm_rocm.sif python -c 'from Submit the job with ``` # running the model directly from Lustre -sbatch launchmultinode_lustre.sbatch +sbatch launchmultinode_lustre.sbatch [gpt-oss-120b | gemma-4-31B-it] # copying the model to burst buffer first before running sbatch launchmultinode_bb.sbatch diff --git a/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch b/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch index c9502c2..e5479a6 100644 --- a/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch +++ b/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch @@ -5,6 +5,23 @@ #SBATCH -o logs/vllmmult_%j.out #SBATCH -C nvme +MODEL_NAME="$1" +SELECTED_MODEL="gemma-4-31B-it" + +if [ "$#" -ne 1 ]; then + echo "No model provided. Defaulting to gemma." + MODEL_NAME="gemma" +elif echo "$MODEL_NAME" | grep -iqE "gpt"; then + echo "Detected GPT keyword. Using 'gpt-oss-120b'." + SELECTED_MODEL="gpt-oss-120b" +elif echo "$MODEL_NAME" | grep -iqE "gemma"; then + echo "Detected Gemma keyword. Using 'gemma-4-31B-it'." +fi + +if [ ! -d "$SELECTED_MODEL" ]; then + echo "Model not downloaded. Please download it and try again." + exit 1 +fi export HF_HOME=/mnt/bb/$USER # Make sure nvme is in use with . Change this location if you want. @@ -21,7 +38,7 @@ export APPTAINER_BINDPATH=/mnt/bb export TIKTOKEN_RS_CACHE_DIR=./vocab_cache # starting head node (head node start also starts a worker) -srun -N1 -n1 -c56 -G8 -w $HEAD_NODE_ADDR $APPTAINER_CMD ./start_head_lustre.sh $SLURM_NNODES > logs/headnodelog_$SLURM_JOB_ID 2>&1 & +srun -N1 -n1 -c56 -G8 -w $HEAD_NODE_ADDR $APPTAINER_CMD ./start_head_lustre.sh $SLURM_NNODES $SELECTED_MODEL > logs/headnodelog_$SLURM_JOB_ID 2>&1 & # starting workers on other nodes srun -N3 -n3 --tasks-per-node=1 --cpus-per-task=56 --gpus-per-task=8 -x $HEAD_NODE_ADDR $APPTAINER_CMD ./start_worker.sh $HEAD_NODE_ADDR > logs/workernodeslog_$SLURM_JOB_ID 2>&1 & diff --git a/frontier/sample_apps/vllm/start_head_lustre.sh b/frontier/sample_apps/vllm/start_head_lustre.sh index 9a57dd1..aa1c144 100755 --- a/frontier/sample_apps/vllm/start_head_lustre.sh +++ b/frontier/sample_apps/vllm/start_head_lustre.sh @@ -1,14 +1,16 @@ #!/bin/bash -NNODES=$@ +NNODES=$1 +RUN_MODEL="$2" export HIP_VISIBLE_DEVICES=$SLURM_STEP_GPUS export VLLM_HOST_IP=$(hostname -I | awk '{print $2}') + echo "VLLM_HOST_IP: $VLLM_HOST_IP" -ray start --node-ip-address=$VLLM_HOST_IP --head --port=6379 +ray start --node-ip-address=$VLLM_HOST_IP --head --port=6379 sleep 10 echo "head node: slurm nnodes - $NNODES" ray status -vllm serve --chat-template "./chattemplate.jinja" --tensor-parallel-size 8 --pipeline-parallel-size $NNODES --distributed-executor-backend ray "./gemma-4-31B-it" --host 0.0.0.0 --port 8000 --gpu-memory-utilization 0.75 --enforce-eager \ No newline at end of file +vllm serve --chat-template "./chattemplate.jinja" --tensor-parallel-size 8 --pipeline-parallel-size $NNODES --distributed-executor-backend ray "./$RUN_MODEL" --host 0.0.0.0 --port 8000 --gpu-memory-utilization 0.75 \ No newline at end of file diff --git a/frontier/sample_apps/vllm/testprompt_lustre.py b/frontier/sample_apps/vllm/testprompt_lustre.py index fc0e41b..5a2f285 100644 --- a/frontier/sample_apps/vllm/testprompt_lustre.py +++ b/frontier/sample_apps/vllm/testprompt_lustre.py @@ -22,10 +22,11 @@ while True: try: + model = client.models.list().data[0].id start = time.time() for test_prompt in test_prompts: completion = client.chat.completions.create( - model=f"./gemma-4-31B-it", + model=f"{model}", messages=[ {"role": "user", "content": test_prompt }, ], @@ -45,4 +46,4 @@ time.sleep(10) else: break -print("Completion result:", [completion for completion in completions], flush=True) \ No newline at end of file +print("Completion result:", [completion for completion in completions], flush=True)