From 4e2f7cf355e8d76f380ab23bd656c4a6c00b840c Mon Sep 17 00:00:00 2001
From: Anthony Ramirez <ramirezat@ornl.gov>
Date: Thu, 21 May 2026 17:28:53 -0400
Subject: [PATCH 1/3] Add base changes for getting vLLM multinode working again
 on Lustre. Add multiple prompts for controlled experiment.

---
 frontier/sample_apps/vllm/README.md           | 12 ++++-----
 .../vllm/launchmultinode_lustre.sbatch        |  4 +--
 .../sample_apps/vllm/start_head_lustre.sh     |  3 ++-
 frontier/sample_apps/vllm/start_worker.sh     |  2 ++
 .../sample_apps/vllm/testprompt_lustre.py     | 25 ++++++++++++++++---
 frontier/sample_apps/vllm/vllm_rocm.def       |  8 ++++++
 6 files changed, 41 insertions(+), 13 deletions(-)
 create mode 100644 frontier/sample_apps/vllm/vllm_rocm.def

diff --git a/frontier/sample_apps/vllm/README.md b/frontier/sample_apps/vllm/README.md
index 9046d97..c51d7ef 100644
--- a/frontier/sample_apps/vllm/README.md
+++ b/frontier/sample_apps/vllm/README.md
@@ -1,22 +1,22 @@
-# vLLM example running astrollama-2-7b-base_abstract 
+# vLLM example running gemma-4-31B-it 
 
 Example showing how to get download AMD's vLLM container release and run it on Frontier
 
-Download the vLLM container from Dockerhub:
+Download the vLLM container from DockerHub using the included build spec:
 ```
-apptainer pull --disable-cache vllm_rocm.sif docker://docker.io/rocm/vllm:rocm6.3.1_vllm_0.8.5_20250513
+apptainer build vllm_rocm.sif vllm_rocm.def
 ```
 
-Download the astrollama model
+Download the gemma-4 model
 ```
 module load git-lfs
 git lfs install
-git clone https://huggingface.co/AstroMLab/astrollama-2-7b-base_abstract
+git clone https://huggingface.co/google/gemma-4-31B-it
 ```
 
 If you plan on moving the model to the burst buffer first, then tar the model directory
 ```
-tar --use-compress-program="pigz -p 16" -cf astrollama-2-7b-base_abstract.tar.gz ./astrollama-2-7b-base_abstract/
+tar --use-compress-program="pigz -p 16" -cf gemma-4-31B-it.tar.gz ./gemma-4-31B-it/
 ```
 
 
diff --git a/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch b/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch
index 9924d0b..5e50f18 100644
--- a/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch
+++ b/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch
@@ -18,10 +18,10 @@ APPTAINER_CMD="apptainer exec --fakeroot --writable-tmpfs ./vllm_rocm.sif "
 export APPTAINER_BINDPATH=/mnt/bb
 
 # starting head node (head node start also starts a worker)
-srun -N1 -n1 -c56 -G8 -w $HEAD_NODE_ADDR $APPTAINER_CMD ./start_head_lustre.sh $SLURM_NNODES > logs/headnodelog 2>&1 &
+srun -N1 -n1 -c56 -G8 -w $HEAD_NODE_ADDR $APPTAINER_CMD ./start_head_lustre.sh $SLURM_NNODES > logs/headnodelog_$SLURM_JOB_ID 2>&1 &
 
 # starting workers on other nodes 
-srun -N3 -n3  --tasks-per-node=1 --cpus-per-task=56 --gpus-per-task=8 -x $HEAD_NODE_ADDR $APPTAINER_CMD ./start_worker.sh $HEAD_NODE_ADDR > logs/workernodeslog 2>&1 &
+srun -N3 -n3  --tasks-per-node=1 --cpus-per-task=56 --gpus-per-task=8 -x $HEAD_NODE_ADDR $APPTAINER_CMD ./start_worker.sh $HEAD_NODE_ADDR > logs/workernodeslog_$SLURM_JOB_ID 2>&1 &
 
 $APPTAINER_CMD python3 ./testprompt_lustre.py
 duration=$(( SECONDS - start ))
diff --git a/frontier/sample_apps/vllm/start_head_lustre.sh b/frontier/sample_apps/vllm/start_head_lustre.sh
index 573cd39..9a57dd1 100755
--- a/frontier/sample_apps/vllm/start_head_lustre.sh
+++ b/frontier/sample_apps/vllm/start_head_lustre.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
 NNODES=$@
+export HIP_VISIBLE_DEVICES=$SLURM_STEP_GPUS
 export VLLM_HOST_IP=$(hostname -I | awk '{print $2}')
 echo "VLLM_HOST_IP: $VLLM_HOST_IP"
 ray start --node-ip-address=$VLLM_HOST_IP --head --port=6379 
@@ -10,4 +11,4 @@ echo "head node: slurm nnodes - $NNODES"
 
 ray status
 
-vllm serve --chat-template "./chattemplate.jinja" --tensor-parallel-size 8 --pipeline-parallel-size $NNODES --distributed-executor-backend ray "./astrollama-2-7b-base_abstract" --host 0.0.0.0 --port 8000 
+vllm serve --chat-template "./chattemplate.jinja" --tensor-parallel-size 8 --pipeline-parallel-size $NNODES --distributed-executor-backend ray "./gemma-4-31B-it" --host 0.0.0.0 --port 8000 --gpu-memory-utilization 0.75 --enforce-eager
\ No newline at end of file
diff --git a/frontier/sample_apps/vllm/start_worker.sh b/frontier/sample_apps/vllm/start_worker.sh
index b2f65c0..3da43ee 100755
--- a/frontier/sample_apps/vllm/start_worker.sh
+++ b/frontier/sample_apps/vllm/start_worker.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 
+export HIP_VISIBLE_DEVICES=$SLURM_STEP_GPUS
 export VLLM_HOST_IP=$(hostname -I | awk '{print $2}')
 echo "VLLM_HOST_IP: $VLLM_HOST_IP"
 HEAD_NODE_ADDR=$@
+
 ray start --node-ip-address $VLLM_HOST_IP --address=$HEAD_NODE_ADDR:6379 --block
diff --git a/frontier/sample_apps/vllm/testprompt_lustre.py b/frontier/sample_apps/vllm/testprompt_lustre.py
index 2239e04..fc0e41b 100644
--- a/frontier/sample_apps/vllm/testprompt_lustre.py
+++ b/frontier/sample_apps/vllm/testprompt_lustre.py
@@ -1,5 +1,5 @@
 from openai import OpenAI
-import os 
+import os
 import time
 import httpcore
 import httpx
@@ -13,10 +13,27 @@
     base_url=openai_api_base,
 )
 
+test_prompts = [
+    "In a few sentences, describe the theory of relativity.",
+    "Write a small Python script for calling a model named `./gemma-4-31B-it` using the `openai` Python library.",
+    "Describe the difference between GPT and LLAMA AI models.",
+]
+completions = []
+
 while True:
     try:
-        completion = client.completions.create(model=f"./astrollama-2-7b-base_abstract",
-                                      prompt="The Magellanic Cloud is a")
+        start = time.time()
+        for test_prompt in test_prompts:
+            completion = client.chat.completions.create(
+                model=f"./gemma-4-31B-it",
+                messages=[
+                    {"role": "user", "content": test_prompt },
+                ],
+                stream=False
+            )
+            print("Completion result:", completion, flush=True)
+            print("Time since beginning:", time.time() - start, flush=True)
+            completions.append(completion)
     except httpcore.ConnectError:
         print("vllm server is not ready. Waiting 10 seconds...", flush=True)
         time.sleep(10)
@@ -28,4 +45,4 @@
         time.sleep(10)
     else:
         break
-print("Completion result:", completion, flush=True)
+print("Completion result:", [completion for completion in completions], flush=True)
\ No newline at end of file
diff --git a/frontier/sample_apps/vllm/vllm_rocm.def b/frontier/sample_apps/vllm/vllm_rocm.def
new file mode 100644
index 0000000..bea701b
--- /dev/null
+++ b/frontier/sample_apps/vllm/vllm_rocm.def
@@ -0,0 +1,8 @@
+Bootstrap: docker
+From: vllm/vllm-openai-rocm:v0.21.0
+
+%post
+
+set -e
+
+pip3 install "ray[default]"
\ No newline at end of file

From 3535fb402a74cb9f16c03a96670158acd2d2c3a4 Mon Sep 17 00:00:00 2001
From: Anthony Ramirez <ramirezat@ornl.gov>
Date: Fri, 22 May 2026 12:28:06 -0400
Subject: [PATCH 2/3] README.md Add instructions for setting up gpt-oss-120b.
 launchmultinode_lustre.sbatch: add required env variable for gpt-oss-120b

---
 frontier/sample_apps/vllm/README.md           | 31 +++++++++++++++++--
 .../vllm/launchmultinode_lustre.sbatch        |  3 ++
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/frontier/sample_apps/vllm/README.md b/frontier/sample_apps/vllm/README.md
index c51d7ef..1fb07ef 100644
--- a/frontier/sample_apps/vllm/README.md
+++ b/frontier/sample_apps/vllm/README.md
@@ -1,12 +1,17 @@
-# vLLM example running gemma-4-31B-it 
+# vLLM multi-node example running gemma-4-31B-it and/or gpt-oss-120b
 
-Example showing how to get download AMD's vLLM container release and run it on Frontier
+Example showing how to build from vLLM's ROCm container release and run it on Frontier
 
-Download the vLLM container from DockerHub using the included build spec:
+## Build the image
+
+Build the vLLM container from DockerHub using the included build spec, which includes `ray`:
 ```
 apptainer build vllm_rocm.sif vllm_rocm.def
 ```
 
+## Download the model
+
+### `gemma-4-31B-it`
 Download the gemma-4 model
 ```
 module load git-lfs
@@ -19,8 +24,28 @@ If you plan on moving the model to the burst buffer first, then tar the model di
 tar --use-compress-program="pigz -p 16" -cf gemma-4-31B-it.tar.gz ./gemma-4-31B-it/
 ```
 
+### `gpt-oss-120b`
+Download the GPT-OSS model
+```
+module load git-lfs
+git lfs install
+git clone https://huggingface.co/openai/gpt-oss-120b
+```
 
+If you plan on moving the model to the burst buffer first, then tar the model directory
+```
+tar --use-compress-program="pigz -p 16" -cf gpt-oss-120b.tar.gz ./gpt-oss-120b/
+```
+
+[!NOTE]
+`gpt-oss-120b` requires an additional step from login nodes.
+Please additionally run the following commands to fetch the vocab file:
+```bash
+mkdir vocab_cache
+TIKTOKEN_RS_CACHE_DIR=./vocab_cache apptainer exec vllm_rocm.sif python -c 'from openai_harmony import load_harmony_encoding; load_harmony_encoding("HarmonyGptOss")'
+```
 
+## Run inference
 Submit the job with
 ```
 # running the model directly from Lustre
diff --git a/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch b/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch
index 5e50f18..c9502c2 100644
--- a/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch
+++ b/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch
@@ -17,6 +17,9 @@ export APPTAINERENV_TRITON_CACHE_DIR="/tmp/triton/cache/"  # change write/build
 APPTAINER_CMD="apptainer exec --fakeroot --writable-tmpfs ./vllm_rocm.sif "
 export APPTAINER_BINDPATH=/mnt/bb
 
+# Needed for gpt-oss-120b
+export TIKTOKEN_RS_CACHE_DIR=./vocab_cache
+
 # starting head node (head node start also starts a worker)
 srun -N1 -n1 -c56 -G8 -w $HEAD_NODE_ADDR $APPTAINER_CMD ./start_head_lustre.sh $SLURM_NNODES > logs/headnodelog_$SLURM_JOB_ID 2>&1 &
 

From c55ece312f2224c49856a899f4f71201c71c2208 Mon Sep 17 00:00:00 2001
From: Anthony Ramirez <ramirezat@ornl.gov>
Date: Fri, 22 May 2026 16:55:47 -0400
Subject: [PATCH 3/3] launchmultinode_lustre.sbatch: Add checks and
 configuration to allow running either gemma or gpt-oss at submit time.
 start_head_lustre.sh: Add capability to run model passed by batch config.
 testprompt_lustre.py: Add model detection. README.md: Add latest
 instructions'

---
 frontier/sample_apps/vllm/README.md           |  2 +-
 .../vllm/launchmultinode_lustre.sbatch        | 19 ++++++++++++++++++-
 .../sample_apps/vllm/start_head_lustre.sh     |  8 +++++---
 .../sample_apps/vllm/testprompt_lustre.py     |  5 +++--
 4 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/frontier/sample_apps/vllm/README.md b/frontier/sample_apps/vllm/README.md
index 1fb07ef..9517793 100644
--- a/frontier/sample_apps/vllm/README.md
+++ b/frontier/sample_apps/vllm/README.md
@@ -49,7 +49,7 @@ TIKTOKEN_RS_CACHE_DIR=./vocab_cache apptainer exec vllm_rocm.sif python -c 'from
 Submit the job with
 ```
 # running the model directly from Lustre
-sbatch launchmultinode_lustre.sbatch
+sbatch launchmultinode_lustre.sbatch [gpt-oss-120b | gemma-4-31B-it]
 
 # copying the model to burst buffer first before running
 sbatch launchmultinode_bb.sbatch
diff --git a/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch b/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch
index c9502c2..e5479a6 100644
--- a/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch
+++ b/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch
@@ -5,6 +5,23 @@
 #SBATCH -o logs/vllmmult_%j.out
 #SBATCH -C nvme
 
+MODEL_NAME="$1"
+SELECTED_MODEL="gemma-4-31B-it"
+
+if [ "$#" -ne 1 ]; then
+	echo "No model provided. Defaulting to gemma."
+	MODEL_NAME="gemma"
+elif echo "$MODEL_NAME" | grep -iqE "gpt"; then
+    echo "Detected GPT keyword. Using 'gpt-oss-120b'."
+    SELECTED_MODEL="gpt-oss-120b"
+elif echo "$MODEL_NAME" | grep -iqE "gemma"; then
+	echo "Detected Gemma keyword. Using 'gemma-4-31B-it'."
+fi
+
+if [ ! -d "$SELECTED_MODEL" ]; then
+	echo "Model not downloaded. Please download it and try again."
+	exit 1
+fi
 
 export HF_HOME=/mnt/bb/$USER # Make sure nvme is in use with . Change this location if you want.
 
@@ -21,7 +38,7 @@ export APPTAINER_BINDPATH=/mnt/bb
 export TIKTOKEN_RS_CACHE_DIR=./vocab_cache
 
 # starting head node (head node start also starts a worker)
-srun -N1 -n1 -c56 -G8 -w $HEAD_NODE_ADDR $APPTAINER_CMD ./start_head_lustre.sh $SLURM_NNODES > logs/headnodelog_$SLURM_JOB_ID 2>&1 &
+srun -N1 -n1 -c56 -G8 -w $HEAD_NODE_ADDR $APPTAINER_CMD ./start_head_lustre.sh $SLURM_NNODES $SELECTED_MODEL > logs/headnodelog_$SLURM_JOB_ID 2>&1 &
 
 # starting workers on other nodes 
 srun -N3 -n3  --tasks-per-node=1 --cpus-per-task=56 --gpus-per-task=8 -x $HEAD_NODE_ADDR $APPTAINER_CMD ./start_worker.sh $HEAD_NODE_ADDR > logs/workernodeslog_$SLURM_JOB_ID 2>&1 &
diff --git a/frontier/sample_apps/vllm/start_head_lustre.sh b/frontier/sample_apps/vllm/start_head_lustre.sh
index 9a57dd1..aa1c144 100755
--- a/frontier/sample_apps/vllm/start_head_lustre.sh
+++ b/frontier/sample_apps/vllm/start_head_lustre.sh
@@ -1,14 +1,16 @@
 #!/bin/bash
 
-NNODES=$@
+NNODES=$1
+RUN_MODEL="$2"
 export HIP_VISIBLE_DEVICES=$SLURM_STEP_GPUS
 export VLLM_HOST_IP=$(hostname -I | awk '{print $2}')
+
 echo "VLLM_HOST_IP: $VLLM_HOST_IP"
-ray start --node-ip-address=$VLLM_HOST_IP --head --port=6379 
+ray start --node-ip-address=$VLLM_HOST_IP --head --port=6379
 
 sleep 10
 echo "head node: slurm nnodes - $NNODES"
 
 ray status
 
-vllm serve --chat-template "./chattemplate.jinja" --tensor-parallel-size 8 --pipeline-parallel-size $NNODES --distributed-executor-backend ray "./gemma-4-31B-it" --host 0.0.0.0 --port 8000 --gpu-memory-utilization 0.75 --enforce-eager
\ No newline at end of file
+vllm serve --chat-template "./chattemplate.jinja" --tensor-parallel-size 8 --pipeline-parallel-size $NNODES --distributed-executor-backend ray "./$RUN_MODEL" --host 0.0.0.0 --port 8000 --gpu-memory-utilization 0.75
\ No newline at end of file
diff --git a/frontier/sample_apps/vllm/testprompt_lustre.py b/frontier/sample_apps/vllm/testprompt_lustre.py
index fc0e41b..5a2f285 100644
--- a/frontier/sample_apps/vllm/testprompt_lustre.py
+++ b/frontier/sample_apps/vllm/testprompt_lustre.py
@@ -22,10 +22,11 @@
 
 while True:
     try:
+        model = client.models.list().data[0].id
         start = time.time()
         for test_prompt in test_prompts:
             completion = client.chat.completions.create(
-                model=f"./gemma-4-31B-it",
+                model=f"{model}",
                 messages=[
                     {"role": "user", "content": test_prompt },
                 ],
@@ -45,4 +46,4 @@
         time.sleep(10)
     else:
         break
-print("Completion result:", [completion for completion in completions], flush=True)
\ No newline at end of file
+print("Completion result:", [completion for completion in completions], flush=True)