olcf · t-ramz · May 21, 2026 · May 22, 2026 · May 22, 2026
diff --git a/frontier/sample_apps/vllm/README.md b/frontier/sample_apps/vllm/README.md
@@ -1,30 +1,55 @@
-# vLLM example running astrollama-2-7b-base_abstract 
+# vLLM multi-node example running gemma-4-31B-it and/or gpt-oss-120b
 
-Example showing how to get download AMD's vLLM container release and run it on Frontier
+Example showing how to build from vLLM's ROCm container release and run it on Frontier
 
-Download the vLLM container from Dockerhub:
+## Build the image
+
+Build the vLLM container from DockerHub using the included build spec, which includes `ray`:
 ```
-apptainer pull --disable-cache vllm_rocm.sif docker://docker.io/rocm/vllm:rocm6.3.1_vllm_0.8.5_20250513
+apptainer build vllm_rocm.sif vllm_rocm.def
 ```
 
-Download the astrollama model
+## Download the model
+
+### `gemma-4-31B-it`
+Download the gemma-4 model
 ```
 module load git-lfs
 git lfs install
-git clone https://huggingface.co/AstroMLab/astrollama-2-7b-base_abstract
+git clone https://huggingface.co/google/gemma-4-31B-it
 ```
 
 If you plan on moving the model to the burst buffer first, then tar the model directory
 ```
-tar --use-compress-program="pigz -p 16" -cf astrollama-2-7b-base_abstract.tar.gz ./astrollama-2-7b-base_abstract/
+tar --use-compress-program="pigz -p 16" -cf gemma-4-31B-it.tar.gz ./gemma-4-31B-it/
 ```
 
+### `gpt-oss-120b`
+Download the GPT-OSS model
+```
+module load git-lfs
+git lfs install
+git clone https://huggingface.co/openai/gpt-oss-120b
+```
 
+If you plan on moving the model to the burst buffer first, then tar the model directory
+```
+tar --use-compress-program="pigz -p 16" -cf gpt-oss-120b.tar.gz ./gpt-oss-120b/
+```
+
+[!NOTE]
+`gpt-oss-120b` requires an additional step from login nodes.
+Please additionally run the following commands to fetch the vocab file:
+```bash
+mkdir vocab_cache
+TIKTOKEN_RS_CACHE_DIR=./vocab_cache apptainer exec vllm_rocm.sif python -c 'from openai_harmony import load_harmony_encoding; load_harmony_encoding("HarmonyGptOss")'
+```
 
+## Run inference
 Submit the job with
 ```
 # running the model directly from Lustre
-sbatch launchmultinode_lustre.sbatch
+sbatch launchmultinode_lustre.sbatch [gpt-oss-120b | gemma-4-31B-it]
 
 # copying the model to burst buffer first before running
 sbatch launchmultinode_bb.sbatch

diff --git a/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch b/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch
@@ -5,6 +5,23 @@
 #SBATCH -o logs/vllmmult_%j.out
 #SBATCH -C nvme
 
+MODEL_NAME="$1"
+SELECTED_MODEL="gemma-4-31B-it"
+
+if [ "$#" -ne 1 ]; then
+	echo "No model provided. Defaulting to gemma."
+	MODEL_NAME="gemma"
+elif echo "$MODEL_NAME" | grep -iqE "gpt"; then
+    echo "Detected GPT keyword. Using 'gpt-oss-120b'."
+    SELECTED_MODEL="gpt-oss-120b"
+elif echo "$MODEL_NAME" | grep -iqE "gemma"; then
+	echo "Detected Gemma keyword. Using 'gemma-4-31B-it'."
+fi
+
+if [ ! -d "$SELECTED_MODEL" ]; then
+	echo "Model not downloaded. Please download it and try again."
+	exit 1
+fi
 
 export HF_HOME=/mnt/bb/$USER # Make sure nvme is in use with . Change this location if you want.
 
@@ -17,11 +34,14 @@ export APPTAINERENV_TRITON_CACHE_DIR="/tmp/triton/cache/"  # change write/build
 APPTAINER_CMD="apptainer exec --fakeroot --writable-tmpfs ./vllm_rocm.sif "
 export APPTAINER_BINDPATH=/mnt/bb
 
+# Needed for gpt-oss-120b
+export TIKTOKEN_RS_CACHE_DIR=./vocab_cache
+
 # starting head node (head node start also starts a worker)
-srun -N1 -n1 -c56 -G8 -w $HEAD_NODE_ADDR $APPTAINER_CMD ./start_head_lustre.sh $SLURM_NNODES > logs/headnodelog 2>&1 &
+srun -N1 -n1 -c56 -G8 -w $HEAD_NODE_ADDR $APPTAINER_CMD ./start_head_lustre.sh $SLURM_NNODES $SELECTED_MODEL > logs/headnodelog_$SLURM_JOB_ID 2>&1 &
 
 # starting workers on other nodes 
-srun -N3 -n3  --tasks-per-node=1 --cpus-per-task=56 --gpus-per-task=8 -x $HEAD_NODE_ADDR $APPTAINER_CMD ./start_worker.sh $HEAD_NODE_ADDR > logs/workernodeslog 2>&1 &
+srun -N3 -n3  --tasks-per-node=1 --cpus-per-task=56 --gpus-per-task=8 -x $HEAD_NODE_ADDR $APPTAINER_CMD ./start_worker.sh $HEAD_NODE_ADDR > logs/workernodeslog_$SLURM_JOB_ID 2>&1 &
 
 $APPTAINER_CMD python3 ./testprompt_lustre.py
 duration=$(( SECONDS - start ))

diff --git a/frontier/sample_apps/vllm/start_head_lustre.sh b/frontier/sample_apps/vllm/start_head_lustre.sh
@@ -1,13 +1,16 @@
 #!/bin/bash
 
-NNODES=$@
+NNODES=$1
+RUN_MODEL="$2"
+export HIP_VISIBLE_DEVICES=$SLURM_STEP_GPUS
 export VLLM_HOST_IP=$(hostname -I | awk '{print $2}')
+
 echo "VLLM_HOST_IP: $VLLM_HOST_IP"
-ray start --node-ip-address=$VLLM_HOST_IP --head --port=6379 
+ray start --node-ip-address=$VLLM_HOST_IP --head --port=6379
 
 sleep 10
 echo "head node: slurm nnodes - $NNODES"
 
 ray status
 
-vllm serve --chat-template "./chattemplate.jinja" --tensor-parallel-size 8 --pipeline-parallel-size $NNODES --distributed-executor-backend ray "./astrollama-2-7b-base_abstract" --host 0.0.0.0 --port 8000 
+vllm serve --chat-template "./chattemplate.jinja" --tensor-parallel-size 8 --pipeline-parallel-size $NNODES --distributed-executor-backend ray "./$RUN_MODEL" --host 0.0.0.0 --port 8000 --gpu-memory-utilization 0.75
diff --git a/frontier/sample_apps/vllm/start_worker.sh b/frontier/sample_apps/vllm/start_worker.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 
+export HIP_VISIBLE_DEVICES=$SLURM_STEP_GPUS
 export VLLM_HOST_IP=$(hostname -I | awk '{print $2}')
 echo "VLLM_HOST_IP: $VLLM_HOST_IP"
 HEAD_NODE_ADDR=$@
+
 ray start --node-ip-address $VLLM_HOST_IP --address=$HEAD_NODE_ADDR:6379 --block
diff --git a/frontier/sample_apps/vllm/testprompt_lustre.py b/frontier/sample_apps/vllm/testprompt_lustre.py
@@ -1,5 +1,5 @@
 from openai import OpenAI
-import os 
+import os
 import time
 import httpcore
 import httpx
@@ -13,10 +13,28 @@
     base_url=openai_api_base,
 )
 
+test_prompts = [
+    "In a few sentences, describe the theory of relativity.",
+    "Write a small Python script for calling a model named `./gemma-4-31B-it` using the `openai` Python library.",
+    "Describe the difference between GPT and LLAMA AI models.",
+]
+completions = []
+
 while True:
     try:
-        completion = client.completions.create(model=f"./astrollama-2-7b-base_abstract",
-                                      prompt="The Magellanic Cloud is a")
+        model = client.models.list().data[0].id
+        start = time.time()
+        for test_prompt in test_prompts:
+            completion = client.chat.completions.create(
+                model=f"{model}",
+                messages=[
+                    {"role": "user", "content": test_prompt },
+                ],
+                stream=False
+            )
+            print("Completion result:", completion, flush=True)
+            print("Time since beginning:", time.time() - start, flush=True)
+            completions.append(completion)
     except httpcore.ConnectError:
         print("vllm server is not ready. Waiting 10 seconds...", flush=True)
         time.sleep(10)
@@ -28,4 +46,4 @@
         time.sleep(10)
     else:
         break
-print("Completion result:", completion, flush=True)
+print("Completion result:", [completion for completion in completions], flush=True)
diff --git a/frontier/sample_apps/vllm/vllm_rocm.def b/frontier/sample_apps/vllm/vllm_rocm.def
@@ -0,0 +1,8 @@
+Bootstrap: docker
+From: vllm/vllm-openai-rocm:v0.21.0
+
+%post
+
+set -e
+
+pip3 install "ray[default]"