diff --git a/frontier/sample_apps/vllm/README.md b/frontier/sample_apps/vllm/README.md index 9046d97..9517793 100644 --- a/frontier/sample_apps/vllm/README.md +++ b/frontier/sample_apps/vllm/README.md @@ -1,30 +1,55 @@ -# vLLM example running astrollama-2-7b-base_abstract +# vLLM multi-node example running gemma-4-31B-it and/or gpt-oss-120b -Example showing how to get download AMD's vLLM container release and run it on Frontier +Example showing how to build from vLLM's ROCm container release and run it on Frontier -Download the vLLM container from Dockerhub: +## Build the image + +Build the vLLM container from DockerHub using the included build spec, which includes `ray`: ``` -apptainer pull --disable-cache vllm_rocm.sif docker://docker.io/rocm/vllm:rocm6.3.1_vllm_0.8.5_20250513 +apptainer build vllm_rocm.sif vllm_rocm.def ``` -Download the astrollama model +## Download the model + +### `gemma-4-31B-it` +Download the gemma-4 model ``` module load git-lfs git lfs install -git clone https://huggingface.co/AstroMLab/astrollama-2-7b-base_abstract +git clone https://huggingface.co/google/gemma-4-31B-it ``` If you plan on moving the model to the burst buffer first, then tar the model directory ``` -tar --use-compress-program="pigz -p 16" -cf astrollama-2-7b-base_abstract.tar.gz ./astrollama-2-7b-base_abstract/ +tar --use-compress-program="pigz -p 16" -cf gemma-4-31B-it.tar.gz ./gemma-4-31B-it/ ``` +### `gpt-oss-120b` +Download the GPT-OSS model +``` +module load git-lfs +git lfs install +git clone https://huggingface.co/openai/gpt-oss-120b +``` +If you plan on moving the model to the burst buffer first, then tar the model directory +``` +tar --use-compress-program="pigz -p 16" -cf gpt-oss-120b.tar.gz ./gpt-oss-120b/ +``` + +[!NOTE] +`gpt-oss-120b` requires an additional step from login nodes. +Please additionally run the following commands to fetch the vocab file: +```bash +mkdir vocab_cache +TIKTOKEN_RS_CACHE_DIR=./vocab_cache apptainer exec vllm_rocm.sif python -c 'from openai_harmony import load_harmony_encoding; load_harmony_encoding("HarmonyGptOss")' +``` +## Run inference Submit the job with ``` # running the model directly from Lustre -sbatch launchmultinode_lustre.sbatch +sbatch launchmultinode_lustre.sbatch [gpt-oss-120b | gemma-4-31B-it] # copying the model to burst buffer first before running sbatch launchmultinode_bb.sbatch diff --git a/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch b/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch index 9924d0b..e5479a6 100644 --- a/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch +++ b/frontier/sample_apps/vllm/launchmultinode_lustre.sbatch @@ -5,6 +5,23 @@ #SBATCH -o logs/vllmmult_%j.out #SBATCH -C nvme +MODEL_NAME="$1" +SELECTED_MODEL="gemma-4-31B-it" + +if [ "$#" -ne 1 ]; then + echo "No model provided. Defaulting to gemma." + MODEL_NAME="gemma" +elif echo "$MODEL_NAME" | grep -iqE "gpt"; then + echo "Detected GPT keyword. Using 'gpt-oss-120b'." + SELECTED_MODEL="gpt-oss-120b" +elif echo "$MODEL_NAME" | grep -iqE "gemma"; then + echo "Detected Gemma keyword. Using 'gemma-4-31B-it'." +fi + +if [ ! -d "$SELECTED_MODEL" ]; then + echo "Model not downloaded. Please download it and try again." + exit 1 +fi export HF_HOME=/mnt/bb/$USER # Make sure nvme is in use with . Change this location if you want. @@ -17,11 +34,14 @@ export APPTAINERENV_TRITON_CACHE_DIR="/tmp/triton/cache/" # change write/build APPTAINER_CMD="apptainer exec --fakeroot --writable-tmpfs ./vllm_rocm.sif " export APPTAINER_BINDPATH=/mnt/bb +# Needed for gpt-oss-120b +export TIKTOKEN_RS_CACHE_DIR=./vocab_cache + # starting head node (head node start also starts a worker) -srun -N1 -n1 -c56 -G8 -w $HEAD_NODE_ADDR $APPTAINER_CMD ./start_head_lustre.sh $SLURM_NNODES > logs/headnodelog 2>&1 & +srun -N1 -n1 -c56 -G8 -w $HEAD_NODE_ADDR $APPTAINER_CMD ./start_head_lustre.sh $SLURM_NNODES $SELECTED_MODEL > logs/headnodelog_$SLURM_JOB_ID 2>&1 & # starting workers on other nodes -srun -N3 -n3 --tasks-per-node=1 --cpus-per-task=56 --gpus-per-task=8 -x $HEAD_NODE_ADDR $APPTAINER_CMD ./start_worker.sh $HEAD_NODE_ADDR > logs/workernodeslog 2>&1 & +srun -N3 -n3 --tasks-per-node=1 --cpus-per-task=56 --gpus-per-task=8 -x $HEAD_NODE_ADDR $APPTAINER_CMD ./start_worker.sh $HEAD_NODE_ADDR > logs/workernodeslog_$SLURM_JOB_ID 2>&1 & $APPTAINER_CMD python3 ./testprompt_lustre.py duration=$(( SECONDS - start )) diff --git a/frontier/sample_apps/vllm/start_head_lustre.sh b/frontier/sample_apps/vllm/start_head_lustre.sh index 573cd39..aa1c144 100755 --- a/frontier/sample_apps/vllm/start_head_lustre.sh +++ b/frontier/sample_apps/vllm/start_head_lustre.sh @@ -1,13 +1,16 @@ #!/bin/bash -NNODES=$@ +NNODES=$1 +RUN_MODEL="$2" +export HIP_VISIBLE_DEVICES=$SLURM_STEP_GPUS export VLLM_HOST_IP=$(hostname -I | awk '{print $2}') + echo "VLLM_HOST_IP: $VLLM_HOST_IP" -ray start --node-ip-address=$VLLM_HOST_IP --head --port=6379 +ray start --node-ip-address=$VLLM_HOST_IP --head --port=6379 sleep 10 echo "head node: slurm nnodes - $NNODES" ray status -vllm serve --chat-template "./chattemplate.jinja" --tensor-parallel-size 8 --pipeline-parallel-size $NNODES --distributed-executor-backend ray "./astrollama-2-7b-base_abstract" --host 0.0.0.0 --port 8000 +vllm serve --chat-template "./chattemplate.jinja" --tensor-parallel-size 8 --pipeline-parallel-size $NNODES --distributed-executor-backend ray "./$RUN_MODEL" --host 0.0.0.0 --port 8000 --gpu-memory-utilization 0.75 \ No newline at end of file diff --git a/frontier/sample_apps/vllm/start_worker.sh b/frontier/sample_apps/vllm/start_worker.sh index b2f65c0..3da43ee 100755 --- a/frontier/sample_apps/vllm/start_worker.sh +++ b/frontier/sample_apps/vllm/start_worker.sh @@ -1,6 +1,8 @@ #!/bin/bash +export HIP_VISIBLE_DEVICES=$SLURM_STEP_GPUS export VLLM_HOST_IP=$(hostname -I | awk '{print $2}') echo "VLLM_HOST_IP: $VLLM_HOST_IP" HEAD_NODE_ADDR=$@ + ray start --node-ip-address $VLLM_HOST_IP --address=$HEAD_NODE_ADDR:6379 --block diff --git a/frontier/sample_apps/vllm/testprompt_lustre.py b/frontier/sample_apps/vllm/testprompt_lustre.py index 2239e04..5a2f285 100644 --- a/frontier/sample_apps/vllm/testprompt_lustre.py +++ b/frontier/sample_apps/vllm/testprompt_lustre.py @@ -1,5 +1,5 @@ from openai import OpenAI -import os +import os import time import httpcore import httpx @@ -13,10 +13,28 @@ base_url=openai_api_base, ) +test_prompts = [ + "In a few sentences, describe the theory of relativity.", + "Write a small Python script for calling a model named `./gemma-4-31B-it` using the `openai` Python library.", + "Describe the difference between GPT and LLAMA AI models.", +] +completions = [] + while True: try: - completion = client.completions.create(model=f"./astrollama-2-7b-base_abstract", - prompt="The Magellanic Cloud is a") + model = client.models.list().data[0].id + start = time.time() + for test_prompt in test_prompts: + completion = client.chat.completions.create( + model=f"{model}", + messages=[ + {"role": "user", "content": test_prompt }, + ], + stream=False + ) + print("Completion result:", completion, flush=True) + print("Time since beginning:", time.time() - start, flush=True) + completions.append(completion) except httpcore.ConnectError: print("vllm server is not ready. Waiting 10 seconds...", flush=True) time.sleep(10) @@ -28,4 +46,4 @@ time.sleep(10) else: break -print("Completion result:", completion, flush=True) +print("Completion result:", [completion for completion in completions], flush=True) diff --git a/frontier/sample_apps/vllm/vllm_rocm.def b/frontier/sample_apps/vllm/vllm_rocm.def new file mode 100644 index 0000000..bea701b --- /dev/null +++ b/frontier/sample_apps/vllm/vllm_rocm.def @@ -0,0 +1,8 @@ +Bootstrap: docker +From: vllm/vllm-openai-rocm:v0.21.0 + +%post + +set -e + +pip3 install "ray[default]" \ No newline at end of file