diff --git a/experiments/inference-scheduling-guidellm.yaml b/experiments/inference-scheduling-guidellm.yaml new file mode 100644 index 00000000..8ba23d28 --- /dev/null +++ b/experiments/inference-scheduling-guidellm.yaml @@ -0,0 +1,37 @@ +setup: + factors: + - LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE + levels: + LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE: "inf-sche-none.yaml, inf-sche-prefix.yaml, inf-sche-kv.yaml, inf-sche-queue.yaml" + # LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE: "inf-sche-prefix.yaml, inf-sche-kv.yaml, inf-sche-queue.yaml" + treatments: + - "inf-sche-none.yaml" + - "inf-sche-prefix.yaml" + - "inf-sche-kv.yaml" + - "inf-sche-queue.yaml" +run: + factors: + - prompt_tokens + - output_tokens + treatments: + - "100,100" + - "100,300" + - "100,1000" + - "300,100" + - "300,300" + - "300,1000" + - "1000,100" + - "1000,300" + - "1000,1000" + # levels: + # data: "prompt_tokens=100REPLACE_COMMAoutput_tokens=100REPLACE_COMMAprefix_tokens=2048, prompt_tokens=100REPLACE_COMMAoutput_tokens=300REPLACE_COMMAprefix_tokens=2048,prompt_tokens=100REPLACE_COMMAoutput_tokens=100REPLACE_COMMAprefix_tokens=2048, prompt_tokens=300REPLACE_COMMAoutput_tokens=100REPLACE_COMMAprefix_tokens=2048, prompt_tokens=300REPLACE_COMMAoutput_tokens=300REPLACE_COMMAprefix_tokens=2048, prompt_tokens=300REPLACE_COMMAoutput_tokens=100REPLACE_COMMAprefix_tokens=2048, prompt_tokens=100REPLACE_COMMAoutput_tokens=100REPLACE_COMMAprefix_tokens=2048, prompt_tokens=100REPLACE_COMMAoutput_tokens=300REPLACE_COMMAprefix_tokens=2048, prompt_tokens=100REPLACE_COMMAoutput_tokens=100REPLACE_COMMAprefix_tokens=2048" + # treatments: + # - "prompt_tokens=100REPLACE_COMMAoutput_tokens=100REPLACE_COMMAprefix_tokens=2048" + # - "prompt_tokens=100REPLACE_COMMAoutput_tokens=300REPLACE_COMMAprefix_tokens=2048" + # - "prompt_tokens=100REPLACE_COMMAoutput_tokens=1000REPLACE_COMMAprefix_tokens=2048" + # - "prompt_tokens=300REPLACE_COMMAoutput_tokens=100REPLACE_COMMAprefix_tokens=2048" + # - "prompt_tokens=300REPLACE_COMMAoutput_tokens=300REPLACE_COMMAprefix_tokens=2048" + # - "prompt_tokens=300REPLACE_COMMAoutput_tokens=1000REPLACE_COMMAprefix_tokens=2048" + # - "prompt_tokens=100REPLACE_COMMAoutput_tokens=100REPLACE_COMMAprefix_tokens=2048" + # - "prompt_tokens=100REPLACE_COMMAoutput_tokens=300REPLACE_COMMAprefix_tokens=2048" + # - "prompt_tokens=100REPLACE_COMMAoutput_tokens=1000REPLACE_COMMAprefix_tokens=2048" \ No newline at end of file diff --git a/experiments/pd-disaggregation-guidellm.yaml b/experiments/pd-disaggregation-guidellm.yaml new file mode 100644 index 00000000..f81496aa --- /dev/null +++ b/experiments/pd-disaggregation-guidellm.yaml @@ -0,0 +1,40 @@ +setup: + factors: + - LLMDBENCH_DEPLOY_METHODS + - LLMDBENCH_VLLM_COMMON_REPLICAS + - LLMDBENCH_VLLM_COMMON_TENSOR_PARALLELISM + - LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS + - LLMDBENCH_VLLM_MODELSERVICE_PREFILL_TENSOR_PARALLELISM + - LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS + - LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM + levels: + LLMDBENCH_VLLM_COMMON_REPLICAS: "2,4" + LLMDBENCH_VLLM_COMMON_TENSOR_PARALLELISM: "8" + LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS: "2,4,6,8" + LLMDBENCH_VLLM_MODELSERVICE_PREFILL_TENSOR_PARALLELISM: "1,2" + LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS: "1,2,4" + LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM: "2,4,8" + treatments: + - "modelservice,NA,NA,1,4,3,4" + - "modelservice,NA,NA,1,8,1,8" + - "modelservice,NA,NA,2,4,1,8" + - "modelservice,NA,NA,2,4,2,4" + - "modelservice,NA,NA,3,4,1,4" + - "standalone,1,2,NA,NA,NA,NA" + - "standalone,1,4,NA,NA,NA,NA" + - "standalone,1,8,NA,NA,NA,NA" +run: + factors: + - rate + - data_samples + levels: + rate: "1,8,32,64,128,256" + data_samples: "10,80,320,640,1280,2560" + treatments: + - "1,10" + - "8,80" + - "32,320" + - "64,640" + - "128,1280" + - "256,2560" + diff --git a/scenarios/guides/inference-scheduling-guidellm.sh b/scenarios/guides/inference-scheduling-guidellm.sh new file mode 100644 index 00000000..c54d59aa --- /dev/null +++ b/scenarios/guides/inference-scheduling-guidellm.sh @@ -0,0 +1,125 @@ +# INFERENCE SCHEDULING WELL LIT PATH +# Based on https://github.com/llm-d/llm-d/tree/main/guides/inference-scheduling +# Removed pod monitoring; can be added using LLMDBENCH_VLLM_MODELSERVICE_EXTRA_POD_CONFIG +# Removed extra volumes metrics-volume and torch-compile-volume; they are not needed for this model and tested hardware. +# Use LLMDBENCH_VLLM_MODELSERVICE_EXTRA_VOLUME_MOUNTS and LLMDBENCH_VLLM_MODELSERVICE_EXTRA_VOLUMES to add them if needed. + +# IMPORTANT NOTE +# All parameters not defined here or exported externally will be the default values found in setup/env.sh +# Many commonly defined values were left blank (default) so that this scenario is applicable to as many environments as possible. + +# Model parameters +export LLMDBENCH_DEPLOY_MODEL_LIST="Qwen/Qwen3-0.6B" +#export LLMDBENCH_DEPLOY_MODEL_LIST="facebook/opt-125m" +# export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-8B-Instruct" +#export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-70B-Instruct" + +# PVC parameters +# Storage class (leave uncommented to automatically detect the "default" storage class) +#export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=standard-rwx +#export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=shared-vast +#export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=ocs-storagecluster-cephfs +export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=1Ti + +# Routing configuration (via gaie) +#export LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE="default-plugins.yaml" # already the default + +# Routing configuration (via modelservice) +export LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_MODEL=true # (default is "false") + +# Affinity to select node with appropriate accelerator (leave uncommented to automatically detect GPU... WILL WORK FOR OpenShift, Kubernetes and GKE) +export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-H100-80GB-HBM3 # OpenShift +# export LLMDBENCH_VLLM_COMMON_AFFINITY=kubernetes.io/hostname:pokstg-b64r39s1 # OpenShift +#export LLMDBENCH_VLLM_COMMON_AFFINITY=gpu.nvidia.com/model:H200 # Kubernetes +#export LLMDBENCH_VLLM_COMMON_AFFINITY=cloud.google.com/gke-accelerator:nvidia-tesla-a100 # GKE +#export LLMDBENCH_VLLM_COMMON_AFFINITY=cloud.google.com/gke-accelerator:nvidia-h100-80gb # GKE +#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-L40S # OpenShift +#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-A100-SXM4-80GB # OpenShift +#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu # ANY GPU (useful for Minikube) + +# Uncomment to request specific network devices +#####export LLMDBENCH_VLLM_COMMON_NETWORK_RESOURCE=rdma/roce_gdr +#######export LLMDBENCH_VLLM_COMMON_NETWORK_RESOURCE=rdma/ib +#export LLMDBENCH_VLLM_COMMON_NETWORK_NR=4 + +# Common parameters across standalone and llm-d (prefill and decode) pods +export LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN=16000 +export LLMDBENCH_VLLM_COMMON_BLOCK_SIZE=64 + +export LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML=$(mktemp) +cat << EOF > $LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML +- name: UCX_TLS + value: "rc,sm,cuda_ipc,cuda_copy,tcp" +- name: UCX_SOCKADDR_TLS_PRIORITY + value: "tcp" +###- name: UCX_NET_DEVICES +### value: mlx5_1:1 +###- name: NCCL_IB_HCA +### value: mlx5_1 +- name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "REPLACE_ENV_LLMDBENCH_VLLM_COMMON_NIXL_SIDE_CHANNEL_PORT" +- name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP +- name: VLLM_LOGGING_LEVEL + value: DEBUG +- name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: "1" +EOF + +export LLMDBENCH_VLLM_COMMON_EXTRA_CONTAINER_CONFIG=$(mktemp) +cat << EOF > ${LLMDBENCH_VLLM_COMMON_EXTRA_CONTAINER_CONFIG} +ports: + - containerPort: REPLACE_ENV_LLMDBENCH_VLLM_COMMON_NIXL_SIDE_CHANNEL_PORT + protocol: TCP + - containerPort: REPLACE_ENV_LLMDBENCH_VLLM_COMMON_METRICS_PORT + name: metrics + protocol: TCP +EOF + +# Prefill parameters: 0 prefill pod +export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS=0 +export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_ACCELERATOR_NR=0 + +# Decode parameters: 2 decode pods +export LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM=1 +export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_NR=16 +export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_MEM=64Gi +# Uncomment (###) the following line to enable multi-nic +###export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PODANNOTATIONS=deployed-by:$LLMDBENCH_CONTROL_USERNAME,modelservice:llm-d-benchmark,k8s.v1.cni.cncf.io/networks:multi-nic-compute +# Uncomment (#####) the following two lines to enable roce/gdr (or switch to rdma/ib for infiniband) +#####export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_RESOURCE=rdma/roce_gdr +#####export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_NR=1 +export LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS=2 +export LLMDBENCH_VLLM_MODELSERVICE_DECODE_MODEL_COMMAND=vllmServe +export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS="[\ +--enforce-eager____\ +--block-size____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_BLOCK_SIZE____\ +--kv-transfer-config____'{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'____\ +--disable-log-requests____\ +--disable-uvicorn-access-log____\ +--max-model-len____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN\ +]" +export LLMDBENCH_VLLM_MODELSERVICE_DECODE_ACCELERATOR_NR=2 + +# Workload parameters +export LLMDBENCH_HARNESS_NAME=guidellm +export LLMDBENCH_HARNESS_EXPERIMENT_PROFILE=shared_prefix_synthetic.yaml + +# Local directory to copy benchmark runtime files and results +export LLMDBENCH_CONTROL_WORK_DIR=~/data/inference-scheduling/guidellm + +# My stuff +export export LLMDBENCH_IMAGE_REGISTRY=quay.io +export LLMDBENCH_IMAGE_REPO=jgchen +export LLMDBENCH_IMAGE_NAME=llm-d-benchmark +export LLMDBENCH_IMAGE_TAG=0.0.25 + +# export LLMDBENCH_IMAGE_REGISTRY="quay.io" +# export LLMDBENCH_IMAGE_REPO="rh_ee_smonson" +# export LLMDBENCH_IMAGE_NAME="llm-d-benchmark" +# export LLMDBENCH_IMAGE_TAG="0.3.0-amd64" + +export LLMDBENCH_VLLM_COMMON_NAMESPACE=jchen +export LLMDBENCH_HARNESS_NAMESPACE=jchen diff --git a/scenarios/guides/inference-scheduling.sh b/scenarios/guides/inference-scheduling.sh index 91c0fe42..fdea9116 100644 --- a/scenarios/guides/inference-scheduling.sh +++ b/scenarios/guides/inference-scheduling.sh @@ -9,9 +9,9 @@ # Many commonly defined values were left blank (default) so that this scenario is applicable to as many environments as possible. # Model parameters -#export LLMDBENCH_DEPLOY_MODEL_LIST="Qwen/Qwen3-0.6B" +export LLMDBENCH_DEPLOY_MODEL_LIST="Qwen/Qwen3-0.6B" #export LLMDBENCH_DEPLOY_MODEL_LIST="facebook/opt-125m" -export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-8B-Instruct" +# export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-8B-Instruct" #export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-70B-Instruct" # PVC parameters @@ -28,7 +28,7 @@ export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=1Ti export LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_MODEL=true # (default is "false") # Affinity to select node with appropriate accelerator (leave uncommented to automatically detect GPU... WILL WORK FOR OpenShift, Kubernetes and GKE) -#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-H100-80GB-HBM3 # OpenShift +export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-H100-80GB-HBM3 # OpenShift #export LLMDBENCH_VLLM_COMMON_AFFINITY=gpu.nvidia.com/model:H200 # Kubernetes #export LLMDBENCH_VLLM_COMMON_AFFINITY=cloud.google.com/gke-accelerator:nvidia-tesla-a100 # GKE #export LLMDBENCH_VLLM_COMMON_AFFINITY=cloud.google.com/gke-accelerator:nvidia-h100-80gb # GKE @@ -108,3 +108,12 @@ export LLMDBENCH_HARNESS_EXPERIMENT_PROFILE=shared_prefix_synthetic.yaml # Local directory to copy benchmark runtime files and results export LLMDBENCH_CONTROL_WORK_DIR=~/data/inference-scheduling + +# My stuff +export export LLMDBENCH_IMAGE_REGISTRY=quay.io +export LLMDBENCH_IMAGE_REPO=jgchen +export LLMDBENCH_IMAGE_NAME=llm-d-benchmark +export LLMDBENCH_IMAGE_TAG=0.0.25 + +export LLMDBENCH_VLLM_COMMON_NAMESPACE=jchen +export LLMDBENCH_HARNESS_NAMESPACE=jchen diff --git a/scenarios/guides/pd-disaggregation-guidellm.sh b/scenarios/guides/pd-disaggregation-guidellm.sh new file mode 100644 index 00000000..5dd7500c --- /dev/null +++ b/scenarios/guides/pd-disaggregation-guidellm.sh @@ -0,0 +1,150 @@ +# P/D DISAGGREGATION WELL LIT PATH +# Based on https://github.com/llm-d/llm-d/tree/main/guides/pd-disaggregation +# Removed pod monitoring; can be added using LLMDBENCH_VLLM_MODELSERVICE_EXTRA_POD_CONFIG +# Removed extra volumes metrics-volume and torch-compile-volume; they are not needed for this model and tested hardware. +# Use LLMDBENCH_VLLM_MODELSERVICE_EXTRA_VOLUME_MOUNTS and LLMDBENCH_VLLM_MODELSERVICE_EXTRA_VOLUMES to add them if needed. + +# IMPORTANT NOTE +# All parameters not defined here or exported externally will be the default values found in setup/env.sh +# Many commonly defined values were left blank (default) so that this scenario is applicable to as many environments as possible. + +# Model parameters +#export LLMDBENCH_DEPLOY_MODEL_LIST="Qwen/Qwen3-0.6B" +#export LLMDBENCH_DEPLOY_MODEL_LIST="facebook/opt-125m" +# export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-8B-Instruct" +export LLMDBENCH_DEPLOY_MODEL_LIST="meta-llama/Llama-3.1-70B-Instruct" + +# PVC parameters +# Storage class (leave uncommented to automatically detect the "default" storage class) +#export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=standard-rwx +#export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=shared-vast +#export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=ocs-storagecluster-cephfs +export LLMDBENCH_VLLM_COMMON_PVC_MODEL_CACHE_SIZE=1Ti + +# Routing configuration (via gaie) +#export LLMDBENCH_VLLM_MODELSERVICE_GAIE_PLUGINS_CONFIGFILE="default-plugins.yaml" # already the default + +# Routing configuration (via modelservice) +export LLMDBENCH_VLLM_MODELSERVICE_INFERENCE_MODEL=true # (default is "false") +# export LLMDBENCH_LLMD_ROUTINGSIDECAR_CONNECTOR=nixlv2 # already the default + +# Affinity to select node with appropriate accelerator (leave uncommented to automatically detect GPU... WILL WORK FOR OpenShift, Kubernetes and GKE) +export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-H100-80GB-HBM3 # OpenShift +#export LLMDBENCH_VLLM_COMMON_AFFINITY=gpu.nvidia.com/model:H200 # Kubernetes +#export LLMDBENCH_VLLM_COMMON_AFFINITY=cloud.google.com/gke-accelerator:nvidia-tesla-a100 # GKE +#export LLMDBENCH_VLLM_COMMON_AFFINITY=cloud.google.com/gke-accelerator:nvidia-h100-80gb # GKE +#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-L40S # OpenShift +#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-A100-SXM4-80GB # OpenShift +#export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu # ANY GPU (useful for Minikube) + +# Uncomment to request specific network devices +#####export LLMDBENCH_VLLM_COMMON_NETWORK_RESOURCE=rdma/roce_gdr +#######export LLMDBENCH_VLLM_COMMON_NETWORK_RESOURCE=rdma/ib +#export LLMDBENCH_VLLM_COMMON_NETWORK_NR=4 + +# Uncomment to use hostNetwork (only ONE PODE PER NODE) +#export LLMDBENCH_VLLM_MODELSERVICE_EXTRA_POD_CONFIG=$(mktemp) +#cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_EXTRA_POD_CONFIG} +# hostNetwork: true +# dnsPolicy: ClusterFirstWithHostNet +#EOF + +# Common parameters across standalone and llm-d (prefill and decode) pods +export LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN=16000 +export LLMDBENCH_VLLM_COMMON_BLOCK_SIZE=128 + +# Uncomment (###) to select additional network devices (e.g., when multi-nic is enabled) +export LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML=$(mktemp) +cat << EOF > $LLMDBENCH_VLLM_COMMON_ENVVARS_TO_YAML +- name: UCX_TLS + value: "rc,sm,cuda_ipc,cuda_copy,tcp" +- name: UCX_SOCKADDR_TLS_PRIORITY + value: "tcp" +###- name: UCX_NET_DEVICES +### value: mlx5_1:1 +###- name: NCCL_IB_HCA +### value: mlx5_1 +- name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "REPLACE_ENV_LLMDBENCH_VLLM_COMMON_NIXL_SIDE_CHANNEL_PORT" +- name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP +- name: VLLM_LOGGING_LEVEL + value: DEBUG +- name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: "1" +EOF + +# Prefill parameters +export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_TENSOR_PARALLELISM=1 +export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS=2 +export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_CPU_NR=32 +export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_CPU_MEM=128Gi +# Uncomment (###) the following line to enable multi-nic +###export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_PODANNOTATIONS=deployed-by:$LLMDBENCH_CONTROL_USERNAME,modelservice:llm-d-benchmark,k8s.v1.cni.cncf.io/networks:multi-nic-compute +# Uncomment (#####) the following two lines to enable roce/gdr (or switch to rdma/ib for infiniband) +#####export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_NETWORK_RESOURCE=rdma/roce_gdr +#####export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_NETWORK_NR=1 +export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_MODEL_COMMAND=vllmServe +export LLMDBENCH_VLLM_MODELSERVICE_PREFILL_EXTRA_ARGS="[\ +--block-size____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_BLOCK_SIZE____\ +--kv-transfer-config____'{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'____\ +--disable-log-requests____\ +--disable-uvicorn-access-log____\ +--max-model-len____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN\ +]" + +# Decode parameters +export LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM=1 +export LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS=2 +export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_NR=32 +export LLMDBENCH_VLLM_MODELSERVICE_DECODE_CPU_MEM=128Gi +# Uncomment (###) the following line to enable multi-nic +###export LLMDBENCH_VLLM_MODELSERVICE_DECODE_PODANNOTATIONS=deployed-by:$LLMDBENCH_CONTROL_USERNAME,modelservice:llm-d-benchmark,k8s.v1.cni.cncf.io/networks:multi-nic-compute +# Uncomment (#####) the following two lines to enable roce/gdr (or switch to rdma/ib for infiniband) +#####export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_RESOURCE=rdma/roce_gdr +#####export LLMDBENCH_VLLM_MODELSERVICE_DECODE_NETWORK_NR=1 +export LLMDBENCH_VLLM_MODELSERVICE_DECODE_MODEL_COMMAND=vllmServe +export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_ARGS="[\ +--block-size____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_BLOCK_SIZE____\ +--kv-transfer-config____'{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}'____\ +--disable-log-requests____\ +--disable-uvicorn-access-log____\ +--max-model-len____REPLACE_ENV_LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN\ +]" + +export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_VOLUME_MOUNTS=$(mktemp) +cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_VOLUME_MOUNTS} +- name: dshm + mountPath: /dev/shm +EOF + +export LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_VOLUMES=$(mktemp) +cat << EOF > ${LLMDBENCH_VLLM_MODELSERVICE_DECODE_EXTRA_VOLUMES} +- name: dshm + emptyDir: + medium: Memory + sizeLimit: REPLACE_ENV_LLMDBENCH_VLLM_COMMON_SHM_MEM +EOF + +# Timeout for benchmark operations +export LLMDBENCH_CONTROL_WAIT_TIMEOUT=900000 +export LLMDBENCH_HARNESS_WAIT_TIMEOUT=900000 + +# Workload parameters +export LLMDBENCH_HARNESS_EXPERIMENT_PROFILE=random_concurrent.yaml +export LLMDBENCH_HARNESS_NAME=guidellm + +# Local directory to copy benchmark runtime files and results +export LLMDBENCH_CONTROL_WORK_DIR=~/data/pd-disaggregation/guidellm + + +# My stuff +export export LLMDBENCH_IMAGE_REGISTRY=quay.io +export LLMDBENCH_IMAGE_REPO=jgchen +export LLMDBENCH_IMAGE_NAME=llm-d-benchmark +export LLMDBENCH_IMAGE_TAG=0.0.24 + +export LLMDBENCH_VLLM_COMMON_NAMESPACE=jchen2 +export LLMDBENCH_HARNESS_NAMESPACE=jchen2 \ No newline at end of file diff --git a/setup/functions.sh b/setup/functions.sh index a9886283..c0266f4a 100755 --- a/setup/functions.sh +++ b/setup/functions.sh @@ -1197,4 +1197,4 @@ function user_has_hf_model_access { case "$http_code" in 200) return 0 ;; 401|403) return 1 ;; *) return 2 ;; esac } -export -f user_has_hf_model_access +export -f user_has_hf_model_access \ No newline at end of file diff --git a/setup/presets/gaie/inf-sche-kv.yaml b/setup/presets/gaie/inf-sche-kv.yaml index ab1c5cd8..d3e972a6 100644 --- a/setup/presets/gaie/inf-sche-kv.yaml +++ b/setup/presets/gaie/inf-sche-kv.yaml @@ -3,7 +3,7 @@ apiVersion: inference.networking.x-k8s.io/v1alpha1 kind: EndpointPickerConfig plugins: -- type: kv-cache-scorer +- type: kv-cache-utilization-scorer - type: decode-filter - type: max-score-picker - type: single-profile-handler @@ -12,5 +12,5 @@ schedulingProfiles: plugins: - pluginRef: decode-filter - pluginRef: max-score-picker - - pluginRef: kv-cache-scorer + - pluginRef: kv-cache-utilization-scorer weight: 1 \ No newline at end of file diff --git a/setup/presets/gaie/inf-sche-prefix-kv-queue.yaml b/setup/presets/gaie/inf-sche-prefix-kv-queue.yaml index 37b70b5b..4876ff99 100644 --- a/setup/presets/gaie/inf-sche-prefix-kv-queue.yaml +++ b/setup/presets/gaie/inf-sche-prefix-kv-queue.yaml @@ -7,7 +7,7 @@ plugins: - type: decode-filter - type: max-score-picker - type: single-profile-handler -- type: kv-cache-scorer +- type: kv-cache-utilization-scorer - type: queue-cache-scorer schedulingProfiles: - name: default @@ -16,7 +16,7 @@ schedulingProfiles: - pluginRef: max-score-picker - pluginRef: prefix-cache-scorer weight: 1 - - pluginRef: kv-cache-scorer + - pluginRef: kv-cache-utilization-scorer weight: 1 - pluginRef: queue-scorer weight: 1 \ No newline at end of file diff --git a/workload/profiles/guidellm/random_concurrent.yaml.in b/workload/profiles/guidellm/random_concurrent.yaml.in new file mode 100644 index 00000000..44fae7aa --- /dev/null +++ b/workload/profiles/guidellm/random_concurrent.yaml.in @@ -0,0 +1,9 @@ +target: REPLACE_ENV_LLMDBENCH_HARNESS_STACK_ENDPOINT_URL +model: REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL +request_type: text_completions +profile: concurrent +rate: 1 +data_samples: 10 +data: + prompt_tokens: 10000 + output_tokens: 1000