diff --git a/.github/workflows/build-mock-images.yml b/.github/workflows/build-mock-images.yml new file mode 100644 index 000000000..99f6c49d1 --- /dev/null +++ b/.github/workflows/build-mock-images.yml @@ -0,0 +1,56 @@ +name: Build and Push Mock Images + +on: + push: + branches: + - main + paths: + - hack/mock-images/** + workflow_dispatch: + +jobs: + build-vllm-mock: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push vllm-mock + uses: docker/build-push-action@v5 + with: + context: hack/mock-images/vllm-mock + push: true + tags: ghcr.io/volcano-sh/vllm-mock:latest + + build-sglang-mock: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push sglang-mock + uses: docker/build-push-action@v5 + with: + context: hack/mock-images/sglang-mock + push: true + tags: ghcr.io/volcano-sh/sglang-mock:latest diff --git a/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml b/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml index ff30fb59a..5f8fd5cee 100644 --- a/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml +++ b/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml @@ -21,7 +21,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock @@ -49,7 +49,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml b/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml index 03b9e9fa5..ae7c77cda 100644 --- a/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml +++ b/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml @@ -19,7 +19,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds7b.yaml b/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds7b.yaml index faa8c76fd..9951505d7 100644 --- a/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds7b.yaml +++ b/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds7b.yaml @@ -19,7 +19,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock.yaml b/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock.yaml index 3a4d8407b..8741b17af 100644 --- a/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock.yaml +++ b/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock.yaml @@ -2,7 +2,7 @@ # The mock server will return a fixed response for any input. # You can use this mock server to test the inference router without deploying a real LLM server. # -# NOTE: `ghcr.io/yaozengzeng/vllm-mock:latest` is built based on `https://github.com/YaoZengzeng/aibrix/tree/vllm-mock`. +# NOTE: `ghcr.io/volcano-sh/vllm-mock:latest` is built based on `https://github.com/volcano-sh/kthena/tree/main/hack/mock-images/vllm-mock`. # Move the image to kthena registry once it's public. apiVersion: apps/v1 @@ -21,7 +21,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml b/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml index ff30fb59a..5f8fd5cee 100644 --- a/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml +++ b/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml @@ -21,7 +21,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock @@ -49,7 +49,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml b/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml index 03b9e9fa5..ae7c77cda 100644 --- a/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml +++ b/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml @@ -19,7 +19,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml b/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml index faa8c76fd..9951505d7 100644 --- a/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml +++ b/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml @@ -19,7 +19,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock.yaml b/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock.yaml index 3a4d8407b..8741b17af 100644 --- a/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock.yaml +++ b/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock.yaml @@ -2,7 +2,7 @@ # The mock server will return a fixed response for any input. # You can use this mock server to test the inference router without deploying a real LLM server. # -# NOTE: `ghcr.io/yaozengzeng/vllm-mock:latest` is built based on `https://github.com/YaoZengzeng/aibrix/tree/vllm-mock`. +# NOTE: `ghcr.io/volcano-sh/vllm-mock:latest` is built based on `https://github.com/volcano-sh/kthena/tree/main/hack/mock-images/vllm-mock`. # Move the image to kthena registry once it's public. apiVersion: apps/v1 @@ -21,7 +21,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml b/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml index ff30fb59a..5f8fd5cee 100644 --- a/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml +++ b/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml @@ -21,7 +21,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock @@ -49,7 +49,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml b/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml index 03b9e9fa5..ae7c77cda 100644 --- a/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml +++ b/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml @@ -19,7 +19,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml b/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml index faa8c76fd..9951505d7 100644 --- a/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml +++ b/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml @@ -19,7 +19,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock.yaml b/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock.yaml index 3a4d8407b..8741b17af 100644 --- a/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock.yaml +++ b/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock.yaml @@ -2,7 +2,7 @@ # The mock server will return a fixed response for any input. # You can use this mock server to test the inference router without deploying a real LLM server. # -# NOTE: `ghcr.io/yaozengzeng/vllm-mock:latest` is built based on `https://github.com/YaoZengzeng/aibrix/tree/vllm-mock`. +# NOTE: `ghcr.io/volcano-sh/vllm-mock:latest` is built based on `https://github.com/volcano-sh/kthena/tree/main/hack/mock-images/vllm-mock`. # Move the image to kthena registry once it's public. apiVersion: apps/v1 @@ -21,7 +21,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml b/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml index ff30fb59a..5f8fd5cee 100644 --- a/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml +++ b/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml @@ -21,7 +21,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock @@ -49,7 +49,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml b/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml index 03b9e9fa5..ae7c77cda 100644 --- a/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml +++ b/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml @@ -19,7 +19,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml b/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml index faa8c76fd..9951505d7 100644 --- a/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml +++ b/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml @@ -19,7 +19,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock.yaml b/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock.yaml index 3a4d8407b..8741b17af 100644 --- a/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock.yaml +++ b/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock.yaml @@ -2,7 +2,7 @@ # The mock server will return a fixed response for any input. # You can use this mock server to test the inference router without deploying a real LLM server. # -# NOTE: `ghcr.io/yaozengzeng/vllm-mock:latest` is built based on `https://github.com/YaoZengzeng/aibrix/tree/vllm-mock`. +# NOTE: `ghcr.io/volcano-sh/vllm-mock:latest` is built based on `https://github.com/volcano-sh/kthena/tree/main/hack/mock-images/vllm-mock`. # Move the image to kthena registry once it's public. apiVersion: apps/v1 @@ -21,7 +21,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml b/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml index ff30fb59a..5f8fd5cee 100644 --- a/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml +++ b/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml @@ -21,7 +21,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock @@ -49,7 +49,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/examples/kthena-router/LLM-Mock-ds1.5b.yaml b/examples/kthena-router/LLM-Mock-ds1.5b.yaml index 03b9e9fa5..ae7c77cda 100644 --- a/examples/kthena-router/LLM-Mock-ds1.5b.yaml +++ b/examples/kthena-router/LLM-Mock-ds1.5b.yaml @@ -19,7 +19,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/examples/kthena-router/LLM-Mock-ds7b.yaml b/examples/kthena-router/LLM-Mock-ds7b.yaml index faa8c76fd..9951505d7 100644 --- a/examples/kthena-router/LLM-Mock-ds7b.yaml +++ b/examples/kthena-router/LLM-Mock-ds7b.yaml @@ -19,7 +19,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/examples/kthena-router/LLM-Mock.yaml b/examples/kthena-router/LLM-Mock.yaml index 3a4d8407b..8741b17af 100644 --- a/examples/kthena-router/LLM-Mock.yaml +++ b/examples/kthena-router/LLM-Mock.yaml @@ -2,7 +2,7 @@ # The mock server will return a fixed response for any input. # You can use this mock server to test the inference router without deploying a real LLM server. # -# NOTE: `ghcr.io/yaozengzeng/vllm-mock:latest` is built based on `https://github.com/YaoZengzeng/aibrix/tree/vllm-mock`. +# NOTE: `ghcr.io/volcano-sh/vllm-mock:latest` is built based on `https://github.com/volcano-sh/kthena/tree/main/hack/mock-images/vllm-mock`. # Move the image to kthena registry once it's public. apiVersion: apps/v1 @@ -21,7 +21,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/examples/kthena-router/ModelServing-ds1.5b-pd-disaggregation.yaml b/examples/kthena-router/ModelServing-ds1.5b-pd-disaggregation.yaml index 71026345d..d385ab500 100644 --- a/examples/kthena-router/ModelServing-ds1.5b-pd-disaggregation.yaml +++ b/examples/kthena-router/ModelServing-ds1.5b-pd-disaggregation.yaml @@ -19,7 +19,7 @@ spec: spec: containers: - name: leader - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock @@ -39,7 +39,7 @@ spec: spec: containers: - name: leader - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/hack/mock-images/sglang-mock/Dockerfile b/hack/mock-images/sglang-mock/Dockerfile new file mode 100644 index 000000000..7298e1f01 --- /dev/null +++ b/hack/mock-images/sglang-mock/Dockerfile @@ -0,0 +1,10 @@ +FROM python:3.11-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +CMD ["python", "sglang_app.py"] diff --git a/hack/mock-images/sglang-mock/requirements.txt b/hack/mock-images/sglang-mock/requirements.txt new file mode 100644 index 000000000..7e1060246 --- /dev/null +++ b/hack/mock-images/sglang-mock/requirements.txt @@ -0,0 +1 @@ +flask diff --git a/hack/mock-images/sglang-mock/sglang_app.py b/hack/mock-images/sglang-mock/sglang_app.py new file mode 100644 index 000000000..fe72f4da4 --- /dev/null +++ b/hack/mock-images/sglang-mock/sglang_app.py @@ -0,0 +1,78 @@ +# Copyright The Volcano Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +SGLang metrics mock server for kthena runtime verification. +Exposes Prometheus metrics at /metrics on port 30000. +""" +from random import randint +import os + +try: + from flask import Flask, Response +except ImportError as e: + raise ImportError("flask is required. Run: pip install flask") from e + +app = Flask(__name__) + +MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.1-8B-Instruct") +SGLANG_METRICS_PORT = 30000 + + +def generate_sglang_metrics(): + """Generate SGLang-compatible Prometheus metrics.""" + model_name = MODEL_NAME + token_usage = min(1.0, randint(10, 90) / 100.0) + num_queue_reqs = randint(0, 5) + ttft_sum = randint(100, 500) / 1000.0 + ttft_count = randint(10, 100) + tpot_sum = randint(50, 200) / 1000.0 + tpot_count = ttft_count + + return f"""# HELP sglang:token_usage KV cache utilization ratio (0.0-1.0) +# TYPE sglang:token_usage gauge +sglang:token_usage{{model_name="{model_name}"}} {token_usage} +# HELP sglang:num_queue_reqs Number of requests waiting in queue +# TYPE sglang:num_queue_reqs gauge +sglang:num_queue_reqs{{model_name="{model_name}"}} {num_queue_reqs} +# HELP sglang:time_to_first_token_seconds Histogram of time to first token in seconds +# TYPE sglang:time_to_first_token_seconds histogram +sglang:time_to_first_token_seconds_bucket{{le="0.001",model_name="{model_name}"}} 0 +sglang:time_to_first_token_seconds_bucket{{le="0.005",model_name="{model_name}"}} 0 +sglang:time_to_first_token_seconds_bucket{{le="0.08",model_name="{model_name}"}} {int(ttft_count * 0.3)} +sglang:time_to_first_token_seconds_bucket{{le="+Inf",model_name="{model_name}"}} {ttft_count} +sglang:time_to_first_token_seconds_sum{{model_name="{model_name}"}} {ttft_sum} +sglang:time_to_first_token_seconds_count{{model_name="{model_name}"}} {ttft_count} +# HELP sglang:time_per_output_token_seconds Histogram of time per output token in seconds +# TYPE sglang:time_per_output_token_seconds histogram +sglang:time_per_output_token_seconds_bucket{{le="0.001",model_name="{model_name}"}} 0 +sglang:time_per_output_token_seconds_bucket{{le="0.005",model_name="{model_name}"}} {int(tpot_count * 0.5)} +sglang:time_per_output_token_seconds_bucket{{le="0.08",model_name="{model_name}"}} {tpot_count} +sglang:time_per_output_token_seconds_bucket{{le="+Inf",model_name="{model_name}"}} {tpot_count} +sglang:time_per_output_token_seconds_sum{{model_name="{model_name}"}} {tpot_sum} +sglang:time_per_output_token_seconds_count{{model_name="{model_name}"}} {tpot_count} +""" + + +@app.route("/metrics") +def metrics(): + return Response(generate_sglang_metrics(), mimetype="text/plain; charset=utf-8") + + +@app.route("/health") +def health(): + return "ok", 200 + + +if __name__ == "__main__": + app.run(host="0.0.0.0", port=SGLANG_METRICS_PORT) diff --git a/hack/mock-images/vllm-mock/Dockerfile b/hack/mock-images/vllm-mock/Dockerfile new file mode 100644 index 000000000..54bf56855 --- /dev/null +++ b/hack/mock-images/vllm-mock/Dockerfile @@ -0,0 +1,10 @@ +FROM python:3.11-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +CMD ["python", "app.py"] diff --git a/hack/mock-images/vllm-mock/app.py b/hack/mock-images/vllm-mock/app.py new file mode 100644 index 000000000..831d45078 --- /dev/null +++ b/hack/mock-images/vllm-mock/app.py @@ -0,0 +1,734 @@ +# Copyright The Volcano Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from flask import Flask, request, Response, jsonify +from flask_httpauth import HTTPTokenAuth +from werkzeug import serving +import random +import re +import logging +import sys +import time +from datetime import datetime +from random import randint +import os +import json + +try: + from kubernetes import client, config +except Exception as e: + print(f"Failed to import kubernetes, skip: {e}") + client = None + config = None + +from simulator import Simulator, Request + +# Global storage for overridden values +overrides = {} + +MODEL_NAME = os.getenv('MODEL_NAME', 'deepseek-r1-1-5b') +DEPLOYMENT_NAME = os.getenv('DEPLOYMENT_NAME', 'deepseek-r1') +NAMESPACE = os.getenv('POD_NAMESPACE', 'default') +DEFAULT_REPLICAS = int(os.getenv('DEFAULT_REPLICAS', '1')) +SIMULATION = os.getenv('SIMULATION', 'disabled') + +modelMaps = { + "deepseek-r1-1-5b": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "deepseek-r1-7b": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" +} + +# Polyfill the necessary arguments. +if "--replica_config_device" not in sys.argv: + sys.argv.append("--replica_config_device") + sys.argv.append(SIMULATION) +if "--replica_config_model_name" not in sys.argv: + sys.argv.append("--replica_config_model_name") + sys.argv.append(modelMaps.get(MODEL_NAME, MODEL_NAME)) + +tokenizer = None +simulator: Optional[Simulator] = None + +# Extract the api_key argument and prepare for authentication +api_key = None +try: + index = sys.argv.index("--api_key") + if index + 1 < len(sys.argv): + api_key = sys.argv[index + 1] +except ValueError: + pass + +auth = HTTPTokenAuth(scheme='Bearer') + + +@auth.verify_token +def verify_token(token): + if api_key is None: + return True + return token == api_key + + +@auth.error_handler +def auth_error(status): + return jsonify({"error": "Unauthorized"}), 401 + + +logger = logging.getLogger(__name__) + + +def read_configs(file_path): + """ + Reads a JSON file that stores sensitive information. + """ + try: + with open(file_path, "r") as f: + data = json.load(f) + if not isinstance(data, dict): + raise Exception("invalid config format, dict expected.") + return data + except Exception as e: + print(f"Error reading JSON file: {e}") + return {} + + +configs = read_configs("config.json") +HUGGINGFACE_TOKEN = configs.get("huggingface_token", "your huggingface token") + + +def get_token_count(text): + # Simple heuristic for token count + if not text: + return 0 + return max(1, len(text) // 4) + + +# Get the full model path from modelMaps based on MODEL_NAME env var +BASE_MODEL_PATH = modelMaps.get(MODEL_NAME, MODEL_NAME) + +models = [ + { + "id": BASE_MODEL_PATH, + "object": "model", + "created": 1715644056, + "owned_by": "vllm", + "root": BASE_MODEL_PATH, + "parent": None, + "permission": [ + { + "id": "modelperm-cb1adf4457b2417e8c7770aadcffe4cc", + "object": "model_permission", + "created": 1715644056, + "allow_create_engine": False, + "allow_sampling": True, + "allow_logprobs": True, + "allow_search_indices": False, + "allow_view": True, + "allow_fine_tuning": False, + "organization": "*", + "group": None, + "is_blocking": False + } + ] + } +] + + +# Note: this is to suppress /metrics logs, gateway sends request to pods to scrape +# the metrics and results in lots of meaningless requests that we do not want to log. +def disable_endpoint_logs(): + """Disable logs for requests to specific endpoints.""" + disabled_endpoints = ('/', '/healthz', '/metrics', '/v1/models') + parent_log_request = serving.WSGIRequestHandler.log_request + + def log_request(self, *args, **kwargs): + if not any(re.match(f"{de}$", self.path) for de in disabled_endpoints): + parent_log_request(self, *args, **kwargs) + + serving.WSGIRequestHandler.log_request = log_request + + +app = Flask(__name__) +disable_endpoint_logs() + + +@app.route('/v1/models', methods=['GET']) +@auth.login_required +def get_models(): + return jsonify({ + "object": "list", + "data": models + }) + + +@app.route('/v1/load_lora_adapter', methods=['POST']) +@auth.login_required +def load_model(): + lora_name = request.json.get('lora_name') + # Check if the model already exists + if any(model['id'] == lora_name for model in models): + return jsonify({"status": "success", "message": "Model already loaded"}), 200 + + new_model = { + 'id': lora_name, + 'created': int(time.time()), + 'object': "model", + 'owned_by': "vllm", + 'parent': None, + 'root': request.json.get('lora_path') + } + + models.append(new_model) + return jsonify({"status": "success", "message": "Model loaded successfully"}), 200 + + +@app.route('/v1/unload_lora_adapter', methods=['POST']) +@auth.login_required +def unload_model(): + model_id = request.json.get('lora_name') + global models + models = [model for model in models if model['id'] != model_id] + return jsonify({"status": "success", "message": "Model unloaded successfully"}), 200 + + +@app.route('/v1/completions', methods=['POST']) +@auth.login_required +def completion(): + try: + prompt = request.json.get('prompt') + model = request.json.get('model') + max_tokens = request.json.get('max_tokens') + if not prompt or not model: + return jsonify({"status": "error", "message": "Prompt and model are required"}), 400 + + # Check if model exists in the models list (includes base model and loaded LoRA adapters) + if not any(m['id'] == model for m in models): + return jsonify({ + "error": { + "message": f"The model `{model}` does not exist", + "type": "invalid_request_error", + "param": "model", + "code": "model_not_found" + } + }), 404 + + arrived_at = datetime.now().timestamp() + input_tokens = get_token_count(prompt) + output_tokens = max_tokens if max_tokens else randint(10, 500) + arrived_next = request.json.get('next_in') + if not arrived_next: + arrived_next = 0.0 + else: + arrived_next += arrived_at + + start = datetime.now().timestamp() + latency = 0.0 + if simulator is not None: + latency = simulator.execute(Request(arrived_at, input_tokens, output_tokens, arrived_next=arrived_next)) + + # Simulated response + response = { + "id": "cmpl-uqkvlQyYK7bGYrRHQ0eXlWi7", + "object": "text_completion", + "created": int(arrived_at), + "model": model, + "system_fingerprint": "fp_44709d6fcb", + "choices": [ + { + "text": f"This is simulated message from {model}!", + "index": 0, + "logprobs": None, + "finish_reason": "length" + } + ], + "usage": { + "prompt_tokens": input_tokens, + "completion_tokens": output_tokens, + "total_tokens": input_tokens + output_tokens, + "time": latency + } + } + overhead = datetime.now().timestamp() - start + if latency > overhead: + time.sleep(latency - overhead) + elif latency > 0.0: + logger.warning(f"Latency is less than overhead: L{latency} - O{overhead}") + + return jsonify(response), 200 + except Exception as e: + err = { + "error": { + "message": f"The server had an error while processing your request: {e}", + "type": "server_error" + } + } + return jsonify(err), 500 + + +@app.route('/v1/chat/completions', methods=['POST']) +@auth.login_required +def chat_completions(): + try: + messages = request.json.get('messages') + model = request.json.get('model') + max_tokens = request.json.get('max_tokens') + if not messages or not model: + return jsonify({"status": "error", "message": "Messages and model are required"}), 400 + + # Check if model exists in the models list (includes base model and loaded LoRA adapters) + if not any(m['id'] == model for m in models): + return jsonify({ + "error": { + "message": f"The model `{model}` does not exist", + "type": "invalid_request_error", + "param": "model", + "code": "model_not_found" + } + }), 404 + + arrived_at = datetime.now().timestamp() + input_tokens = sum(get_token_count(message["content"]) for message in messages) + output_tokens = max_tokens if max_tokens else randint(10, 500) + arrived_next = request.json.get('next_in') + if not arrived_next: + arrived_next = 0.0 + else: + arrived_next += arrived_at + + start = datetime.now().timestamp() + latency = 0.0 + if simulator is not None: + latency = simulator.execute(Request(arrived_at, input_tokens, output_tokens, arrived_next=arrived_next)) + + # Simulated response + response = { + "id": "chatcmpl-abc123", + "object": "chat.completion", + "created": int(arrived_at), + "model": model, + "usage": { + "prompt_tokens": input_tokens, + "completion_tokens": output_tokens, + "total_tokens": input_tokens + output_tokens, + "time": latency + }, + "choices": [ + { + "message": { + "role": "assistant", + "content": f"\n\nThis is simulated message from {model}!" + }, + "logprobs": None, + "finish_reason": "stop", + "index": 0 + } + ] + } + overhead = datetime.now().timestamp() - start + if latency > overhead: + time.sleep(latency - overhead) + else: + logger.warning(f"Latency is less than overhead: L{latency} - O{overhead}") + + return jsonify(response), 200 + except Exception as e: + err = { + "error": { + "message": f"The server had an error while processing your request: {e}", + "type": "server_error" + } + } + return jsonify(err), 500 + + +@app.route('/set_metrics', methods=['POST']) +def set_metrics(): + global overrides + # Get JSON data from the request + data = request.json + if data: + # Update overrides with new key-value pairs + overrides.update(data) + return {"status": "success", "message": "Overrides updated"}, 200 + else: + return {"status": "error", "message": "No data provided"}, 400 + + +# Initialize global state to keep track of metrics data +metrics_state = {} + + +def generate_histogram_metric(metric_name, description, model_name, buckets, new_requests, help_header=True): + """ + Generate Prometheus histogram metrics with dynamically updated bucket values. + + Args: + metric_name (str): Name of the metric. + description (str): Metric description. + model_name (str): Model name. + buckets (list): List of bucket boundaries. + new_requests (dict): Dictionary with new requests to update bucket values. + help_header: the flag to include HELP Header + + Returns: + str: Prometheus-formatted histogram metric. + """ + global metrics_state + + # Initialize state if not already present + if metric_name not in metrics_state: + metrics_state[metric_name] = { + "buckets": {bucket: 0 for bucket in buckets}, # Bucket values + "total_sum": 0, # Total sum of all values + "total_count": 0 # Total count of all events + } + + # Retrieve current metric state + current_state = metrics_state[metric_name] + + # Update buckets and ensure cumulative nature + for bucket in buckets: + if bucket in new_requests: + # Add new requests for this bucket + current_state["buckets"][bucket] += new_requests[bucket] + + # Ensure cumulative updates for histogram buckets + if bucket != buckets[0]: # Skip the first bucket + current_state["buckets"][bucket] = max( + current_state["buckets"][bucket], + current_state["buckets"][buckets[buckets.index(bucket) - 1]] + ) + + # Update total_count and total_sum + current_state["total_count"] = current_state["buckets"][buckets[-1]] # `+Inf` bucket is the total count + current_state["total_sum"] += sum( + float(bucket) * value for bucket, value in new_requests.items() if bucket != "+Inf" + ) + + # Generate Prometheus bucket strings + bucket_strings = "\n".join( + [f'vllm:{metric_name}_bucket{{le="{bucket}",model_name="{model_name}"}} {current_state["buckets"][bucket]}' + for bucket in buckets] + ) + + # Return formatted histogram metric + histogram_template = """ +# HELP vllm:{metric_name} {description} +# TYPE vllm:{metric_name} histogram +vllm:{metric_name}_sum{{model_name="{model_name}"}} {value} +{buckets} +vllm:{metric_name}_count{{model_name="{model_name}"}} {count} +""" if help_header else """ +vllm:{metric_name}_sum{{model_name="{model_name}"}} {value} +{buckets} +vllm:{metric_name}_count{{model_name="{model_name}"}} {count} +""" + + return histogram_template.format( + metric_name=metric_name, + description=description, + model_name=model_name, + value=current_state["total_sum"], + buckets=bucket_strings, + count=current_state["total_count"] + ) + + +def generate_counter_gauge_metric(metric_name, metric_type, description, model_name, value, help_header=True): + """ + Generates a Prometheus metric string for counter or gauge. + + Args: + metric_name (str): The name of the metric. + metric_type (str): The type of the metric ('counter' or 'gauge'). + description (str): The HELP description of the metric. + model_name (str): The name of the model. + value (float): The value of the metric. + help_header: the flag to include HELP Header + + Returns: + str: A formatted Prometheus metric string. + """ + counter_gauge_template = """ +# HELP vllm:{metric_name} {description} +# TYPE vllm:{metric_name} {metric_type} +vllm:{metric_name}{{model_name="{model_name}"}} {value} +""" if help_header else """ +vllm:{metric_name}{{model_name="{model_name}"}} {value} +""" + + return counter_gauge_template.format( + metric_name=metric_name, + metric_type=metric_type, + description=description, + model_name=model_name, + value=value + ) + + +@app.route('/metrics') +def metrics(): + # get deployment information + try: + apps_v1 = client.AppsV1Api() + resp = apps_v1.read_namespaced_deployment(DEPLOYMENT_NAME, NAMESPACE) + replicas = resp.spec.replicas if resp.spec.replicas is not None else 1 + except Exception as e: + #print(f"Failed to get deployment information: {DEPLOYMENT_NAME=} {NAMESPACE=} error={str(e)}") + #print(f"Due to the failure, replicas {DEFAULT_REPLICAS} will be used to calculate metrics") + replicas = DEFAULT_REPLICAS + + # a reasonable mock total value + total = overrides.get("total", 100.0) + model_name = overrides.get("model_name", MODEL_NAME) + # calculate metrics with potential overrides + success_total = overrides.get("success_total", total / replicas) + avg_prompt_throughput = overrides.get("avg_prompt_throughput", total / replicas if replicas > 0 else 0) + avg_generation_throughput = overrides.get("avg_generation_throughput", total / replicas if replicas > 0 else 0) + prompt_tokens_total = overrides.get("prompt_tokens_total", randint(100, 1024) * success_total) + generation_tokens_total = overrides.get("generation_tokens_total", randint(100, 1024) * success_total) + running = overrides.get("running", randint(1, 100)) + cpu_running = overrides.get("cpu_running", randint(1, 100)) + waiting = overrides.get("waiting", randint(1, 5)) + swapped = overrides.get("swapped", randint(1, 100)) + max_running_capacity = 100 + gpu_cache_usage_perc = overrides.get("gpu_cache_usage_perc", min(1.0, (running / max_running_capacity))) + cpu_cache_usage_perc = overrides.get("cpu_cache_usage_perc", min(1.0, (cpu_running / max_running_capacity))) + + # Define metrics and their attributes + simple_metrics = [ + { + "name": "prompt_tokens_total", + "type": "counter", + "description": "Count of prefill tokens processed.", + "value": overrides.get("prompt_tokens_total", prompt_tokens_total) + }, + { + "name": "generation_tokens_total", + "type": "counter", + "description": "Count of generation tokens processed.", + "value": overrides.get("generation_tokens_total", generation_tokens_total) + }, + { + "name": "request_success_total", + "type": "counter", + "description": "Count of successfully processed requests.", + "value": overrides.get("success_total", success_total) + }, + { + "name": "num_requests_running", + "type": "gauge", + "description": "Number of requests currently running on GPU.", + "value": overrides.get("running", running) + }, + { + "name": "num_requests_swapped", + "type": "gauge", + "description": "Number of requests swapped to CPU.", + "value": overrides.get("swapped", swapped) + }, + { + "name": "num_requests_waiting", + "type": "gauge", + "description": "Number of requests waiting to be processed.", + "value": overrides.get("waiting", waiting) + }, + { + "name": "avg_prompt_throughput_toks_per_s", + "type": "gauge", + "description": "Average prefill throughput in tokens/s.", + "value": overrides.get("avg_prompt_throughput", avg_prompt_throughput) + }, + { + "name": "avg_generation_throughput_toks_per_s", + "type": "gauge", + "description": "Average generation throughput in tokens/s.", + "value": overrides.get("avg_generation_throughput", avg_generation_throughput) + }, + { + "name": "gpu_cache_usage_perc", + "type": "gauge", + "description": "GPU KV-cache usage. 1 means 100 percent usage.", + "value": overrides.get( + "gpu_cache_usage_perc", gpu_cache_usage_perc + ) + }, + { + "name": "cpu_cache_usage_perc", + "type": "gauge", + "description": "CPU KV-cache usage. 1 means 100 percent usage.", + "value": overrides.get( + "cpu_cache_usage_perc", cpu_cache_usage_perc + ) + }, + ] + + # Generate all metrics + metrics_output = "" + for metric in simple_metrics: + metrics_output += generate_counter_gauge_metric(metric["name"], metric["type"], metric["description"], + model_name, metric["value"]) + metrics_output += generate_counter_gauge_metric(metric["name"], metric["type"], metric["description"], + "lora-A", metric["value"], help_header=False) + + + metrics_output += """ +# HELP vllm:lora_requests_info Running stats on lora requests. +# TYPE vllm:lora_requests_info gauge +vllm:lora_requests_info{max_lora="1",running_lora_adapters="lora-A",waiting_lora_adapters=""} 1 +""" + + histogram_metrics = [ + { + "name": "iteration_tokens_total", + "type": "histogram", + "description": "Histogram of number of tokens per engine_step.", + "buckets": ["1.0", "8.0", "16.0", "32.0", "64.0", "128.0", "256.0", + "512.0", "1024.0", "2048.0", "4096.0", "8192.0", "+Inf"] + }, + { + "name": "time_to_first_token_seconds", + "type": "histogram", + "description": "Histogram of time to first token in seconds.", + "buckets": ["0.001", "0.005", "0.01", "0.02", "0.04", "0.06", + "0.08", "0.1", "0.25", "0.5", "+Inf"] + }, + { + "name": "time_per_output_token_seconds", + "type": "histogram", + "description": "Histogram of time per output token in seconds.", + "buckets": ["0.01", "0.025", "0.05", "0.075", "0.1", "0.15", + "0.2", "0.3", "0.4", "+Inf"] + }, + { + "name": "request_prompt_tokens", + "type": "histogram", + "description": "Histogram of number of prefill tokens processed.", + "buckets": ["1.0", "2.0", "5.0", "10.0", "20.0", "50.0", + "100.0", "200.0", "500.0", "1000.0", "2000.0", + "5000.0", "10000.0", "+Inf"] + }, + { + "name": "request_generation_tokens", + "type": "histogram", + "description": "Histogram of number of generation tokens processed.", + "buckets": ["1.0", "2.0", "5.0", "10.0", "20.0", "50.0", + "100.0", "200.0", "500.0", "1000.0", "2000.0", + "5000.0", "10000.0", "+Inf"] + }, + { + "name": "e2e_request_latency_seconds", + "type": "histogram", + "description": "Histogram of end to end request latency in seconds.", + "buckets": ["0.3", "0.5", "0.8", "1.0", "1.5", "2.0", "5.0", "+Inf"] + }, + { + "name": "request_queue_time_seconds", + "type": "histogram", + "description": "Histogram of time spent in WAITING phase for request.", + "buckets": ["0.3", "0.5", "0.8", "1.0", "1.5", "2.0", "5.0", "+Inf"] + }, + { + "name": "request_inference_time_seconds", + "type": "histogram", + "description": "Histogram of time spent in RUNNING phase for request.", + "buckets": ["0.3", "0.5", "0.8", "1.0", "1.5", "2.0", "5.0", "+Inf"] + }, + { + "name": "request_decode_time_seconds", + "type": "histogram", + "description": "Histogram of time spent in DECODE phase for request.", + "buckets": ["0.3", "0.5", "0.8", "1.0", "1.5", "2.0", "5.0", "+Inf"] + }, + { + "name": "request_prefill_time_seconds", + "type": "histogram", + "description": "Histogram of time spent in PREFILL phase for request.", + "buckets": ["0.3", "0.5", "0.8", "1.0", "1.5", "2.0", "5.0", "+Inf"] + }, + ] + + # Generate metrics output + histogram_metrics_output = "" + for metric in histogram_metrics: + # Simulate random new requests for the metric + new_requests = {bucket: random.randint(0, 5) for bucket in metric["buckets"]} + histogram_metrics_output += generate_histogram_metric( + metric_name=metric["name"], + description=metric["description"], + model_name=model_name, + buckets=metric["buckets"], + new_requests=new_requests + ) + new_requests = {bucket: random.randint(0, 5) for bucket in metric["buckets"]} + histogram_metrics_output += generate_histogram_metric( + metric_name=metric["name"], + description=metric["description"], + model_name="lora-A", + buckets=metric["buckets"], + new_requests=new_requests, + help_header=False + ) + + return Response(metrics_output + histogram_metrics_output, mimetype='text/plain') + + +if __name__ == '__main__': + logging.basicConfig(level=logging.DEBUG) + logging.getLogger("kubernetes.client.rest").setLevel(logging.ERROR) # Suppress kubernetes logs + + print(f"Starting app. DEPLOYMENT_NAME: {DEPLOYMENT_NAME}, NAMESPACE: {NAMESPACE}, MODEL: {MODEL_NAME}") + + # Extract gpu_device without call argparse + gpu_device = "disabled" + try: + index = sys.argv.index("--replica_config_device") + if index + 1 < len(sys.argv): + gpu_device = sys.argv[index + 1] + except ValueError: + pass + + # Restore -h functionality + if '-h' in sys.argv: + print("Mock server help: [no options available]") + + # Launch simulator + if gpu_device != "disabled": + # Simplified simulator initialization + simulator = Simulator() + overrides = { + "total": 100.0, + "running": 0, + "waiting": 0, + "swapped": 0 + } + + thread = None + if simulator is not None: + # TODO: Move simulation to a separate workflow, independent of the main web service + thread = simulator.start() + + # Perform profiling and skip actual run + if '--time_limit' not in sys.argv: + try: + # config.load_kube_config() + config.load_incluster_config() + except Exception as e: + print(f"Failed to load k8s config: {e}") + + # app.run(host='0.0.0.0', port=8080) + app.run(host='0.0.0.0', port=8000) + + if simulator is not None: + simulator.stop() + + if thread is not None: + thread.join() diff --git a/hack/mock-images/vllm-mock/config.json b/hack/mock-images/vllm-mock/config.json new file mode 100644 index 000000000..9df272fe9 --- /dev/null +++ b/hack/mock-images/vllm-mock/config.json @@ -0,0 +1,3 @@ +{ + "huggingface_token": "your huggingface token" +} \ No newline at end of file diff --git a/hack/mock-images/vllm-mock/requirements.txt b/hack/mock-images/vllm-mock/requirements.txt new file mode 100644 index 000000000..fcee99ce9 --- /dev/null +++ b/hack/mock-images/vllm-mock/requirements.txt @@ -0,0 +1,3 @@ +flask +Flask-HTTPAuth +kubernetes diff --git a/hack/mock-images/vllm-mock/simulator.py b/hack/mock-images/vllm-mock/simulator.py new file mode 100644 index 000000000..89e301317 --- /dev/null +++ b/hack/mock-images/vllm-mock/simulator.py @@ -0,0 +1,49 @@ +# Copyright The Volcano Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import threading +import time +import random +from typing import Optional + +class Request: + def __init__(self, arrived_at, input_tokens, output_tokens, arrived_next=0): + self.arrived_at = arrived_at + self.input_tokens = input_tokens + self.output_tokens = output_tokens + self.arrived_next = arrived_next + +class Simulator: + def __init__(self, config=None): + self._terminate = False + + def start(self): + # Dummy thread for compatibility + def dummy_run(): + while not self._terminate: + time.sleep(1) + + t = threading.Thread(target=dummy_run) + t.start() + return t + + def stop(self): + self._terminate = True + + def execute(self, request: Request) -> float: + # Simple latency mock: base delay + per-token delay + base_latency = 0.05 + per_token_latency = 0.002 + latency = base_latency + (request.input_tokens + request.output_tokens) * per_token_latency + latency *= random.uniform(0.9, 1.1) + return latency diff --git a/hack/mock-images/vllm-mock/test_app.py b/hack/mock-images/vllm-mock/test_app.py new file mode 100644 index 000000000..f7f72b7e6 --- /dev/null +++ b/hack/mock-images/vllm-mock/test_app.py @@ -0,0 +1,46 @@ +# Copyright The Volcano Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +from app import app + + +class FlaskTestCase(unittest.TestCase): + + def setUp(self): + self.client = app.test_client() + + def test_metrics(self): + expected_total = 100 + replica = 3 + response = self.client.get('/metrics') + self.assertEqual(response.status_code, 200) + data = response.data.decode() + print(f"response data: \n<<<<<<<<<<<<<<<<\n{data}\n<<<<<<<<<<<<<<<<") + # metrics exists + self.assertIn('vllm:request_success_total', data) + self.assertIn('vllm:avg_prompt_throughput_toks_per_s', data) + self.assertIn('vllm:avg_generation_throughput_toks_per_s', data) + + # assert metric value + self.assertIn( + f'vllm:request_success_total{{finished_reason="stop",model_name="llama2-70b"}} {expected_total / replica}', + data) + self.assertIn(f'vllm:avg_prompt_throughput_toks_per_s{{model_name="llama2-70b"}} {expected_total / replica}', + data) + self.assertIn( + f'vllm:avg_generation_throughput_toks_per_s{{model_name="llama2-70b"}} {expected_total / replica}', data) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/e2e/router/testdata/LLM-Mock-ds1.5b-Canary.yaml b/test/e2e/router/testdata/LLM-Mock-ds1.5b-Canary.yaml index ff30fb59a..5f8fd5cee 100644 --- a/test/e2e/router/testdata/LLM-Mock-ds1.5b-Canary.yaml +++ b/test/e2e/router/testdata/LLM-Mock-ds1.5b-Canary.yaml @@ -21,7 +21,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock @@ -49,7 +49,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/test/e2e/router/testdata/LLM-Mock-ds1.5b.yaml b/test/e2e/router/testdata/LLM-Mock-ds1.5b.yaml index 03b9e9fa5..ae7c77cda 100644 --- a/test/e2e/router/testdata/LLM-Mock-ds1.5b.yaml +++ b/test/e2e/router/testdata/LLM-Mock-ds1.5b.yaml @@ -19,7 +19,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/test/e2e/router/testdata/LLM-Mock-ds7b.yaml b/test/e2e/router/testdata/LLM-Mock-ds7b.yaml index faa8c76fd..9951505d7 100644 --- a/test/e2e/router/testdata/LLM-Mock-ds7b.yaml +++ b/test/e2e/router/testdata/LLM-Mock-ds7b.yaml @@ -19,7 +19,7 @@ spec: spec: containers: - name: llm-engine - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock diff --git a/test/e2e/router/testdata/ModelServing-ds1.5b-pd-disaggregation.yaml b/test/e2e/router/testdata/ModelServing-ds1.5b-pd-disaggregation.yaml index 71026345d..d385ab500 100644 --- a/test/e2e/router/testdata/ModelServing-ds1.5b-pd-disaggregation.yaml +++ b/test/e2e/router/testdata/ModelServing-ds1.5b-pd-disaggregation.yaml @@ -19,7 +19,7 @@ spec: spec: containers: - name: leader - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock @@ -39,7 +39,7 @@ spec: spec: containers: - name: leader - image: ghcr.io/yaozengzeng/vllm-mock:latest + image: ghcr.io/volcano-sh/vllm-mock:latest imagePullPolicy: IfNotPresent env: # specify the model name to mock