diff --git a/.github/workflows/build-mock-images.yml b/.github/workflows/build-mock-images.yml
new file mode 100644
index 000000000..99f6c49d1
--- /dev/null
+++ b/.github/workflows/build-mock-images.yml
@@ -0,0 +1,56 @@
+name: Build and Push Mock Images
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - hack/mock-images/**
+  workflow_dispatch:
+
+jobs:
+  build-vllm-mock:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Log in to GHCR
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push vllm-mock
+        uses: docker/build-push-action@v5
+        with:
+          context: hack/mock-images/vllm-mock
+          push: true
+          tags: ghcr.io/volcano-sh/vllm-mock:latest
+
+  build-sglang-mock:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Log in to GHCR
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push sglang-mock
+        uses: docker/build-push-action@v5
+        with:
+          context: hack/mock-images/sglang-mock
+          push: true
+          tags: ghcr.io/volcano-sh/sglang-mock:latest
diff --git a/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml b/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml
index ff30fb59a..5f8fd5cee 100644
--- a/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml
+++ b/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml
@@ -21,7 +21,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
@@ -49,7 +49,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml b/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml
index 03b9e9fa5..ae7c77cda 100644
--- a/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml
+++ b/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml
@@ -19,7 +19,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds7b.yaml b/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds7b.yaml
index faa8c76fd..9951505d7 100644
--- a/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds7b.yaml
+++ b/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock-ds7b.yaml
@@ -19,7 +19,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock.yaml b/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock.yaml
index 3a4d8407b..8741b17af 100644
--- a/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock.yaml
+++ b/docs/kthena/docs/assets/examples/kthena-router/LLM-Mock.yaml
@@ -2,7 +2,7 @@
 # The mock server will return a fixed response for any input.
 # You can use this mock server to test the inference router without deploying a real LLM server.
 #
-# NOTE: `ghcr.io/yaozengzeng/vllm-mock:latest` is built based on `https://github.com/YaoZengzeng/aibrix/tree/vllm-mock`.
+# NOTE: `ghcr.io/volcano-sh/vllm-mock:latest` is built based on `https://github.com/volcano-sh/kthena/tree/main/hack/mock-images/vllm-mock`.
 # Move the image to kthena registry once it's public.
 
 apiVersion: apps/v1
@@ -21,7 +21,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml b/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml
index ff30fb59a..5f8fd5cee 100644
--- a/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml
+++ b/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml
@@ -21,7 +21,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
@@ -49,7 +49,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml b/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml
index 03b9e9fa5..ae7c77cda 100644
--- a/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml
+++ b/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml
@@ -19,7 +19,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml b/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml
index faa8c76fd..9951505d7 100644
--- a/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml
+++ b/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml
@@ -19,7 +19,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock.yaml b/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock.yaml
index 3a4d8407b..8741b17af 100644
--- a/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock.yaml
+++ b/docs/kthena/versioned_docs/version-v0.1.0/assets/examples/kthena-router/LLM-Mock.yaml
@@ -2,7 +2,7 @@
 # The mock server will return a fixed response for any input.
 # You can use this mock server to test the inference router without deploying a real LLM server.
 #
-# NOTE: `ghcr.io/yaozengzeng/vllm-mock:latest` is built based on `https://github.com/YaoZengzeng/aibrix/tree/vllm-mock`.
+# NOTE: `ghcr.io/volcano-sh/vllm-mock:latest` is built based on `https://github.com/volcano-sh/kthena/tree/main/hack/mock-images/vllm-mock`.
 # Move the image to kthena registry once it's public.
 
 apiVersion: apps/v1
@@ -21,7 +21,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml b/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml
index ff30fb59a..5f8fd5cee 100644
--- a/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml
+++ b/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml
@@ -21,7 +21,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
@@ -49,7 +49,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml b/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml
index 03b9e9fa5..ae7c77cda 100644
--- a/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml
+++ b/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml
@@ -19,7 +19,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml b/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml
index faa8c76fd..9951505d7 100644
--- a/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml
+++ b/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml
@@ -19,7 +19,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock.yaml b/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock.yaml
index 3a4d8407b..8741b17af 100644
--- a/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock.yaml
+++ b/docs/kthena/versioned_docs/version-v0.2.0/assets/examples/kthena-router/LLM-Mock.yaml
@@ -2,7 +2,7 @@
 # The mock server will return a fixed response for any input.
 # You can use this mock server to test the inference router without deploying a real LLM server.
 #
-# NOTE: `ghcr.io/yaozengzeng/vllm-mock:latest` is built based on `https://github.com/YaoZengzeng/aibrix/tree/vllm-mock`.
+# NOTE: `ghcr.io/volcano-sh/vllm-mock:latest` is built based on `https://github.com/volcano-sh/kthena/tree/main/hack/mock-images/vllm-mock`.
 # Move the image to kthena registry once it's public.
 
 apiVersion: apps/v1
@@ -21,7 +21,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml b/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml
index ff30fb59a..5f8fd5cee 100644
--- a/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml
+++ b/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml
@@ -21,7 +21,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
@@ -49,7 +49,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml b/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml
index 03b9e9fa5..ae7c77cda 100644
--- a/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml
+++ b/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds1.5b.yaml
@@ -19,7 +19,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml b/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml
index faa8c76fd..9951505d7 100644
--- a/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml
+++ b/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock-ds7b.yaml
@@ -19,7 +19,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock.yaml b/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock.yaml
index 3a4d8407b..8741b17af 100644
--- a/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock.yaml
+++ b/docs/kthena/versioned_docs/version-v0.3.0/assets/examples/kthena-router/LLM-Mock.yaml
@@ -2,7 +2,7 @@
 # The mock server will return a fixed response for any input.
 # You can use this mock server to test the inference router without deploying a real LLM server.
 #
-# NOTE: `ghcr.io/yaozengzeng/vllm-mock:latest` is built based on `https://github.com/YaoZengzeng/aibrix/tree/vllm-mock`.
+# NOTE: `ghcr.io/volcano-sh/vllm-mock:latest` is built based on `https://github.com/volcano-sh/kthena/tree/main/hack/mock-images/vllm-mock`.
 # Move the image to kthena registry once it's public.
 
 apiVersion: apps/v1
@@ -21,7 +21,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml b/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml
index ff30fb59a..5f8fd5cee 100644
--- a/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml
+++ b/examples/kthena-router/LLM-Mock-ds1.5b-Canary.yaml
@@ -21,7 +21,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
@@ -49,7 +49,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/examples/kthena-router/LLM-Mock-ds1.5b.yaml b/examples/kthena-router/LLM-Mock-ds1.5b.yaml
index 03b9e9fa5..ae7c77cda 100644
--- a/examples/kthena-router/LLM-Mock-ds1.5b.yaml
+++ b/examples/kthena-router/LLM-Mock-ds1.5b.yaml
@@ -19,7 +19,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/examples/kthena-router/LLM-Mock-ds7b.yaml b/examples/kthena-router/LLM-Mock-ds7b.yaml
index faa8c76fd..9951505d7 100644
--- a/examples/kthena-router/LLM-Mock-ds7b.yaml
+++ b/examples/kthena-router/LLM-Mock-ds7b.yaml
@@ -19,7 +19,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/examples/kthena-router/LLM-Mock.yaml b/examples/kthena-router/LLM-Mock.yaml
index 3a4d8407b..8741b17af 100644
--- a/examples/kthena-router/LLM-Mock.yaml
+++ b/examples/kthena-router/LLM-Mock.yaml
@@ -2,7 +2,7 @@
 # The mock server will return a fixed response for any input.
 # You can use this mock server to test the inference router without deploying a real LLM server.
 #
-# NOTE: `ghcr.io/yaozengzeng/vllm-mock:latest` is built based on `https://github.com/YaoZengzeng/aibrix/tree/vllm-mock`.
+# NOTE: `ghcr.io/volcano-sh/vllm-mock:latest` is built based on `https://github.com/volcano-sh/kthena/tree/main/hack/mock-images/vllm-mock`.
 # Move the image to kthena registry once it's public.
 
 apiVersion: apps/v1
@@ -21,7 +21,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/examples/kthena-router/ModelServing-ds1.5b-pd-disaggregation.yaml b/examples/kthena-router/ModelServing-ds1.5b-pd-disaggregation.yaml
index 71026345d..d385ab500 100644
--- a/examples/kthena-router/ModelServing-ds1.5b-pd-disaggregation.yaml
+++ b/examples/kthena-router/ModelServing-ds1.5b-pd-disaggregation.yaml
@@ -19,7 +19,7 @@ spec:
           spec:
             containers:
               - name: leader
-                image: ghcr.io/yaozengzeng/vllm-mock:latest
+                image: ghcr.io/volcano-sh/vllm-mock:latest
                 imagePullPolicy: IfNotPresent
                 env:
                   # specify the model name to mock
@@ -39,7 +39,7 @@ spec:
           spec:
             containers:
               - name: leader
-                image: ghcr.io/yaozengzeng/vllm-mock:latest
+                image: ghcr.io/volcano-sh/vllm-mock:latest
                 imagePullPolicy: IfNotPresent
                 env:
                   # specify the model name to mock
diff --git a/hack/mock-images/sglang-mock/Dockerfile b/hack/mock-images/sglang-mock/Dockerfile
new file mode 100644
index 000000000..7298e1f01
--- /dev/null
+++ b/hack/mock-images/sglang-mock/Dockerfile
@@ -0,0 +1,10 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+CMD ["python", "sglang_app.py"]
diff --git a/hack/mock-images/sglang-mock/requirements.txt b/hack/mock-images/sglang-mock/requirements.txt
new file mode 100644
index 000000000..7e1060246
--- /dev/null
+++ b/hack/mock-images/sglang-mock/requirements.txt
@@ -0,0 +1 @@
+flask
diff --git a/hack/mock-images/sglang-mock/sglang_app.py b/hack/mock-images/sglang-mock/sglang_app.py
new file mode 100644
index 000000000..fe72f4da4
--- /dev/null
+++ b/hack/mock-images/sglang-mock/sglang_app.py
@@ -0,0 +1,78 @@
+# Copyright The Volcano Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+SGLang metrics mock server for kthena runtime verification.
+Exposes Prometheus metrics at /metrics on port 30000.
+"""
+from random import randint
+import os
+
+try:
+    from flask import Flask, Response
+except ImportError as e:
+    raise ImportError("flask is required. Run: pip install flask") from e
+
+app = Flask(__name__)
+
+MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.1-8B-Instruct")
+SGLANG_METRICS_PORT = 30000
+
+
+def generate_sglang_metrics():
+    """Generate SGLang-compatible Prometheus metrics."""
+    model_name = MODEL_NAME
+    token_usage = min(1.0, randint(10, 90) / 100.0)
+    num_queue_reqs = randint(0, 5)
+    ttft_sum = randint(100, 500) / 1000.0
+    ttft_count = randint(10, 100)
+    tpot_sum = randint(50, 200) / 1000.0
+    tpot_count = ttft_count
+
+    return f"""# HELP sglang:token_usage KV cache utilization ratio (0.0-1.0)
+# TYPE sglang:token_usage gauge
+sglang:token_usage{{model_name="{model_name}"}} {token_usage}
+# HELP sglang:num_queue_reqs Number of requests waiting in queue
+# TYPE sglang:num_queue_reqs gauge
+sglang:num_queue_reqs{{model_name="{model_name}"}} {num_queue_reqs}
+# HELP sglang:time_to_first_token_seconds Histogram of time to first token in seconds
+# TYPE sglang:time_to_first_token_seconds histogram
+sglang:time_to_first_token_seconds_bucket{{le="0.001",model_name="{model_name}"}} 0
+sglang:time_to_first_token_seconds_bucket{{le="0.005",model_name="{model_name}"}} 0
+sglang:time_to_first_token_seconds_bucket{{le="0.08",model_name="{model_name}"}} {int(ttft_count * 0.3)}
+sglang:time_to_first_token_seconds_bucket{{le="+Inf",model_name="{model_name}"}} {ttft_count}
+sglang:time_to_first_token_seconds_sum{{model_name="{model_name}"}} {ttft_sum}
+sglang:time_to_first_token_seconds_count{{model_name="{model_name}"}} {ttft_count}
+# HELP sglang:time_per_output_token_seconds Histogram of time per output token in seconds
+# TYPE sglang:time_per_output_token_seconds histogram
+sglang:time_per_output_token_seconds_bucket{{le="0.001",model_name="{model_name}"}} 0
+sglang:time_per_output_token_seconds_bucket{{le="0.005",model_name="{model_name}"}} {int(tpot_count * 0.5)}
+sglang:time_per_output_token_seconds_bucket{{le="0.08",model_name="{model_name}"}} {tpot_count}
+sglang:time_per_output_token_seconds_bucket{{le="+Inf",model_name="{model_name}"}} {tpot_count}
+sglang:time_per_output_token_seconds_sum{{model_name="{model_name}"}} {tpot_sum}
+sglang:time_per_output_token_seconds_count{{model_name="{model_name}"}} {tpot_count}
+"""
+
+
+@app.route("/metrics")
+def metrics():
+    return Response(generate_sglang_metrics(), mimetype="text/plain; charset=utf-8")
+
+
+@app.route("/health")
+def health():
+    return "ok", 200
+
+
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=SGLANG_METRICS_PORT)
diff --git a/hack/mock-images/vllm-mock/Dockerfile b/hack/mock-images/vllm-mock/Dockerfile
new file mode 100644
index 000000000..54bf56855
--- /dev/null
+++ b/hack/mock-images/vllm-mock/Dockerfile
@@ -0,0 +1,10 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+CMD ["python", "app.py"]
diff --git a/hack/mock-images/vllm-mock/app.py b/hack/mock-images/vllm-mock/app.py
new file mode 100644
index 000000000..831d45078
--- /dev/null
+++ b/hack/mock-images/vllm-mock/app.py
@@ -0,0 +1,734 @@
+# Copyright The Volcano Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from flask import Flask, request, Response, jsonify
+from flask_httpauth import HTTPTokenAuth
+from werkzeug import serving
+import random
+import re
+import logging
+import sys
+import time
+from datetime import datetime
+from random import randint
+import os
+import json
+
+try:
+    from kubernetes import client, config
+except Exception as e:
+    print(f"Failed to import kubernetes, skip: {e}")
+    client = None
+    config = None
+
+from simulator import Simulator, Request
+
+# Global storage for overridden values
+overrides = {}
+
+MODEL_NAME = os.getenv('MODEL_NAME', 'deepseek-r1-1-5b')
+DEPLOYMENT_NAME = os.getenv('DEPLOYMENT_NAME', 'deepseek-r1')
+NAMESPACE = os.getenv('POD_NAMESPACE', 'default')
+DEFAULT_REPLICAS = int(os.getenv('DEFAULT_REPLICAS', '1'))
+SIMULATION = os.getenv('SIMULATION', 'disabled')
+
+modelMaps = {
+    "deepseek-r1-1-5b": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+    "deepseek-r1-7b": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
+}
+
+# Polyfill the necessary arguments.
+if "--replica_config_device" not in sys.argv:
+    sys.argv.append("--replica_config_device")
+    sys.argv.append(SIMULATION)
+if "--replica_config_model_name" not in sys.argv:
+    sys.argv.append("--replica_config_model_name")
+    sys.argv.append(modelMaps.get(MODEL_NAME, MODEL_NAME))
+
+tokenizer = None
+simulator: Optional[Simulator] = None
+
+# Extract the api_key argument and prepare for authentication
+api_key = None
+try:
+    index = sys.argv.index("--api_key")
+    if index + 1 < len(sys.argv):
+        api_key = sys.argv[index + 1]
+except ValueError:
+    pass
+
+auth = HTTPTokenAuth(scheme='Bearer')
+
+
+@auth.verify_token
+def verify_token(token):
+    if api_key is None:
+        return True
+    return token == api_key
+
+
+@auth.error_handler
+def auth_error(status):
+    return jsonify({"error": "Unauthorized"}), 401
+
+
+logger = logging.getLogger(__name__)
+
+
+def read_configs(file_path):
+    """
+    Reads a JSON file that stores sensitive information.
+    """
+    try:
+        with open(file_path, "r") as f:
+            data = json.load(f)
+            if not isinstance(data, dict):
+                raise Exception("invalid config format, dict expected.")
+            return data
+    except Exception as e:
+        print(f"Error reading JSON file: {e}")
+        return {}
+
+
+configs = read_configs("config.json")
+HUGGINGFACE_TOKEN = configs.get("huggingface_token", "your huggingface token")
+
+
+def get_token_count(text):
+    # Simple heuristic for token count
+    if not text:
+        return 0
+    return max(1, len(text) // 4)
+
+
+# Get the full model path from modelMaps based on MODEL_NAME env var
+BASE_MODEL_PATH = modelMaps.get(MODEL_NAME, MODEL_NAME)
+
+models = [
+    {
+        "id": BASE_MODEL_PATH,
+        "object": "model",
+        "created": 1715644056,
+        "owned_by": "vllm",
+        "root": BASE_MODEL_PATH,
+        "parent": None,
+        "permission": [
+            {
+                "id": "modelperm-cb1adf4457b2417e8c7770aadcffe4cc",
+                "object": "model_permission",
+                "created": 1715644056,
+                "allow_create_engine": False,
+                "allow_sampling": True,
+                "allow_logprobs": True,
+                "allow_search_indices": False,
+                "allow_view": True,
+                "allow_fine_tuning": False,
+                "organization": "*",
+                "group": None,
+                "is_blocking": False
+            }
+        ]
+    }
+]
+
+
+# Note: this is to suppress /metrics logs, gateway sends request to pods to scrape
+# the metrics and results in lots of meaningless requests that we do not want to log.
+def disable_endpoint_logs():
+    """Disable logs for requests to specific endpoints."""
+    disabled_endpoints = ('/', '/healthz', '/metrics', '/v1/models')
+    parent_log_request = serving.WSGIRequestHandler.log_request
+
+    def log_request(self, *args, **kwargs):
+        if not any(re.match(f"{de}$", self.path) for de in disabled_endpoints):
+            parent_log_request(self, *args, **kwargs)
+
+    serving.WSGIRequestHandler.log_request = log_request
+
+
+app = Flask(__name__)
+disable_endpoint_logs()
+
+
+@app.route('/v1/models', methods=['GET'])
+@auth.login_required
+def get_models():
+    return jsonify({
+        "object": "list",
+        "data": models
+    })
+
+
+@app.route('/v1/load_lora_adapter', methods=['POST'])
+@auth.login_required
+def load_model():
+    lora_name = request.json.get('lora_name')
+    # Check if the model already exists
+    if any(model['id'] == lora_name for model in models):
+        return jsonify({"status": "success", "message": "Model already loaded"}), 200
+
+    new_model = {
+        'id': lora_name,
+        'created': int(time.time()),
+        'object': "model",
+        'owned_by': "vllm",
+        'parent': None,
+        'root': request.json.get('lora_path')
+    }
+
+    models.append(new_model)
+    return jsonify({"status": "success", "message": "Model loaded successfully"}), 200
+
+
+@app.route('/v1/unload_lora_adapter', methods=['POST'])
+@auth.login_required
+def unload_model():
+    model_id = request.json.get('lora_name')
+    global models
+    models = [model for model in models if model['id'] != model_id]
+    return jsonify({"status": "success", "message": "Model unloaded successfully"}), 200
+
+
+@app.route('/v1/completions', methods=['POST'])
+@auth.login_required
+def completion():
+    try:
+        prompt = request.json.get('prompt')
+        model = request.json.get('model')
+        max_tokens = request.json.get('max_tokens')
+        if not prompt or not model:
+            return jsonify({"status": "error", "message": "Prompt and model are required"}), 400
+
+        # Check if model exists in the models list (includes base model and loaded LoRA adapters)
+        if not any(m['id'] == model for m in models):
+            return jsonify({
+                "error": {
+                    "message": f"The model `{model}` does not exist",
+                    "type": "invalid_request_error",
+                    "param": "model",
+                    "code": "model_not_found"
+                }
+            }), 404
+
+        arrived_at = datetime.now().timestamp()
+        input_tokens = get_token_count(prompt)
+        output_tokens = max_tokens if max_tokens else randint(10, 500)
+        arrived_next = request.json.get('next_in')
+        if not arrived_next:
+            arrived_next = 0.0
+        else:
+            arrived_next += arrived_at
+
+        start = datetime.now().timestamp()
+        latency = 0.0
+        if simulator is not None:
+            latency = simulator.execute(Request(arrived_at, input_tokens, output_tokens, arrived_next=arrived_next))
+
+        # Simulated response
+        response = {
+            "id": "cmpl-uqkvlQyYK7bGYrRHQ0eXlWi7",
+            "object": "text_completion",
+            "created": int(arrived_at),
+            "model": model,
+            "system_fingerprint": "fp_44709d6fcb",
+            "choices": [
+                {
+                    "text": f"This is simulated message from {model}!",
+                    "index": 0,
+                    "logprobs": None,
+                    "finish_reason": "length"
+                }
+            ],
+            "usage": {
+                "prompt_tokens": input_tokens,
+                "completion_tokens": output_tokens,
+                "total_tokens": input_tokens + output_tokens,
+                "time": latency
+            }
+        }
+        overhead = datetime.now().timestamp() - start
+        if latency > overhead:
+            time.sleep(latency - overhead)
+        elif latency > 0.0:
+            logger.warning(f"Latency is less than overhead: L{latency} - O{overhead}")
+
+        return jsonify(response), 200
+    except Exception as e:
+        err = {
+            "error": {
+                "message": f"The server had an error while processing your request: {e}",
+                "type": "server_error"
+            }
+        }
+        return jsonify(err), 500
+
+
+@app.route('/v1/chat/completions', methods=['POST'])
+@auth.login_required
+def chat_completions():
+    try:
+        messages = request.json.get('messages')
+        model = request.json.get('model')
+        max_tokens = request.json.get('max_tokens')
+        if not messages or not model:
+            return jsonify({"status": "error", "message": "Messages and model are required"}), 400
+
+        # Check if model exists in the models list (includes base model and loaded LoRA adapters)
+        if not any(m['id'] == model for m in models):
+            return jsonify({
+                "error": {
+                    "message": f"The model `{model}` does not exist",
+                    "type": "invalid_request_error",
+                    "param": "model",
+                    "code": "model_not_found"
+                }
+            }), 404
+
+        arrived_at = datetime.now().timestamp()
+        input_tokens = sum(get_token_count(message["content"]) for message in messages)
+        output_tokens = max_tokens if max_tokens else randint(10, 500)
+        arrived_next = request.json.get('next_in')
+        if not arrived_next:
+            arrived_next = 0.0
+        else:
+            arrived_next += arrived_at
+
+        start = datetime.now().timestamp()
+        latency = 0.0
+        if simulator is not None:
+            latency = simulator.execute(Request(arrived_at, input_tokens, output_tokens, arrived_next=arrived_next))
+
+        # Simulated response
+        response = {
+            "id": "chatcmpl-abc123",
+            "object": "chat.completion",
+            "created": int(arrived_at),
+            "model": model,
+            "usage": {
+                "prompt_tokens": input_tokens,
+                "completion_tokens": output_tokens,
+                "total_tokens": input_tokens + output_tokens,
+                "time": latency
+            },
+            "choices": [
+                {
+                    "message": {
+                        "role": "assistant",
+                        "content": f"\n\nThis is simulated message from {model}!"
+                    },
+                    "logprobs": None,
+                    "finish_reason": "stop",
+                    "index": 0
+                }
+            ]
+        }
+        overhead = datetime.now().timestamp() - start
+        if latency > overhead:
+            time.sleep(latency - overhead)
+        else:
+            logger.warning(f"Latency is less than overhead: L{latency} - O{overhead}")
+
+        return jsonify(response), 200
+    except Exception as e:
+        err = {
+            "error": {
+                "message": f"The server had an error while processing your request: {e}",
+                "type": "server_error"
+            }
+        }
+        return jsonify(err), 500
+
+
+@app.route('/set_metrics', methods=['POST'])
+def set_metrics():
+    global overrides
+    # Get JSON data from the request
+    data = request.json
+    if data:
+        # Update overrides with new key-value pairs
+        overrides.update(data)
+        return {"status": "success", "message": "Overrides updated"}, 200
+    else:
+        return {"status": "error", "message": "No data provided"}, 400
+
+
+# Initialize global state to keep track of metrics data
+metrics_state = {}
+
+
+def generate_histogram_metric(metric_name, description, model_name, buckets, new_requests, help_header=True):
+    """
+    Generate Prometheus histogram metrics with dynamically updated bucket values.
+
+    Args:
+        metric_name (str): Name of the metric.
+        description (str): Metric description.
+        model_name (str): Model name.
+        buckets (list): List of bucket boundaries.
+        new_requests (dict): Dictionary with new requests to update bucket values.
+        help_header: the flag to include HELP Header
+
+    Returns:
+        str: Prometheus-formatted histogram metric.
+    """
+    global metrics_state
+
+    # Initialize state if not already present
+    if metric_name not in metrics_state:
+        metrics_state[metric_name] = {
+            "buckets": {bucket: 0 for bucket in buckets},  # Bucket values
+            "total_sum": 0,  # Total sum of all values
+            "total_count": 0  # Total count of all events
+        }
+
+    # Retrieve current metric state
+    current_state = metrics_state[metric_name]
+
+    # Update buckets and ensure cumulative nature
+    for bucket in buckets:
+        if bucket in new_requests:
+            # Add new requests for this bucket
+            current_state["buckets"][bucket] += new_requests[bucket]
+
+        # Ensure cumulative updates for histogram buckets
+        if bucket != buckets[0]:  # Skip the first bucket
+            current_state["buckets"][bucket] = max(
+                current_state["buckets"][bucket],
+                current_state["buckets"][buckets[buckets.index(bucket) - 1]]
+            )
+
+    # Update total_count and total_sum
+    current_state["total_count"] = current_state["buckets"][buckets[-1]]  # `+Inf` bucket is the total count
+    current_state["total_sum"] += sum(
+        float(bucket) * value for bucket, value in new_requests.items() if bucket != "+Inf"
+    )
+
+    # Generate Prometheus bucket strings
+    bucket_strings = "\n".join(
+        [f'vllm:{metric_name}_bucket{{le="{bucket}",model_name="{model_name}"}} {current_state["buckets"][bucket]}'
+         for bucket in buckets]
+    )
+
+    # Return formatted histogram metric
+    histogram_template = """
+# HELP vllm:{metric_name} {description}
+# TYPE vllm:{metric_name} histogram
+vllm:{metric_name}_sum{{model_name="{model_name}"}} {value}
+{buckets}
+vllm:{metric_name}_count{{model_name="{model_name}"}} {count}
+""" if help_header else """
+vllm:{metric_name}_sum{{model_name="{model_name}"}} {value}
+{buckets}
+vllm:{metric_name}_count{{model_name="{model_name}"}} {count}
+"""
+
+    return histogram_template.format(
+        metric_name=metric_name,
+        description=description,
+        model_name=model_name,
+        value=current_state["total_sum"],
+        buckets=bucket_strings,
+        count=current_state["total_count"]
+    )
+
+
+def generate_counter_gauge_metric(metric_name, metric_type, description, model_name, value, help_header=True):
+    """
+    Generates a Prometheus metric string for counter or gauge.
+
+    Args:
+        metric_name (str): The name of the metric.
+        metric_type (str): The type of the metric ('counter' or 'gauge').
+        description (str): The HELP description of the metric.
+        model_name (str): The name of the model.
+        value (float): The value of the metric.
+        help_header: the flag to include HELP Header
+
+    Returns:
+        str: A formatted Prometheus metric string.
+    """
+    counter_gauge_template = """
+# HELP vllm:{metric_name} {description}
+# TYPE vllm:{metric_name} {metric_type}
+vllm:{metric_name}{{model_name="{model_name}"}} {value}
+""" if help_header else """
+vllm:{metric_name}{{model_name="{model_name}"}} {value}
+"""
+
+    return counter_gauge_template.format(
+        metric_name=metric_name,
+        metric_type=metric_type,
+        description=description,
+        model_name=model_name,
+        value=value
+    )
+
+
+@app.route('/metrics')
+def metrics():
+    # get deployment information
+    try:
+        apps_v1 = client.AppsV1Api()
+        resp = apps_v1.read_namespaced_deployment(DEPLOYMENT_NAME, NAMESPACE)
+        replicas = resp.spec.replicas if resp.spec.replicas is not None else 1
+    except Exception as e:
+        #print(f"Failed to get deployment information: {DEPLOYMENT_NAME=} {NAMESPACE=} error={str(e)}")
+        #print(f"Due to the failure, replicas {DEFAULT_REPLICAS} will be used to calculate metrics")
+        replicas = DEFAULT_REPLICAS
+
+    # a reasonable mock total value
+    total = overrides.get("total", 100.0)
+    model_name = overrides.get("model_name", MODEL_NAME)
+    # calculate metrics with potential overrides
+    success_total = overrides.get("success_total", total / replicas)
+    avg_prompt_throughput = overrides.get("avg_prompt_throughput", total / replicas if replicas > 0 else 0)
+    avg_generation_throughput = overrides.get("avg_generation_throughput", total / replicas if replicas > 0 else 0)
+    prompt_tokens_total = overrides.get("prompt_tokens_total", randint(100, 1024) * success_total)
+    generation_tokens_total = overrides.get("generation_tokens_total", randint(100, 1024) * success_total)
+    running = overrides.get("running", randint(1, 100))
+    cpu_running = overrides.get("cpu_running", randint(1, 100))
+    waiting = overrides.get("waiting", randint(1, 5))
+    swapped = overrides.get("swapped", randint(1, 100))
+    max_running_capacity = 100
+    gpu_cache_usage_perc = overrides.get("gpu_cache_usage_perc", min(1.0, (running / max_running_capacity)))
+    cpu_cache_usage_perc = overrides.get("cpu_cache_usage_perc", min(1.0, (cpu_running / max_running_capacity)))
+
+    # Define metrics and their attributes
+    simple_metrics = [
+        {
+            "name": "prompt_tokens_total",
+            "type": "counter",
+            "description": "Count of prefill tokens processed.",
+            "value": overrides.get("prompt_tokens_total", prompt_tokens_total)
+        },
+        {
+            "name": "generation_tokens_total",
+            "type": "counter",
+            "description": "Count of generation tokens processed.",
+            "value": overrides.get("generation_tokens_total", generation_tokens_total)
+        },
+        {
+            "name": "request_success_total",
+            "type": "counter",
+            "description": "Count of successfully processed requests.",
+            "value": overrides.get("success_total", success_total)
+        },
+        {
+            "name": "num_requests_running",
+            "type": "gauge",
+            "description": "Number of requests currently running on GPU.",
+            "value": overrides.get("running", running)
+        },
+        {
+            "name": "num_requests_swapped",
+            "type": "gauge",
+            "description": "Number of requests swapped to CPU.",
+            "value": overrides.get("swapped", swapped)
+        },
+        {
+            "name": "num_requests_waiting",
+            "type": "gauge",
+            "description": "Number of requests waiting to be processed.",
+            "value": overrides.get("waiting", waiting)
+        },
+        {
+            "name": "avg_prompt_throughput_toks_per_s",
+            "type": "gauge",
+            "description": "Average prefill throughput in tokens/s.",
+            "value": overrides.get("avg_prompt_throughput", avg_prompt_throughput)
+        },
+        {
+            "name": "avg_generation_throughput_toks_per_s",
+            "type": "gauge",
+            "description": "Average generation throughput in tokens/s.",
+            "value": overrides.get("avg_generation_throughput", avg_generation_throughput)
+        },
+        {
+            "name": "gpu_cache_usage_perc",
+            "type": "gauge",
+            "description": "GPU KV-cache usage. 1 means 100 percent usage.",
+            "value": overrides.get(
+                "gpu_cache_usage_perc", gpu_cache_usage_perc
+            )
+        },
+        {
+            "name": "cpu_cache_usage_perc",
+            "type": "gauge",
+            "description": "CPU KV-cache usage. 1 means 100 percent usage.",
+            "value": overrides.get(
+                "cpu_cache_usage_perc", cpu_cache_usage_perc
+            )
+        },
+    ]
+
+    # Generate all metrics
+    metrics_output = ""
+    for metric in simple_metrics:
+        metrics_output += generate_counter_gauge_metric(metric["name"], metric["type"], metric["description"],
+                                                        model_name, metric["value"])
+        metrics_output += generate_counter_gauge_metric(metric["name"], metric["type"], metric["description"],
+                                                        "lora-A", metric["value"], help_header=False)
+        
+    
+    metrics_output += """
+# HELP vllm:lora_requests_info Running stats on lora requests.
+# TYPE vllm:lora_requests_info gauge
+vllm:lora_requests_info{max_lora="1",running_lora_adapters="lora-A",waiting_lora_adapters=""} 1
+"""
+
+    histogram_metrics = [
+        {
+            "name": "iteration_tokens_total",
+            "type": "histogram",
+            "description": "Histogram of number of tokens per engine_step.",
+            "buckets": ["1.0", "8.0", "16.0", "32.0", "64.0", "128.0", "256.0",
+                        "512.0", "1024.0", "2048.0", "4096.0", "8192.0", "+Inf"]
+        },
+        {
+            "name": "time_to_first_token_seconds",
+            "type": "histogram",
+            "description": "Histogram of time to first token in seconds.",
+            "buckets": ["0.001", "0.005", "0.01", "0.02", "0.04", "0.06",
+                        "0.08", "0.1", "0.25", "0.5", "+Inf"]
+        },
+        {
+            "name": "time_per_output_token_seconds",
+            "type": "histogram",
+            "description": "Histogram of time per output token in seconds.",
+            "buckets": ["0.01", "0.025", "0.05", "0.075", "0.1", "0.15",
+                        "0.2", "0.3", "0.4", "+Inf"]
+        },
+        {
+            "name": "request_prompt_tokens",
+            "type": "histogram",
+            "description": "Histogram of number of prefill tokens processed.",
+            "buckets": ["1.0", "2.0", "5.0", "10.0", "20.0", "50.0",
+                        "100.0", "200.0", "500.0", "1000.0", "2000.0",
+                        "5000.0", "10000.0", "+Inf"]
+        },
+        {
+            "name": "request_generation_tokens",
+            "type": "histogram",
+            "description": "Histogram of number of generation tokens processed.",
+            "buckets": ["1.0", "2.0", "5.0", "10.0", "20.0", "50.0",
+                        "100.0", "200.0", "500.0", "1000.0", "2000.0",
+                        "5000.0", "10000.0", "+Inf"]
+        },
+        {
+            "name": "e2e_request_latency_seconds",
+            "type": "histogram",
+            "description": "Histogram of end to end request latency in seconds.",
+            "buckets": ["0.3", "0.5", "0.8", "1.0", "1.5", "2.0", "5.0", "+Inf"]
+        },
+        {
+            "name": "request_queue_time_seconds",
+            "type": "histogram",
+            "description": "Histogram of time spent in WAITING phase for request.",
+            "buckets": ["0.3", "0.5", "0.8", "1.0", "1.5", "2.0", "5.0", "+Inf"]
+        },
+        {
+            "name": "request_inference_time_seconds",
+            "type": "histogram",
+            "description": "Histogram of time spent in RUNNING phase for request.",
+            "buckets": ["0.3", "0.5", "0.8", "1.0", "1.5", "2.0", "5.0", "+Inf"]
+        },
+        {
+            "name": "request_decode_time_seconds",
+            "type": "histogram",
+            "description": "Histogram of time spent in DECODE phase for request.",
+            "buckets": ["0.3", "0.5", "0.8", "1.0", "1.5", "2.0", "5.0", "+Inf"]
+        },
+        {
+            "name": "request_prefill_time_seconds",
+            "type": "histogram",
+            "description": "Histogram of time spent in PREFILL phase for request.",
+            "buckets": ["0.3", "0.5", "0.8", "1.0", "1.5", "2.0", "5.0", "+Inf"]
+        },
+    ]
+
+    # Generate metrics output
+    histogram_metrics_output = ""
+    for metric in histogram_metrics:
+        # Simulate random new requests for the metric
+        new_requests = {bucket: random.randint(0, 5) for bucket in metric["buckets"]}
+        histogram_metrics_output += generate_histogram_metric(
+            metric_name=metric["name"],
+            description=metric["description"],
+            model_name=model_name,
+            buckets=metric["buckets"],
+            new_requests=new_requests
+        )
+        new_requests = {bucket: random.randint(0, 5) for bucket in metric["buckets"]}
+        histogram_metrics_output += generate_histogram_metric(
+            metric_name=metric["name"],
+            description=metric["description"],
+            model_name="lora-A",
+            buckets=metric["buckets"],
+            new_requests=new_requests,
+            help_header=False
+        )
+
+    return Response(metrics_output + histogram_metrics_output, mimetype='text/plain')
+
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.DEBUG)
+    logging.getLogger("kubernetes.client.rest").setLevel(logging.ERROR)  # Suppress kubernetes logs
+
+    print(f"Starting app. DEPLOYMENT_NAME: {DEPLOYMENT_NAME}, NAMESPACE: {NAMESPACE}, MODEL: {MODEL_NAME}")
+
+    # Extract gpu_device without call argparse
+    gpu_device = "disabled"
+    try:
+        index = sys.argv.index("--replica_config_device")
+        if index + 1 < len(sys.argv):
+            gpu_device = sys.argv[index + 1]
+    except ValueError:
+        pass
+
+    # Restore -h functionality
+    if '-h' in sys.argv:
+        print("Mock server help: [no options available]")
+
+    # Launch simulator
+    if gpu_device != "disabled":
+        # Simplified simulator initialization
+        simulator = Simulator()
+        overrides = {
+            "total": 100.0,
+            "running": 0,
+            "waiting": 0,
+            "swapped": 0
+        }
+
+    thread = None
+    if simulator is not None:
+        # TODO: Move simulation to a separate workflow, independent of the main web service
+        thread = simulator.start()
+
+    # Perform profiling and skip actual run
+    if '--time_limit' not in sys.argv:
+        try:
+            # config.load_kube_config()
+            config.load_incluster_config()
+        except Exception as e:
+            print(f"Failed to load k8s config: {e}")
+
+        # app.run(host='0.0.0.0', port=8080)
+        app.run(host='0.0.0.0', port=8000)
+
+    if simulator is not None:
+        simulator.stop()
+
+    if thread is not None:
+        thread.join()
diff --git a/hack/mock-images/vllm-mock/config.json b/hack/mock-images/vllm-mock/config.json
new file mode 100644
index 000000000..9df272fe9
--- /dev/null
+++ b/hack/mock-images/vllm-mock/config.json
@@ -0,0 +1,3 @@
+{
+    "huggingface_token": "your huggingface token"
+}
\ No newline at end of file
diff --git a/hack/mock-images/vllm-mock/requirements.txt b/hack/mock-images/vllm-mock/requirements.txt
new file mode 100644
index 000000000..fcee99ce9
--- /dev/null
+++ b/hack/mock-images/vllm-mock/requirements.txt
@@ -0,0 +1,3 @@
+flask
+Flask-HTTPAuth
+kubernetes
diff --git a/hack/mock-images/vllm-mock/simulator.py b/hack/mock-images/vllm-mock/simulator.py
new file mode 100644
index 000000000..89e301317
--- /dev/null
+++ b/hack/mock-images/vllm-mock/simulator.py
@@ -0,0 +1,49 @@
+# Copyright The Volcano Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import threading
+import time
+import random
+from typing import Optional
+
+class Request:
+    def __init__(self, arrived_at, input_tokens, output_tokens, arrived_next=0):
+        self.arrived_at = arrived_at
+        self.input_tokens = input_tokens
+        self.output_tokens = output_tokens
+        self.arrived_next = arrived_next
+
+class Simulator:
+    def __init__(self, config=None):
+        self._terminate = False
+
+    def start(self):
+        # Dummy thread for compatibility
+        def dummy_run():
+            while not self._terminate:
+                time.sleep(1)
+        
+        t = threading.Thread(target=dummy_run)
+        t.start()
+        return t
+
+    def stop(self):
+        self._terminate = True
+
+    def execute(self, request: Request) -> float:
+        # Simple latency mock: base delay + per-token delay
+        base_latency = 0.05
+        per_token_latency = 0.002
+        latency = base_latency + (request.input_tokens + request.output_tokens) * per_token_latency
+        latency *= random.uniform(0.9, 1.1)
+        return latency
diff --git a/hack/mock-images/vllm-mock/test_app.py b/hack/mock-images/vllm-mock/test_app.py
new file mode 100644
index 000000000..f7f72b7e6
--- /dev/null
+++ b/hack/mock-images/vllm-mock/test_app.py
@@ -0,0 +1,46 @@
+# Copyright The Volcano Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from app import app
+
+
+class FlaskTestCase(unittest.TestCase):
+
+    def setUp(self):
+        self.client = app.test_client()
+
+    def test_metrics(self):
+        expected_total = 100
+        replica = 3
+        response = self.client.get('/metrics')
+        self.assertEqual(response.status_code, 200)
+        data = response.data.decode()
+        print(f"response data: \n<<<<<<<<<<<<<<<<\n{data}\n<<<<<<<<<<<<<<<<")
+        # metrics exists
+        self.assertIn('vllm:request_success_total', data)
+        self.assertIn('vllm:avg_prompt_throughput_toks_per_s', data)
+        self.assertIn('vllm:avg_generation_throughput_toks_per_s', data)
+
+        # assert metric value
+        self.assertIn(
+            f'vllm:request_success_total{{finished_reason="stop",model_name="llama2-70b"}} {expected_total / replica}',
+            data)
+        self.assertIn(f'vllm:avg_prompt_throughput_toks_per_s{{model_name="llama2-70b"}} {expected_total / replica}',
+                      data)
+        self.assertIn(
+            f'vllm:avg_generation_throughput_toks_per_s{{model_name="llama2-70b"}} {expected_total / replica}', data)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/e2e/router/testdata/LLM-Mock-ds1.5b-Canary.yaml b/test/e2e/router/testdata/LLM-Mock-ds1.5b-Canary.yaml
index ff30fb59a..5f8fd5cee 100644
--- a/test/e2e/router/testdata/LLM-Mock-ds1.5b-Canary.yaml
+++ b/test/e2e/router/testdata/LLM-Mock-ds1.5b-Canary.yaml
@@ -21,7 +21,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
@@ -49,7 +49,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/test/e2e/router/testdata/LLM-Mock-ds1.5b.yaml b/test/e2e/router/testdata/LLM-Mock-ds1.5b.yaml
index 03b9e9fa5..ae7c77cda 100644
--- a/test/e2e/router/testdata/LLM-Mock-ds1.5b.yaml
+++ b/test/e2e/router/testdata/LLM-Mock-ds1.5b.yaml
@@ -19,7 +19,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/test/e2e/router/testdata/LLM-Mock-ds7b.yaml b/test/e2e/router/testdata/LLM-Mock-ds7b.yaml
index faa8c76fd..9951505d7 100644
--- a/test/e2e/router/testdata/LLM-Mock-ds7b.yaml
+++ b/test/e2e/router/testdata/LLM-Mock-ds7b.yaml
@@ -19,7 +19,7 @@ spec:
     spec:
       containers:
         - name: llm-engine
-          image: ghcr.io/yaozengzeng/vllm-mock:latest
+          image: ghcr.io/volcano-sh/vllm-mock:latest
           imagePullPolicy: IfNotPresent
           env:
             # specify the model name to mock
diff --git a/test/e2e/router/testdata/ModelServing-ds1.5b-pd-disaggregation.yaml b/test/e2e/router/testdata/ModelServing-ds1.5b-pd-disaggregation.yaml
index 71026345d..d385ab500 100644
--- a/test/e2e/router/testdata/ModelServing-ds1.5b-pd-disaggregation.yaml
+++ b/test/e2e/router/testdata/ModelServing-ds1.5b-pd-disaggregation.yaml
@@ -19,7 +19,7 @@ spec:
           spec:
             containers:
               - name: leader
-                image: ghcr.io/yaozengzeng/vllm-mock:latest
+                image: ghcr.io/volcano-sh/vllm-mock:latest
                 imagePullPolicy: IfNotPresent
                 env:
                   # specify the model name to mock
@@ -39,7 +39,7 @@ spec:
           spec:
             containers:
               - name: leader
-                image: ghcr.io/yaozengzeng/vllm-mock:latest
+                image: ghcr.io/volcano-sh/vllm-mock:latest
                 imagePullPolicy: IfNotPresent
                 env:
                   # specify the model name to mock