Skip to content

Commit ec44bd5

Browse files
committed
add external UDS based tokenizer service
Signed-off-by: Hang Yin <[email protected]>
1 parent d070ea6 commit ec44bd5

File tree

23 files changed

+1905
-5
lines changed

23 files changed

+1905
-5
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ __pycache__/
2020
*.pyc
2121
*.pyo
2222
*.pyd
23+
*.python-version
2324

2425
# Go workspace file
2526
go.work

examples/kv_events/online/main.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import (
3434
"github.com/llm-d/llm-d-kv-cache-manager/pkg/kvcache/kvblock"
3535
"github.com/llm-d/llm-d-kv-cache-manager/pkg/kvcache/kvevents"
3636
preprocessing "github.com/llm-d/llm-d-kv-cache-manager/pkg/preprocessing/chat_completions"
37+
"github.com/llm-d/llm-d-kv-cache-manager/pkg/tokenization"
3738
)
3839

3940
const (
@@ -51,6 +52,8 @@ const (
5152

5253
envHTTPPort = "HTTP_PORT"
5354
defaultHTTPPort = "8080"
55+
56+
envExternalTokenization = "EXTERNAL_TOKENIZATION"
5457
)
5558

5659
// ChatCompletionsRequest holds the fields needed for chat-completions rendering.
@@ -177,6 +180,12 @@ func getKVCacheIndexerConfig() *kvcache.Config {
177180
config.TokenProcessorConfig.BlockSize = blockSize
178181
}
179182

183+
useExternalTokenization, err := strconv.ParseBool(os.Getenv(envExternalTokenization))
184+
if err == nil && useExternalTokenization {
185+
config.TokenizersPoolConfig.UdsTokenizerConfig = &tokenization.UdsTokenizerConfig{}
186+
config.TokenizersPoolConfig.HFTokenizerConfig = nil
187+
}
188+
180189
config.KVBlockIndexConfig.EnableMetrics = true
181190
config.KVBlockIndexConfig.MetricsLoggingInterval = 30 * time.Second
182191

examples/uds_tokenizer/Dockerfile

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# Copyright 2025 The llm-d Authors.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Build stage
16+
FROM --platform=$TARGETPLATFORM registry-cn-hangzhou.ack.aliyuncs.com/dev/python:3.10-slim-test as builder
17+
18+
# Set build arguments
19+
ARG TARGETPLATFORM
20+
ARG BUILDPLATFORM
21+
22+
# Set working directory
23+
WORKDIR /app
24+
25+
# Copy dependencies and install them
26+
COPY requirements.txt /app/requirements.txt
27+
RUN pip install --no-cache-dir -r requirements.txt
28+
29+
# Runtime stage
30+
FROM --platform=$TARGETPLATFORM registry-cn-hangzhou.ack.aliyuncs.com/dev/python:3.10-slim-test
31+
32+
# Set working directory
33+
WORKDIR /app
34+
35+
RUN apt-get update && apt-get upgrade -y && rm -rf /var/cache/apt/
36+
37+
# Copy installed dependencies from build stage
38+
COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
39+
# Copy executables from build stage
40+
COPY --from=builder /usr/local/bin/ /usr/local/bin/
41+
42+
# Copy project files into the image
43+
COPY server.py /app/
44+
COPY requirements.txt /app/requirements.txt
45+
COPY gunicorn.conf.py /app/gunicorn.conf.py
46+
COPY start_gunicorn.sh /app/start_gunicorn.sh
47+
COPY models/ /app/models/
48+
COPY utils/ /app/utils/
49+
COPY tokenizer_service/ /app/tokenizer_service/
50+
51+
# Make startup script executable
52+
RUN chmod +x /app/start_gunicorn.sh
53+
54+
# Create directory for UDS socket
55+
RUN mkdir -p /tmp/tokenizer && chown 65532:65532 /tmp/tokenizer
56+
57+
# Create model cache directories and set permissions
58+
RUN mkdir -p /app/models && chown -R 65532:65532 /app/models
59+
# Create and set permissions for ModelScope directory
60+
RUN mkdir -p /.modelscope && chown -R 65532:65532 /.modelscope
61+
# Create and set permissions for Hugging Face cache directory
62+
RUN mkdir -p /.cache && chown -R 65532:65532 /.cache
63+
64+
# Switch to non-root user
65+
USER 65532:65532
66+
67+
# Startup command: run UDS server
68+
CMD ["/app/start_gunicorn.sh"]

examples/uds_tokenizer/README.md

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
# UDS Tokenizer Service
2+
3+
This service provides tokenization functionality via HTTP over Unix Domain Socket (UDS). It also exposes a separate HTTP endpoint for Kubernetes health checks.
4+
5+
## Features
6+
7+
- Apply chat templates to messages
8+
- Tokenize text prompts
9+
- Runtime configuration updates
10+
- Health check endpoint for Kubernetes
11+
- Support for multiple model formats (HuggingFace, ModelScope)
12+
- Automatic model downloading and caching
13+
14+
## Services
15+
16+
The service exposes multiple endpoints:
17+
18+
1. `/chat-template` - Apply chat template to messages (UDS only)
19+
2. `/tokenize` - Tokenize text (UDS only)
20+
3. `/health` - Health check endpoint (TCP port, for Kubernetes probes)
21+
4. `/config` - Get or update configuration (UDS only)
22+
23+
## Quick Start
24+
25+
Start the service:
26+
```bash
27+
python server.py
28+
```
29+
30+
Or using Gunicorn for production:
31+
```bash
32+
./start_gunicorn.sh
33+
```
34+
35+
The service will:
36+
- Listen on `/tmp/tokenizer/tokenizer-uds.socket` for main functionality
37+
- Listen on port 8080 (configurable via PROBE_PORT) for health checks
38+
39+
## Environment Variables
40+
41+
| Variable | Description | Default |
42+
|---------|-------------|---------|
43+
| `LOG_LEVEL` | Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL) | INFO |
44+
| `WORKERS` | Number of worker processes when using Gunicorn | CPU cores * 2 + 1 |
45+
| `MODEL` | Path to the model directory | ./models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B |
46+
| `ADD_SPECIAL_TOKENS` | Whether to add special tokens | true |
47+
| `ENABLE_THINKING` | Whether to enable thinking mode | false |
48+
| `ADD_GENERATION_PROMPT` | Whether to add generation prompt | true |
49+
| `PROBE_PORT` | Port for health check endpoint | 8080 |
50+
| `USE_MODELSCOPE` | Whether to download tokenizer files from ModelScope (true) or Hugging Face (false) | false |
51+
52+
## API Endpoints
53+
54+
### POST /chat-template
55+
Apply chat template to a list of messages.
56+
57+
Request body:
58+
```json
59+
[
60+
{
61+
"role": "system",
62+
"content": "You are a helpful assistant."
63+
},
64+
{
65+
"role": "user",
66+
"content": "Hello!"
67+
}
68+
]
69+
```
70+
71+
Response:
72+
Plain text with the formatted prompt.
73+
74+
### POST /tokenize
75+
Tokenize a text prompt.
76+
77+
Request body:
78+
```
79+
Text to tokenize
80+
```
81+
82+
Response:
83+
JSON with tokenization results:
84+
- `input_ids`: List of token IDs
85+
- `attention_mask`: Attention mask for the tokens
86+
87+
### GET /health
88+
Health check endpoint for Kubernetes probes.
89+
90+
Response:
91+
```json
92+
{
93+
"status": "healthy",
94+
"service": "tokenizer-service",
95+
"timestamp": 1234567890.123
96+
}
97+
```
98+
99+
### GET /config
100+
Get current configuration.
101+
102+
Response:
103+
```json
104+
{
105+
"model": "./models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
106+
"add_special_tokens": true,
107+
"enable_thinking": false,
108+
"add_generation_prompt": true
109+
}
110+
```
111+
112+
### POST /config
113+
Update configuration at runtime.
114+
115+
Request body:
116+
```json
117+
{
118+
"model": "./models/qwen/qwen3-0.6b",
119+
"add_special_tokens": false
120+
}
121+
```
122+
123+
Response:
124+
```json
125+
{
126+
"status": "success",
127+
"message": "Configuration updated successfully"
128+
}
129+
```
130+
131+
## Testing
132+
133+
### Unit Tests
134+
135+
Run unit tests with mocks (no service needed):
136+
```bash
137+
# Install test dependencies
138+
pip install -r tests/requirements.txt
139+
140+
# Run unit tests
141+
python -m pytest tests/test_tokenizer_unit.py -v
142+
```
143+
144+
### Integration Tests
145+
146+
Run integration tests (requires service to be running):
147+
```bash
148+
# Start the service in the background
149+
python server.py &
150+
151+
# Run integration tests with automatic waiting
152+
python tests/run_integration_tests.py
153+
154+
# Stop the service
155+
pkill -f "python server.py"
156+
```
157+
158+
The integration test runner will automatically wait for the server to be ready before running tests.
159+
160+
## Kubernetes Deployment
161+
162+
The service is designed to run in Kubernetes with:
163+
- A shared `emptyDir` volume for UDS communication between containers
164+
- Health check endpoint for liveness and readiness probes
165+
- Proper security context with non-root user
166+
167+
## Model Support
168+
169+
The service supports:
170+
- HuggingFace models (local or remote)
171+
- ModelScope models (automatically downloaded and cached)
172+
- Custom models in standard format
173+
174+
Models are automatically downloaded and cached in the `models/` directory.
175+
The source for downloading can be controlled with the `USE_MODELSCOPE` environment variable:
176+
- `false` (default): Download from Hugging Face
177+
- `true`: Download from ModelScope
178+
179+
See [models/README.md](models/README.md) for detailed information about model caching, pre-populating the cache, and Kubernetes deployment strategies.
180+
181+
## Project Structure
182+
183+
```
184+
├── server.py # Main server entry point
185+
├── tokenizer_service/ # Core tokenizer service implementation
186+
│ ├── __init__.py
187+
│ ├── tokenizer.py # Tokenizer service implementation
188+
│ └── exceptions.py # Custom exceptions
189+
├── utils/ # Utility functions
190+
│ ├── __init__.py
191+
│ └── logger.py # logger functionality
192+
├── tests/ # Test files
193+
│ ├── __init__.py
194+
│ ├── run_integration_tests.py # Integration test runner
195+
│ ├── test_tokenizer_unit.py # Unit tests
196+
│ └── test_tokenizer_service.py # Legacy integration test
197+
├── models/ # Model files (downloaded automatically)
198+
├── client/ # Client examples
199+
├── requirements.txt # Python dependencies
200+
├── gunicorn.conf.py # Gunicorn configuration
201+
├── start_gunicorn.sh # Gunicorn startup script
202+
└── README.md # This file
203+
```
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Copyright 2025 The llm-d Authors.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Gunicorn configuration file
16+
import os
17+
import multiprocessing
18+
19+
# Server configuration
20+
bind = "unix:/tmp/tokenizer/tokenizer-uds.socket"
21+
workers = int(os.getenv("WORKERS", multiprocessing.cpu_count() * 2 + 1))
22+
worker_class = "aiohttp.GunicornWebWorker"
23+
worker_connections = 1000
24+
max_requests = 1000
25+
max_requests_jitter = 100
26+
27+
# Logging configuration
28+
accesslog = "-"
29+
errorlog = "-"
30+
loglevel = os.getenv("LOG_LEVEL", "info").lower()
31+
# Use aiohttp-style log format
32+
access_log_format = '%a %t "%r" %s %b "%{Referer}i" "%{User-Agent}i" %Tf'
33+
34+
# Process naming
35+
proc_name = "tokenizer-service"
36+
37+
# Timeout settings
38+
timeout = 30
39+
graceful_timeout = 30
40+
keepalive = 5
41+
42+
preload_app = True

0 commit comments

Comments
 (0)