llm-d
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/kv_events/online/main.go‎
Lines changed: 9 additions & 0 deletions b/‎examples/kv_events/online/main.go‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎examples/uds_tokenizer/Dockerfile‎
Lines changed: 68 additions & 0 deletions b/‎examples/uds_tokenizer/Dockerfile‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎examples/uds_tokenizer/README.md‎
Lines changed: 203 additions & 0 deletions b/‎examples/uds_tokenizer/README.md‎
Lines changed: 203 additions & 0 deletions
diff --git a/‎examples/uds_tokenizer/gunicorn.conf.py‎
Lines changed: 42 additions & 0 deletions b/‎examples/uds_tokenizer/gunicorn.conf.py‎
Lines changed: 42 additions & 0 deletions
@@ -20,6 +20,7 @@ __pycache__/
 *.pyc
 *.pyo
 *.pyd
+*.python-version
 
 # Go workspace file
 go.work
 
@@ -34,6 +34,7 @@ import (
 	"github.com/llm-d/llm-d-kv-cache-manager/pkg/kvcache/kvblock"
 	"github.com/llm-d/llm-d-kv-cache-manager/pkg/kvcache/kvevents"
 	preprocessing "github.com/llm-d/llm-d-kv-cache-manager/pkg/preprocessing/chat_completions"
+	"github.com/llm-d/llm-d-kv-cache-manager/pkg/tokenization"
 )
 
 const (
@@ -51,6 +52,8 @@ const (
 
 	envHTTPPort     = "HTTP_PORT"
 	defaultHTTPPort = "8080"
+
+	envExternalTokenization = "EXTERNAL_TOKENIZATION"
 )
 
 // ChatCompletionsRequest holds the fields needed for chat-completions rendering.
@@ -177,6 +180,12 @@ func getKVCacheIndexerConfig() *kvcache.Config {
 		config.TokenProcessorConfig.BlockSize = blockSize
 	}
 
+	useExternalTokenization, err := strconv.ParseBool(os.Getenv(envExternalTokenization))
+	if err == nil && useExternalTokenization {
+		config.TokenizersPoolConfig.UdsTokenizerConfig = &tokenization.UdsTokenizerConfig{}
+		config.TokenizersPoolConfig.HFTokenizerConfig = nil
+	}
+
 	config.KVBlockIndexConfig.EnableMetrics = true
 	config.KVBlockIndexConfig.MetricsLoggingInterval = 30 * time.Second
 
 
@@ -0,0 +1,68 @@
+# Copyright 2025 The llm-d Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Build stage
+FROM --platform=$TARGETPLATFORM registry-cn-hangzhou.ack.aliyuncs.com/dev/python:3.10-slim-test as builder
+
+# Set build arguments
+ARG TARGETPLATFORM
+ARG BUILDPLATFORM
+
+# Set working directory
+WORKDIR /app
+
+# Copy dependencies and install them
+COPY requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Runtime stage
+FROM --platform=$TARGETPLATFORM registry-cn-hangzhou.ack.aliyuncs.com/dev/python:3.10-slim-test
+
+# Set working directory
+WORKDIR /app
+
+RUN apt-get update && apt-get upgrade -y && rm -rf /var/cache/apt/
+
+# Copy installed dependencies from build stage
+COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages
+# Copy executables from build stage
+COPY --from=builder /usr/local/bin/ /usr/local/bin/
+
+# Copy project files into the image
+COPY server.py /app/
+COPY requirements.txt /app/requirements.txt
+COPY gunicorn.conf.py /app/gunicorn.conf.py
+COPY start_gunicorn.sh /app/start_gunicorn.sh
+COPY models/ /app/models/
+COPY utils/ /app/utils/
+COPY tokenizer_service/ /app/tokenizer_service/
+
+# Make startup script executable
+RUN chmod +x /app/start_gunicorn.sh
+
+# Create directory for UDS socket
+RUN mkdir -p /tmp/tokenizer && chown 65532:65532 /tmp/tokenizer
+
+# Create model cache directories and set permissions
+RUN mkdir -p /app/models && chown -R 65532:65532 /app/models
+# Create and set permissions for ModelScope directory
+RUN mkdir -p /.modelscope && chown -R 65532:65532 /.modelscope
+# Create and set permissions for Hugging Face cache directory
+RUN mkdir -p /.cache && chown -R 65532:65532 /.cache
+
+# Switch to non-root user
+USER 65532:65532
+
+# Startup command: run UDS server
+CMD ["/app/start_gunicorn.sh"]
@@ -0,0 +1,203 @@
+# UDS Tokenizer Service
+
+This service provides tokenization functionality via HTTP over Unix Domain Socket (UDS). It also exposes a separate HTTP endpoint for Kubernetes health checks.
+
+## Features
+
+- Apply chat templates to messages
+- Tokenize text prompts
+- Runtime configuration updates
+- Health check endpoint for Kubernetes
+- Support for multiple model formats (HuggingFace, ModelScope)
+- Automatic model downloading and caching
+
+## Services
+
+The service exposes multiple endpoints:
+
+1. `/chat-template` - Apply chat template to messages (UDS only)
+2. `/tokenize` - Tokenize text (UDS only)
+3. `/health` - Health check endpoint (TCP port, for Kubernetes probes)
+4. `/config` - Get or update configuration (UDS only)
+
+## Quick Start
+
+Start the service:
+```bash
+python server.py
+```
+
+Or using Gunicorn for production:
+```bash
+./start_gunicorn.sh
+```
+
+The service will:
+- Listen on `/tmp/tokenizer/tokenizer-uds.socket` for main functionality
+- Listen on port 8080 (configurable via PROBE_PORT) for health checks
+
+## Environment Variables
+
+| Variable | Description | Default |
+|---------|-------------|---------|
+| `LOG_LEVEL` | Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL) | INFO |
+| `WORKERS` | Number of worker processes when using Gunicorn | CPU cores * 2 + 1 |
+| `MODEL` | Path to the model directory | ./models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B |
+| `ADD_SPECIAL_TOKENS` | Whether to add special tokens | true |
+| `ENABLE_THINKING` | Whether to enable thinking mode | false |
+| `ADD_GENERATION_PROMPT` | Whether to add generation prompt | true |
+| `PROBE_PORT` | Port for health check endpoint | 8080 |
+| `USE_MODELSCOPE` | Whether to download tokenizer files from ModelScope (true) or Hugging Face (false) | false |
+
+## API Endpoints
+
+### POST /chat-template
+Apply chat template to a list of messages.
+
+Request body:
+```json
+[
+  {
+    "role": "system",
+    "content": "You are a helpful assistant."
+  },
+  {
+    "role": "user",
+    "content": "Hello!"
+  }
+]
+```
+
+Response:
+Plain text with the formatted prompt.
+
+### POST /tokenize
+Tokenize a text prompt.
+
+Request body:
+```
+Text to tokenize
+```
+
+Response:
+JSON with tokenization results:
+- `input_ids`: List of token IDs
+- `attention_mask`: Attention mask for the tokens
+
+### GET /health
+Health check endpoint for Kubernetes probes.
+
+Response:
+```json
+{
+  "status": "healthy",
+  "service": "tokenizer-service",
+  "timestamp": 1234567890.123
+}
+```
+
+### GET /config
+Get current configuration.
+
+Response:
+```json
+{
+  "model": "./models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+  "add_special_tokens": true,
+  "enable_thinking": false,
+  "add_generation_prompt": true
+}
+```
+
+### POST /config
+Update configuration at runtime.
+
+Request body:
+```json
+{
+  "model": "./models/qwen/qwen3-0.6b",
+  "add_special_tokens": false
+}
+```
+
+Response:
+```json
+{
+  "status": "success",
+  "message": "Configuration updated successfully"
+}
+```
+
+## Testing
+
+### Unit Tests
+
+Run unit tests with mocks (no service needed):
+```bash
+# Install test dependencies
+pip install -r tests/requirements.txt
+
+# Run unit tests
+python -m pytest tests/test_tokenizer_unit.py -v
+```
+
+### Integration Tests
+
+Run integration tests (requires service to be running):
+```bash
+# Start the service in the background
+python server.py &
+
+# Run integration tests with automatic waiting
+python tests/run_integration_tests.py
+
+# Stop the service
+pkill -f "python server.py"
+```
+
+The integration test runner will automatically wait for the server to be ready before running tests.
+
+## Kubernetes Deployment
+
+The service is designed to run in Kubernetes with:
+- A shared `emptyDir` volume for UDS communication between containers
+- Health check endpoint for liveness and readiness probes
+- Proper security context with non-root user
+
+## Model Support
+
+The service supports:
+- HuggingFace models (local or remote)
+- ModelScope models (automatically downloaded and cached)
+- Custom models in standard format
+
+Models are automatically downloaded and cached in the `models/` directory. 
+The source for downloading can be controlled with the `USE_MODELSCOPE` environment variable:
+- `false` (default): Download from Hugging Face
+- `true`: Download from ModelScope
+
+See [models/README.md](models/README.md) for detailed information about model caching, pre-populating the cache, and Kubernetes deployment strategies.
+
+## Project Structure
+
+```
+├── server.py              # Main server entry point
+├── tokenizer_service/     # Core tokenizer service implementation
+│   ├── __init__.py
+│   ├── tokenizer.py       # Tokenizer service implementation
+│   └── exceptions.py      # Custom exceptions
+├── utils/                 # Utility functions
+│   ├── __init__.py
+│   └── logger.py          # logger functionality
+├── tests/                 # Test files
+│   ├── __init__.py
+│   ├── run_integration_tests.py  # Integration test runner
+│   ├── test_tokenizer_unit.py    # Unit tests
+│   └── test_tokenizer_service.py # Legacy integration test
+├── models/                # Model files (downloaded automatically)
+├── client/                # Client examples
+├── requirements.txt       # Python dependencies
+├── gunicorn.conf.py       # Gunicorn configuration
+├── start_gunicorn.sh      # Gunicorn startup script
+└── README.md              # This file
+```
@@ -0,0 +1,42 @@
+# Copyright 2025 The llm-d Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Gunicorn configuration file
+import os
+import multiprocessing
+
+# Server configuration
+bind = "unix:/tmp/tokenizer/tokenizer-uds.socket"
+workers = int(os.getenv("WORKERS", multiprocessing.cpu_count() * 2 + 1))
+worker_class = "aiohttp.GunicornWebWorker"
+worker_connections = 1000
+max_requests = 1000
+max_requests_jitter = 100
+
+# Logging configuration
+accesslog = "-"
+errorlog = "-"
+loglevel = os.getenv("LOG_LEVEL", "info").lower()
+# Use aiohttp-style log format
+access_log_format = '%a %t "%r" %s %b "%{Referer}i" "%{User-Agent}i" %Tf'
+
+# Process naming
+proc_name = "tokenizer-service"
+
+# Timeout settings
+timeout = 30
+graceful_timeout = 30
+keepalive = 5
+
+preload_app = True