Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
422 changes: 422 additions & 0 deletions docs/03_kvcache_management_api.md

Large diffs are not rendered by default.

422 changes: 422 additions & 0 deletions docs/03_kvcache_management_api_en.md

Large diffs are not rendered by default.

15 changes: 10 additions & 5 deletions kvcached/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,10 @@ class MemInfoStruct:
total_size: int
used_size: int
prealloc_size: int
min_safe_limit: int = 0 # Minimum safe KV cache limit (auto-calculated from model params)

DTYPE = np.int64
N_FIELDS = 3
N_FIELDS = 4
SHM_SIZE = np.dtype(DTYPE).itemsize * N_FIELDS

@classmethod
Expand All @@ -43,12 +44,15 @@ def _view(cls, buf: mmap.mmap) -> np.ndarray:
@classmethod
def from_buffer(cls, buf: mmap.mmap) -> "MemInfoStruct":
arr = cls._view(buf)
# Backward compatible: old segments with 3 fields will have min_safe_limit=0
min_safe = int(arr[3]) if len(arr) > 3 else 0
return cls(int(arr[0]), int(arr[1]),
int(arr[2])) # total, used, prealloc
int(arr[2]), min_safe) # total, used, prealloc, min_safe_limit

def write_to_buffer(self, buf: mmap.mmap) -> None:
arr = self._view(buf)
arr[:] = (self.total_size, self.used_size, self.prealloc_size)
arr[:] = (self.total_size, self.used_size, self.prealloc_size,
self.min_safe_limit)


class RwLockedShm:
Expand Down Expand Up @@ -96,7 +100,8 @@ def __exit__(self, exc_type, exc_value, traceback):
self.file.close()


def init_kv_cache_limit(ipc_name: str, kv_cache_limit: int):
def init_kv_cache_limit(ipc_name: str, kv_cache_limit: int,
min_safe_limit: int = 0):
"""
Set the kv cache limit for the current process.
Creates a persistent shared memory file that remains even after the script exits.
Expand All @@ -110,7 +115,7 @@ def init_kv_cache_limit(ipc_name: str, kv_cache_limit: int):
# Now we can safely memory map and write the values
with RwLockedShm(get_ipc_name(ipc_name), MemInfoStruct.SHM_SIZE,
RwLockedShm.WLOCK) as mm:
mem_info = MemInfoStruct(kv_cache_limit, 0, 0)
mem_info = MemInfoStruct(kv_cache_limit, 0, 0, min_safe_limit)
mem_info.write_to_buffer(mm)
return mem_info

Expand Down
2 changes: 2 additions & 0 deletions kvcached/integration/sglang/autopatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
ElasticMambaPoolPatch,
ElasticMemoryPoolPatch,
ElasticMLAMemoryPoolPatch,
HttpServerPatch,
RadixCacheLimitPatch,
SchedulerMemoryLeakPatch,
)
Expand Down Expand Up @@ -44,6 +45,7 @@ def _patch_sglang(_sglang: types.ModuleType) -> None:
(ElasticHybridLinearKVPoolPatch(), SGLANG_ALL_RANGE),
(SchedulerMemoryLeakPatch(), SGLANG_ALL_RANGE),
(RadixCacheLimitPatch(), SGLANG_ALL_RANGE),
(HttpServerPatch(), SGLANG_ALL_RANGE),
]
)

Expand Down
61 changes: 61 additions & 0 deletions kvcached/integration/sglang/patches.py
Original file line number Diff line number Diff line change
Expand Up @@ -1319,3 +1319,64 @@ def _wrapped(self_rc, *args: Any, **kwargs: Any):
f"{max_cached} tokens (KVCACHED_MAX_CACHED_TOKENS)"
)
return True


class HttpServerPatch(VersionAwarePatch, BasePatch):
"""Patch SGLang's HTTP server to inject kvcached KV cache management endpoints.

SGLang creates a module-level ``app = FastAPI(...)`` in
``sglang.srt.entrypoints.http_server``. We wrap
``_setup_and_run_http_server`` so that the kvcache router is attached
to ``app`` before uvicorn starts listening.

This adds ``/kvcache/status``, ``/kvcache/limit``, and
``/kvcache/limit_percent`` to every SGLang instance.
"""

library = "sglang"
target_module = "sglang.srt.entrypoints.http_server"
target_class = None # Module-level function, not a class
patch_name = "http_server_kvcache"

def can_apply(self, target_module: types.ModuleType) -> bool:
return hasattr(target_module, "_setup_and_run_http_server")

def apply(self, http_server_mod: types.ModuleType) -> bool:
if not self.initialize_version_info():
return False
return self.patch_setup_and_run(http_server_mod)

@version_range(SGLANG_ALL_RANGE)
def patch_setup_and_run(self, http_server_mod: types.ModuleType) -> bool:
"""Wrap _setup_and_run_http_server to attach kvcache routes."""
original_fn = getattr(http_server_mod, "_setup_and_run_http_server", None)
if original_fn is None:
self.logger.warning(
"_setup_and_run_http_server not found in http_server module")
return False

if self._is_already_patched(original_fn, "_setup_and_run"):
self.logger.debug("_setup_and_run_http_server already patched")
return True

_logger = self.logger

def _patched_setup_and_run(*args: Any, **kwargs: Any):
if enable_kvcached():
try:
# The module-level ``app`` is the FastAPI instance.
fastapi_app = getattr(http_server_mod, "app", None)
if fastapi_app is not None:
from kvcached.integration.vllm.api_router import (
attach_to_app,
)
attach_to_app(fastapi_app)
except Exception as exc:
_logger.warning(
"Failed to attach kvcache router to SGLang: %s", exc)

return original_fn(*args, **kwargs)

self._mark_as_patched(_patched_setup_and_run, "_setup_and_run")
http_server_mod._setup_and_run_http_server = _patched_setup_and_run
return True
Loading
Loading