From 9f182b0f9cf9d6045ca43b572b24ba42c9427a67 Mon Sep 17 00:00:00 2001 From: Rixin Liu Date: Wed, 3 Jun 2026 07:49:22 +0000 Subject: [PATCH 1/2] fix(sglang): support SGLang 0.5.11+ (version detection + refactored leak check) Two general (non-AMD) compatibility defects in the SGLang integration, found while porting kvcached to AMD and verified on NVIDIA (SGLang 0.5.12.post1). - version_utils: detect_version() only read module attributes (__version__, version, ...). Source builds of SGLang expose none, so detection returned None and every SGLang patch silently bailed. Fall back to importlib.metadata.version(). - sglang/patches: the scheduler_memory_leak patch scanned for a single Scheduler method containing both "memory leak detected" and "token_to_kv_pool_allocator". SGLang 0.5.11+ moved the check into SchedulerRuntimeCheckerMixin and split it across _check_req_pool / _report_leak, so nothing matched and SGLang's leak detector crashed the scheduler. Wrap every Scheduler method whose source raises "memory leak detected" instead -- covers both the old single-method layout and the new split one. Verified on NVIDIA (SGLang 0.5.12.post1) and AMD MI300X: all 7 SGLang patches apply, kvcached engages, no crash, output md5 matches the no-kvcached baseline. Co-Authored-By: Claude Opus 4.8 (1M context) --- kvcached/integration/sglang/patches.py | 63 +++++++++++++++++--------- kvcached/integration/version_utils.py | 10 ++++ 2 files changed, 52 insertions(+), 21 deletions(-) diff --git a/kvcached/integration/sglang/patches.py b/kvcached/integration/sglang/patches.py index bc02b218..ac6cfa35 100644 --- a/kvcached/integration/sglang/patches.py +++ b/kvcached/integration/sglang/patches.py @@ -8,7 +8,7 @@ import inspect import math import types -from typing import Any, List, Optional, Tuple, Union, cast +from typing import Any, Callable, List, Optional, Tuple, Union, cast from kvcached.integration.patch_base import BasePatch, enable_kvcached from kvcached.integration.version_utils import VersionAwarePatch, version_range @@ -1213,41 +1213,62 @@ def apply(self, sched_mod: types.ModuleType) -> bool: @version_range(SGLANG_ALL_RANGE) def patch_scheduler_memory_leak(self, sched_mod: types.ModuleType) -> bool: - """Patch scheduler to suppress memory leak check when kvcached is enabled""" + """Patch scheduler to suppress memory leak check when kvcached is enabled. + + kvcached maps physical KV pages lazily, so SGLang's static-pool + invariant (total == available + in-use) does not hold and its leak + detector would raise spuriously. We neutralize the leak *raisers*. + + Older SGLang keeps the whole check in a single Scheduler method whose + source mentions ``token_to_kv_pool_allocator``. Newer SGLang + (>=0.5.11) moved it into ``SchedulerRuntimeCheckerMixin`` and split it + across several small methods (e.g. ``_check_req_pool`` raises directly, + ``_report_leak`` is the choke point for pool leaks). To stay robust + across both layouts we wrap every Scheduler method whose source raises + a ``"memory leak detected"`` error -- and only those, leaving the + stats/logging methods untouched. + """ Scheduler = self._get_target_class(sched_mod) if Scheduler is None: return False - target_method_name: Union[str, None] = None + target_method_names: List[str] = [] for name, fn in inspect.getmembers(Scheduler, predicate=inspect.isfunction): try: src = inspect.getsource(fn) except Exception: continue - if "token_to_kv_pool_allocator memory leak detected!" in src or ( - "memory leak detected" in src and "token_to_kv_pool_allocator" in src - ): - target_method_name = name - break + if "memory leak detected" in src: + target_method_names.append(name) - if target_method_name is None: + if not target_method_names: self.logger.debug("No memory leak detection method found in Scheduler") return False - original = getattr(Scheduler, target_method_name) - if self._is_already_patched(original): - self.logger.debug("Scheduler memory leak check already patched") - return True + def _make_wrapped(original: Callable[..., Any]) -> Callable[..., Any]: + def _wrapped(self, *args: Any, **kwargs: Any): + # Disable memory leak detection when ENABLE_KVCACHED is set + if enable_kvcached(): + return + return original(self, *args, **kwargs) + + return _wrapped + + patched_any = False + for target_method_name in target_method_names: + original = getattr(Scheduler, target_method_name) + if self._is_already_patched(original): + self.logger.debug( + f"Scheduler.{target_method_name} leak check already patched") + patched_any = True + continue - def _wrapped(self, *args: Any, **kwargs: Any): - # Disable memory leak detection when ENABLE_KVCACHED is set - if enable_kvcached(): - return - return original(self, *args, **kwargs) + wrapped = _make_wrapped(original) + self._mark_as_patched(wrapped) + setattr(Scheduler, target_method_name, wrapped) + patched_any = True - self._mark_as_patched(_wrapped) - setattr(Scheduler, target_method_name, _wrapped) - return True + return patched_any class RadixCacheLimitPatch(VersionAwarePatch, BasePatch): diff --git a/kvcached/integration/version_utils.py b/kvcached/integration/version_utils.py index 0de036a2..ed65fc1e 100644 --- a/kvcached/integration/version_utils.py +++ b/kvcached/integration/version_utils.py @@ -154,6 +154,16 @@ def detect_version(self, library_name: str, force_refresh: bool = False) -> Opti except Exception as e: self.logger.warning(f"Error detecting version for {library_name}: {e}") + # Fallback to installed-package metadata. Some builds (e.g. source + # builds of SGLang) don't expose a module-level __version__, but the + # distribution metadata still carries the version. + if detected_version is None: + try: + import importlib.metadata as _md + detected_version = _md.version(library_name) + except Exception: + pass + self._version_cache[library_name] = detected_version return detected_version From cc71981d0bb8f7de0118198ebf2baf79869876c9 Mon Sep 17 00:00:00 2001 From: Rixin Liu Date: Wed, 3 Jun 2026 11:43:24 +0000 Subject: [PATCH 2/2] fix(sglang): leave req_to_token_pool leak check intact The previous commit suppressed every Scheduler method whose source mentioned "memory leak detected". On SGLang 0.5.10+ that also wraps _check_req_pool, which guards req_to_token_pool -- a pool kvcached does not manage. Silencing it could hide a genuine request-pool leak. Skip any check specific to req_to_token_pool; keep the KV/token-pool checks (the old combined check names token_to_kv_pool_allocator; the new generic _report_leak names no pool and is only called for token pools). Verified against sglang 0.4.9 / 0.5.10 / 0.5.12: the KV-pool check is still neutralized, the request-pool check stays live, all 7 patches apply, no crash, output md5 matches the no-kvcached baseline. Co-Authored-By: Claude Opus 4.8 (1M context) --- kvcached/integration/sglang/patches.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/kvcached/integration/sglang/patches.py b/kvcached/integration/sglang/patches.py index ac6cfa35..354b8451 100644 --- a/kvcached/integration/sglang/patches.py +++ b/kvcached/integration/sglang/patches.py @@ -1223,10 +1223,16 @@ def patch_scheduler_memory_leak(self, sched_mod: types.ModuleType) -> bool: source mentions ``token_to_kv_pool_allocator``. Newer SGLang (>=0.5.11) moved it into ``SchedulerRuntimeCheckerMixin`` and split it across several small methods (e.g. ``_check_req_pool`` raises directly, - ``_report_leak`` is the choke point for pool leaks). To stay robust - across both layouts we wrap every Scheduler method whose source raises - a ``"memory leak detected"`` error -- and only those, leaving the - stats/logging methods untouched. + ``_report_leak`` is the generic choke point for *token/KV* pool leaks). + + We suppress only the leak checks for pools kvcached actually manages + (the KV / token pools). A check that is specific to + ``req_to_token_pool`` is deliberately left intact -- kvcached does not + manage the request pool, its invariant still holds, and silencing it + would hide a genuine request-pool leak. The old single-method layout + (which names ``token_to_kv_pool_allocator``) and the new generic + reporter (which names no pool, and is only ever called for the token + pools) are both kept; only the req-pool-specific check is skipped. """ Scheduler = self._get_target_class(sched_mod) if Scheduler is None: @@ -1238,8 +1244,14 @@ def patch_scheduler_memory_leak(self, sched_mod: types.ModuleType) -> bool: src = inspect.getsource(fn) except Exception: continue - if "memory leak detected" in src: - target_method_names.append(name) + if "memory leak detected" not in src: + continue + # Skip a check that is specific to the request pool, which kvcached + # does not manage. The generic reporter names no pool (so it is not + # excluded) and the legacy combined check names the KV allocator. + if "req_to_token_pool" in src and "token_to_kv_pool" not in src: + continue + target_method_names.append(name) if not target_method_names: self.logger.debug("No memory leak detection method found in Scheduler")