From 9f182b0f9cf9d6045ca43b572b24ba42c9427a67 Mon Sep 17 00:00:00 2001
From: Rixin Liu <rixin.liu@rice.edu>
Date: Wed, 3 Jun 2026 07:49:22 +0000
Subject: [PATCH 1/2] fix(sglang): support SGLang 0.5.11+ (version detection +
 refactored leak check)

Two general (non-AMD) compatibility defects in the SGLang integration,
found while porting kvcached to AMD and verified on NVIDIA (SGLang
0.5.12.post1).

- version_utils: detect_version() only read module attributes
  (__version__, version, ...). Source builds of SGLang expose none, so
  detection returned None and every SGLang patch silently bailed. Fall
  back to importlib.metadata.version().

- sglang/patches: the scheduler_memory_leak patch scanned for a single
  Scheduler method containing both "memory leak detected" and
  "token_to_kv_pool_allocator". SGLang 0.5.11+ moved the check into
  SchedulerRuntimeCheckerMixin and split it across _check_req_pool /
  _report_leak, so nothing matched and SGLang's leak detector crashed the
  scheduler. Wrap every Scheduler method whose source raises "memory leak
  detected" instead -- covers both the old single-method layout and the
  new split one.

Verified on NVIDIA (SGLang 0.5.12.post1) and AMD MI300X: all 7 SGLang
patches apply, kvcached engages, no crash, output md5 matches the
no-kvcached baseline.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 kvcached/integration/sglang/patches.py | 63 +++++++++++++++++---------
 kvcached/integration/version_utils.py  | 10 ++++
 2 files changed, 52 insertions(+), 21 deletions(-)

diff --git a/kvcached/integration/sglang/patches.py b/kvcached/integration/sglang/patches.py
index bc02b218..ac6cfa35 100644
--- a/kvcached/integration/sglang/patches.py
+++ b/kvcached/integration/sglang/patches.py
@@ -8,7 +8,7 @@
 import inspect
 import math
 import types
-from typing import Any, List, Optional, Tuple, Union, cast
+from typing import Any, Callable, List, Optional, Tuple, Union, cast
 
 from kvcached.integration.patch_base import BasePatch, enable_kvcached
 from kvcached.integration.version_utils import VersionAwarePatch, version_range
@@ -1213,41 +1213,62 @@ def apply(self, sched_mod: types.ModuleType) -> bool:
 
     @version_range(SGLANG_ALL_RANGE)
     def patch_scheduler_memory_leak(self, sched_mod: types.ModuleType) -> bool:
-        """Patch scheduler to suppress memory leak check when kvcached is enabled"""
+        """Patch scheduler to suppress memory leak check when kvcached is enabled.
+
+        kvcached maps physical KV pages lazily, so SGLang's static-pool
+        invariant (total == available + in-use) does not hold and its leak
+        detector would raise spuriously.  We neutralize the leak *raisers*.
+
+        Older SGLang keeps the whole check in a single Scheduler method whose
+        source mentions ``token_to_kv_pool_allocator``.  Newer SGLang
+        (>=0.5.11) moved it into ``SchedulerRuntimeCheckerMixin`` and split it
+        across several small methods (e.g. ``_check_req_pool`` raises directly,
+        ``_report_leak`` is the choke point for pool leaks).  To stay robust
+        across both layouts we wrap every Scheduler method whose source raises
+        a ``"memory leak detected"`` error -- and only those, leaving the
+        stats/logging methods untouched.
+        """
         Scheduler = self._get_target_class(sched_mod)
         if Scheduler is None:
             return False
 
-        target_method_name: Union[str, None] = None
+        target_method_names: List[str] = []
         for name, fn in inspect.getmembers(Scheduler, predicate=inspect.isfunction):
             try:
                 src = inspect.getsource(fn)
             except Exception:
                 continue
-            if "token_to_kv_pool_allocator memory leak detected!" in src or (
-                "memory leak detected" in src and "token_to_kv_pool_allocator" in src
-            ):
-                target_method_name = name
-                break
+            if "memory leak detected" in src:
+                target_method_names.append(name)
 
-        if target_method_name is None:
+        if not target_method_names:
             self.logger.debug("No memory leak detection method found in Scheduler")
             return False
 
-        original = getattr(Scheduler, target_method_name)
-        if self._is_already_patched(original):
-            self.logger.debug("Scheduler memory leak check already patched")
-            return True
+        def _make_wrapped(original: Callable[..., Any]) -> Callable[..., Any]:
+            def _wrapped(self, *args: Any, **kwargs: Any):
+                # Disable memory leak detection when ENABLE_KVCACHED is set
+                if enable_kvcached():
+                    return
+                return original(self, *args, **kwargs)
+
+            return _wrapped
+
+        patched_any = False
+        for target_method_name in target_method_names:
+            original = getattr(Scheduler, target_method_name)
+            if self._is_already_patched(original):
+                self.logger.debug(
+                    f"Scheduler.{target_method_name} leak check already patched")
+                patched_any = True
+                continue
 
-        def _wrapped(self, *args: Any, **kwargs: Any):
-            # Disable memory leak detection when ENABLE_KVCACHED is set
-            if enable_kvcached():
-                return
-            return original(self, *args, **kwargs)
+            wrapped = _make_wrapped(original)
+            self._mark_as_patched(wrapped)
+            setattr(Scheduler, target_method_name, wrapped)
+            patched_any = True
 
-        self._mark_as_patched(_wrapped)
-        setattr(Scheduler, target_method_name, _wrapped)
-        return True
+        return patched_any
 
 
 class RadixCacheLimitPatch(VersionAwarePatch, BasePatch):
diff --git a/kvcached/integration/version_utils.py b/kvcached/integration/version_utils.py
index 0de036a2..ed65fc1e 100644
--- a/kvcached/integration/version_utils.py
+++ b/kvcached/integration/version_utils.py
@@ -154,6 +154,16 @@ def detect_version(self, library_name: str, force_refresh: bool = False) -> Opti
         except Exception as e:
             self.logger.warning(f"Error detecting version for {library_name}: {e}")
 
+        # Fallback to installed-package metadata. Some builds (e.g. source
+        # builds of SGLang) don't expose a module-level __version__, but the
+        # distribution metadata still carries the version.
+        if detected_version is None:
+            try:
+                import importlib.metadata as _md
+                detected_version = _md.version(library_name)
+            except Exception:
+                pass
+
         self._version_cache[library_name] = detected_version
         return detected_version
 

From cc71981d0bb8f7de0118198ebf2baf79869876c9 Mon Sep 17 00:00:00 2001
From: Rixin Liu <rixin.liu@rice.edu>
Date: Wed, 3 Jun 2026 11:43:24 +0000
Subject: [PATCH 2/2] fix(sglang): leave req_to_token_pool leak check intact

The previous commit suppressed every Scheduler method whose source
mentioned "memory leak detected". On SGLang 0.5.10+ that also wraps
_check_req_pool, which guards req_to_token_pool -- a pool kvcached does
not manage. Silencing it could hide a genuine request-pool leak.

Skip any check specific to req_to_token_pool; keep the KV/token-pool
checks (the old combined check names token_to_kv_pool_allocator; the new
generic _report_leak names no pool and is only called for token pools).

Verified against sglang 0.4.9 / 0.5.10 / 0.5.12: the KV-pool check is
still neutralized, the request-pool check stays live, all 7 patches
apply, no crash, output md5 matches the no-kvcached baseline.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 kvcached/integration/sglang/patches.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/kvcached/integration/sglang/patches.py b/kvcached/integration/sglang/patches.py
index ac6cfa35..354b8451 100644
--- a/kvcached/integration/sglang/patches.py
+++ b/kvcached/integration/sglang/patches.py
@@ -1223,10 +1223,16 @@ def patch_scheduler_memory_leak(self, sched_mod: types.ModuleType) -> bool:
         source mentions ``token_to_kv_pool_allocator``.  Newer SGLang
         (>=0.5.11) moved it into ``SchedulerRuntimeCheckerMixin`` and split it
         across several small methods (e.g. ``_check_req_pool`` raises directly,
-        ``_report_leak`` is the choke point for pool leaks).  To stay robust
-        across both layouts we wrap every Scheduler method whose source raises
-        a ``"memory leak detected"`` error -- and only those, leaving the
-        stats/logging methods untouched.
+        ``_report_leak`` is the generic choke point for *token/KV* pool leaks).
+
+        We suppress only the leak checks for pools kvcached actually manages
+        (the KV / token pools).  A check that is specific to
+        ``req_to_token_pool`` is deliberately left intact -- kvcached does not
+        manage the request pool, its invariant still holds, and silencing it
+        would hide a genuine request-pool leak.  The old single-method layout
+        (which names ``token_to_kv_pool_allocator``) and the new generic
+        reporter (which names no pool, and is only ever called for the token
+        pools) are both kept; only the req-pool-specific check is skipped.
         """
         Scheduler = self._get_target_class(sched_mod)
         if Scheduler is None:
@@ -1238,8 +1244,14 @@ def patch_scheduler_memory_leak(self, sched_mod: types.ModuleType) -> bool:
                 src = inspect.getsource(fn)
             except Exception:
                 continue
-            if "memory leak detected" in src:
-                target_method_names.append(name)
+            if "memory leak detected" not in src:
+                continue
+            # Skip a check that is specific to the request pool, which kvcached
+            # does not manage.  The generic reporter names no pool (so it is not
+            # excluded) and the legacy combined check names the KV allocator.
+            if "req_to_token_pool" in src and "token_to_kv_pool" not in src:
+                continue
+            target_method_names.append(name)
 
         if not target_method_names:
             self.logger.debug("No memory leak detection method found in Scheduler")