fix(litellm): bound the off-band span registry

jgreer013 · claude · jgreer013 · commit 986f979f1c2d · 2026-06-18T09:28:31.000-07:00
Address self-review: the module-level span registry was only evicted by
the terminal success/failure callback, so a call abandoned before a
terminal callback fires (e.g. a stream the caller stops iterating) leaked
its Span entry -- holding prompt data -- for the process lifetime. The
prior kwargs-scoped storage was GC'd with the request, so this was a
regression.

Back the registry with an OrderedDict capped at _MAX_TRACKED_SPANS and
evict oldest-first in _store_span, so abandoned calls cannot grow it
unbounded. A WeakValueDictionary is not an option here: Span/Transaction
objects are not weakly referenceable.

Add tests for the bound, terminal-callback cleanup, and the
litellm_call_id-absent fallback key; correct the registry comment.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/sentry_sdk/integrations/litellm.py b/sentry_sdk/integrations/litellm.py
@@ -1,5 +1,6 @@
 import copy
 import threading
+from collections import OrderedDict
 from typing import TYPE_CHECKING
 
 import sentry_sdk
@@ -40,8 +41,11 @@
 # to the provider. `litellm_call_id` is a per-request UUID that stays stable
 # across the input/success/failure callbacks; the identity of the (shared)
 # callback kwargs dict is the fallback for direct callback invocations that omit
-# it. Entries are removed by the terminal success/failure callback.
-_spans_by_call = {}  # type: Dict[Any, Any]
+# it. The terminal success/failure callback removes the entry; the registry is
+# capped (oldest evicted first) so calls abandoned before a terminal callback
+# fires -- e.g. a stream the caller stops iterating -- cannot grow it unbounded.
+_MAX_TRACKED_SPANS = 1000
+_spans_by_call = OrderedDict()  # type: OrderedDict[Any, Any]
 _spans_by_call_lock = threading.Lock()
 
 
@@ -50,8 +54,18 @@ def _span_key(kwargs: "Dict[str, Any]") -> "Any":
 
 
 def _store_span(kwargs: "Dict[str, Any]", span: "Any") -> None:
+    key = _span_key(kwargs)
+    evicted = []  # type: List[Any]
     with _spans_by_call_lock:
-        _spans_by_call[_span_key(kwargs)] = span
+        _spans_by_call[key] = span
+        _spans_by_call.move_to_end(key)
+        while len(_spans_by_call) > _MAX_TRACKED_SPANS:
+            _, evicted_span = _spans_by_call.popitem(last=False)
+            evicted.append(evicted_span)
+    # Finish evicted spans outside the lock so an over-cap call (heavy
+    # concurrency) still records a span instead of leaking an unfinished one.
+    for evicted_span in evicted:
+        evicted_span.__exit__(None, None, None)
 
 
 def _peek_span(kwargs: "Dict[str, Any]") -> "Any":
diff --git a/tests/integrations/litellm/test_litellm.py b/tests/integrations/litellm/test_litellm.py
@@ -39,6 +39,7 @@ async def __call__(self, *args, **kwargs):
 from sentry_sdk import start_transaction
 from sentry_sdk._types import BLOB_DATA_SUBSTITUTE
 from sentry_sdk.consts import OP, SPANDATA
+from sentry_sdk.integrations import litellm as litellm_integration
 from sentry_sdk.integrations.litellm import (
     LiteLLMIntegration,
     _convert_message_parts,
@@ -2587,6 +2588,132 @@ def test_caller_metadata_stays_json_serializable(
     assert len(chat_spans) == 1
 
 
+def test_span_registry_is_bounded():
+    """The off-band span registry must stay bounded when calls are abandoned
+    before a terminal callback fires (e.g. an interrupted stream), so a
+    long-running process cannot accumulate Span objects without limit. Evicted
+    spans are finished, so an over-cap call still records a span instead of
+    leaking an unfinished one.
+    """
+
+    class _FakeSpan:
+        def __init__(self):
+            self.exited = False
+
+        def __exit__(self, *exc_info):
+            self.exited = True
+
+    registry = litellm_integration._spans_by_call
+    registry.clear()
+    try:
+        count = 5000
+        spans = []
+        for i in range(count):
+            span = _FakeSpan()
+            spans.append(span)
+            litellm_integration._store_span(
+                {"litellm_call_id": "call-{}".format(i)}, span
+            )
+
+        # Bounded, not unbounded: the registry must not retain every entry.
+        assert len(registry) < count
+        # Most-recent entries are kept and left running...
+        assert (
+            litellm_integration._peek_span(
+                {"litellm_call_id": "call-{}".format(count - 1)}
+            )
+            is spans[-1]
+        )
+        assert spans[-1].exited is False
+        # ...while the oldest are evicted and finished (not silently dropped).
+        assert litellm_integration._peek_span({"litellm_call_id": "call-0"}) is None
+        assert spans[0].exited is True
+    finally:
+        registry.clear()
+
+
+def test_span_registry_cleaned_up_after_terminal_callbacks(sentry_init):
+    """Both terminal callbacks must remove the off-band registry entry, so a
+    completed or failed call leaves nothing behind."""
+    sentry_init(
+        integrations=[LiteLLMIntegration()],
+        disabled_integrations=[StdlibIntegration],
+        traces_sample_rate=1.0,
+        _experiments={"trace_lifecycle": "static"},
+    )
+    litellm_integration._spans_by_call.clear()
+
+    with start_transaction(name="litellm test"):
+        success_kwargs = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "hi"}],
+            "litellm_call_id": "success-call",
+        }
+        _input_callback(success_kwargs)
+        assert "success-call" in litellm_integration._spans_by_call
+        _success_callback(
+            success_kwargs, MockCompletionResponse(), datetime.now(), datetime.now()
+        )
+        assert "success-call" not in litellm_integration._spans_by_call
+
+        failure_kwargs = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "hi"}],
+            "litellm_call_id": "failure-call",
+        }
+        _input_callback(failure_kwargs)
+        assert "failure-call" in litellm_integration._spans_by_call
+        _failure_callback(
+            failure_kwargs, ValueError("boom"), datetime.now(), datetime.now()
+        )
+        assert "failure-call" not in litellm_integration._spans_by_call
+
+
+def test_span_key_falls_back_to_kwargs_identity(sentry_init):
+    """When litellm omits litellm_call_id (direct callback use), the shared
+    kwargs dict identity keys the registry, and distinct calls stay independent.
+    """
+    sentry_init(
+        integrations=[LiteLLMIntegration()],
+        disabled_integrations=[StdlibIntegration],
+        traces_sample_rate=1.0,
+        _experiments={"trace_lifecycle": "static"},
+    )
+    litellm_integration._spans_by_call.clear()
+
+    with start_transaction(name="litellm test"):
+        kwargs_a = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "a"}],
+        }
+        kwargs_b = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "b"}],
+        }
+
+        _input_callback(kwargs_a)
+        _input_callback(kwargs_b)
+
+        # Distinct dicts (no litellm_call_id) get distinct keys -> no cross-talk.
+        span_a = litellm_integration._peek_span(kwargs_a)
+        span_b = litellm_integration._peek_span(kwargs_b)
+        assert span_a is not None
+        assert span_b is not None
+        assert span_a is not span_b
+
+        # Closing A leaves B's span intact.
+        _success_callback(
+            kwargs_a, MockCompletionResponse(), datetime.now(), datetime.now()
+        )
+        assert litellm_integration._peek_span(kwargs_a) is None
+        assert litellm_integration._peek_span(kwargs_b) is span_b
+
+        _success_callback(
+            kwargs_b, MockCompletionResponse(), datetime.now(), datetime.now()
+        )
+        assert litellm_integration._peek_span(kwargs_b) is None
+
+
 def test_litellm_message_truncation(sentry_init, capture_events):
     """Test that large messages are truncated properly in LiteLLM integration."""
     sentry_init(