fix(llmobs): move listener hooks to enable instead of on init (#11889)

Yun-Kim · web-flow · commit 8702cab2464d · 2025-01-10T13:26:13.000-05:00
Follow up on #11781 to fix a weird duplicate span writing issue with the new listener hook logic. Since we were registering these hooks on `LLMObs.__init__()` which also happens at startup (as we create a default LLMObs() instance) as well as on `LLMObs.enable()`, we were double registering these hooks, and the default LLMObsSpanWriter was still saved and called each time the tracer finished a span. A symptom of this issue is that if a user was to manually enable agentless mode, they would see noisy logs indicating a failure to send spans to the agent proxy endpoint (which is the default writer mode) even though they also submitted spans to the agentless endpoint succesfully. This fix resolves the issue by moving the hook registering to `LLMObs.enable()`, and adding corresponding logic to deregister the hooks on `_stop_service()`. This way we should only ever have one set of hooks registered per process. ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
@@ -111,9 +111,9 @@ def __init__(self, tracer=None):
         self._annotations = []
         self._annotation_context_lock = forksafe.RLock()
 
-        # Register hooks for span events
-        core.on("trace.span_start", self._do_annotations)
-        core.on("trace.span_finish", self._on_span_finish)
+    def _on_span_start(self, span):
+        if self.enabled and span.span_type == SpanTypes.LLM:
+            self._do_annotations(span)
 
     def _on_span_finish(self, span):
         if self.enabled and span.span_type == SpanTypes.LLM:
@@ -272,6 +272,10 @@ def _start_service(self) -> None:
             log.debug("Error starting evaluator runner")
 
     def _stop_service(self) -> None:
+        # Remove listener hooks for span events
+        core.reset_listeners("trace.span_start", self._on_span_start)
+        core.reset_listeners("trace.span_finish", self._on_span_finish)
+
         try:
             self._evaluator_runner.stop()
             # flush remaining evaluation spans & evaluations
@@ -366,6 +370,10 @@ def enable(
         cls.enabled = True
         cls._instance.start()
 
+        # Register hooks for span events
+        core.on("trace.span_start", cls._instance._on_span_start)
+        core.on("trace.span_finish", cls._instance._on_span_finish)
+
         atexit.register(cls.disable)
         telemetry_writer.product_activated(TELEMETRY_APM_PRODUCT.LLMOBS, True)
 
diff --git a/releasenotes/notes/fix-llmobs-default-writer-hooks-5e456c2f7dfd4381.yaml b/releasenotes/notes/fix-llmobs-default-writer-hooks-5e456c2f7dfd4381.yaml
@@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    LLM Observability: Resolves an issue where enabling LLM Observability in agentless mode would result in traces also being sent to the agent proxy endpoint.
diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py
@@ -1310,6 +1310,34 @@ def test_activate_distributed_headers_activates_context(llmobs, mock_llmobs_logs
             mock_activate.assert_called_once_with(dummy_context)
 
 
+def test_listener_hooks_enqueue_correct_writer(run_python_code_in_subprocess):
+    """
+    Regression test that ensures that listener hooks enqueue span events to the correct writer,
+    not the default writer created at startup.
+    """
+    env = os.environ.copy()
+    pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))]
+    if "PYTHONPATH" in env:
+        pypath.append(env["PYTHONPATH"])
+    env.update({"PYTHONPATH": ":".join(pypath), "DD_TRACE_ENABLED": "0"})
+    out, err, status, pid = run_python_code_in_subprocess(
+        """
+from ddtrace.llmobs import LLMObs
+
+LLMObs.enable(ml_app="repro-issue", agentless_enabled=True, api_key="foobar.baz", site="datad0g.com")
+with LLMObs.agent("dummy"):
+    pass
+""",
+        env=env,
+    )
+    assert status == 0, err
+    assert out == b""
+    agentless_writer_log = b"failed to send traces to intake at https://llmobs-intake.datad0g.com/api/v2/llmobs: HTTP error status 403, reason Forbidden\n"  # noqa: E501
+    agent_proxy_log = b"failed to send, dropping 1 traces to intake at http://localhost:8126/evp_proxy/v2/api/v2/llmobs after 5 retries"  # noqa: E501
+    assert err == agentless_writer_log
+    assert agent_proxy_log not in err
+
+
 def test_llmobs_fork_recreates_and_restarts_span_writer():
     """Test that forking a process correctly recreates and restarts the LLMObsSpanWriter."""
     with mock.patch("ddtrace.internal.writer.HTTPWriter._send_payload"):

-Original file line number
+Diff line change
 +---
 +fixes:
 +  - |
 +    LLM Observability: Resolves an issue where enabling LLM Observability in agentless mode would result in traces also being sent to the agent proxy endpoint.