Skip to content

Commit 959432a

Browse files
committed
chore(llmobs): refactor to use span events
The LLMObs service formerly depended on the TraceProcessor interface in the tracer. This was problematic due to sharing a dependency with the public API. As such, users could configure a trace filter (under the hood is a trace processor) and overwrite the LLMObs TraceProcessor. Instead, the tracer can emit span start and finish events which the LLMObs service listens to and acts on, as proposed here. The gotcha is that the LLMObs service no longer has a way to drop traces when run in agentless mode, which only LLMObs supports. Instead, we encourage users to explicitly turn off APM which carries the benefit of clarity since this was implicit before.
1 parent b253aa3 commit 959432a

File tree

7 files changed

+151
-285
lines changed

7 files changed

+151
-285
lines changed

ddtrace/_trace/tracer.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
from ddtrace.internal.atexit import register_on_exit_signal
4242
from ddtrace.internal.constants import SAMPLING_DECISION_TRACE_TAG_KEY
4343
from ddtrace.internal.constants import SPAN_API_DATADOG
44+
from ddtrace.internal.core import dispatch
4445
from ddtrace.internal.dogstatsd import get_dogstatsd_client
4546
from ddtrace.internal.logger import get_logger
4647
from ddtrace.internal.peer_service.processor import PeerServiceProcessor
@@ -866,7 +867,7 @@ def _start_span(
866867
for p in chain(self._span_processors, SpanProcessor.__processors__, self._deferred_processors):
867868
p.on_span_start(span)
868869
self._hooks.emit(self.__class__.start_span, span)
869-
870+
dispatch("trace.span_start", (span,))
870871
return span
871872

872873
start_span = _start_span
@@ -883,6 +884,8 @@ def _on_span_finish(self, span: Span) -> None:
883884
for p in chain(self._span_processors, SpanProcessor.__processors__, self._deferred_processors):
884885
p.on_span_finish(span)
885886

887+
dispatch("trace.span_finish", (span,))
888+
886889
if log.isEnabledFor(logging.DEBUG):
887890
log.debug("finishing span %s (enabled:%s)", span._pprint(), self.enabled)
888891

ddtrace/llmobs/_llmobs.py

+141-21
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
import time
44
from typing import Any
55
from typing import Dict
6+
from typing import List
67
from typing import Optional
8+
from typing import Tuple
79
from typing import Union
810

911
import ddtrace
@@ -13,6 +15,7 @@
1315
from ddtrace._trace.context import Context
1416
from ddtrace.ext import SpanTypes
1517
from ddtrace.internal import atexit
18+
from ddtrace.internal import core
1619
from ddtrace.internal import forksafe
1720
from ddtrace.internal._rand import rand64bits
1821
from ddtrace.internal.compat import ensure_text
@@ -45,11 +48,11 @@
4548
from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING
4649
from ddtrace.llmobs._constants import TAGS
4750
from ddtrace.llmobs._evaluators.runner import EvaluatorRunner
48-
from ddtrace.llmobs._trace_processor import LLMObsTraceProcessor
4951
from ddtrace.llmobs._utils import AnnotationContext
5052
from ddtrace.llmobs._utils import _get_llmobs_parent_id
5153
from ddtrace.llmobs._utils import _get_ml_app
5254
from ddtrace.llmobs._utils import _get_session_id
55+
from ddtrace.llmobs._utils import _get_span_name
5356
from ddtrace.llmobs._utils import _inject_llmobs_parent_id
5457
from ddtrace.llmobs._utils import safe_json
5558
from ddtrace.llmobs._utils import validate_prompt
@@ -60,6 +63,11 @@
6063
from ddtrace.llmobs.utils import Messages
6164
from ddtrace.propagation.http import HTTPPropagator
6265

66+
from ..constants import ERROR_MSG
67+
from ..constants import ERROR_STACK
68+
from ..constants import ERROR_TYPE
69+
from . import _constants as constants
70+
6371

6472
log = get_logger(__name__)
6573

@@ -81,34 +89,157 @@ class LLMObs(Service):
8189
def __init__(self, tracer=None):
8290
super(LLMObs, self).__init__()
8391
self.tracer = tracer or ddtrace.tracer
84-
self._llmobs_span_writer = None
85-
8692
self._llmobs_span_writer = LLMObsSpanWriter(
8793
is_agentless=config._llmobs_agentless_enabled,
8894
interval=float(os.getenv("_DD_LLMOBS_WRITER_INTERVAL", 1.0)),
8995
timeout=float(os.getenv("_DD_LLMOBS_WRITER_TIMEOUT", 5.0)),
9096
)
91-
9297
self._llmobs_eval_metric_writer = LLMObsEvalMetricWriter(
9398
site=config._dd_site,
9499
api_key=config._dd_api_key,
95100
interval=float(os.getenv("_DD_LLMOBS_WRITER_INTERVAL", 1.0)),
96101
timeout=float(os.getenv("_DD_LLMOBS_WRITER_TIMEOUT", 5.0)),
97102
)
98-
99103
self._evaluator_runner = EvaluatorRunner(
100104
interval=float(os.getenv("_DD_LLMOBS_EVALUATOR_INTERVAL", 1.0)),
101105
llmobs_service=self,
102106
)
103107

104-
self._trace_processor = LLMObsTraceProcessor(self._llmobs_span_writer, self._evaluator_runner)
105108
forksafe.register(self._child_after_fork)
106109

107110
self._annotations = []
108111
self._annotation_context_lock = forksafe.RLock()
109-
self.tracer.on_start_span(self._do_annotations)
110112

111-
def _do_annotations(self, span):
113+
# Register hooks for span events
114+
core.on("trace.span_start", self._do_annotations)
115+
core.on("trace.span_finish", self._on_span_finish)
116+
117+
def _on_span_finish(self, span):
118+
if self.enabled and span.span_type == SpanTypes.LLM:
119+
self._submit_llmobs_span(span)
120+
121+
def _submit_llmobs_span(self, span: Span) -> None:
122+
"""Generate and submit an LLMObs span event to be sent to LLMObs."""
123+
span_event = None
124+
is_llm_span = span._get_ctx_item(SPAN_KIND) == "llm"
125+
is_ragas_integration_span = False
126+
try:
127+
span_event, is_ragas_integration_span = self._llmobs_span_event(span)
128+
self._llmobs_span_writer.enqueue(span_event)
129+
except (KeyError, TypeError):
130+
log.error(
131+
"Error generating LLMObs span event for span %s, likely due to malformed span", span, exc_info=True
132+
)
133+
finally:
134+
if not span_event or not is_llm_span or is_ragas_integration_span:
135+
return
136+
if self._evaluator_runner:
137+
self._evaluator_runner.enqueue(span_event, span)
138+
139+
@classmethod
140+
def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]:
141+
"""Span event object structure."""
142+
span_kind = span._get_ctx_item(SPAN_KIND)
143+
if not span_kind:
144+
raise KeyError("Span kind not found in span context")
145+
meta: Dict[str, Any] = {"span.kind": span_kind, "input": {}, "output": {}}
146+
if span_kind in ("llm", "embedding") and span._get_ctx_item(MODEL_NAME) is not None:
147+
meta["model_name"] = span._get_ctx_item(MODEL_NAME)
148+
meta["model_provider"] = (span._get_ctx_item(MODEL_PROVIDER) or "custom").lower()
149+
meta["metadata"] = span._get_ctx_item(METADATA) or {}
150+
if span._get_ctx_item(INPUT_PARAMETERS):
151+
meta["input"]["parameters"] = span._get_ctx_item(INPUT_PARAMETERS)
152+
if span_kind == "llm" and span._get_ctx_item(INPUT_MESSAGES) is not None:
153+
meta["input"]["messages"] = span._get_ctx_item(INPUT_MESSAGES)
154+
if span._get_ctx_item(INPUT_VALUE) is not None:
155+
meta["input"]["value"] = safe_json(span._get_ctx_item(INPUT_VALUE))
156+
if span_kind == "llm" and span._get_ctx_item(OUTPUT_MESSAGES) is not None:
157+
meta["output"]["messages"] = span._get_ctx_item(OUTPUT_MESSAGES)
158+
if span_kind == "embedding" and span._get_ctx_item(INPUT_DOCUMENTS) is not None:
159+
meta["input"]["documents"] = span._get_ctx_item(INPUT_DOCUMENTS)
160+
if span._get_ctx_item(OUTPUT_VALUE) is not None:
161+
meta["output"]["value"] = safe_json(span._get_ctx_item(OUTPUT_VALUE))
162+
if span_kind == "retrieval" and span._get_ctx_item(OUTPUT_DOCUMENTS) is not None:
163+
meta["output"]["documents"] = span._get_ctx_item(OUTPUT_DOCUMENTS)
164+
if span._get_ctx_item(INPUT_PROMPT) is not None:
165+
prompt_json_str = span._get_ctx_item(INPUT_PROMPT)
166+
if span_kind != "llm":
167+
log.warning(
168+
"Dropping prompt on non-LLM span kind, annotating prompts is only supported for LLM span kinds."
169+
)
170+
else:
171+
meta["input"]["prompt"] = prompt_json_str
172+
if span.error:
173+
meta.update(
174+
{
175+
ERROR_MSG: span.get_tag(ERROR_MSG),
176+
ERROR_STACK: span.get_tag(ERROR_STACK),
177+
ERROR_TYPE: span.get_tag(ERROR_TYPE),
178+
}
179+
)
180+
if not meta["input"]:
181+
meta.pop("input")
182+
if not meta["output"]:
183+
meta.pop("output")
184+
metrics = span._get_ctx_item(METRICS) or {}
185+
ml_app = _get_ml_app(span)
186+
187+
is_ragas_integration_span = False
188+
189+
if ml_app.startswith(constants.RAGAS_ML_APP_PREFIX):
190+
is_ragas_integration_span = True
191+
192+
span._set_ctx_item(ML_APP, ml_app)
193+
parent_id = str(_get_llmobs_parent_id(span) or "undefined")
194+
195+
llmobs_span_event = {
196+
"trace_id": "{:x}".format(span.trace_id),
197+
"span_id": str(span.span_id),
198+
"parent_id": parent_id,
199+
"name": _get_span_name(span),
200+
"start_ns": span.start_ns,
201+
"duration": span.duration_ns,
202+
"status": "error" if span.error else "ok",
203+
"meta": meta,
204+
"metrics": metrics,
205+
}
206+
session_id = _get_session_id(span)
207+
if session_id is not None:
208+
span._set_ctx_item(SESSION_ID, session_id)
209+
llmobs_span_event["session_id"] = session_id
210+
211+
llmobs_span_event["tags"] = cls._llmobs_tags(
212+
span, ml_app, session_id, is_ragas_integration_span=is_ragas_integration_span
213+
)
214+
return llmobs_span_event, is_ragas_integration_span
215+
216+
@staticmethod
217+
def _llmobs_tags(
218+
span: Span, ml_app: str, session_id: Optional[str] = None, is_ragas_integration_span: bool = False
219+
) -> List[str]:
220+
tags = {
221+
"version": config.version or "",
222+
"env": config.env or "",
223+
"service": span.service or "",
224+
"source": "integration",
225+
"ml_app": ml_app,
226+
"ddtrace.version": ddtrace.__version__,
227+
"language": "python",
228+
"error": span.error,
229+
}
230+
err_type = span.get_tag(ERROR_TYPE)
231+
if err_type:
232+
tags["error_type"] = err_type
233+
if session_id:
234+
tags["session_id"] = session_id
235+
if is_ragas_integration_span:
236+
tags[constants.RUNNER_IS_INTEGRATION_SPAN_TAG] = "ragas"
237+
existing_tags = span._get_ctx_item(TAGS)
238+
if existing_tags is not None:
239+
tags.update(existing_tags)
240+
return ["{}:{}".format(k, v) for k, v in tags.items()]
241+
242+
def _do_annotations(self, span: Span) -> None:
112243
# get the current span context
113244
# only do the annotations if it matches the context
114245
if span.span_type != SpanTypes.LLM: # do this check to avoid the warning log in `annotate`
@@ -120,20 +251,14 @@ def _do_annotations(self, span):
120251
if current_context_id == context_id:
121252
self.annotate(span, **annotation_kwargs)
122253

123-
def _child_after_fork(self):
254+
def _child_after_fork(self) -> None:
124255
self._llmobs_span_writer = self._llmobs_span_writer.recreate()
125256
self._llmobs_eval_metric_writer = self._llmobs_eval_metric_writer.recreate()
126257
self._evaluator_runner = self._evaluator_runner.recreate()
127-
self._trace_processor._span_writer = self._llmobs_span_writer
128-
self._trace_processor._evaluator_runner = self._evaluator_runner
129258
if self.enabled:
130259
self._start_service()
131260

132261
def _start_service(self) -> None:
133-
tracer_filters = self.tracer._filters
134-
if not any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in tracer_filters):
135-
tracer_filters += [self._trace_processor]
136-
self.tracer.configure(settings={"FILTERS": tracer_filters})
137262
try:
138263
self._llmobs_span_writer.start()
139264
self._llmobs_eval_metric_writer.start()
@@ -160,11 +285,7 @@ def _stop_service(self) -> None:
160285
except ServiceStatusError:
161286
log.debug("Error stopping LLMObs writers")
162287

163-
try:
164-
forksafe.unregister(self._child_after_fork)
165-
self.tracer.shutdown()
166-
except Exception:
167-
log.warning("Failed to shutdown tracer", exc_info=True)
288+
forksafe.unregister(self._child_after_fork)
168289

169290
@classmethod
170291
def enable(
@@ -265,7 +386,6 @@ def disable(cls) -> None:
265386

266387
cls._instance.stop()
267388
cls.enabled = False
268-
cls._instance.tracer.deregister_on_start_span(cls._instance._do_annotations)
269389
telemetry_writer.product_activated(TELEMETRY_APM_PRODUCT.LLMOBS, False)
270390

271391
log.debug("%s disabled", cls.__name__)

0 commit comments

Comments
 (0)