From 959432ac0bd530ea1687b709f75623f86dfadf12 Mon Sep 17 00:00:00 2001 From: Kyle Verhoog Date: Fri, 13 Dec 2024 14:54:04 -0500 Subject: [PATCH 01/12] chore(llmobs): refactor to use span events The LLMObs service formerly depended on the TraceProcessor interface in the tracer. This was problematic due to sharing a dependency with the public API. As such, users could configure a trace filter (under the hood is a trace processor) and overwrite the LLMObs TraceProcessor. Instead, the tracer can emit span start and finish events which the LLMObs service listens to and acts on, as proposed here. The gotcha is that the LLMObs service no longer has a way to drop traces when run in agentless mode, which only LLMObs supports. Instead, we encourage users to explicitly turn off APM which carries the benefit of clarity since this was implicit before. --- ddtrace/_trace/tracer.py | 5 +- ddtrace/llmobs/_llmobs.py | 162 +++++++++++++++--- ddtrace/llmobs/_trace_processor.py | 177 -------------------- ddtrace/llmobs/_utils.py | 2 + tests/llmobs/conftest.py | 1 - tests/llmobs/test_llmobs_service.py | 53 +----- tests/llmobs/test_llmobs_trace_processor.py | 36 ---- 7 files changed, 151 insertions(+), 285 deletions(-) delete mode 100644 ddtrace/llmobs/_trace_processor.py delete mode 100644 tests/llmobs/test_llmobs_trace_processor.py diff --git a/ddtrace/_trace/tracer.py b/ddtrace/_trace/tracer.py index 6027976d6dc..b2d132deb50 100644 --- a/ddtrace/_trace/tracer.py +++ b/ddtrace/_trace/tracer.py @@ -41,6 +41,7 @@ from ddtrace.internal.atexit import register_on_exit_signal from ddtrace.internal.constants import SAMPLING_DECISION_TRACE_TAG_KEY from ddtrace.internal.constants import SPAN_API_DATADOG +from ddtrace.internal.core import dispatch from ddtrace.internal.dogstatsd import get_dogstatsd_client from ddtrace.internal.logger import get_logger from ddtrace.internal.peer_service.processor import PeerServiceProcessor @@ -866,7 +867,7 @@ def _start_span( for p in chain(self._span_processors, SpanProcessor.__processors__, self._deferred_processors): p.on_span_start(span) self._hooks.emit(self.__class__.start_span, span) - + dispatch("trace.span_start", (span,)) return span start_span = _start_span @@ -883,6 +884,8 @@ def _on_span_finish(self, span: Span) -> None: for p in chain(self._span_processors, SpanProcessor.__processors__, self._deferred_processors): p.on_span_finish(span) + dispatch("trace.span_finish", (span,)) + if log.isEnabledFor(logging.DEBUG): log.debug("finishing span %s (enabled:%s)", span._pprint(), self.enabled) diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 49815151118..49dae967f4a 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -3,7 +3,9 @@ import time from typing import Any from typing import Dict +from typing import List from typing import Optional +from typing import Tuple from typing import Union import ddtrace @@ -13,6 +15,7 @@ from ddtrace._trace.context import Context from ddtrace.ext import SpanTypes from ddtrace.internal import atexit +from ddtrace.internal import core from ddtrace.internal import forksafe from ddtrace.internal._rand import rand64bits from ddtrace.internal.compat import ensure_text @@ -45,11 +48,11 @@ from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING from ddtrace.llmobs._constants import TAGS from ddtrace.llmobs._evaluators.runner import EvaluatorRunner -from ddtrace.llmobs._trace_processor import LLMObsTraceProcessor from ddtrace.llmobs._utils import AnnotationContext from ddtrace.llmobs._utils import _get_llmobs_parent_id from ddtrace.llmobs._utils import _get_ml_app from ddtrace.llmobs._utils import _get_session_id +from ddtrace.llmobs._utils import _get_span_name from ddtrace.llmobs._utils import _inject_llmobs_parent_id from ddtrace.llmobs._utils import safe_json from ddtrace.llmobs._utils import validate_prompt @@ -60,6 +63,11 @@ from ddtrace.llmobs.utils import Messages from ddtrace.propagation.http import HTTPPropagator +from ..constants import ERROR_MSG +from ..constants import ERROR_STACK +from ..constants import ERROR_TYPE +from . import _constants as constants + log = get_logger(__name__) @@ -81,34 +89,157 @@ class LLMObs(Service): def __init__(self, tracer=None): super(LLMObs, self).__init__() self.tracer = tracer or ddtrace.tracer - self._llmobs_span_writer = None - self._llmobs_span_writer = LLMObsSpanWriter( is_agentless=config._llmobs_agentless_enabled, interval=float(os.getenv("_DD_LLMOBS_WRITER_INTERVAL", 1.0)), timeout=float(os.getenv("_DD_LLMOBS_WRITER_TIMEOUT", 5.0)), ) - self._llmobs_eval_metric_writer = LLMObsEvalMetricWriter( site=config._dd_site, api_key=config._dd_api_key, interval=float(os.getenv("_DD_LLMOBS_WRITER_INTERVAL", 1.0)), timeout=float(os.getenv("_DD_LLMOBS_WRITER_TIMEOUT", 5.0)), ) - self._evaluator_runner = EvaluatorRunner( interval=float(os.getenv("_DD_LLMOBS_EVALUATOR_INTERVAL", 1.0)), llmobs_service=self, ) - self._trace_processor = LLMObsTraceProcessor(self._llmobs_span_writer, self._evaluator_runner) forksafe.register(self._child_after_fork) self._annotations = [] self._annotation_context_lock = forksafe.RLock() - self.tracer.on_start_span(self._do_annotations) - def _do_annotations(self, span): + # Register hooks for span events + core.on("trace.span_start", self._do_annotations) + core.on("trace.span_finish", self._on_span_finish) + + def _on_span_finish(self, span): + if self.enabled and span.span_type == SpanTypes.LLM: + self._submit_llmobs_span(span) + + def _submit_llmobs_span(self, span: Span) -> None: + """Generate and submit an LLMObs span event to be sent to LLMObs.""" + span_event = None + is_llm_span = span._get_ctx_item(SPAN_KIND) == "llm" + is_ragas_integration_span = False + try: + span_event, is_ragas_integration_span = self._llmobs_span_event(span) + self._llmobs_span_writer.enqueue(span_event) + except (KeyError, TypeError): + log.error( + "Error generating LLMObs span event for span %s, likely due to malformed span", span, exc_info=True + ) + finally: + if not span_event or not is_llm_span or is_ragas_integration_span: + return + if self._evaluator_runner: + self._evaluator_runner.enqueue(span_event, span) + + @classmethod + def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]: + """Span event object structure.""" + span_kind = span._get_ctx_item(SPAN_KIND) + if not span_kind: + raise KeyError("Span kind not found in span context") + meta: Dict[str, Any] = {"span.kind": span_kind, "input": {}, "output": {}} + if span_kind in ("llm", "embedding") and span._get_ctx_item(MODEL_NAME) is not None: + meta["model_name"] = span._get_ctx_item(MODEL_NAME) + meta["model_provider"] = (span._get_ctx_item(MODEL_PROVIDER) or "custom").lower() + meta["metadata"] = span._get_ctx_item(METADATA) or {} + if span._get_ctx_item(INPUT_PARAMETERS): + meta["input"]["parameters"] = span._get_ctx_item(INPUT_PARAMETERS) + if span_kind == "llm" and span._get_ctx_item(INPUT_MESSAGES) is not None: + meta["input"]["messages"] = span._get_ctx_item(INPUT_MESSAGES) + if span._get_ctx_item(INPUT_VALUE) is not None: + meta["input"]["value"] = safe_json(span._get_ctx_item(INPUT_VALUE)) + if span_kind == "llm" and span._get_ctx_item(OUTPUT_MESSAGES) is not None: + meta["output"]["messages"] = span._get_ctx_item(OUTPUT_MESSAGES) + if span_kind == "embedding" and span._get_ctx_item(INPUT_DOCUMENTS) is not None: + meta["input"]["documents"] = span._get_ctx_item(INPUT_DOCUMENTS) + if span._get_ctx_item(OUTPUT_VALUE) is not None: + meta["output"]["value"] = safe_json(span._get_ctx_item(OUTPUT_VALUE)) + if span_kind == "retrieval" and span._get_ctx_item(OUTPUT_DOCUMENTS) is not None: + meta["output"]["documents"] = span._get_ctx_item(OUTPUT_DOCUMENTS) + if span._get_ctx_item(INPUT_PROMPT) is not None: + prompt_json_str = span._get_ctx_item(INPUT_PROMPT) + if span_kind != "llm": + log.warning( + "Dropping prompt on non-LLM span kind, annotating prompts is only supported for LLM span kinds." + ) + else: + meta["input"]["prompt"] = prompt_json_str + if span.error: + meta.update( + { + ERROR_MSG: span.get_tag(ERROR_MSG), + ERROR_STACK: span.get_tag(ERROR_STACK), + ERROR_TYPE: span.get_tag(ERROR_TYPE), + } + ) + if not meta["input"]: + meta.pop("input") + if not meta["output"]: + meta.pop("output") + metrics = span._get_ctx_item(METRICS) or {} + ml_app = _get_ml_app(span) + + is_ragas_integration_span = False + + if ml_app.startswith(constants.RAGAS_ML_APP_PREFIX): + is_ragas_integration_span = True + + span._set_ctx_item(ML_APP, ml_app) + parent_id = str(_get_llmobs_parent_id(span) or "undefined") + + llmobs_span_event = { + "trace_id": "{:x}".format(span.trace_id), + "span_id": str(span.span_id), + "parent_id": parent_id, + "name": _get_span_name(span), + "start_ns": span.start_ns, + "duration": span.duration_ns, + "status": "error" if span.error else "ok", + "meta": meta, + "metrics": metrics, + } + session_id = _get_session_id(span) + if session_id is not None: + span._set_ctx_item(SESSION_ID, session_id) + llmobs_span_event["session_id"] = session_id + + llmobs_span_event["tags"] = cls._llmobs_tags( + span, ml_app, session_id, is_ragas_integration_span=is_ragas_integration_span + ) + return llmobs_span_event, is_ragas_integration_span + + @staticmethod + def _llmobs_tags( + span: Span, ml_app: str, session_id: Optional[str] = None, is_ragas_integration_span: bool = False + ) -> List[str]: + tags = { + "version": config.version or "", + "env": config.env or "", + "service": span.service or "", + "source": "integration", + "ml_app": ml_app, + "ddtrace.version": ddtrace.__version__, + "language": "python", + "error": span.error, + } + err_type = span.get_tag(ERROR_TYPE) + if err_type: + tags["error_type"] = err_type + if session_id: + tags["session_id"] = session_id + if is_ragas_integration_span: + tags[constants.RUNNER_IS_INTEGRATION_SPAN_TAG] = "ragas" + existing_tags = span._get_ctx_item(TAGS) + if existing_tags is not None: + tags.update(existing_tags) + return ["{}:{}".format(k, v) for k, v in tags.items()] + + def _do_annotations(self, span: Span) -> None: # get the current span context # only do the annotations if it matches the context if span.span_type != SpanTypes.LLM: # do this check to avoid the warning log in `annotate` @@ -120,20 +251,14 @@ def _do_annotations(self, span): if current_context_id == context_id: self.annotate(span, **annotation_kwargs) - def _child_after_fork(self): + def _child_after_fork(self) -> None: self._llmobs_span_writer = self._llmobs_span_writer.recreate() self._llmobs_eval_metric_writer = self._llmobs_eval_metric_writer.recreate() self._evaluator_runner = self._evaluator_runner.recreate() - self._trace_processor._span_writer = self._llmobs_span_writer - self._trace_processor._evaluator_runner = self._evaluator_runner if self.enabled: self._start_service() def _start_service(self) -> None: - tracer_filters = self.tracer._filters - if not any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in tracer_filters): - tracer_filters += [self._trace_processor] - self.tracer.configure(settings={"FILTERS": tracer_filters}) try: self._llmobs_span_writer.start() self._llmobs_eval_metric_writer.start() @@ -160,11 +285,7 @@ def _stop_service(self) -> None: except ServiceStatusError: log.debug("Error stopping LLMObs writers") - try: - forksafe.unregister(self._child_after_fork) - self.tracer.shutdown() - except Exception: - log.warning("Failed to shutdown tracer", exc_info=True) + forksafe.unregister(self._child_after_fork) @classmethod def enable( @@ -265,7 +386,6 @@ def disable(cls) -> None: cls._instance.stop() cls.enabled = False - cls._instance.tracer.deregister_on_start_span(cls._instance._do_annotations) telemetry_writer.product_activated(TELEMETRY_APM_PRODUCT.LLMOBS, False) log.debug("%s disabled", cls.__name__) diff --git a/ddtrace/llmobs/_trace_processor.py b/ddtrace/llmobs/_trace_processor.py deleted file mode 100644 index 231d53d7626..00000000000 --- a/ddtrace/llmobs/_trace_processor.py +++ /dev/null @@ -1,177 +0,0 @@ -from typing import Any -from typing import Dict -from typing import List -from typing import Optional -from typing import Tuple - -import ddtrace -from ddtrace import Span -from ddtrace import config -from ddtrace._trace.processor import TraceProcessor -from ddtrace.constants import ERROR_MSG -from ddtrace.constants import ERROR_STACK -from ddtrace.constants import ERROR_TYPE -from ddtrace.ext import SpanTypes -from ddtrace.internal.logger import get_logger -from ddtrace.llmobs._constants import INPUT_DOCUMENTS -from ddtrace.llmobs._constants import INPUT_MESSAGES -from ddtrace.llmobs._constants import INPUT_PARAMETERS -from ddtrace.llmobs._constants import INPUT_PROMPT -from ddtrace.llmobs._constants import INPUT_VALUE -from ddtrace.llmobs._constants import METADATA -from ddtrace.llmobs._constants import METRICS -from ddtrace.llmobs._constants import ML_APP -from ddtrace.llmobs._constants import MODEL_NAME -from ddtrace.llmobs._constants import MODEL_PROVIDER -from ddtrace.llmobs._constants import OUTPUT_DOCUMENTS -from ddtrace.llmobs._constants import OUTPUT_MESSAGES -from ddtrace.llmobs._constants import OUTPUT_VALUE -from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX -from ddtrace.llmobs._constants import RUNNER_IS_INTEGRATION_SPAN_TAG -from ddtrace.llmobs._constants import SESSION_ID -from ddtrace.llmobs._constants import SPAN_KIND -from ddtrace.llmobs._constants import TAGS -from ddtrace.llmobs._utils import _get_llmobs_parent_id -from ddtrace.llmobs._utils import _get_ml_app -from ddtrace.llmobs._utils import _get_session_id -from ddtrace.llmobs._utils import _get_span_name -from ddtrace.llmobs._utils import safe_json - - -log = get_logger(__name__) - - -class LLMObsTraceProcessor(TraceProcessor): - """ - Processor that extracts LLM-type spans in a trace to submit as separate LLMObs span events to LLM Observability. - """ - - def __init__(self, llmobs_span_writer, evaluator_runner=None): - self._span_writer = llmobs_span_writer - self._evaluator_runner = evaluator_runner - - def process_trace(self, trace: List[Span]) -> Optional[List[Span]]: - if not trace: - return None - for span in trace: - if span.span_type == SpanTypes.LLM: - self.submit_llmobs_span(span) - return None if config._llmobs_agentless_enabled else trace - - def submit_llmobs_span(self, span: Span) -> None: - """Generate and submit an LLMObs span event to be sent to LLMObs.""" - span_event = None - is_llm_span = span._get_ctx_item(SPAN_KIND) == "llm" - is_ragas_integration_span = False - try: - span_event, is_ragas_integration_span = self._llmobs_span_event(span) - self._span_writer.enqueue(span_event) - except (KeyError, TypeError): - log.error("Error generating LLMObs span event for span %s, likely due to malformed span", span) - finally: - if not span_event or not is_llm_span or is_ragas_integration_span: - return - if self._evaluator_runner: - self._evaluator_runner.enqueue(span_event, span) - - def _llmobs_span_event(self, span: Span) -> Tuple[Dict[str, Any], bool]: - """Span event object structure.""" - span_kind = span._get_ctx_item(SPAN_KIND) - if not span_kind: - raise KeyError("Span kind not found in span context") - meta: Dict[str, Any] = {"span.kind": span_kind, "input": {}, "output": {}} - if span_kind in ("llm", "embedding") and span._get_ctx_item(MODEL_NAME) is not None: - meta["model_name"] = span._get_ctx_item(MODEL_NAME) - meta["model_provider"] = (span._get_ctx_item(MODEL_PROVIDER) or "custom").lower() - meta["metadata"] = span._get_ctx_item(METADATA) or {} - if span._get_ctx_item(INPUT_PARAMETERS): - meta["input"]["parameters"] = span._get_ctx_item(INPUT_PARAMETERS) - if span_kind == "llm" and span._get_ctx_item(INPUT_MESSAGES) is not None: - meta["input"]["messages"] = span._get_ctx_item(INPUT_MESSAGES) - if span._get_ctx_item(INPUT_VALUE) is not None: - meta["input"]["value"] = safe_json(span._get_ctx_item(INPUT_VALUE)) - if span_kind == "llm" and span._get_ctx_item(OUTPUT_MESSAGES) is not None: - meta["output"]["messages"] = span._get_ctx_item(OUTPUT_MESSAGES) - if span_kind == "embedding" and span._get_ctx_item(INPUT_DOCUMENTS) is not None: - meta["input"]["documents"] = span._get_ctx_item(INPUT_DOCUMENTS) - if span._get_ctx_item(OUTPUT_VALUE) is not None: - meta["output"]["value"] = safe_json(span._get_ctx_item(OUTPUT_VALUE)) - if span_kind == "retrieval" and span._get_ctx_item(OUTPUT_DOCUMENTS) is not None: - meta["output"]["documents"] = span._get_ctx_item(OUTPUT_DOCUMENTS) - if span._get_ctx_item(INPUT_PROMPT) is not None: - prompt_json_str = span._get_ctx_item(INPUT_PROMPT) - if span_kind != "llm": - log.warning( - "Dropping prompt on non-LLM span kind, annotating prompts is only supported for LLM span kinds." - ) - else: - meta["input"]["prompt"] = prompt_json_str - if span.error: - meta.update( - { - ERROR_MSG: span.get_tag(ERROR_MSG), - ERROR_STACK: span.get_tag(ERROR_STACK), - ERROR_TYPE: span.get_tag(ERROR_TYPE), - } - ) - if not meta["input"]: - meta.pop("input") - if not meta["output"]: - meta.pop("output") - metrics = span._get_ctx_item(METRICS) or {} - ml_app = _get_ml_app(span) - - is_ragas_integration_span = False - - if ml_app.startswith(RAGAS_ML_APP_PREFIX): - is_ragas_integration_span = True - - span._set_ctx_item(ML_APP, ml_app) - parent_id = str(_get_llmobs_parent_id(span) or "undefined") - - llmobs_span_event = { - "trace_id": "{:x}".format(span.trace_id), - "span_id": str(span.span_id), - "parent_id": parent_id, - "name": _get_span_name(span), - "start_ns": span.start_ns, - "duration": span.duration_ns, - "status": "error" if span.error else "ok", - "meta": meta, - "metrics": metrics, - } - session_id = _get_session_id(span) - if session_id is not None: - span._set_ctx_item(SESSION_ID, session_id) - llmobs_span_event["session_id"] = session_id - - llmobs_span_event["tags"] = self._llmobs_tags( - span, ml_app, session_id, is_ragas_integration_span=is_ragas_integration_span - ) - return llmobs_span_event, is_ragas_integration_span - - @staticmethod - def _llmobs_tags( - span: Span, ml_app: str, session_id: Optional[str] = None, is_ragas_integration_span: bool = False - ) -> List[str]: - tags = { - "version": config.version or "", - "env": config.env or "", - "service": span.service or "", - "source": "integration", - "ml_app": ml_app, - "ddtrace.version": ddtrace.__version__, - "language": "python", - "error": span.error, - } - err_type = span.get_tag(ERROR_TYPE) - if err_type: - tags["error_type"] = err_type - if session_id: - tags["session_id"] = session_id - if is_ragas_integration_span: - tags[RUNNER_IS_INTEGRATION_SPAN_TAG] = "ragas" - existing_tags = span._get_ctx_item(TAGS) - if existing_tags is not None: - tags.update(existing_tags) - return ["{}:{}".format(k, v) for k, v in tags.items()] diff --git a/ddtrace/llmobs/_utils.py b/ddtrace/llmobs/_utils.py index c1b1c4a776c..4b1d4f1ac60 100644 --- a/ddtrace/llmobs/_utils.py +++ b/ddtrace/llmobs/_utils.py @@ -135,6 +135,7 @@ def _get_ml_app(span: Span) -> str: ml_app = span._get_ctx_item(ML_APP) if ml_app: return ml_app + # TODO: go up the span tree to find the nearest LLMObs span with an ml_app nearest_llmobs_ancestor = _get_nearest_llmobs_ancestor(span) if nearest_llmobs_ancestor: ml_app = nearest_llmobs_ancestor._get_ctx_item(ML_APP) @@ -149,6 +150,7 @@ def _get_session_id(span: Span) -> Optional[str]: session_id = span._get_ctx_item(SESSION_ID) if session_id: return session_id + # TODO: go up the span tree to find the nearest LLMObs span with session nearest_llmobs_ancestor = _get_nearest_llmobs_ancestor(span) if nearest_llmobs_ancestor: session_id = nearest_llmobs_ancestor._get_ctx_item(SESSION_ID) diff --git a/tests/llmobs/conftest.py b/tests/llmobs/conftest.py index a7d467b3985..7a2a940e5c8 100644 --- a/tests/llmobs/conftest.py +++ b/tests/llmobs/conftest.py @@ -251,7 +251,6 @@ def llmobs(monkeypatch, tracer, llmobs_env, llmobs_span_writer): with override_global_config(dict(_llmobs_ml_app=llmobs_env.get("DD_LLMOBS_ML_APP"))): llmobs_service.enable(_tracer=tracer) llmobs_service._instance._llmobs_span_writer = llmobs_span_writer - llmobs_service._instance._trace_processor._span_writer = llmobs_span_writer yield llmobs llmobs_service.disable() diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py index 98748250c3a..1721f79ef97 100644 --- a/tests/llmobs/test_llmobs_service.py +++ b/tests/llmobs/test_llmobs_service.py @@ -7,9 +7,7 @@ import ddtrace from ddtrace._trace.context import Context -from ddtrace._trace.span import Span from ddtrace.ext import SpanTypes -from ddtrace.filters import TraceFilter from ddtrace.internal.service import ServiceStatus from ddtrace.llmobs import LLMObs as llmobs_service from ddtrace.llmobs._constants import INPUT_DOCUMENTS @@ -31,7 +29,6 @@ from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING from ddtrace.llmobs._constants import TAGS from ddtrace.llmobs._llmobs import SUPPORTED_LLMOBS_INTEGRATIONS -from ddtrace.llmobs._llmobs import LLMObsTraceProcessor from ddtrace.llmobs.utils import Prompt from tests.llmobs._utils import _expected_llmobs_eval_metric_event from tests.llmobs._utils import _expected_llmobs_llm_span_event @@ -48,13 +45,9 @@ def mock_logs(): def run_llmobs_trace_filter(dummy_tracer): - for trace_filter in dummy_tracer._filters: - if isinstance(trace_filter, LLMObsTraceProcessor): - root_llm_span = Span(name="span1", span_type=SpanTypes.LLM) - root_llm_span.set_tag_str(SPAN_KIND, "llm") - trace1 = [root_llm_span] - return trace_filter.process_trace(trace1) - raise ValueError("LLMObsTraceProcessor not found in tracer filters.") + with dummy_tracer.trace("span1", span_type=SpanTypes.LLM) as span: + span.set_tag_str(SPAN_KIND, "llm") + return dummy_tracer.writer.pop() def test_service_enable(): @@ -65,7 +58,6 @@ def test_service_enable(): assert llmobs_instance is not None assert llmobs_service.enabled assert llmobs_instance.tracer == dummy_tracer - assert any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in dummy_tracer._filters) assert run_llmobs_trace_filter(dummy_tracer) is not None llmobs_service.disable() @@ -79,7 +71,6 @@ def test_service_enable_with_apm_disabled(monkeypatch): assert llmobs_instance is not None assert llmobs_service.enabled assert llmobs_instance.tracer == dummy_tracer - assert any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in dummy_tracer._filters) assert run_llmobs_trace_filter(dummy_tracer) is None llmobs_service.disable() @@ -139,7 +130,6 @@ def test_service_enable_already_enabled(mock_logs): assert llmobs_instance is not None assert llmobs_service.enabled assert llmobs_instance.tracer == dummy_tracer - assert any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in dummy_tracer._filters) llmobs_service.disable() mock_logs.debug.assert_has_calls([mock.call("%s already enabled", "LLMObs")]) @@ -1667,42 +1657,6 @@ def test_llmobs_fork_evaluator_runner_run(monkeypatch): llmobs_service.disable() -def test_llmobs_fork_custom_filter(monkeypatch): - """Test that forking a process correctly keeps any custom filters.""" - - class CustomFilter(TraceFilter): - def process_trace(self, trace): - return trace - - monkeypatch.setenv("_DD_LLMOBS_WRITER_INTERVAL", 5.0) - with mock.patch("ddtrace.internal.writer.HTTPWriter._send_payload"): - tracer = DummyTracer() - custom_filter = CustomFilter() - tracer.configure(settings={"FILTERS": [custom_filter]}) - llmobs_service.enable(_tracer=tracer, ml_app="test_app") - assert custom_filter in llmobs_service._instance.tracer._filters - pid = os.fork() - if pid: # parent - assert custom_filter in llmobs_service._instance.tracer._filters - assert any( - isinstance(tracer_filter, LLMObsTraceProcessor) - for tracer_filter in llmobs_service._instance.tracer._filters - ) - else: # child - assert custom_filter in llmobs_service._instance.tracer._filters - assert any( - isinstance(tracer_filter, LLMObsTraceProcessor) - for tracer_filter in llmobs_service._instance.tracer._filters - ) - llmobs_service.disable() - os._exit(12) - - _, status = os.waitpid(pid, 0) - exit_code = os.WEXITSTATUS(status) - assert exit_code == 12 - llmobs_service.disable() - - def test_llmobs_fork_disabled(monkeypatch): """Test that after being disabled the service remains disabled when forking""" monkeypatch.setenv("DD_LLMOBS_ENABLED", "0") @@ -1994,3 +1948,4 @@ def test_service_enable_does_not_start_evaluator_runner(): assert llmobs_service._instance._llmobs_span_writer.status.value == "running" assert llmobs_service._instance._evaluator_runner.status.value == "stopped" llmobs_service.disable() + diff --git a/tests/llmobs/test_llmobs_trace_processor.py b/tests/llmobs/test_llmobs_trace_processor.py deleted file mode 100644 index b55286d49c8..00000000000 --- a/tests/llmobs/test_llmobs_trace_processor.py +++ /dev/null @@ -1,36 +0,0 @@ -import mock - -from ddtrace._trace.span import Span -from ddtrace.ext import SpanTypes -from ddtrace.llmobs._constants import SPAN_KIND -from ddtrace.llmobs._trace_processor import LLMObsTraceProcessor -from tests.utils import override_global_config - - -def test_processor_returns_all_traces_by_default(): - """Test that the LLMObsTraceProcessor returns all traces by default.""" - trace_filter = LLMObsTraceProcessor(llmobs_span_writer=mock.MagicMock()) - root_llm_span = Span(name="span1", span_type=SpanTypes.LLM) - root_llm_span._set_ctx_item(SPAN_KIND, "llm") - trace1 = [root_llm_span] - assert trace_filter.process_trace(trace1) == trace1 - - -def test_processor_returns_all_traces_if_not_agentless(): - """Test that the LLMObsTraceProcessor returns all traces if DD_LLMOBS_AGENTLESS_ENABLED is not set to true.""" - with override_global_config(dict(_llmobs_agentless_enabled=False)): - trace_filter = LLMObsTraceProcessor(llmobs_span_writer=mock.MagicMock()) - root_llm_span = Span(name="span1", span_type=SpanTypes.LLM) - root_llm_span._set_ctx_item(SPAN_KIND, "llm") - trace1 = [root_llm_span] - assert trace_filter.process_trace(trace1) == trace1 - - -def test_processor_returns_none_in_agentless_mode(): - """Test that the LLMObsTraceProcessor returns None if DD_LLMOBS_AGENTLESS_ENABLED is set to true.""" - with override_global_config(dict(_llmobs_agentless_enabled=True)): - trace_filter = LLMObsTraceProcessor(llmobs_span_writer=mock.MagicMock()) - root_llm_span = Span(name="span1", span_type=SpanTypes.LLM) - root_llm_span._set_ctx_item(SPAN_KIND, "llm") - trace1 = [root_llm_span] - assert trace_filter.process_trace(trace1) is None From 0555fbd66bfda9a0f796be8824cdbd5294d26f4f Mon Sep 17 00:00:00 2001 From: Yun Kim Date: Thu, 2 Jan 2025 12:50:57 -0500 Subject: [PATCH 02/12] fmt --- ddtrace/llmobs/_llmobs.py | 12 +++++------- ddtrace/llmobs/_utils.py | 20 ++++++++++++-------- tests/llmobs/test_llmobs_service.py | 1 - 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 49dae967f4a..0636652f078 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -13,6 +13,9 @@ from ddtrace import config from ddtrace import patch from ddtrace._trace.context import Context +from ddtrace.constants import ERROR_MSG +from ddtrace.constants import ERROR_STACK +from ddtrace.constants import ERROR_TYPE from ddtrace.ext import SpanTypes from ddtrace.internal import atexit from ddtrace.internal import core @@ -27,6 +30,7 @@ from ddtrace.internal.telemetry.constants import TELEMETRY_APM_PRODUCT from ddtrace.internal.utils.formats import asbool from ddtrace.internal.utils.formats import parse_tags_str +from ddtrace.llmobs import _constants as constants from ddtrace.llmobs._constants import ANNOTATIONS_CONTEXT_ID from ddtrace.llmobs._constants import INPUT_DOCUMENTS from ddtrace.llmobs._constants import INPUT_MESSAGES @@ -63,11 +67,6 @@ from ddtrace.llmobs.utils import Messages from ddtrace.propagation.http import HTTPPropagator -from ..constants import ERROR_MSG -from ..constants import ERROR_STACK -from ..constants import ERROR_TYPE -from . import _constants as constants - log = get_logger(__name__) @@ -121,7 +120,6 @@ def _on_span_finish(self, span): def _submit_llmobs_span(self, span: Span) -> None: """Generate and submit an LLMObs span event to be sent to LLMObs.""" span_event = None - is_llm_span = span._get_ctx_item(SPAN_KIND) == "llm" is_ragas_integration_span = False try: span_event, is_ragas_integration_span = self._llmobs_span_event(span) @@ -131,7 +129,7 @@ def _submit_llmobs_span(self, span: Span) -> None: "Error generating LLMObs span event for span %s, likely due to malformed span", span, exc_info=True ) finally: - if not span_event or not is_llm_span or is_ragas_integration_span: + if not span_event or is_ragas_integration_span: return if self._evaluator_runner: self._evaluator_runner.enqueue(span_event, span) diff --git a/ddtrace/llmobs/_utils.py b/ddtrace/llmobs/_utils.py index 4b1d4f1ac60..dd616db8bef 100644 --- a/ddtrace/llmobs/_utils.py +++ b/ddtrace/llmobs/_utils.py @@ -135,10 +135,12 @@ def _get_ml_app(span: Span) -> str: ml_app = span._get_ctx_item(ML_APP) if ml_app: return ml_app - # TODO: go up the span tree to find the nearest LLMObs span with an ml_app - nearest_llmobs_ancestor = _get_nearest_llmobs_ancestor(span) - if nearest_llmobs_ancestor: - ml_app = nearest_llmobs_ancestor._get_ctx_item(ML_APP) + llmobs_parent = _get_nearest_llmobs_ancestor(span) + while llmobs_parent: + ml_app = llmobs_parent._get_ctx_item(ML_APP) + if ml_app is not None: + return ml_app + llmobs_parent = _get_nearest_llmobs_ancestor(llmobs_parent) return ml_app or config._llmobs_ml_app or "unknown-ml-app" @@ -150,10 +152,12 @@ def _get_session_id(span: Span) -> Optional[str]: session_id = span._get_ctx_item(SESSION_ID) if session_id: return session_id - # TODO: go up the span tree to find the nearest LLMObs span with session - nearest_llmobs_ancestor = _get_nearest_llmobs_ancestor(span) - if nearest_llmobs_ancestor: - session_id = nearest_llmobs_ancestor._get_ctx_item(SESSION_ID) + llmobs_parent = _get_nearest_llmobs_ancestor(span) + while llmobs_parent: + session_id = llmobs_parent._get_ctx_item(SESSION_ID) + if session_id is not None: + return session_id + llmobs_parent = _get_nearest_llmobs_ancestor(llmobs_parent) return session_id diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py index 1721f79ef97..d60bb4637fd 100644 --- a/tests/llmobs/test_llmobs_service.py +++ b/tests/llmobs/test_llmobs_service.py @@ -1948,4 +1948,3 @@ def test_service_enable_does_not_start_evaluator_runner(): assert llmobs_service._instance._llmobs_span_writer.status.value == "running" assert llmobs_service._instance._evaluator_runner.status.value == "stopped" llmobs_service.disable() - From 85b3baee2af72f90158253c63bf7339a9b5b65c0 Mon Sep 17 00:00:00 2001 From: Yun Kim Date: Thu, 2 Jan 2025 14:31:57 -0500 Subject: [PATCH 03/12] Revert llm kind check removal, fix llmobs_service tests --- ddtrace/llmobs/_llmobs.py | 3 +- riotfile.py | 2 +- tests/llmobs/conftest.py | 25 +- tests/llmobs/test_llmobs.py | 21 +- tests/llmobs/test_llmobs_service.py | 1255 ++++++++++++--------------- 5 files changed, 568 insertions(+), 738 deletions(-) diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 0636652f078..cd4069b4094 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -120,6 +120,7 @@ def _on_span_finish(self, span): def _submit_llmobs_span(self, span: Span) -> None: """Generate and submit an LLMObs span event to be sent to LLMObs.""" span_event = None + is_llm_span = span._get_ctx_item(SPAN_KIND) == "llm" is_ragas_integration_span = False try: span_event, is_ragas_integration_span = self._llmobs_span_event(span) @@ -129,7 +130,7 @@ def _submit_llmobs_span(self, span: Span) -> None: "Error generating LLMObs span event for span %s, likely due to malformed span", span, exc_info=True ) finally: - if not span_event or is_ragas_integration_span: + if not span_event or not is_llm_span or is_ragas_integration_span: return if self._evaluator_runner: self._evaluator_runner.enqueue(span_event, span) diff --git a/riotfile.py b/riotfile.py index e7a078a5425..f274b84bb0a 100644 --- a/riotfile.py +++ b/riotfile.py @@ -2883,8 +2883,8 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT name="llmobs", command="pytest {cmdargs} tests/llmobs", pkgs={"vcrpy": latest, "pytest-asyncio": "==0.21.1"}, - pys=select_pys(min_version="3.7"), venvs=[ + Venv(pys="3.7"), Venv(pys=select_pys(min_version="3.8"), pkgs={"ragas": "==0.1.21", "langchain": latest}), ], ), diff --git a/tests/llmobs/conftest.py b/tests/llmobs/conftest.py index 7a2a940e5c8..7e4ff7021a1 100644 --- a/tests/llmobs/conftest.py +++ b/tests/llmobs/conftest.py @@ -129,6 +129,13 @@ def mock_http_writer_logs(): yield m +@pytest.fixture +def mock_llmobs_logs(): + with mock.patch("ddtrace.llmobs._llmobs.log") as m: + yield m + m.reset_mock() + + @pytest.fixture def ddtrace_global_config(): config = {} @@ -243,15 +250,25 @@ def llmobs_span_writer(): @pytest.fixture -def llmobs(monkeypatch, tracer, llmobs_env, llmobs_span_writer): +def llmobs( + ddtrace_global_config, + monkeypatch, + tracer, + llmobs_env, + llmobs_span_writer, + mock_llmobs_eval_metric_writer, + mock_llmobs_evaluator_runner, +): for env, val in llmobs_env.items(): monkeypatch.setenv(env, val) - + global_config = default_global_config() + global_config.update(dict(_llmobs_ml_app=llmobs_env.get("DD_LLMOBS_ML_APP"))) + global_config.update(ddtrace_global_config) # TODO: remove once rest of tests are moved off of global config tampering - with override_global_config(dict(_llmobs_ml_app=llmobs_env.get("DD_LLMOBS_ML_APP"))): + with override_global_config(global_config): llmobs_service.enable(_tracer=tracer) llmobs_service._instance._llmobs_span_writer = llmobs_span_writer - yield llmobs + yield llmobs_service llmobs_service.disable() diff --git a/tests/llmobs/test_llmobs.py b/tests/llmobs/test_llmobs.py index 1bae7efe9ed..6cf19fc3e2c 100644 --- a/tests/llmobs/test_llmobs.py +++ b/tests/llmobs/test_llmobs.py @@ -1,4 +1,3 @@ -import mock import pytest from ddtrace.ext import SpanTypes @@ -8,12 +7,6 @@ from tests.llmobs._utils import _expected_llmobs_llm_span_event -@pytest.fixture -def mock_logs(): - with mock.patch("ddtrace.llmobs._trace_processor.log") as mock_logs: - yield mock_logs - - class TestMLApp: @pytest.mark.parametrize("llmobs_env", [{"DD_LLMOBS_ML_APP": ""}]) def test_tag_defaults_to_env_var(self, tracer, llmobs_env, llmobs_events): @@ -228,19 +221,19 @@ def test_model_and_provider_are_set(tracer, llmobs_events): assert span_event["meta"]["model_provider"] == "model_provider" -def test_malformed_span_logs_error_instead_of_raising(mock_logs, tracer, llmobs_events): +def test_malformed_span_logs_error_instead_of_raising(tracer, llmobs_events, mock_llmobs_logs): """Test that a trying to create a span event from a malformed span will log an error instead of crashing.""" with tracer.trace("root_llm_span", span_type=SpanTypes.LLM) as llm_span: # span does not have SPAN_KIND tag pass - mock_logs.error.assert_called_once_with( - "Error generating LLMObs span event for span %s, likely due to malformed span", llm_span + mock_llmobs_logs.error.assert_called_with( + "Error generating LLMObs span event for span %s, likely due to malformed span", llm_span, exc_info=True ) assert len(llmobs_events) == 0 -def test_processor_only_creates_llmobs_span_event(tracer, llmobs_events): - """Test that the LLMObsTraceProcessor only creates LLMObs span events for LLM span types.""" +def test_only_generate_span_events_from_llmobs_spans(tracer, llmobs_events): + """Test that we only generate LLMObs span events for LLM span types.""" with tracer.trace("root_llm_span", service="tests.llmobs", span_type=SpanTypes.LLM) as root_span: root_span._set_ctx_item(const.SPAN_KIND, "llm") with tracer.trace("child_span"): @@ -250,5 +243,5 @@ def test_processor_only_creates_llmobs_span_event(tracer, llmobs_events): expected_grandchild_llmobs_span["parent_id"] = str(root_span.span_id) assert len(llmobs_events) == 2 - assert llmobs_events[0] == _expected_llmobs_llm_span_event(root_span, "llm") - assert llmobs_events[1] == expected_grandchild_llmobs_span + assert llmobs_events[1] == _expected_llmobs_llm_span_event(root_span, "llm") + assert llmobs_events[0] == expected_grandchild_llmobs_span diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py index d60bb4637fd..3689b7bac63 100644 --- a/tests/llmobs/test_llmobs_service.py +++ b/tests/llmobs/test_llmobs_service.py @@ -38,16 +38,10 @@ from tests.utils import override_global_config -@pytest.fixture -def mock_logs(): - with mock.patch("ddtrace.llmobs._llmobs.log") as mock_logs: - yield mock_logs - - def run_llmobs_trace_filter(dummy_tracer): with dummy_tracer.trace("span1", span_type=SpanTypes.LLM) as span: span.set_tag_str(SPAN_KIND, "llm") - return dummy_tracer.writer.pop() + return dummy_tracer._writer.pop() def test_service_enable(): @@ -63,19 +57,6 @@ def test_service_enable(): llmobs_service.disable() -def test_service_enable_with_apm_disabled(monkeypatch): - with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")): - dummy_tracer = DummyTracer() - llmobs_service.enable(_tracer=dummy_tracer, agentless_enabled=True) - llmobs_instance = llmobs_service._instance - assert llmobs_instance is not None - assert llmobs_service.enabled - assert llmobs_instance.tracer == dummy_tracer - assert run_llmobs_trace_filter(dummy_tracer) is None - - llmobs_service.disable() - - def test_service_disable(): with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")): dummy_tracer = DummyTracer() @@ -109,7 +90,7 @@ def test_service_enable_no_ml_app_specified(): assert llmobs_service._instance._evaluator_runner.status.value == "stopped" -def test_service_enable_deprecated_ml_app_name(monkeypatch, mock_logs): +def test_service_enable_deprecated_ml_app_name(monkeypatch, mock_llmobs_logs): with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")): dummy_tracer = DummyTracer() monkeypatch.setenv("DD_LLMOBS_APP_NAME", "test_ml_app") @@ -117,11 +98,13 @@ def test_service_enable_deprecated_ml_app_name(monkeypatch, mock_logs): assert llmobs_service.enabled is True assert llmobs_service._instance._llmobs_eval_metric_writer.status.value == "running" assert llmobs_service._instance._llmobs_span_writer.status.value == "running" - mock_logs.warning.assert_called_once_with("`DD_LLMOBS_APP_NAME` is deprecated. Use `DD_LLMOBS_ML_APP` instead.") + mock_llmobs_logs.warning.assert_called_once_with( + "`DD_LLMOBS_APP_NAME` is deprecated. Use `DD_LLMOBS_ML_APP` instead." + ) llmobs_service.disable() -def test_service_enable_already_enabled(mock_logs): +def test_service_enable_already_enabled(mock_llmobs_logs): with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")): dummy_tracer = DummyTracer() llmobs_service.enable(_tracer=dummy_tracer) @@ -131,7 +114,7 @@ def test_service_enable_already_enabled(mock_logs): assert llmobs_service.enabled assert llmobs_instance.tracer == dummy_tracer llmobs_service.disable() - mock_logs.debug.assert_has_calls([mock.call("%s already enabled", "LLMObs")]) + mock_llmobs_logs.debug.assert_has_calls([mock.call("%s already enabled", "LLMObs")]) @mock.patch("ddtrace.llmobs._llmobs.patch") @@ -193,107 +176,83 @@ def test_service_enable_does_not_override_global_patch_config(mock_tracer_patch, llmobs_service.disable() -def test_start_span_while_disabled_logs_warning(LLMObs, mock_logs): - LLMObs.disable() - _ = LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") - mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) - mock_logs.reset_mock() - _ = LLMObs.tool(name="test_tool") - mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) - mock_logs.reset_mock() - _ = LLMObs.task(name="test_task") - mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) - mock_logs.reset_mock() - _ = LLMObs.workflow(name="test_workflow") - mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) - mock_logs.reset_mock() - _ = LLMObs.agent(name="test_agent") - mock_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) - - -def test_start_span_uses_kind_as_default_name(LLMObs): - with LLMObs.llm(model_name="test_model", model_provider="test_provider") as span: +def test_start_span_while_disabled_logs_warning(llmobs, mock_llmobs_logs): + llmobs.disable() + _ = llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") + mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) + mock_llmobs_logs.reset_mock() + _ = llmobs.tool(name="test_tool") + mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) + mock_llmobs_logs.reset_mock() + _ = llmobs.task(name="test_task") + mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) + mock_llmobs_logs.reset_mock() + _ = llmobs.workflow(name="test_workflow") + mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) + mock_llmobs_logs.reset_mock() + _ = llmobs.agent(name="test_agent") + mock_llmobs_logs.warning.assert_called_once_with(SPAN_START_WHILE_DISABLED_WARNING) + + +def test_start_span_uses_kind_as_default_name(llmobs): + with llmobs.llm(model_name="test_model", model_provider="test_provider") as span: assert span.name == "llm" - with LLMObs.tool() as span: + with llmobs.tool() as span: assert span.name == "tool" - with LLMObs.task() as span: + with llmobs.task() as span: assert span.name == "task" - with LLMObs.workflow() as span: + with llmobs.workflow() as span: assert span.name == "workflow" - with LLMObs.agent() as span: + with llmobs.agent() as span: assert span.name == "agent" -def test_start_span_with_session_id(LLMObs): - with LLMObs.llm(model_name="test_model", session_id="test_session_id") as span: +def test_start_span_with_session_id(llmobs): + with llmobs.llm(model_name="test_model", session_id="test_session_id") as span: assert span._get_ctx_item(SESSION_ID) == "test_session_id" - with LLMObs.tool(session_id="test_session_id") as span: + with llmobs.tool(session_id="test_session_id") as span: assert span._get_ctx_item(SESSION_ID) == "test_session_id" - with LLMObs.task(session_id="test_session_id") as span: + with llmobs.task(session_id="test_session_id") as span: assert span._get_ctx_item(SESSION_ID) == "test_session_id" - with LLMObs.workflow(session_id="test_session_id") as span: + with llmobs.workflow(session_id="test_session_id") as span: assert span._get_ctx_item(SESSION_ID) == "test_session_id" - with LLMObs.agent(session_id="test_session_id") as span: + with llmobs.agent(session_id="test_session_id") as span: assert span._get_ctx_item(SESSION_ID) == "test_session_id" -def test_session_id_becomes_top_level_field(LLMObs, mock_llmobs_span_writer): - session_id = "test_session_id" - with LLMObs.task(session_id=session_id) as span: - pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "task", session_id=session_id) - ) - - -def test_session_id_becomes_top_level_field_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): +def test_session_id_becomes_top_level_field(llmobs, llmobs_events): session_id = "test_session_id" - with AgentlessLLMObs.task(session_id=session_id) as span: + with llmobs.task(session_id=session_id) as span: pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "task", session_id=session_id) - ) - - -def test_llm_span(LLMObs, mock_llmobs_span_writer): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - assert span.name == "test_llm_call" - assert span.resource == "llm" - assert span.span_type == "llm" - assert span._get_ctx_item(SPAN_KIND) == "llm" - assert span._get_ctx_item(MODEL_NAME) == "test_model" - assert span._get_ctx_item(MODEL_PROVIDER) == "test_provider" - - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "llm", model_name="test_model", model_provider="test_provider") - ) + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task", session_id=session_id) -def test_llm_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: +def test_llm_span(llmobs, llmobs_events): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: assert span.name == "test_llm_call" assert span.resource == "llm" assert span.span_type == "llm" assert span._get_ctx_item(SPAN_KIND) == "llm" assert span._get_ctx_item(MODEL_NAME) == "test_model" assert span._get_ctx_item(MODEL_PROVIDER) == "test_provider" - - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "llm", model_name="test_model", model_provider="test_provider") + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "llm", model_name="test_model", model_provider="test_provider" ) -def test_llm_span_no_model_sets_default(LLMObs, mock_llmobs_span_writer): - with LLMObs.llm(name="test_llm_call", model_provider="test_provider") as span: +def test_llm_span_no_model_sets_default(llmobs, llmobs_events): + with llmobs.llm(name="test_llm_call", model_provider="test_provider") as span: assert span._get_ctx_item(MODEL_NAME) == "custom" - - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "llm", model_name="custom", model_provider="test_provider") + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "llm", model_name="custom", model_provider="test_provider" ) -def test_default_model_provider_set_to_custom(LLMObs): - with LLMObs.llm(model_name="test_model", name="test_llm_call") as span: +def test_default_model_provider_set_to_custom(llmobs): + with llmobs.llm(model_name="test_model", name="test_llm_call") as span: assert span.name == "test_llm_call" assert span.resource == "llm" assert span.span_type == "llm" @@ -302,88 +261,57 @@ def test_default_model_provider_set_to_custom(LLMObs): assert span._get_ctx_item(MODEL_PROVIDER) == "custom" -def test_tool_span(LLMObs, mock_llmobs_span_writer): - with LLMObs.tool(name="test_tool") as span: - assert span.name == "test_tool" - assert span.resource == "tool" - assert span.span_type == "llm" - assert span._get_ctx_item(SPAN_KIND) == "tool" - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "tool")) - - -def test_tool_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.tool(name="test_tool") as span: +def test_tool_span(llmobs, llmobs_events): + with llmobs.tool(name="test_tool") as span: assert span.name == "test_tool" assert span.resource == "tool" assert span.span_type == "llm" assert span._get_ctx_item(SPAN_KIND) == "tool" - mock_llmobs_span_agentless_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "tool")) - - -def test_task_span(LLMObs, mock_llmobs_span_writer): - with LLMObs.task(name="test_task") as span: - assert span.name == "test_task" - assert span.resource == "task" - assert span.span_type == "llm" - assert span._get_ctx_item(SPAN_KIND) == "task" - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "task")) + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "tool") -def test_task_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.task(name="test_task") as span: +def test_task_span(llmobs, llmobs_events): + with llmobs.task(name="test_task") as span: assert span.name == "test_task" assert span.resource == "task" assert span.span_type == "llm" assert span._get_ctx_item(SPAN_KIND) == "task" - mock_llmobs_span_agentless_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "task")) - - -def test_workflow_span(LLMObs, mock_llmobs_span_writer): - with LLMObs.workflow(name="test_workflow") as span: - assert span.name == "test_workflow" - assert span.resource == "workflow" - assert span.span_type == "llm" - assert span._get_ctx_item(SPAN_KIND) == "workflow" - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "workflow")) + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task") -def test_workflow_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.workflow(name="test_workflow") as span: +def test_workflow_span(llmobs, llmobs_events): + with llmobs.workflow(name="test_workflow") as span: assert span.name == "test_workflow" assert span.resource == "workflow" assert span.span_type == "llm" assert span._get_ctx_item(SPAN_KIND) == "workflow" - mock_llmobs_span_agentless_writer.enqueue.assert_called_with(_expected_llmobs_non_llm_span_event(span, "workflow")) + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "workflow") -def test_agent_span(LLMObs, mock_llmobs_span_writer): - with LLMObs.agent(name="test_agent") as span: +def test_agent_span(llmobs, llmobs_events): + with llmobs.agent(name="test_agent") as span: assert span.name == "test_agent" assert span.resource == "agent" assert span.span_type == "llm" assert span._get_ctx_item(SPAN_KIND) == "agent" - mock_llmobs_span_writer.enqueue.assert_called_with(_expected_llmobs_llm_span_event(span, "agent")) + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_llm_span_event(span, "agent") -def test_agent_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.agent(name="test_agent") as span: - assert span.name == "test_agent" - assert span.resource == "agent" - assert span.span_type == "llm" - assert span._get_ctx_item(SPAN_KIND) == "agent" - mock_llmobs_span_agentless_writer.enqueue.assert_called_with(_expected_llmobs_llm_span_event(span, "agent")) - - -def test_embedding_span_no_model_sets_default(LLMObs, mock_llmobs_span_writer): - with LLMObs.embedding(name="test_embedding", model_provider="test_provider") as span: +def test_embedding_span_no_model_sets_default(llmobs, llmobs_events): + with llmobs.embedding(name="test_embedding", model_provider="test_provider") as span: assert span._get_ctx_item(MODEL_NAME) == "custom" - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "embedding", model_name="custom", model_provider="test_provider") + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "embedding", model_name="custom", model_provider="test_provider" ) -def test_embedding_default_model_provider_set_to_custom(LLMObs): - with LLMObs.embedding(model_name="test_model", name="test_embedding") as span: +def test_embedding_default_model_provider_set_to_custom(llmobs): + with llmobs.embedding(model_name="test_model", name="test_embedding") as span: assert span.name == "test_embedding" assert span.resource == "embedding" assert span.span_type == "llm" @@ -392,198 +320,182 @@ def test_embedding_default_model_provider_set_to_custom(LLMObs): assert span._get_ctx_item(MODEL_PROVIDER) == "custom" -def test_embedding_span(LLMObs, mock_llmobs_span_writer): - with LLMObs.embedding(model_name="test_model", name="test_embedding", model_provider="test_provider") as span: - assert span.name == "test_embedding" - assert span.resource == "embedding" - assert span.span_type == "llm" - assert span._get_ctx_item(SPAN_KIND) == "embedding" - assert span._get_ctx_item(MODEL_NAME) == "test_model" - assert span._get_ctx_item(MODEL_PROVIDER) == "test_provider" - - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "embedding", model_name="test_model", model_provider="test_provider") - ) - - -def test_embedding_span_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.embedding( - model_name="test_model", name="test_embedding", model_provider="test_provider" - ) as span: +def test_embedding_span(llmobs, llmobs_events): + with llmobs.embedding(model_name="test_model", name="test_embedding", model_provider="test_provider") as span: assert span.name == "test_embedding" assert span.resource == "embedding" assert span.span_type == "llm" assert span._get_ctx_item(SPAN_KIND) == "embedding" assert span._get_ctx_item(MODEL_NAME) == "test_model" assert span._get_ctx_item(MODEL_PROVIDER) == "test_provider" - - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "embedding", model_name="test_model", model_provider="test_provider") + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, "embedding", model_name="test_model", model_provider="test_provider" ) -def test_annotate_no_active_span_logs_warning(LLMObs, mock_logs): - LLMObs.annotate(parameters={"test": "test"}) - mock_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.") +def test_annotate_no_active_span_logs_warning(llmobs, mock_llmobs_logs): + llmobs.annotate(parameters={"test": "test"}) + mock_llmobs_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.") -def test_annotate_non_llm_span_logs_warning(LLMObs, mock_logs): +def test_annotate_non_llm_span_logs_warning(llmobs, mock_llmobs_logs): dummy_tracer = DummyTracer() with dummy_tracer.trace("root") as non_llmobs_span: - LLMObs.annotate(span=non_llmobs_span, parameters={"test": "test"}) - mock_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.") + llmobs.annotate(span=non_llmobs_span, parameters={"test": "test"}) + mock_llmobs_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.") -def test_annotate_finished_span_does_nothing(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: +def test_annotate_finished_span_does_nothing(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: pass - LLMObs.annotate(span=span, parameters={"test": "test"}) - mock_logs.warning.assert_called_once_with("Cannot annotate a finished span.") + llmobs.annotate(span=span, parameters={"test": "test"}) + mock_llmobs_logs.warning.assert_called_once_with("Cannot annotate a finished span.") -def test_annotate_parameters(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.annotate(span=span, parameters={"temperature": 0.9, "max_tokens": 50}) +def test_annotate_parameters(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.annotate(span=span, parameters={"temperature": 0.9, "max_tokens": 50}) assert span._get_ctx_item(INPUT_PARAMETERS) == {"temperature": 0.9, "max_tokens": 50} - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.warning.assert_called_once_with( "Setting parameters is deprecated, please set parameters and other metadata as tags instead." ) -def test_annotate_metadata(LLMObs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.annotate(span=span, metadata={"temperature": 0.5, "max_tokens": 20, "top_k": 10, "n": 3}) +def test_annotate_metadata(llmobs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.annotate(span=span, metadata={"temperature": 0.5, "max_tokens": 20, "top_k": 10, "n": 3}) assert span._get_ctx_item(METADATA) == {"temperature": 0.5, "max_tokens": 20, "top_k": 10, "n": 3} -def test_annotate_metadata_wrong_type_raises_warning(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.annotate(span=span, metadata="wrong_metadata") +def test_annotate_metadata_wrong_type_raises_warning(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.annotate(span=span, metadata="wrong_metadata") assert span._get_ctx_item(METADATA) is None - mock_logs.warning.assert_called_once_with("metadata must be a dictionary of string key-value pairs.") - mock_logs.reset_mock() + mock_llmobs_logs.warning.assert_called_once_with("metadata must be a dictionary of string key-value pairs.") + mock_llmobs_logs.reset_mock() -def test_annotate_tag(LLMObs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.annotate(span=span, tags={"test_tag_name": "test_tag_value", "test_numeric_tag": 10}) +def test_annotate_tag(llmobs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.annotate(span=span, tags={"test_tag_name": "test_tag_value", "test_numeric_tag": 10}) assert span._get_ctx_item(TAGS) == {"test_tag_name": "test_tag_value", "test_numeric_tag": 10} -def test_annotate_tag_wrong_type(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.annotate(span=span, tags=12345) +def test_annotate_tag_wrong_type(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.annotate(span=span, tags=12345) assert span._get_ctx_item(TAGS) is None - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.warning.assert_called_once_with( "span_tags must be a dictionary of string key - primitive value pairs." ) -def test_annotate_input_string(LLMObs): - with LLMObs.llm(model_name="test_model") as llm_span: - LLMObs.annotate(span=llm_span, input_data="test_input") +def test_annotate_input_string(llmobs): + with llmobs.llm(model_name="test_model") as llm_span: + llmobs.annotate(span=llm_span, input_data="test_input") assert llm_span._get_ctx_item(INPUT_MESSAGES) == [{"content": "test_input"}] - with LLMObs.task() as task_span: - LLMObs.annotate(span=task_span, input_data="test_input") + with llmobs.task() as task_span: + llmobs.annotate(span=task_span, input_data="test_input") assert task_span._get_ctx_item(INPUT_VALUE) == "test_input" - with LLMObs.tool() as tool_span: - LLMObs.annotate(span=tool_span, input_data="test_input") + with llmobs.tool() as tool_span: + llmobs.annotate(span=tool_span, input_data="test_input") assert tool_span._get_ctx_item(INPUT_VALUE) == "test_input" - with LLMObs.workflow() as workflow_span: - LLMObs.annotate(span=workflow_span, input_data="test_input") + with llmobs.workflow() as workflow_span: + llmobs.annotate(span=workflow_span, input_data="test_input") assert workflow_span._get_ctx_item(INPUT_VALUE) == "test_input" - with LLMObs.agent() as agent_span: - LLMObs.annotate(span=agent_span, input_data="test_input") + with llmobs.agent() as agent_span: + llmobs.annotate(span=agent_span, input_data="test_input") assert agent_span._get_ctx_item(INPUT_VALUE) == "test_input" - with LLMObs.retrieval() as retrieval_span: - LLMObs.annotate(span=retrieval_span, input_data="test_input") + with llmobs.retrieval() as retrieval_span: + llmobs.annotate(span=retrieval_span, input_data="test_input") assert retrieval_span._get_ctx_item(INPUT_VALUE) == "test_input" -def test_annotate_numeric_io(LLMObs): - with LLMObs.task() as task_span: - LLMObs.annotate(span=task_span, input_data=0, output_data=0) +def test_annotate_numeric_io(llmobs): + with llmobs.task() as task_span: + llmobs.annotate(span=task_span, input_data=0, output_data=0) assert task_span._get_ctx_item(INPUT_VALUE) == "0" assert task_span._get_ctx_item(OUTPUT_VALUE) == "0" - with LLMObs.task() as task_span: - LLMObs.annotate(span=task_span, input_data=1.23, output_data=1.23) + with llmobs.task() as task_span: + llmobs.annotate(span=task_span, input_data=1.23, output_data=1.23) assert task_span._get_ctx_item(INPUT_VALUE) == "1.23" assert task_span._get_ctx_item(OUTPUT_VALUE) == "1.23" -def test_annotate_input_serializable_value(LLMObs): - with LLMObs.task() as task_span: - LLMObs.annotate(span=task_span, input_data=["test_input"]) +def test_annotate_input_serializable_value(llmobs): + with llmobs.task() as task_span: + llmobs.annotate(span=task_span, input_data=["test_input"]) assert task_span._get_ctx_item(INPUT_VALUE) == str(["test_input"]) - with LLMObs.tool() as tool_span: - LLMObs.annotate(span=tool_span, input_data={"test_input": "hello world"}) + with llmobs.tool() as tool_span: + llmobs.annotate(span=tool_span, input_data={"test_input": "hello world"}) assert tool_span._get_ctx_item(INPUT_VALUE) == str({"test_input": "hello world"}) - with LLMObs.workflow() as workflow_span: - LLMObs.annotate(span=workflow_span, input_data=("asd", 123)) + with llmobs.workflow() as workflow_span: + llmobs.annotate(span=workflow_span, input_data=("asd", 123)) assert workflow_span._get_ctx_item(INPUT_VALUE) == str(("asd", 123)) - with LLMObs.agent() as agent_span: - LLMObs.annotate(span=agent_span, input_data="test_input") + with llmobs.agent() as agent_span: + llmobs.annotate(span=agent_span, input_data="test_input") assert agent_span._get_ctx_item(INPUT_VALUE) == "test_input" - with LLMObs.retrieval() as retrieval_span: - LLMObs.annotate(span=retrieval_span, input_data=[0, 1, 2, 3, 4]) + with llmobs.retrieval() as retrieval_span: + llmobs.annotate(span=retrieval_span, input_data=[0, 1, 2, 3, 4]) assert retrieval_span._get_ctx_item(INPUT_VALUE) == str([0, 1, 2, 3, 4]) -def test_annotate_input_llm_message(LLMObs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data=[{"content": "test_input", "role": "human"}]) +def test_annotate_input_llm_message(llmobs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate(span=span, input_data=[{"content": "test_input", "role": "human"}]) assert span._get_ctx_item(INPUT_MESSAGES) == [{"content": "test_input", "role": "human"}] -def test_annotate_input_llm_message_wrong_type(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data=[{"content": object()}]) +def test_annotate_input_llm_message_wrong_type(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate(span=span, input_data=[{"content": object()}]) assert span._get_ctx_item(INPUT_MESSAGES) is None - mock_logs.warning.assert_called_once_with("Failed to parse input messages.", exc_info=True) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input messages.", exc_info=True) -def test_llmobs_annotate_incorrect_message_content_type_raises_warning(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data={"role": "user", "content": {"nested": "yes"}}) - mock_logs.warning.assert_called_once_with("Failed to parse input messages.", exc_info=True) - mock_logs.reset_mock() - LLMObs.annotate(span=span, output_data={"role": "user", "content": {"nested": "yes"}}) - mock_logs.warning.assert_called_once_with("Failed to parse output messages.", exc_info=True) +def test_llmobs_annotate_incorrect_message_content_type_raises_warning(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate(span=span, input_data={"role": "user", "content": {"nested": "yes"}}) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input messages.", exc_info=True) + mock_llmobs_logs.reset_mock() + llmobs.annotate(span=span, output_data={"role": "user", "content": {"nested": "yes"}}) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output messages.", exc_info=True) -def test_annotate_document_str(LLMObs): - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data="test_document_text") +def test_annotate_document_str(llmobs): + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate(span=span, input_data="test_document_text") documents = span._get_ctx_item(INPUT_DOCUMENTS) assert documents assert len(documents) == 1 assert documents[0]["text"] == "test_document_text" - with LLMObs.retrieval() as span: - LLMObs.annotate(span=span, output_data="test_document_text") + with llmobs.retrieval() as span: + llmobs.annotate(span=span, output_data="test_document_text") documents = span._get_ctx_item(OUTPUT_DOCUMENTS) assert documents assert len(documents) == 1 assert documents[0]["text"] == "test_document_text" -def test_annotate_document_dict(LLMObs): - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data={"text": "test_document_text"}) +def test_annotate_document_dict(llmobs): + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate(span=span, input_data={"text": "test_document_text"}) documents = span._get_ctx_item(INPUT_DOCUMENTS) assert documents assert len(documents) == 1 assert documents[0]["text"] == "test_document_text" - with LLMObs.retrieval() as span: - LLMObs.annotate(span=span, output_data={"text": "test_document_text"}) + with llmobs.retrieval() as span: + llmobs.annotate(span=span, output_data={"text": "test_document_text"}) documents = span._get_ctx_item(OUTPUT_DOCUMENTS) assert documents assert len(documents) == 1 assert documents[0]["text"] == "test_document_text" -def test_annotate_document_list(LLMObs): - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate( +def test_annotate_document_list(llmobs): + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate( span=span, input_data=[{"text": "test_document_text"}, {"text": "text", "name": "name", "score": 0.9, "id": "id"}], ) @@ -595,8 +507,8 @@ def test_annotate_document_list(LLMObs): assert documents[1]["name"] == "name" assert documents[1]["id"] == "id" assert documents[1]["score"] == 0.9 - with LLMObs.retrieval() as span: - LLMObs.annotate( + with llmobs.retrieval() as span: + llmobs.annotate( span=span, output_data=[{"text": "test_document_text"}, {"text": "text", "name": "name", "score": 0.9, "id": "id"}], ) @@ -610,129 +522,131 @@ def test_annotate_document_list(LLMObs): assert documents[1]["score"] == 0.9 -def test_annotate_incorrect_document_type_raises_warning(LLMObs, mock_logs): - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data={"text": 123}) - mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) - mock_logs.reset_mock() - LLMObs.annotate(span=span, input_data=123) - mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) - mock_logs.reset_mock() - LLMObs.annotate(span=span, input_data=object()) - mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) - mock_logs.reset_mock() - with LLMObs.retrieval() as span: - LLMObs.annotate(span=span, output_data=[{"score": 0.9, "id": "id", "name": "name"}]) - mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) - mock_logs.reset_mock() - LLMObs.annotate(span=span, output_data=123) - mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) - mock_logs.reset_mock() - LLMObs.annotate(span=span, output_data=object()) - mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) - - -def test_annotate_document_no_text_raises_warning(LLMObs, mock_logs): - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data=[{"score": 0.9, "id": "id", "name": "name"}]) - mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) - mock_logs.reset_mock() - with LLMObs.retrieval() as span: - LLMObs.annotate(span=span, output_data=[{"score": 0.9, "id": "id", "name": "name"}]) - mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) - - -def test_annotate_incorrect_document_field_type_raises_warning(LLMObs, mock_logs): - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate(span=span, input_data=[{"text": "test_document_text", "score": "0.9"}]) - mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) - mock_logs.reset_mock() - with LLMObs.embedding(model_name="test_model") as span: - LLMObs.annotate( +def test_annotate_incorrect_document_type_raises_warning(llmobs, mock_llmobs_logs): + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate(span=span, input_data={"text": 123}) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + llmobs.annotate(span=span, input_data=123) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + llmobs.annotate(span=span, input_data=object()) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + with llmobs.retrieval() as span: + llmobs.annotate(span=span, output_data=[{"score": 0.9, "id": "id", "name": "name"}]) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + llmobs.annotate(span=span, output_data=123) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + llmobs.annotate(span=span, output_data=object()) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) + + +def test_annotate_document_no_text_raises_warning(llmobs, mock_llmobs_logs): + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate(span=span, input_data=[{"score": 0.9, "id": "id", "name": "name"}]) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + with llmobs.retrieval() as span: + llmobs.annotate(span=span, output_data=[{"score": 0.9, "id": "id", "name": "name"}]) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) + + +def test_annotate_incorrect_document_field_type_raises_warning(llmobs, mock_llmobs_logs): + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate(span=span, input_data=[{"text": "test_document_text", "score": "0.9"}]) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + with llmobs.embedding(model_name="test_model") as span: + llmobs.annotate( span=span, input_data=[{"text": "text", "id": 123, "score": "0.9", "name": ["h", "e", "l", "l", "o"]}] ) - mock_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) - mock_logs.reset_mock() - with LLMObs.retrieval() as span: - LLMObs.annotate(span=span, output_data=[{"text": "test_document_text", "score": "0.9"}]) - mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) - mock_logs.reset_mock() - with LLMObs.retrieval() as span: - LLMObs.annotate( + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse input documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + with llmobs.retrieval() as span: + llmobs.annotate(span=span, output_data=[{"text": "test_document_text", "score": "0.9"}]) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) + mock_llmobs_logs.reset_mock() + with llmobs.retrieval() as span: + llmobs.annotate( span=span, output_data=[{"text": "text", "id": 123, "score": "0.9", "name": ["h", "e", "l", "l", "o"]}] ) - mock_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output documents.", exc_info=True) -def test_annotate_output_string(LLMObs): - with LLMObs.llm(model_name="test_model") as llm_span: - LLMObs.annotate(span=llm_span, output_data="test_output") +def test_annotate_output_string(llmobs): + with llmobs.llm(model_name="test_model") as llm_span: + llmobs.annotate(span=llm_span, output_data="test_output") assert llm_span._get_ctx_item(OUTPUT_MESSAGES) == [{"content": "test_output"}] - with LLMObs.embedding(model_name="test_model") as embedding_span: - LLMObs.annotate(span=embedding_span, output_data="test_output") + with llmobs.embedding(model_name="test_model") as embedding_span: + llmobs.annotate(span=embedding_span, output_data="test_output") assert embedding_span._get_ctx_item(OUTPUT_VALUE) == "test_output" - with LLMObs.task() as task_span: - LLMObs.annotate(span=task_span, output_data="test_output") + with llmobs.task() as task_span: + llmobs.annotate(span=task_span, output_data="test_output") assert task_span._get_ctx_item(OUTPUT_VALUE) == "test_output" - with LLMObs.tool() as tool_span: - LLMObs.annotate(span=tool_span, output_data="test_output") + with llmobs.tool() as tool_span: + llmobs.annotate(span=tool_span, output_data="test_output") assert tool_span._get_ctx_item(OUTPUT_VALUE) == "test_output" - with LLMObs.workflow() as workflow_span: - LLMObs.annotate(span=workflow_span, output_data="test_output") + with llmobs.workflow() as workflow_span: + llmobs.annotate(span=workflow_span, output_data="test_output") assert workflow_span._get_ctx_item(OUTPUT_VALUE) == "test_output" - with LLMObs.agent() as agent_span: - LLMObs.annotate(span=agent_span, output_data="test_output") + with llmobs.agent() as agent_span: + llmobs.annotate(span=agent_span, output_data="test_output") assert agent_span._get_ctx_item(OUTPUT_VALUE) == "test_output" -def test_annotate_output_serializable_value(LLMObs): - with LLMObs.embedding(model_name="test_model") as embedding_span: - LLMObs.annotate(span=embedding_span, output_data=[[0, 1, 2, 3], [4, 5, 6, 7]]) +def test_annotate_output_serializable_value(llmobs): + with llmobs.embedding(model_name="test_model") as embedding_span: + llmobs.annotate(span=embedding_span, output_data=[[0, 1, 2, 3], [4, 5, 6, 7]]) assert embedding_span._get_ctx_item(OUTPUT_VALUE) == str([[0, 1, 2, 3], [4, 5, 6, 7]]) - with LLMObs.task() as task_span: - LLMObs.annotate(span=task_span, output_data=["test_output"]) + with llmobs.task() as task_span: + llmobs.annotate(span=task_span, output_data=["test_output"]) assert task_span._get_ctx_item(OUTPUT_VALUE) == str(["test_output"]) - with LLMObs.tool() as tool_span: - LLMObs.annotate(span=tool_span, output_data={"test_output": "hello world"}) + with llmobs.tool() as tool_span: + llmobs.annotate(span=tool_span, output_data={"test_output": "hello world"}) assert tool_span._get_ctx_item(OUTPUT_VALUE) == str({"test_output": "hello world"}) - with LLMObs.workflow() as workflow_span: - LLMObs.annotate(span=workflow_span, output_data=("asd", 123)) + with llmobs.workflow() as workflow_span: + llmobs.annotate(span=workflow_span, output_data=("asd", 123)) assert workflow_span._get_ctx_item(OUTPUT_VALUE) == str(("asd", 123)) - with LLMObs.agent() as agent_span: - LLMObs.annotate(span=agent_span, output_data="test_output") + with llmobs.agent() as agent_span: + llmobs.annotate(span=agent_span, output_data="test_output") assert agent_span._get_ctx_item(OUTPUT_VALUE) == "test_output" -def test_annotate_output_llm_message(LLMObs): - with LLMObs.llm(model_name="test_model") as llm_span: - LLMObs.annotate(span=llm_span, output_data=[{"content": "test_output", "role": "human"}]) +def test_annotate_output_llm_message(llmobs): + with llmobs.llm(model_name="test_model") as llm_span: + llmobs.annotate(span=llm_span, output_data=[{"content": "test_output", "role": "human"}]) assert llm_span._get_ctx_item(OUTPUT_MESSAGES) == [{"content": "test_output", "role": "human"}] -def test_annotate_output_llm_message_wrong_type(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model") as llm_span: - LLMObs.annotate(span=llm_span, output_data=[{"content": object()}]) +def test_annotate_output_llm_message_wrong_type(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model") as llm_span: + llmobs.annotate(span=llm_span, output_data=[{"content": object()}]) assert llm_span._get_ctx_item(OUTPUT_MESSAGES) is None - mock_logs.warning.assert_called_once_with("Failed to parse output messages.", exc_info=True) + mock_llmobs_logs.warning.assert_called_once_with("Failed to parse output messages.", exc_info=True) -def test_annotate_metrics(LLMObs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate(span=span, metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30}) +def test_annotate_metrics(llmobs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate(span=span, metrics={"input_tokens": 10, "output_tokens": 20, "total_tokens": 30}) assert span._get_ctx_item(METRICS) == {"input_tokens": 10, "output_tokens": 20, "total_tokens": 30} -def test_annotate_metrics_wrong_type(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model") as llm_span: - LLMObs.annotate(span=llm_span, metrics=12345) +def test_annotate_metrics_wrong_type(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model") as llm_span: + llmobs.annotate(span=llm_span, metrics=12345) assert llm_span._get_ctx_item(METRICS) is None - mock_logs.warning.assert_called_once_with("metrics must be a dictionary of string key - numeric value pairs.") - mock_logs.reset_mock() + mock_llmobs_logs.warning.assert_called_once_with( + "metrics must be a dictionary of string key - numeric value pairs." + ) + mock_llmobs_logs.reset_mock() -def test_annotate_prompt_dict(LLMObs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate( +def test_annotate_prompt_dict(llmobs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate( span=span, prompt={ "template": "{var1} {var3}", @@ -751,9 +665,9 @@ def test_annotate_prompt_dict(LLMObs): } -def test_annotate_prompt_dict_with_context_var_keys(LLMObs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate( +def test_annotate_prompt_dict_with_context_var_keys(llmobs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate( span=span, prompt={ "template": "{var1} {var3}", @@ -774,9 +688,9 @@ def test_annotate_prompt_dict_with_context_var_keys(LLMObs): } -def test_annotate_prompt_typed_dict(LLMObs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate( +def test_annotate_prompt_typed_dict(llmobs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate( span=span, prompt=Prompt( template="{var1} {var3}", @@ -797,63 +711,30 @@ def test_annotate_prompt_typed_dict(LLMObs): } -def test_annotate_prompt_wrong_type(LLMObs, mock_logs): - with LLMObs.llm(model_name="test_model") as span: - LLMObs.annotate(span=span, prompt="prompt") +def test_annotate_prompt_wrong_type(llmobs, mock_llmobs_logs): + with llmobs.llm(model_name="test_model") as span: + llmobs.annotate(span=span, prompt="prompt") assert span._get_ctx_item(INPUT_PROMPT) is None - mock_logs.warning.assert_called_once_with("Failed to validate prompt with error: ", exc_info=True) - mock_logs.reset_mock() + mock_llmobs_logs.warning.assert_called_once_with("Failed to validate prompt with error: ", exc_info=True) + mock_llmobs_logs.reset_mock() - LLMObs.annotate(span=span, prompt={"template": 1}) - mock_logs.warning.assert_called_once_with("Failed to validate prompt with error: ", exc_info=True) - mock_logs.reset_mock() - - -def test_span_error_sets_error(LLMObs, mock_llmobs_span_writer): - with pytest.raises(ValueError): - with LLMObs.llm(model_name="test_model", model_provider="test_model_provider") as span: - raise ValueError("test error message") - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, - model_name="test_model", - model_provider="test_model_provider", - error="builtins.ValueError", - error_message="test error message", - error_stack=span.get_tag("error.stack"), - ) - ) + llmobs.annotate(span=span, prompt={"template": 1}) + mock_llmobs_logs.warning.assert_called_once_with("Failed to validate prompt with error: ", exc_info=True) + mock_llmobs_logs.reset_mock() -def test_span_error_sets_error_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): +def test_span_error_sets_error(llmobs, llmobs_events): with pytest.raises(ValueError): - with AgentlessLLMObs.llm(model_name="test_model", model_provider="test_model_provider") as span: + with llmobs.llm(model_name="test_model", model_provider="test_model_provider") as span: raise ValueError("test error message") - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, - model_name="test_model", - model_provider="test_model_provider", - error="builtins.ValueError", - error_message="test error message", - error_stack=span.get_tag("error.stack"), - ) - ) - - -@pytest.mark.parametrize( - "ddtrace_global_config", - [dict(version="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")], -) -def test_tags(ddtrace_global_config, LLMObs, mock_llmobs_span_writer, monkeypatch): - with LLMObs.task(name="test_task") as span: - pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "task", - tags={"version": "1.2.3", "env": "test_env", "service": "test_service", "ml_app": "test_app_name"}, - ) + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_llm_span_event( + span, + model_name="test_model", + model_provider="test_model_provider", + error="builtins.ValueError", + error_message="test error message", + error_stack=span.get_tag("error.stack"), ) @@ -861,202 +742,152 @@ def test_tags(ddtrace_global_config, LLMObs, mock_llmobs_span_writer, monkeypatc "ddtrace_global_config", [dict(version="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")], ) -def test_tags_agentless(ddtrace_global_config, AgentlessLLMObs, mock_llmobs_span_agentless_writer, monkeypatch): - with AgentlessLLMObs.task(name="test_task") as span: - pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event( - span, - "task", - tags={"version": "1.2.3", "env": "test_env", "service": "test_service", "ml_app": "test_app_name"}, - ) - ) - - -def test_ml_app_override(LLMObs, mock_llmobs_span_writer): - with LLMObs.task(name="test_task", ml_app="test_app") as span: - pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "task", tags={"ml_app": "test_app"}) - ) - with LLMObs.tool(name="test_tool", ml_app="test_app") as span: - pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "tool", tags={"ml_app": "test_app"}) - ) - with LLMObs.llm(model_name="model_name", name="test_llm", ml_app="test_app") as span: - pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "llm", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"} - ) - ) - with LLMObs.embedding(model_name="model_name", name="test_embedding", ml_app="test_app") as span: - pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "embedding", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"} - ) - ) - with LLMObs.workflow(name="test_workflow", ml_app="test_app") as span: +def test_tags(ddtrace_global_config, llmobs, llmobs_events, monkeypatch): + with llmobs.task(name="test_task") as span: pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "workflow", tags={"ml_app": "test_app"}) - ) - with LLMObs.agent(name="test_agent", ml_app="test_app") as span: - pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "agent", tags={"ml_app": "test_app"}) - ) - with LLMObs.retrieval(name="test_retrieval", ml_app="test_app") as span: - pass - mock_llmobs_span_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "retrieval", tags={"ml_app": "test_app"}) + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event( + span, + "task", + tags={"version": "1.2.3", "env": "test_env", "service": "test_service", "ml_app": "test_app_name"}, ) -def test_ml_app_override_agentless(AgentlessLLMObs, mock_llmobs_span_agentless_writer): - with AgentlessLLMObs.task(name="test_task", ml_app="test_app") as span: +def test_ml_app_override(llmobs, llmobs_events): + with llmobs.task(name="test_task", ml_app="test_app") as span: pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "task", tags={"ml_app": "test_app"}) - ) - with AgentlessLLMObs.tool(name="test_tool", ml_app="test_app") as span: + assert len(llmobs_events) == 1 + assert llmobs_events[0] == _expected_llmobs_non_llm_span_event(span, "task", tags={"ml_app": "test_app"}) + with llmobs.tool(name="test_tool", ml_app="test_app") as span: pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "tool", tags={"ml_app": "test_app"}) - ) - with AgentlessLLMObs.llm(model_name="model_name", name="test_llm", ml_app="test_app") as span: + assert len(llmobs_events) == 2 + assert llmobs_events[1] == _expected_llmobs_non_llm_span_event(span, "tool", tags={"ml_app": "test_app"}) + with llmobs.llm(model_name="model_name", name="test_llm", ml_app="test_app") as span: pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "llm", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"} - ) + assert len(llmobs_events) == 3 + assert llmobs_events[2] == _expected_llmobs_llm_span_event( + span, "llm", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"} ) - with AgentlessLLMObs.embedding(model_name="model_name", name="test_embedding", ml_app="test_app") as span: + with llmobs.embedding(model_name="model_name", name="test_embedding", ml_app="test_app") as span: pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event( - span, "embedding", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"} - ) + assert len(llmobs_events) == 4 + assert llmobs_events[3] == _expected_llmobs_llm_span_event( + span, "embedding", model_name="model_name", model_provider="custom", tags={"ml_app": "test_app"} ) - with AgentlessLLMObs.workflow(name="test_workflow", ml_app="test_app") as span: + with llmobs.workflow(name="test_workflow", ml_app="test_app") as span: pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "workflow", tags={"ml_app": "test_app"}) - ) - with AgentlessLLMObs.agent(name="test_agent", ml_app="test_app") as span: + assert len(llmobs_events) == 5 + assert llmobs_events[4] == _expected_llmobs_non_llm_span_event(span, "workflow", tags={"ml_app": "test_app"}) + with llmobs.agent(name="test_agent", ml_app="test_app") as span: pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_llm_span_event(span, "agent", tags={"ml_app": "test_app"}) - ) - with AgentlessLLMObs.retrieval(name="test_retrieval", ml_app="test_app") as span: + assert len(llmobs_events) == 6 + assert llmobs_events[5] == _expected_llmobs_llm_span_event(span, "agent", tags={"ml_app": "test_app"}) + with llmobs.retrieval(name="test_retrieval", ml_app="test_app") as span: pass - mock_llmobs_span_agentless_writer.enqueue.assert_called_with( - _expected_llmobs_non_llm_span_event(span, "retrieval", tags={"ml_app": "test_app"}) - ) + assert len(llmobs_events) == 7 + assert llmobs_events[6] == _expected_llmobs_non_llm_span_event(span, "retrieval", tags={"ml_app": "test_app"}) -def test_export_span_specified_span_is_incorrect_type_raises_warning(LLMObs, mock_logs): - LLMObs.export_span(span="asd") - mock_logs.warning.assert_called_once_with("Failed to export span. Span must be a valid Span object.") +def test_export_span_specified_span_is_incorrect_type_raises_warning(llmobs, mock_llmobs_logs): + llmobs.export_span(span="asd") + mock_llmobs_logs.warning.assert_called_once_with("Failed to export span. Span must be a valid Span object.") -def test_export_span_specified_span_is_not_llmobs_span_raises_warning(LLMObs, mock_logs): +def test_export_span_specified_span_is_not_llmobs_span_raises_warning(llmobs, mock_llmobs_logs): with DummyTracer().trace("non_llmobs_span") as span: - LLMObs.export_span(span=span) - mock_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.") + llmobs.export_span(span=span) + mock_llmobs_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.") -def test_export_span_specified_span_returns_span_context(LLMObs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - span_context = LLMObs.export_span(span=span) +def test_export_span_specified_span_returns_span_context(llmobs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + span_context = llmobs.export_span(span=span) assert span_context is not None assert span_context["span_id"] == str(span.span_id) assert span_context["trace_id"] == "{:x}".format(span.trace_id) -def test_export_span_no_specified_span_no_active_span_raises_warning(LLMObs, mock_logs): - LLMObs.export_span() - mock_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.") +def test_export_span_no_specified_span_no_active_span_raises_warning(llmobs, mock_llmobs_logs): + llmobs.export_span() + mock_llmobs_logs.warning.assert_called_once_with("No span provided and no active LLMObs-generated span found.") -def test_export_span_active_span_not_llmobs_span_raises_warning(LLMObs, mock_logs): - with LLMObs._instance.tracer.trace("non_llmobs_span"): - LLMObs.export_span() - mock_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.") +def test_export_span_active_span_not_llmobs_span_raises_warning(llmobs, mock_llmobs_logs): + with llmobs._instance.tracer.trace("non_llmobs_span"): + llmobs.export_span() + mock_llmobs_logs.warning.assert_called_once_with("Span must be an LLMObs-generated span.") -def test_export_span_no_specified_span_returns_exported_active_span(LLMObs): - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - span_context = LLMObs.export_span() +def test_export_span_no_specified_span_returns_exported_active_span(llmobs): + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + span_context = llmobs.export_span() assert span_context is not None assert span_context["span_id"] == str(span.span_id) assert span_context["trace_id"] == "{:x}".format(span.trace_id) -def test_submit_evaluation_llmobs_disabled_raises_warning(LLMObs, mock_logs): - LLMObs.disable() - LLMObs.submit_evaluation( +def test_submit_evaluation_llmobs_disabled_raises_warning(llmobs, mock_llmobs_logs): + llmobs.disable() + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high" ) - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.warning.assert_called_once_with( "LLMObs.submit_evaluation() called when LLMObs is not enabled. Evaluation metric data will not be sent." ) -def test_submit_evaluation_no_api_key_raises_warning(AgentlessLLMObs, mock_logs): +def test_submit_evaluation_no_api_key_raises_warning(llmobs, mock_llmobs_logs): with override_global_config(dict(_dd_api_key="")): - AgentlessLLMObs.submit_evaluation( + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high", ) - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.warning.assert_called_once_with( "DD_API_KEY is required for sending evaluation metrics. Evaluation metric data will not be sent. " "Ensure this configuration is set before running your application." ) -def test_submit_evaluation_ml_app_raises_warning(LLMObs, mock_logs): +def test_submit_evaluation_ml_app_raises_warning(llmobs, mock_llmobs_logs): with override_global_config(dict(_llmobs_ml_app="")): - LLMObs.submit_evaluation( + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high", ) - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.warning.assert_called_once_with( "ML App name is required for sending evaluation metrics. Evaluation metric data will not be sent. " "Ensure this configuration is set before running your application." ) -def test_submit_evaluation_span_context_incorrect_type_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation(span_context="asd", label="toxicity", metric_type="categorical", value="high") - mock_logs.warning.assert_called_once_with( +def test_submit_evaluation_span_context_incorrect_type_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation(span_context="asd", label="toxicity", metric_type="categorical", value="high") + mock_llmobs_logs.warning.assert_called_once_with( "span_context must be a dictionary containing both span_id and trace_id keys. " "LLMObs.export_span() can be used to generate this dictionary from a given span." ) -def test_submit_evaluation_empty_span_or_trace_id_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_empty_span_or_trace_id_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"trace_id": "456"}, label="toxicity", metric_type="categorical", value="high" ) - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.warning.assert_called_once_with( "span_id and trace_id must both be specified for the given evaluation metric to be submitted." ) - mock_logs.reset_mock() - LLMObs.submit_evaluation(span_context={"span_id": "456"}, label="toxicity", metric_type="categorical", value="high") - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.reset_mock() + llmobs.submit_evaluation(span_context={"span_id": "456"}, label="toxicity", metric_type="categorical", value="high") + mock_llmobs_logs.warning.assert_called_once_with( "span_id and trace_id must both be specified for the given evaluation metric to be submitted." ) -def test_submit_evaluation_invalid_timestamp_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_invalid_timestamp_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="", metric_type="categorical", @@ -1064,35 +895,35 @@ def test_submit_evaluation_invalid_timestamp_raises_warning(LLMObs, mock_logs): ml_app="dummy", timestamp_ms="invalid", ) - mock_logs.warning.assert_called_once_with( + mock_llmobs_logs.warning.assert_called_once_with( "timestamp_ms must be a non-negative integer. Evaluation metric data will not be sent" ) -def test_submit_evaluation_empty_label_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_empty_label_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="", metric_type="categorical", value="high" ) - mock_logs.warning.assert_called_once_with("label must be the specified name of the evaluation metric.") + mock_llmobs_logs.warning.assert_called_once_with("label must be the specified name of the evaluation metric.") -def test_submit_evaluation_incorrect_metric_type_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_incorrect_metric_type_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="wrong", value="high" ) - mock_logs.warning.assert_called_once_with("metric_type must be one of 'categorical' or 'score'.") - mock_logs.reset_mock() - LLMObs.submit_evaluation( + mock_llmobs_logs.warning.assert_called_once_with("metric_type must be one of 'categorical' or 'score'.") + mock_llmobs_logs.reset_mock() + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="", value="high" ) - mock_logs.warning.assert_called_once_with("metric_type must be one of 'categorical' or 'score'.") + mock_llmobs_logs.warning.assert_called_once_with("metric_type must be one of 'categorical' or 'score'.") -def test_submit_evaluation_numerical_value_raises_unsupported_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_numerical_value_raises_unsupported_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="numerical", value="high" ) - mock_logs.warning.assert_has_calls( + mock_llmobs_logs.warning.assert_has_calls( [ mock.call( "The evaluation metric type 'numerical' is unsupported. Use 'score' instead. " @@ -1102,44 +933,44 @@ def test_submit_evaluation_numerical_value_raises_unsupported_warning(LLMObs, mo ) -def test_submit_evaluation_incorrect_numerical_value_type_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_incorrect_numerical_value_type_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="numerical", value="high" ) - mock_logs.warning.assert_has_calls( + mock_llmobs_logs.warning.assert_has_calls( [ mock.call("value must be an integer or float for a score metric."), ] ) -def test_submit_evaluation_incorrect_score_value_type_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_incorrect_score_value_type_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="score", value="high" ) - mock_logs.warning.assert_called_once_with("value must be an integer or float for a score metric.") + mock_llmobs_logs.warning.assert_called_once_with("value must be an integer or float for a score metric.") -def test_submit_evaluation_invalid_tags_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_invalid_tags_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high", tags=["invalid"], ) - mock_logs.warning.assert_called_once_with("tags must be a dictionary of string key-value pairs.") + mock_llmobs_logs.warning.assert_called_once_with("tags must be a dictionary of string key-value pairs.") -def test_submit_evaluation_invalid_metadata_raises_warning(LLMObs, mock_logs): - LLMObs.submit_evaluation( +def test_submit_evaluation_invalid_metadata_raises_warning(llmobs, mock_llmobs_logs): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", value="high", metadata=1, ) - mock_logs.warning.assert_called_once_with("metadata must be json serializable dictionary.") + mock_llmobs_logs.warning.assert_called_once_with("metadata must be json serializable dictionary.") @pytest.mark.parametrize( @@ -1147,9 +978,9 @@ def test_submit_evaluation_invalid_metadata_raises_warning(LLMObs, mock_logs): [dict(_llmobs_ml_app="test_app_name")], ) def test_submit_evaluation_non_string_tags_raises_warning_but_still_submits( - LLMObs, mock_logs, mock_llmobs_eval_metric_writer + llmobs, mock_llmobs_logs, mock_llmobs_eval_metric_writer ): - LLMObs.submit_evaluation( + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", @@ -1157,8 +988,10 @@ def test_submit_evaluation_non_string_tags_raises_warning_but_still_submits( tags={1: 2, "foo": "bar"}, ml_app="dummy", ) - mock_logs.warning.assert_called_once_with("Failed to parse tags. Tags for evaluation metrics must be strings.") - mock_logs.reset_mock() + mock_llmobs_logs.warning.assert_called_once_with( + "Failed to parse tags. Tags for evaluation metrics must be strings." + ) + mock_llmobs_logs.reset_mock() mock_llmobs_eval_metric_writer.enqueue.assert_called_with( _expected_llmobs_eval_metric_event( ml_app="dummy", @@ -1176,8 +1009,8 @@ def test_submit_evaluation_non_string_tags_raises_warning_but_still_submits( "ddtrace_global_config", [dict(ddtrace="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")], ) -def test_submit_evaluation_metric_tags(LLMObs, mock_llmobs_eval_metric_writer): - LLMObs.submit_evaluation( +def test_submit_evaluation_metric_tags(llmobs, mock_llmobs_eval_metric_writer): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", @@ -1202,8 +1035,8 @@ def test_submit_evaluation_metric_tags(LLMObs, mock_llmobs_eval_metric_writer): "ddtrace_global_config", [dict(ddtrace="1.2.3", env="test_env", service="test_service", _llmobs_ml_app="test_app_name")], ) -def test_submit_evaluation_metric_with_metadata_enqueues_metric(LLMObs, mock_llmobs_eval_metric_writer): - LLMObs.submit_evaluation( +def test_submit_evaluation_metric_with_metadata_enqueues_metric(llmobs, mock_llmobs_eval_metric_writer): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", @@ -1225,7 +1058,7 @@ def test_submit_evaluation_metric_with_metadata_enqueues_metric(LLMObs, mock_llm ) ) mock_llmobs_eval_metric_writer.reset() - LLMObs.submit_evaluation( + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", @@ -1247,8 +1080,8 @@ def test_submit_evaluation_metric_with_metadata_enqueues_metric(LLMObs, mock_llm ) -def test_submit_evaluation_enqueues_writer_with_categorical_metric(LLMObs, mock_llmobs_eval_metric_writer): - LLMObs.submit_evaluation( +def test_submit_evaluation_enqueues_writer_with_categorical_metric(llmobs, mock_llmobs_eval_metric_writer): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="toxicity", metric_type="categorical", @@ -1266,9 +1099,9 @@ def test_submit_evaluation_enqueues_writer_with_categorical_metric(LLMObs, mock_ ) ) mock_llmobs_eval_metric_writer.reset_mock() - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.submit_evaluation( - span_context=LLMObs.export_span(span), + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.submit_evaluation( + span_context=llmobs.export_span(span), label="toxicity", metric_type="categorical", value="high", @@ -1286,8 +1119,8 @@ def test_submit_evaluation_enqueues_writer_with_categorical_metric(LLMObs, mock_ ) -def test_submit_evaluation_enqueues_writer_with_score_metric(LLMObs, mock_llmobs_eval_metric_writer): - LLMObs.submit_evaluation( +def test_submit_evaluation_enqueues_writer_with_score_metric(llmobs, mock_llmobs_eval_metric_writer): + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="sentiment", metric_type="score", @@ -1300,9 +1133,9 @@ def test_submit_evaluation_enqueues_writer_with_score_metric(LLMObs, mock_llmobs ) ) mock_llmobs_eval_metric_writer.reset_mock() - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.submit_evaluation( - span_context=LLMObs.export_span(span), label="sentiment", metric_type="score", value=0.9, ml_app="dummy" + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.submit_evaluation( + span_context=llmobs.export_span(span), label="sentiment", metric_type="score", value=0.9, ml_app="dummy" ) mock_llmobs_eval_metric_writer.enqueue.assert_called_with( _expected_llmobs_eval_metric_event( @@ -1317,9 +1150,9 @@ def test_submit_evaluation_enqueues_writer_with_score_metric(LLMObs, mock_llmobs def test_submit_evaluation_with_numerical_metric_enqueues_writer_with_score_metric( - LLMObs, mock_llmobs_eval_metric_writer + llmobs, mock_llmobs_eval_metric_writer ): - LLMObs.submit_evaluation( + llmobs.submit_evaluation( span_context={"span_id": "123", "trace_id": "456"}, label="token_count", metric_type="numerical", @@ -1332,9 +1165,9 @@ def test_submit_evaluation_with_numerical_metric_enqueues_writer_with_score_metr ) ) mock_llmobs_eval_metric_writer.reset_mock() - with LLMObs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: - LLMObs.submit_evaluation( - span_context=LLMObs.export_span(span), + with llmobs.llm(model_name="test_model", name="test_llm_call", model_provider="test_provider") as span: + llmobs.submit_evaluation( + span_context=llmobs.export_span(span), label="token_count", metric_type="numerical", value=35, @@ -1362,18 +1195,18 @@ def test_flush_calls_periodic_agentless( def test_flush_does_not_call_periodic_when_llmobs_is_disabled( - LLMObs, + llmobs, mock_llmobs_span_writer, mock_llmobs_eval_metric_writer, mock_llmobs_evaluator_runner, - mock_logs, + mock_llmobs_logs, disabled_llmobs, ): - LLMObs.flush() + llmobs.flush() mock_llmobs_span_writer.periodic.assert_not_called() mock_llmobs_eval_metric_writer.periodic.assert_not_called() mock_llmobs_evaluator_runner.periodic.assert_not_called() - mock_logs.warning.assert_has_calls( + mock_llmobs_logs.warning.assert_has_calls( [mock.call("flushing when LLMObs is disabled. No spans or evaluation metrics will be sent.")] ) @@ -1383,113 +1216,113 @@ def test_flush_does_not_call_periodic_when_llmobs_is_disabled_agentless( mock_llmobs_span_agentless_writer, mock_llmobs_eval_metric_writer, mock_llmobs_evaluator_runner, - mock_logs, + mock_llmobs_logs, disabled_llmobs, ): AgentlessLLMObs.flush() mock_llmobs_span_agentless_writer.periodic.assert_not_called() mock_llmobs_eval_metric_writer.periodic.assert_not_called() mock_llmobs_evaluator_runner.periodic.assert_not_called() - mock_logs.warning.assert_has_calls( + mock_llmobs_logs.warning.assert_has_calls( [mock.call("flushing when LLMObs is disabled. No spans or evaluation metrics will be sent.")] ) -def test_inject_distributed_headers_llmobs_disabled_does_nothing(LLMObs, mock_logs): - LLMObs.disable() - headers = LLMObs.inject_distributed_headers({}, span=None) - mock_logs.warning.assert_called_once_with( +def test_inject_distributed_headers_llmobs_disabled_does_nothing(llmobs, mock_llmobs_logs): + llmobs.disable() + headers = llmobs.inject_distributed_headers({}, span=None) + mock_llmobs_logs.warning.assert_called_once_with( "LLMObs.inject_distributed_headers() called when LLMObs is not enabled. " "Distributed context will not be injected." ) assert headers == {} -def test_inject_distributed_headers_not_dict_logs_warning(LLMObs, mock_logs): - headers = LLMObs.inject_distributed_headers("not a dictionary", span=None) - mock_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.") +def test_inject_distributed_headers_not_dict_logs_warning(llmobs, mock_llmobs_logs): + headers = llmobs.inject_distributed_headers("not a dictionary", span=None) + mock_llmobs_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.") assert headers == "not a dictionary" - mock_logs.reset_mock() - headers = LLMObs.inject_distributed_headers(123, span=None) - mock_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.") + mock_llmobs_logs.reset_mock() + headers = llmobs.inject_distributed_headers(123, span=None) + mock_llmobs_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.") assert headers == 123 - mock_logs.reset_mock() - headers = LLMObs.inject_distributed_headers(None, span=None) - mock_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.") + mock_llmobs_logs.reset_mock() + headers = llmobs.inject_distributed_headers(None, span=None) + mock_llmobs_logs.warning.assert_called_once_with("request_headers must be a dictionary of string key-value pairs.") assert headers is None -def test_inject_distributed_headers_no_active_span_logs_warning(LLMObs, mock_logs): - headers = LLMObs.inject_distributed_headers({}, span=None) - mock_logs.warning.assert_called_once_with("No span provided and no currently active span found.") +def test_inject_distributed_headers_no_active_span_logs_warning(llmobs, mock_llmobs_logs): + headers = llmobs.inject_distributed_headers({}, span=None) + mock_llmobs_logs.warning.assert_called_once_with("No span provided and no currently active span found.") assert headers == {} -def test_inject_distributed_headers_span_calls_httppropagator_inject(LLMObs, mock_logs): - span = LLMObs._instance.tracer.trace("test_span") +def test_inject_distributed_headers_span_calls_httppropagator_inject(llmobs, mock_llmobs_logs): + span = llmobs._instance.tracer.trace("test_span") with mock.patch("ddtrace.propagation.http.HTTPPropagator.inject") as mock_inject: - LLMObs.inject_distributed_headers({}, span=span) + llmobs.inject_distributed_headers({}, span=span) assert mock_inject.call_count == 1 mock_inject.assert_called_once_with(span.context, {}) -def test_inject_distributed_headers_current_active_span_injected(LLMObs, mock_logs): - span = LLMObs._instance.tracer.trace("test_span") +def test_inject_distributed_headers_current_active_span_injected(llmobs, mock_llmobs_logs): + span = llmobs._instance.tracer.trace("test_span") with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.inject") as mock_inject: - LLMObs.inject_distributed_headers({}, span=None) + llmobs.inject_distributed_headers({}, span=None) assert mock_inject.call_count == 1 mock_inject.assert_called_once_with(span.context, {}) -def test_activate_distributed_headers_llmobs_disabled_does_nothing(LLMObs, mock_logs): - LLMObs.disable() - LLMObs.activate_distributed_headers({}) - mock_logs.warning.assert_called_once_with( +def test_activate_distributed_headers_llmobs_disabled_does_nothing(llmobs, mock_llmobs_logs): + llmobs.disable() + llmobs.activate_distributed_headers({}) + mock_llmobs_logs.warning.assert_called_once_with( "LLMObs.activate_distributed_headers() called when LLMObs is not enabled. " "Distributed context will not be activated." ) -def test_activate_distributed_headers_calls_httppropagator_extract(LLMObs, mock_logs): +def test_activate_distributed_headers_calls_httppropagator_extract(llmobs, mock_llmobs_logs): with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract: - LLMObs.activate_distributed_headers({}) + llmobs.activate_distributed_headers({}) assert mock_extract.call_count == 1 mock_extract.assert_called_once_with({}) -def test_activate_distributed_headers_no_trace_id_does_nothing(LLMObs, mock_logs): +def test_activate_distributed_headers_no_trace_id_does_nothing(llmobs, mock_llmobs_logs): with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract: mock_extract.return_value = Context(span_id="123", meta={PROPAGATED_PARENT_ID_KEY: "123"}) - LLMObs.activate_distributed_headers({}) + llmobs.activate_distributed_headers({}) assert mock_extract.call_count == 1 - mock_logs.warning.assert_called_once_with("Failed to extract trace ID or span ID from request headers.") + mock_llmobs_logs.warning.assert_called_once_with("Failed to extract trace ID or span ID from request headers.") -def test_activate_distributed_headers_no_span_id_does_nothing(LLMObs, mock_logs): +def test_activate_distributed_headers_no_span_id_does_nothing(llmobs, mock_llmobs_logs): with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract: mock_extract.return_value = Context(trace_id="123", meta={PROPAGATED_PARENT_ID_KEY: "123"}) - LLMObs.activate_distributed_headers({}) + llmobs.activate_distributed_headers({}) assert mock_extract.call_count == 1 - mock_logs.warning.assert_called_once_with("Failed to extract trace ID or span ID from request headers.") + mock_llmobs_logs.warning.assert_called_once_with("Failed to extract trace ID or span ID from request headers.") -def test_activate_distributed_headers_no_llmobs_parent_id_does_nothing(LLMObs, mock_logs): +def test_activate_distributed_headers_no_llmobs_parent_id_does_nothing(llmobs, mock_llmobs_logs): with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract: dummy_context = Context(trace_id="123", span_id="456") mock_extract.return_value = dummy_context with mock.patch("ddtrace.llmobs.LLMObs._instance.tracer.context_provider.activate") as mock_activate: - LLMObs.activate_distributed_headers({}) + llmobs.activate_distributed_headers({}) assert mock_extract.call_count == 1 - mock_logs.warning.assert_called_once_with("Failed to extract LLMObs parent ID from request headers.") + mock_llmobs_logs.warning.assert_called_once_with("Failed to extract LLMObs parent ID from request headers.") mock_activate.assert_called_once_with(dummy_context) -def test_activate_distributed_headers_activates_context(LLMObs, mock_logs): +def test_activate_distributed_headers_activates_context(llmobs, mock_llmobs_logs): with mock.patch("ddtrace.llmobs._llmobs.HTTPPropagator.extract") as mock_extract: dummy_context = Context(trace_id="123", span_id="456", meta={PROPAGATED_PARENT_ID_KEY: "789"}) mock_extract.return_value = dummy_context with mock.patch("ddtrace.llmobs.LLMObs._instance.tracer.context_provider.activate") as mock_activate: - LLMObs.activate_distributed_headers({}) + llmobs.activate_distributed_headers({}) assert mock_extract.call_count == 1 mock_activate.assert_called_once_with(dummy_context) @@ -1504,16 +1337,10 @@ def test_llmobs_fork_recreates_and_restarts_span_writer(): if pid: # parent assert llmobs_service._instance.tracer._pid == original_pid assert llmobs_service._instance._llmobs_span_writer == original_span_writer - assert ( - llmobs_service._instance._trace_processor._span_writer == llmobs_service._instance._llmobs_span_writer - ) assert llmobs_service._instance._llmobs_span_writer.status == ServiceStatus.RUNNING else: # child assert llmobs_service._instance.tracer._pid != original_pid assert llmobs_service._instance._llmobs_span_writer != original_span_writer - assert ( - llmobs_service._instance._trace_processor._span_writer == llmobs_service._instance._llmobs_span_writer - ) assert llmobs_service._instance._llmobs_span_writer.status == ServiceStatus.RUNNING llmobs_service.disable() os._exit(12) @@ -1559,18 +1386,10 @@ def test_llmobs_fork_recreates_and_restarts_evaluator_runner(mock_ragas_evaluato if pid: # parent assert llmobs_service._instance.tracer._pid == original_pid assert llmobs_service._instance._evaluator_runner == original_evaluator_runner - assert ( - llmobs_service._instance._trace_processor._evaluator_runner - == llmobs_service._instance._evaluator_runner - ) assert llmobs_service._instance._evaluator_runner.status == ServiceStatus.RUNNING else: # child assert llmobs_service._instance.tracer._pid != original_pid assert llmobs_service._instance._evaluator_runner != original_evaluator_runner - assert ( - llmobs_service._instance._trace_processor._evaluator_runner - == llmobs_service._instance._evaluator_runner - ) assert llmobs_service._instance._evaluator_runner.status == ServiceStatus.RUNNING llmobs_service.disable() os._exit(12) @@ -1700,46 +1519,46 @@ def test_llmobs_fork_disabled_then_enabled(monkeypatch): svc.disable() -def test_llmobs_with_evaluator_runner(LLMObs, mock_llmobs_evaluator_runner): - with LLMObs.llm(model_name="test_model"): +def test_llmobs_with_evaluator_runner(llmobs, mock_llmobs_evaluator_runner): + with llmobs.llm(model_name="test_model"): pass time.sleep(0.1) - assert LLMObs._instance._evaluator_runner.enqueue.call_count == 1 + assert llmobs._instance._evaluator_runner.enqueue.call_count == 1 -def test_llmobs_with_evaluator_runner_does_not_enqueue_evaluation_spans(mock_llmobs_evaluator_runner, LLMObs): - with LLMObs.llm(model_name="test_model", ml_app="{}-dummy".format(RAGAS_ML_APP_PREFIX)): +def test_llmobs_with_evaluator_runner_does_not_enqueue_evaluation_spans(mock_llmobs_evaluator_runner, llmobs): + with llmobs.llm(model_name="test_model", ml_app="{}-dummy".format(RAGAS_ML_APP_PREFIX)): pass time.sleep(0.1) - assert LLMObs._instance._evaluator_runner.enqueue.call_count == 0 + assert llmobs._instance._evaluator_runner.enqueue.call_count == 0 -def test_llmobs_with_evaluation_runner_does_not_enqueue_non_llm_spans(mock_llmobs_evaluator_runner, LLMObs): - with LLMObs.workflow(name="test"): +def test_llmobs_with_evaluation_runner_does_not_enqueue_non_llm_spans(mock_llmobs_evaluator_runner, llmobs): + with llmobs.workflow(name="test"): pass - with LLMObs.agent(name="test"): + with llmobs.agent(name="test"): pass - with LLMObs.task(name="test"): + with llmobs.task(name="test"): pass - with LLMObs.embedding(model_name="test"): + with llmobs.embedding(model_name="test"): pass - with LLMObs.retrieval(name="test"): + with llmobs.retrieval(name="test"): pass - with LLMObs.tool(name="test"): + with llmobs.tool(name="test"): pass time.sleep(0.1) - assert LLMObs._instance._evaluator_runner.enqueue.call_count == 0 + assert llmobs._instance._evaluator_runner.enqueue.call_count == 0 -def test_annotation_context_modifies_span_tags(LLMObs): - with LLMObs.annotation_context(tags={"foo": "bar"}): - with LLMObs.agent(name="test_agent") as span: +def test_annotation_context_modifies_span_tags(llmobs): + with llmobs.annotation_context(tags={"foo": "bar"}): + with llmobs.agent(name="test_agent") as span: assert span._get_ctx_item(TAGS) == {"foo": "bar"} -def test_annotation_context_modifies_prompt(LLMObs): - with LLMObs.annotation_context(prompt={"template": "test_template"}): - with LLMObs.llm(name="test_agent", model_name="test") as span: +def test_annotation_context_modifies_prompt(llmobs): + with llmobs.annotation_context(prompt={"template": "test_template"}): + with llmobs.llm(name="test_agent", model_name="test") as span: assert span._get_ctx_item(INPUT_PROMPT) == { "template": "test_template", "_dd_context_variable_keys": ["context"], @@ -1747,80 +1566,80 @@ def test_annotation_context_modifies_prompt(LLMObs): } -def test_annotation_context_modifies_name(LLMObs): - with LLMObs.annotation_context(name="test_agent_override"): - with LLMObs.llm(name="test_agent", model_name="test") as span: +def test_annotation_context_modifies_name(llmobs): + with llmobs.annotation_context(name="test_agent_override"): + with llmobs.llm(name="test_agent", model_name="test") as span: assert span.name == "test_agent_override" -def test_annotation_context_finished_context_does_not_modify_tags(LLMObs): - with LLMObs.annotation_context(tags={"foo": "bar"}): +def test_annotation_context_finished_context_does_not_modify_tags(llmobs): + with llmobs.annotation_context(tags={"foo": "bar"}): pass - with LLMObs.agent(name="test_agent") as span: + with llmobs.agent(name="test_agent") as span: assert span._get_ctx_item(TAGS) is None -def test_annotation_context_finished_context_does_not_modify_prompt(LLMObs): - with LLMObs.annotation_context(prompt={"template": "test_template"}): +def test_annotation_context_finished_context_does_not_modify_prompt(llmobs): + with llmobs.annotation_context(prompt={"template": "test_template"}): pass - with LLMObs.llm(name="test_agent", model_name="test") as span: + with llmobs.llm(name="test_agent", model_name="test") as span: assert span._get_ctx_item(INPUT_PROMPT) is None -def test_annotation_context_finished_context_does_not_modify_name(LLMObs): - with LLMObs.annotation_context(name="test_agent_override"): +def test_annotation_context_finished_context_does_not_modify_name(llmobs): + with llmobs.annotation_context(name="test_agent_override"): pass - with LLMObs.agent(name="test_agent") as span: + with llmobs.agent(name="test_agent") as span: assert span.name == "test_agent" -def test_annotation_context_nested(LLMObs): - with LLMObs.annotation_context(tags={"foo": "bar", "boo": "bar"}): - with LLMObs.annotation_context(tags={"foo": "baz"}): - with LLMObs.agent(name="test_agent") as span: +def test_annotation_context_nested(llmobs): + with llmobs.annotation_context(tags={"foo": "bar", "boo": "bar"}): + with llmobs.annotation_context(tags={"foo": "baz"}): + with llmobs.agent(name="test_agent") as span: assert span._get_ctx_item(TAGS) == {"foo": "baz", "boo": "bar"} -def test_annotation_context_nested_overrides_name(LLMObs): - with LLMObs.annotation_context(name="unexpected"): - with LLMObs.annotation_context(name="expected"): - with LLMObs.agent(name="test_agent") as span: +def test_annotation_context_nested_overrides_name(llmobs): + with llmobs.annotation_context(name="unexpected"): + with llmobs.annotation_context(name="expected"): + with llmobs.agent(name="test_agent") as span: assert span.name == "expected" -def test_annotation_context_nested_maintains_trace_structure(LLMObs, mock_llmobs_span_writer): +def test_annotation_context_nested_maintains_trace_structure(llmobs, llmobs_events): """This test makes sure starting/stopping annotation contexts do not modify the llmobs trace structure""" - with LLMObs.annotation_context(tags={"foo": "bar", "boo": "bar"}): - with LLMObs.agent(name="parent_span") as parent_span: - with LLMObs.annotation_context(tags={"foo": "baz"}): - with LLMObs.workflow(name="child_span") as child_span: + with llmobs.annotation_context(tags={"foo": "bar", "boo": "bar"}): + with llmobs.agent(name="parent_span") as parent_span: + with llmobs.annotation_context(tags={"foo": "baz"}): + with llmobs.workflow(name="child_span") as child_span: assert child_span._get_ctx_item(TAGS) == {"foo": "baz", "boo": "bar"} assert parent_span._get_ctx_item(TAGS) == {"foo": "bar", "boo": "bar"} - assert len(mock_llmobs_span_writer.enqueue.call_args_list) == 2 - parent_span, child_span = [span[0] for span, _ in mock_llmobs_span_writer.enqueue.call_args_list] + assert len(llmobs_events) == 2 + parent_span, child_span = llmobs_events[1], llmobs_events[0] assert child_span["trace_id"] == parent_span["trace_id"] assert child_span["span_id"] != parent_span["span_id"] assert child_span["parent_id"] == parent_span["span_id"] assert parent_span["parent_id"] == "undefined" - mock_llmobs_span_writer.reset_mock() - with LLMObs.annotation_context(tags={"foo": "bar", "boo": "bar"}): - with LLMObs.agent(name="parent_span"): +def test_annotation_context_separate_traces_maintained(llmobs, llmobs_events): + with llmobs.annotation_context(tags={"foo": "bar", "boo": "bar"}): + with llmobs.agent(name="parent_span"): pass - with LLMObs.workflow(name="child_span"): + with llmobs.workflow(name="child_span"): pass - assert len(mock_llmobs_span_writer.enqueue.call_args_list) == 2 - trace_one, trace_two = [span[0] for span, _ in mock_llmobs_span_writer.enqueue.call_args_list] - assert trace_one["trace_id"] != trace_two["trace_id"] - assert trace_one["span_id"] != trace_two["span_id"] - assert trace_two["parent_id"] == "undefined" - assert trace_one["parent_id"] == "undefined" + assert len(llmobs_events) == 2 + agent_span, workflow_span = llmobs_events[1], llmobs_events[0] + assert agent_span["trace_id"] != workflow_span["trace_id"] + assert agent_span["span_id"] != workflow_span["span_id"] + assert workflow_span["parent_id"] == "undefined" + assert agent_span["parent_id"] == "undefined" -def test_annotation_context_only_applies_to_local_context(LLMObs): +def test_annotation_context_only_applies_to_local_context(llmobs): """ tests that annotation contexts only apply to spans belonging to the same trace context and not globally to all spans. @@ -1836,8 +1655,8 @@ def test_annotation_context_only_applies_to_local_context(LLMObs): def context_one(): nonlocal agent_has_correct_name nonlocal agent_has_correct_tags - with LLMObs.annotation_context(name="expected_agent", tags={"foo": "bar"}): - with LLMObs.agent(name="test_agent") as span: + with llmobs.annotation_context(name="expected_agent", tags={"foo": "bar"}): + with llmobs.agent(name="test_agent") as span: event.wait() agent_has_correct_tags = span._get_ctx_item(TAGS) == {"foo": "bar"} agent_has_correct_name = span.name == "expected_agent" @@ -1846,9 +1665,9 @@ def context_one(): def context_two(): nonlocal tool_has_correct_name nonlocal tool_does_not_have_tags - with LLMObs.agent(name="test_agent"): - with LLMObs.annotation_context(name="expected_tool"): - with LLMObs.tool(name="test_tool") as tool_span: + with llmobs.agent(name="test_agent"): + with llmobs.annotation_context(name="expected_tool"): + with llmobs.tool(name="test_tool") as tool_span: event.wait() tool_does_not_have_tags = tool_span._get_ctx_item(TAGS) is None tool_has_correct_name = tool_span.name == "expected_tool" @@ -1858,7 +1677,7 @@ def context_two(): thread_one.start() thread_two.start() - with LLMObs.agent(name="test_agent") as span: + with llmobs.agent(name="test_agent") as span: assert span.name == "test_agent" assert span._get_ctx_item(TAGS) is None @@ -1874,15 +1693,15 @@ def context_two(): assert tool_does_not_have_tags is True -async def test_annotation_context_async_modifies_span_tags(LLMObs): - async with LLMObs.annotation_context(tags={"foo": "bar"}): - with LLMObs.agent(name="test_agent") as span: +async def test_annotation_context_async_modifies_span_tags(llmobs): + async with llmobs.annotation_context(tags={"foo": "bar"}): + with llmobs.agent(name="test_agent") as span: assert span._get_ctx_item(TAGS) == {"foo": "bar"} -async def test_annotation_context_async_modifies_prompt(LLMObs): - async with LLMObs.annotation_context(prompt={"template": "test_template"}): - with LLMObs.llm(name="test_agent", model_name="test") as span: +async def test_annotation_context_async_modifies_prompt(llmobs): + async with llmobs.annotation_context(prompt={"template": "test_template"}): + with llmobs.llm(name="test_agent", model_name="test") as span: assert span._get_ctx_item(INPUT_PROMPT) == { "template": "test_template", "_dd_context_variable_keys": ["context"], @@ -1890,37 +1709,37 @@ async def test_annotation_context_async_modifies_prompt(LLMObs): } -async def test_annotation_context_async_modifies_name(LLMObs): - async with LLMObs.annotation_context(name="test_agent_override"): - with LLMObs.llm(name="test_agent", model_name="test") as span: +async def test_annotation_context_async_modifies_name(llmobs): + async with llmobs.annotation_context(name="test_agent_override"): + with llmobs.llm(name="test_agent", model_name="test") as span: assert span.name == "test_agent_override" -async def test_annotation_context_async_finished_context_does_not_modify_tags(LLMObs): - async with LLMObs.annotation_context(tags={"foo": "bar"}): +async def test_annotation_context_async_finished_context_does_not_modify_tags(llmobs): + async with llmobs.annotation_context(tags={"foo": "bar"}): pass - with LLMObs.agent(name="test_agent") as span: + with llmobs.agent(name="test_agent") as span: assert span._get_ctx_item(TAGS) is None -async def test_annotation_context_async_finished_context_does_not_modify_prompt(LLMObs): - async with LLMObs.annotation_context(prompt={"template": "test_template"}): +async def test_annotation_context_async_finished_context_does_not_modify_prompt(llmobs): + async with llmobs.annotation_context(prompt={"template": "test_template"}): pass - with LLMObs.llm(name="test_agent", model_name="test") as span: + with llmobs.llm(name="test_agent", model_name="test") as span: assert span._get_ctx_item(INPUT_PROMPT) is None -async def test_annotation_context_finished_context_async_does_not_modify_name(LLMObs): - async with LLMObs.annotation_context(name="test_agent_override"): +async def test_annotation_context_finished_context_async_does_not_modify_name(llmobs): + async with llmobs.annotation_context(name="test_agent_override"): pass - with LLMObs.agent(name="test_agent") as span: + with llmobs.agent(name="test_agent") as span: assert span.name == "test_agent" -async def test_annotation_context_async_nested(LLMObs): - async with LLMObs.annotation_context(tags={"foo": "bar", "boo": "bar"}): - async with LLMObs.annotation_context(tags={"foo": "baz"}): - with LLMObs.agent(name="test_agent") as span: +async def test_annotation_context_async_nested(llmobs): + async with llmobs.annotation_context(tags={"foo": "bar", "boo": "bar"}): + async with llmobs.annotation_context(tags={"foo": "baz"}): + with llmobs.agent(name="test_agent") as span: assert span._get_ctx_item(TAGS) == {"foo": "baz", "boo": "bar"} From cd23e8554a6853e7ffa89385d14be307c6f4d597 Mon Sep 17 00:00:00 2001 From: Yun Kim Date: Fri, 3 Jan 2025 10:30:04 -0500 Subject: [PATCH 04/12] Remove failing assertions due to irrelevant side effect --- tests/llmobs/test_propagation.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/llmobs/test_propagation.py b/tests/llmobs/test_propagation.py index d892c6b98a2..e6f5234c824 100644 --- a/tests/llmobs/test_propagation.py +++ b/tests/llmobs/test_propagation.py @@ -216,8 +216,6 @@ def test_activate_distributed_headers_propagate_correct_llmobs_parent_id_simple( env["DD_TRACE_ENABLED"] = "0" stdout, stderr, status, _ = run_python_code_in_subprocess(code=code, env=env) assert status == 0, (stdout, stderr) - assert stderr == b"", (stdout, stderr) - headers = json.loads(stdout.decode()) LLMObs.activate_distributed_headers(headers) with LLMObs.workflow("LLMObs span") as span: @@ -252,7 +250,6 @@ def test_activate_distributed_headers_propagate_llmobs_parent_id_complex(run_pyt env["DD_TRACE_ENABLED"] = "0" stdout, stderr, status, _ = run_python_code_in_subprocess(code=code, env=env) assert status == 0, (stdout, stderr) - assert stderr == b"", (stdout, stderr) headers = json.loads(stdout.decode()) LLMObs.activate_distributed_headers(headers) From cadae2ca875401a0805ee8a23498bd48d24c2f2e Mon Sep 17 00:00:00 2001 From: Yun Kim Date: Fri, 3 Jan 2025 16:29:40 -0500 Subject: [PATCH 05/12] Fix ragas span tests to use events helper --- ...est_llmobs_ragas_faithfulness_evaluator.py | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py b/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py index 1f78b538f24..f5f02602c8a 100644 --- a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py +++ b/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py @@ -167,19 +167,18 @@ def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys(ragas, L @pytest.mark.vcr_logs -def test_ragas_faithfulness_emits_traces(ragas, LLMObs): - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) +def test_ragas_faithfulness_emits_traces(ragas, llmobs, llmobs_events): + """Why are we asserting only 7 spans caught?""" + rf_evaluator = RagasFaithfulnessEvaluator(llmobs) rf_evaluator.evaluate(_llm_span_with_expected_ragas_inputs_in_prompt()) - assert rf_evaluator.llmobs_service._instance._llmobs_span_writer.enqueue.call_count == 7 - calls = rf_evaluator.llmobs_service._instance._llmobs_span_writer.enqueue.call_args_list - - spans = [call[0][0] for call in calls] - + ragas_spans = [event for event in llmobs_events if event["name"].startswith("dd-ragas.")] + ragas_spans = sorted(ragas_spans, key=lambda d: d["start_ns"]) + assert len(ragas_spans) == 7 # check name, io, span kinds match - assert spans == _expected_ragas_spans() + assert ragas_spans == _expected_ragas_spans() # verify the trace structure - root_span = spans[0] + root_span = ragas_spans[0] root_span_id = root_span["span_id"] assert root_span["parent_id"] == "undefined" assert root_span["meta"] is not None @@ -187,16 +186,15 @@ def test_ragas_faithfulness_emits_traces(ragas, LLMObs): assert isinstance(root_span["meta"]["metadata"]["faithfulness_list"], list) assert isinstance(root_span["meta"]["metadata"]["statements"], list) root_span_trace_id = root_span["trace_id"] - for child_span in spans[1:]: + for child_span in ragas_spans[1:]: assert child_span["trace_id"] == root_span_trace_id - assert spans[1]["parent_id"] == root_span_id # input extraction (task) - assert spans[2]["parent_id"] == root_span_id # create statements (workflow) - assert spans[4]["parent_id"] == root_span_id # create verdicts (workflow) - assert spans[6]["parent_id"] == root_span_id # create score (task) - - assert spans[3]["parent_id"] == spans[2]["span_id"] # create statements prompt (task) - assert spans[5]["parent_id"] == spans[4]["span_id"] # create verdicts prompt (task) + assert ragas_spans[1]["parent_id"] == root_span_id # input extraction (task) + assert ragas_spans[2]["parent_id"] == root_span_id # create statements (workflow) + assert ragas_spans[4]["parent_id"] == root_span_id # create verdicts (workflow) + assert ragas_spans[6]["parent_id"] == root_span_id # create score (task) + assert ragas_spans[3]["parent_id"] == ragas_spans[2]["span_id"] # create statements prompt (task) + assert ragas_spans[5]["parent_id"] == ragas_spans[4]["span_id"] # create verdicts prompt (task) def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_logs, run_python_code_in_subprocess): From e0da7f927e15b2f05a1f4316be21d2d62bfa0a08 Mon Sep 17 00:00:00 2001 From: Yun Kim Date: Fri, 3 Jan 2025 16:41:21 -0500 Subject: [PATCH 06/12] Add lockfile, fmt --- .riot/requirements/16562eb.txt | 32 ++++++++++++++++++++++++++++++++ tests/llmobs/test_propagation.py | 1 + 2 files changed, 33 insertions(+) create mode 100644 .riot/requirements/16562eb.txt diff --git a/.riot/requirements/16562eb.txt b/.riot/requirements/16562eb.txt new file mode 100644 index 00000000000..e2aac88c146 --- /dev/null +++ b/.riot/requirements/16562eb.txt @@ -0,0 +1,32 @@ +# +# This file is autogenerated by pip-compile with Python 3.7 +# by the following command: +# +# pip-compile --allow-unsafe --config=pyproject.toml --no-annotate --resolver=backtracking .riot/requirements/16562eb.in +# +attrs==24.2.0 +coverage[toml]==7.2.7 +exceptiongroup==1.2.2 +hypothesis==6.45.0 +idna==3.10 +importlib-metadata==6.7.0 +iniconfig==2.0.0 +mock==5.1.0 +multidict==6.0.5 +opentracing==2.4.0 +packaging==24.0 +pluggy==1.2.0 +pytest==7.4.4 +pytest-asyncio==0.21.1 +pytest-cov==4.1.0 +pytest-mock==3.11.1 +pyyaml==6.0.1 +six==1.17.0 +sortedcontainers==2.4.0 +tomli==2.0.1 +typing-extensions==4.7.1 +urllib3==1.26.20 +vcrpy==4.4.0 +wrapt==1.16.0 +yarl==1.9.4 +zipp==3.15.0 diff --git a/tests/llmobs/test_propagation.py b/tests/llmobs/test_propagation.py index e6f5234c824..d14b22d65d5 100644 --- a/tests/llmobs/test_propagation.py +++ b/tests/llmobs/test_propagation.py @@ -216,6 +216,7 @@ def test_activate_distributed_headers_propagate_correct_llmobs_parent_id_simple( env["DD_TRACE_ENABLED"] = "0" stdout, stderr, status, _ = run_python_code_in_subprocess(code=code, env=env) assert status == 0, (stdout, stderr) + headers = json.loads(stdout.decode()) LLMObs.activate_distributed_headers(headers) with LLMObs.workflow("LLMObs span") as span: From c2e59b4d4446eef82eeeab8e15778d4241f87369 Mon Sep 17 00:00:00 2001 From: Yun Kim Date: Fri, 3 Jan 2025 17:56:03 -0500 Subject: [PATCH 07/12] Skip ragas tests if ragas not available --- tests/llmobs/conftest.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/llmobs/conftest.py b/tests/llmobs/conftest.py index 7e4ff7021a1..3cd9d6055c7 100644 --- a/tests/llmobs/conftest.py +++ b/tests/llmobs/conftest.py @@ -198,16 +198,20 @@ def mock_ragas_dependencies_not_present(): @pytest.fixture def ragas(mock_llmobs_span_writer, mock_llmobs_eval_metric_writer): with override_global_config(dict(_dd_api_key="")): - import ragas - + try: + import ragas + except ImportError: + pytest.skip("Ragas not installed") with override_env(dict(OPENAI_API_KEY=os.getenv("OPENAI_API_KEY", ""))): yield ragas @pytest.fixture def reset_ragas_faithfulness_llm(): - import ragas - + try: + import ragas + except ImportError: + pytest.skip("Ragas not installed") previous_llm = ragas.metrics.faithfulness.llm yield ragas.metrics.faithfulness.llm = previous_llm From 24b8c92d63b032e017b7a216b26b9ea212770bb3 Mon Sep 17 00:00:00 2001 From: Yun Kim Date: Mon, 6 Jan 2025 11:39:17 -0500 Subject: [PATCH 08/12] Skip ragas tests if ragas not available --- riotfile.py | 6 +- ...emits_traces_and_evaluations_on_exit.yaml} | 0 ...test_ragas_faithfulness_emits_traces.yaml} | 0 ...agas_faithfulness_submits_evaluation.yaml} | 0 ..._evaluation_on_span_with_custom_keys.yaml} | 0 ...on_on_span_with_question_in_messages.yaml} | 0 ...est_llmobs_ragas_faithfulness_evaluator.py | 406 +++++++++--------- tests/llmobs/test_llmobs_service.py | 4 + 8 files changed, 214 insertions(+), 202 deletions(-) rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml => tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.emits_traces_and_evaluations_on_exit.yaml} (100%) rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml => tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_emits_traces.yaml} (100%) rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml => tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation.yaml} (100%) rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml => tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml} (100%) rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml => tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml} (100%) diff --git a/riotfile.py b/riotfile.py index f274b84bb0a..8f36a9ff80b 100644 --- a/riotfile.py +++ b/riotfile.py @@ -2885,7 +2885,11 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT pkgs={"vcrpy": latest, "pytest-asyncio": "==0.21.1"}, venvs=[ Venv(pys="3.7"), - Venv(pys=select_pys(min_version="3.8"), pkgs={"ragas": "==0.1.21", "langchain": latest}), + Venv( + pys=select_pys(min_version="3.8"), + pkgs={"ragas": "==0.1.21", "langchain": latest}, + env={"RAGAS_AVAILABLE": "True"}, + ), ], ), Venv( diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.emits_traces_and_evaluations_on_exit.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.emits_traces_and_evaluations_on_exit.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_emits_traces.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_emits_traces.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml diff --git a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py b/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py index f5f02602c8a..42a69a4d613 100644 --- a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py +++ b/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py @@ -11,15 +11,11 @@ from tests.llmobs._utils import _llm_span_with_expected_ragas_inputs_in_prompt -def _llm_span_without_io(): - return _expected_llmobs_llm_span_event(Span("dummy")) +RAGAS_AVAILABLE = os.getenv("RAGAS_AVAILABLE", False) -def test_ragas_evaluator_init(ragas, LLMObs): - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - assert rf_evaluator.llmobs_service == LLMObs - assert rf_evaluator.ragas_faithfulness_instance == ragas.metrics.faithfulness - assert rf_evaluator.ragas_faithfulness_instance.llm == ragas.llms.llm_factory() +def _llm_span_without_io(): + return _expected_llmobs_llm_span_event(Span("dummy")) def test_ragas_faithfulness_throws_if_dependencies_not_present(LLMObs, mock_ragas_dependencies_not_present, ragas): @@ -27,195 +23,203 @@ def test_ragas_faithfulness_throws_if_dependencies_not_present(LLMObs, mock_raga RagasFaithfulnessEvaluator(LLMObs) -def test_ragas_faithfulness_returns_none_if_inputs_extraction_fails(ragas, mock_llmobs_submit_evaluation, LLMObs): - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - failure_msg, _ = rf_evaluator.evaluate(_llm_span_without_io()) - assert failure_msg == "fail_extract_faithfulness_inputs" - assert rf_evaluator.llmobs_service.submit_evaluation.call_count == 0 - - -def test_ragas_faithfulness_has_modified_faithfulness_instance( - ragas, mock_llmobs_submit_evaluation, reset_ragas_faithfulness_llm, LLMObs -): - """Faithfulness instance used in ragas evaluator should match the global ragas faithfulness instance""" - from ragas.llms import BaseRagasLLM - from ragas.metrics import faithfulness - - class FirstDummyLLM(BaseRagasLLM): - def __init__(self): - super().__init__() - - def generate_text(self) -> str: - return "dummy llm" - - def agenerate_text(self) -> str: - return "dummy llm" - - faithfulness.llm = FirstDummyLLM() - - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - - assert rf_evaluator.ragas_faithfulness_instance.llm.generate_text() == "dummy llm" - - class SecondDummyLLM(BaseRagasLLM): - def __init__(self): - super().__init__() - - def generate_text(self, statements) -> str: - raise ValueError("dummy_llm") - - def agenerate_text(self, statements) -> str: - raise ValueError("dummy_llm") - - faithfulness.llm = SecondDummyLLM() - - with pytest.raises(ValueError, match="dummy_llm"): - rf_evaluator.evaluate(_llm_span_with_expected_ragas_inputs_in_prompt()) - - -@pytest.mark.vcr_logs -def test_ragas_faithfulness_submits_evaluation(ragas, LLMObs, mock_llmobs_submit_evaluation): - """Test that evaluation is submitted for a valid llm span where question is in the prompt variables""" - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - llm_span = _llm_span_with_expected_ragas_inputs_in_prompt() - rf_evaluator.run_and_submit_evaluation(llm_span) - rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( - [ - mock.call( - span_context={ - "span_id": llm_span.get("span_id"), - "trace_id": llm_span.get("trace_id"), +@pytest.mark.skipif(not RAGAS_AVAILABLE, reason="Tests require ragas to be available on user env") +class TestRagasFaithfulnessEvaluator: + def test_ragas_evaluator_init(self, ragas, LLMObs): + rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + assert rf_evaluator.llmobs_service == LLMObs + assert rf_evaluator.ragas_faithfulness_instance == ragas.metrics.faithfulness + assert rf_evaluator.ragas_faithfulness_instance.llm == ragas.llms.llm_factory() + + def test_ragas_faithfulness_returns_none_if_inputs_extraction_fails( + self, ragas, mock_llmobs_submit_evaluation, LLMObs + ): + rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + failure_msg, _ = rf_evaluator.evaluate(_llm_span_without_io()) + assert failure_msg == "fail_extract_faithfulness_inputs" + assert rf_evaluator.llmobs_service.submit_evaluation.call_count == 0 + + def test_ragas_faithfulness_has_modified_faithfulness_instance( + self, ragas, mock_llmobs_submit_evaluation, reset_ragas_faithfulness_llm, LLMObs + ): + """Faithfulness instance used in ragas evaluator should match the global ragas faithfulness instance""" + from ragas.llms import BaseRagasLLM + from ragas.metrics import faithfulness + + class FirstDummyLLM(BaseRagasLLM): + def __init__(self): + super().__init__() + + def generate_text(self) -> str: + return "dummy llm" + + def agenerate_text(self) -> str: + return "dummy llm" + + faithfulness.llm = FirstDummyLLM() + + rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + + assert rf_evaluator.ragas_faithfulness_instance.llm.generate_text() == "dummy llm" + + class SecondDummyLLM(BaseRagasLLM): + def __init__(self): + super().__init__() + + def generate_text(self, statements) -> str: + raise ValueError("dummy_llm") + + def agenerate_text(self, statements) -> str: + raise ValueError("dummy_llm") + + faithfulness.llm = SecondDummyLLM() + + with pytest.raises(ValueError, match="dummy_llm"): + rf_evaluator.evaluate(_llm_span_with_expected_ragas_inputs_in_prompt()) + + @pytest.mark.vcr_logs + def test_ragas_faithfulness_submits_evaluation(self, ragas, LLMObs, mock_llmobs_submit_evaluation): + """Test that evaluation is submitted for a valid llm span where question is in the prompt variables""" + rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + llm_span = _llm_span_with_expected_ragas_inputs_in_prompt() + rf_evaluator.run_and_submit_evaluation(llm_span) + rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( + [ + mock.call( + span_context={ + "span_id": llm_span.get("span_id"), + "trace_id": llm_span.get("trace_id"), + }, + label=RagasFaithfulnessEvaluator.LABEL, + metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, + value=1.0, + metadata={ + "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, + "_dd.faithfulness_disagreements": mock.ANY, + "_dd.evaluation_kind": "faithfulness", + }, + ) + ] + ) + + @pytest.mark.vcr_logs + def test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages( + self, ragas, LLMObs, mock_llmobs_submit_evaluation + ): + """Test that evaluation is submitted for a valid llm span where the last message content is the question""" + rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + llm_span = _llm_span_with_expected_ragas_inputs_in_messages() + rf_evaluator.run_and_submit_evaluation(llm_span) + rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( + [ + mock.call( + span_context={ + "span_id": llm_span.get("span_id"), + "trace_id": llm_span.get("trace_id"), + }, + label=RagasFaithfulnessEvaluator.LABEL, + metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, + value=1.0, + metadata={ + "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, + "_dd.faithfulness_disagreements": mock.ANY, + "_dd.evaluation_kind": "faithfulness", + }, + ) + ] + ) + + @pytest.mark.vcr_logs + def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys( + self, ragas, LLMObs, mock_llmobs_submit_evaluation + ): + """Test that evaluation is submitted for a valid llm span where the last message content is the question""" + rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + llm_span = _expected_llmobs_llm_span_event( + Span("dummy"), + prompt={ + "variables": { + "user_input": "Is france part of europe?", + "context_1": "hello, ", + "context_2": "france is ", + "context_3": "part of europe", }, - label=RagasFaithfulnessEvaluator.LABEL, - metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, - value=1.0, - metadata={ - "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, - "_dd.faithfulness_disagreements": mock.ANY, - "_dd.evaluation_kind": "faithfulness", - }, - ) - ] - ) - - -@pytest.mark.vcr_logs -def test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages( - ragas, LLMObs, mock_llmobs_submit_evaluation -): - """Test that evaluation is submitted for a valid llm span where the last message content is the question""" - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - llm_span = _llm_span_with_expected_ragas_inputs_in_messages() - rf_evaluator.run_and_submit_evaluation(llm_span) - rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( - [ - mock.call( - span_context={ - "span_id": llm_span.get("span_id"), - "trace_id": llm_span.get("trace_id"), - }, - label=RagasFaithfulnessEvaluator.LABEL, - metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, - value=1.0, - metadata={ - "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, - "_dd.faithfulness_disagreements": mock.ANY, - "_dd.evaluation_kind": "faithfulness", - }, - ) - ] - ) - - -@pytest.mark.vcr_logs -def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys(ragas, LLMObs, mock_llmobs_submit_evaluation): - """Test that evaluation is submitted for a valid llm span where the last message content is the question""" - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - llm_span = _expected_llmobs_llm_span_event( - Span("dummy"), - prompt={ - "variables": { - "user_input": "Is france part of europe?", - "context_1": "hello, ", - "context_2": "france is ", - "context_3": "part of europe", + "_dd_context_variable_keys": ["context_1", "context_2", "context_3"], + "_dd_query_variable_keys": ["user_input"], }, - "_dd_context_variable_keys": ["context_1", "context_2", "context_3"], - "_dd_query_variable_keys": ["user_input"], - }, - output_messages=[{"content": "France is indeed part of europe"}], - ) - rf_evaluator.run_and_submit_evaluation(llm_span) - rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( - [ - mock.call( - span_context={ - "span_id": llm_span.get("span_id"), - "trace_id": llm_span.get("trace_id"), - }, - label=RagasFaithfulnessEvaluator.LABEL, - metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, - value=1.0, - metadata={ - "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, - "_dd.faithfulness_disagreements": mock.ANY, - "_dd.evaluation_kind": "faithfulness", - }, - ) - ] - ) - - -@pytest.mark.vcr_logs -def test_ragas_faithfulness_emits_traces(ragas, llmobs, llmobs_events): - """Why are we asserting only 7 spans caught?""" - rf_evaluator = RagasFaithfulnessEvaluator(llmobs) - rf_evaluator.evaluate(_llm_span_with_expected_ragas_inputs_in_prompt()) - ragas_spans = [event for event in llmobs_events if event["name"].startswith("dd-ragas.")] - ragas_spans = sorted(ragas_spans, key=lambda d: d["start_ns"]) - assert len(ragas_spans) == 7 - # check name, io, span kinds match - assert ragas_spans == _expected_ragas_spans() - - # verify the trace structure - root_span = ragas_spans[0] - root_span_id = root_span["span_id"] - assert root_span["parent_id"] == "undefined" - assert root_span["meta"] is not None - assert root_span["meta"]["metadata"] is not None - assert isinstance(root_span["meta"]["metadata"]["faithfulness_list"], list) - assert isinstance(root_span["meta"]["metadata"]["statements"], list) - root_span_trace_id = root_span["trace_id"] - for child_span in ragas_spans[1:]: - assert child_span["trace_id"] == root_span_trace_id - - assert ragas_spans[1]["parent_id"] == root_span_id # input extraction (task) - assert ragas_spans[2]["parent_id"] == root_span_id # create statements (workflow) - assert ragas_spans[4]["parent_id"] == root_span_id # create verdicts (workflow) - assert ragas_spans[6]["parent_id"] == root_span_id # create score (task) - assert ragas_spans[3]["parent_id"] == ragas_spans[2]["span_id"] # create statements prompt (task) - assert ragas_spans[5]["parent_id"] == ragas_spans[4]["span_id"] # create verdicts prompt (task) - - -def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_logs, run_python_code_in_subprocess): - env = os.environ.copy() - pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))] - if "PYTHONPATH" in env: - pypath.append(env["PYTHONPATH"]) - env.update( - { - "DD_API_KEY": os.getenv("DD_API_KEY", "dummy-api-key"), - "DD_SITE": "datad0g.com", - "PYTHONPATH": ":".join(pypath), - "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", "dummy-openai-api-key"), - "DD_LLMOBS_ML_APP": "unnamed-ml-app", - "_DD_LLMOBS_EVALUATOR_INTERVAL": "5", - "_DD_LLMOBS_EVALUATORS": "ragas_faithfulness", - "DD_LLMOBS_AGENTLESS_ENABLED": "true", - } - ) - out, err, status, pid = run_python_code_in_subprocess( - """ + output_messages=[{"content": "France is indeed part of europe"}], + ) + rf_evaluator.run_and_submit_evaluation(llm_span) + rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( + [ + mock.call( + span_context={ + "span_id": llm_span.get("span_id"), + "trace_id": llm_span.get("trace_id"), + }, + label=RagasFaithfulnessEvaluator.LABEL, + metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, + value=1.0, + metadata={ + "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, + "_dd.faithfulness_disagreements": mock.ANY, + "_dd.evaluation_kind": "faithfulness", + }, + ) + ] + ) + + @pytest.mark.vcr_logs + def test_ragas_faithfulness_emits_traces(self, ragas, llmobs, llmobs_events): + """Why are we asserting only 7 spans caught?""" + rf_evaluator = RagasFaithfulnessEvaluator(llmobs) + rf_evaluator.evaluate(_llm_span_with_expected_ragas_inputs_in_prompt()) + ragas_spans = [event for event in llmobs_events if event["name"].startswith("dd-ragas.")] + ragas_spans = sorted(ragas_spans, key=lambda d: d["start_ns"]) + assert len(ragas_spans) == 7 + # check name, io, span kinds match + assert ragas_spans == _expected_ragas_spans() + + # verify the trace structure + root_span = ragas_spans[0] + root_span_id = root_span["span_id"] + assert root_span["parent_id"] == "undefined" + assert root_span["meta"] is not None + assert root_span["meta"]["metadata"] is not None + assert isinstance(root_span["meta"]["metadata"]["faithfulness_list"], list) + assert isinstance(root_span["meta"]["metadata"]["statements"], list) + root_span_trace_id = root_span["trace_id"] + for child_span in ragas_spans[1:]: + assert child_span["trace_id"] == root_span_trace_id + + assert ragas_spans[1]["parent_id"] == root_span_id # input extraction (task) + assert ragas_spans[2]["parent_id"] == root_span_id # create statements (workflow) + assert ragas_spans[4]["parent_id"] == root_span_id # create verdicts (workflow) + assert ragas_spans[6]["parent_id"] == root_span_id # create score (task) + assert ragas_spans[3]["parent_id"] == ragas_spans[2]["span_id"] # create statements prompt (task) + assert ragas_spans[5]["parent_id"] == ragas_spans[4]["span_id"] # create verdicts prompt (task) + + def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit( + self, mock_writer_logs, run_python_code_in_subprocess + ): + env = os.environ.copy() + pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))] + if "PYTHONPATH" in env: + pypath.append(env["PYTHONPATH"]) + env.update( + { + "DD_API_KEY": os.getenv("DD_API_KEY", "dummy-api-key"), + "DD_SITE": "datad0g.com", + "PYTHONPATH": ":".join(pypath), + "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", "dummy-openai-api-key"), + "DD_LLMOBS_ML_APP": "unnamed-ml-app", + "_DD_LLMOBS_EVALUATOR_INTERVAL": "5", + "_DD_LLMOBS_EVALUATORS": "ragas_faithfulness", + "DD_LLMOBS_AGENTLESS_ENABLED": "true", + } + ) + out, err, status, pid = run_python_code_in_subprocess( + """ import os import time import atexit @@ -226,7 +230,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log from tests.llmobs._utils import logs_vcr ctx = logs_vcr.use_cassette( - "tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml" + "tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.emits_traces_and_evaluations_on_exit.yaml" ) ctx.__enter__() atexit.register(lambda: ctx.__exit__()) @@ -239,9 +243,9 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log ): LLMObs.enable() LLMObs._instance._evaluator_runner.enqueue(_llm_span_with_expected_ragas_inputs_in_messages(), None) -""", - env=env, - ) - assert status == 0, err - assert out == b"" - assert err == b"" + """, + env=env, + ) + assert status == 0, err + assert out == b"" + assert err == b"" diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py index 3689b7bac63..f550bf639ac 100644 --- a/tests/llmobs/test_llmobs_service.py +++ b/tests/llmobs/test_llmobs_service.py @@ -38,6 +38,9 @@ from tests.utils import override_global_config +RAGAS_AVAILABLE = os.getenv("RAGAS_AVAILABLE", False) + + def run_llmobs_trace_filter(dummy_tracer): with dummy_tracer.trace("span1", span_type=SpanTypes.LLM) as span: span.set_tag_str(SPAN_KIND, "llm") @@ -1743,6 +1746,7 @@ async def test_annotation_context_async_nested(llmobs): assert span._get_ctx_item(TAGS) == {"foo": "baz", "boo": "bar"} +@pytest.mark.skipif(not RAGAS_AVAILABLE, reason="Test requires ragas to be available on user env") def test_service_enable_starts_evaluator_runner_when_evaluators_exist(): with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")): with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")): From 34f469ddcbe2e76f08f4ec20fd40f88320c55576 Mon Sep 17 00:00:00 2001 From: Yun Kim Date: Mon, 6 Jan 2025 15:39:42 -0500 Subject: [PATCH 09/12] Release note --- .../notes/fix-llmobs-processor-4afd715a84323d32.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 releasenotes/notes/fix-llmobs-processor-4afd715a84323d32.yaml diff --git a/releasenotes/notes/fix-llmobs-processor-4afd715a84323d32.yaml b/releasenotes/notes/fix-llmobs-processor-4afd715a84323d32.yaml new file mode 100644 index 00000000000..5912a415022 --- /dev/null +++ b/releasenotes/notes/fix-llmobs-processor-4afd715a84323d32.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + LLM Observability: Resolves an issue where configuring custom trace filters/processors onto the tracer would disable LLM Observability. + Note that if LLM Observability is enabled in agentless mode, writing APM traces must be explicitly disabled by setting `DD_TRACE_ENABLED=0`. From b20211e1c799b19ab5a95fd93d343b131dd684b5 Mon Sep 17 00:00:00 2001 From: Yun Kim Date: Mon, 6 Jan 2025 16:09:11 -0500 Subject: [PATCH 10/12] Skip all ragas tests if ragas unavailable --- riotfile.py | 6 +- ...emits_traces_and_evaluations_on_exit.yaml} | 0 ...test_ragas_faithfulness_emits_traces.yaml} | 0 ...agas_faithfulness_submits_evaluation.yaml} | 0 ..._evaluation_on_span_with_custom_keys.yaml} | 0 ...on_on_span_with_question_in_messages.yaml} | 0 ...est_llmobs_ragas_faithfulness_evaluator.py | 405 +++++++++--------- tests/llmobs/test_llmobs_service.py | 2 +- 8 files changed, 204 insertions(+), 209 deletions(-) rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.emits_traces_and_evaluations_on_exit.yaml => tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml} (100%) rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_emits_traces.yaml => tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml} (100%) rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation.yaml => tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml} (100%) rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml => tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml} (100%) rename tests/llmobs/llmobs_cassettes/{tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml => tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml} (100%) diff --git a/riotfile.py b/riotfile.py index 8f36a9ff80b..f274b84bb0a 100644 --- a/riotfile.py +++ b/riotfile.py @@ -2885,11 +2885,7 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT pkgs={"vcrpy": latest, "pytest-asyncio": "==0.21.1"}, venvs=[ Venv(pys="3.7"), - Venv( - pys=select_pys(min_version="3.8"), - pkgs={"ragas": "==0.1.21", "langchain": latest}, - env={"RAGAS_AVAILABLE": "True"}, - ), + Venv(pys=select_pys(min_version="3.8"), pkgs={"ragas": "==0.1.21", "langchain": latest}), ], ), Venv( diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.emits_traces_and_evaluations_on_exit.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.emits_traces_and_evaluations_on_exit.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_emits_traces.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_emits_traces.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_emits_traces.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys.yaml diff --git a/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml b/tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml similarity index 100% rename from tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml rename to tests/llmobs/llmobs_cassettes/tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages.yaml diff --git a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py b/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py index 42a69a4d613..7309d911b31 100644 --- a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py +++ b/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py @@ -11,215 +11,214 @@ from tests.llmobs._utils import _llm_span_with_expected_ragas_inputs_in_prompt -RAGAS_AVAILABLE = os.getenv("RAGAS_AVAILABLE", False) +pytest.importorskip("ragas", reason="Tests require ragas to be available on user env") def _llm_span_without_io(): return _expected_llmobs_llm_span_event(Span("dummy")) +def test_ragas_evaluator_init(ragas, LLMObs): + rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + assert rf_evaluator.llmobs_service == LLMObs + assert rf_evaluator.ragas_faithfulness_instance == ragas.metrics.faithfulness + assert rf_evaluator.ragas_faithfulness_instance.llm == ragas.llms.llm_factory() + + def test_ragas_faithfulness_throws_if_dependencies_not_present(LLMObs, mock_ragas_dependencies_not_present, ragas): with pytest.raises(NotImplementedError, match="Failed to load dependencies for `ragas_faithfulness` evaluator"): RagasFaithfulnessEvaluator(LLMObs) -@pytest.mark.skipif(not RAGAS_AVAILABLE, reason="Tests require ragas to be available on user env") -class TestRagasFaithfulnessEvaluator: - def test_ragas_evaluator_init(self, ragas, LLMObs): - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - assert rf_evaluator.llmobs_service == LLMObs - assert rf_evaluator.ragas_faithfulness_instance == ragas.metrics.faithfulness - assert rf_evaluator.ragas_faithfulness_instance.llm == ragas.llms.llm_factory() - - def test_ragas_faithfulness_returns_none_if_inputs_extraction_fails( - self, ragas, mock_llmobs_submit_evaluation, LLMObs - ): - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - failure_msg, _ = rf_evaluator.evaluate(_llm_span_without_io()) - assert failure_msg == "fail_extract_faithfulness_inputs" - assert rf_evaluator.llmobs_service.submit_evaluation.call_count == 0 - - def test_ragas_faithfulness_has_modified_faithfulness_instance( - self, ragas, mock_llmobs_submit_evaluation, reset_ragas_faithfulness_llm, LLMObs - ): - """Faithfulness instance used in ragas evaluator should match the global ragas faithfulness instance""" - from ragas.llms import BaseRagasLLM - from ragas.metrics import faithfulness - - class FirstDummyLLM(BaseRagasLLM): - def __init__(self): - super().__init__() - - def generate_text(self) -> str: - return "dummy llm" - - def agenerate_text(self) -> str: - return "dummy llm" - - faithfulness.llm = FirstDummyLLM() - - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - - assert rf_evaluator.ragas_faithfulness_instance.llm.generate_text() == "dummy llm" - - class SecondDummyLLM(BaseRagasLLM): - def __init__(self): - super().__init__() - - def generate_text(self, statements) -> str: - raise ValueError("dummy_llm") - - def agenerate_text(self, statements) -> str: - raise ValueError("dummy_llm") - - faithfulness.llm = SecondDummyLLM() - - with pytest.raises(ValueError, match="dummy_llm"): - rf_evaluator.evaluate(_llm_span_with_expected_ragas_inputs_in_prompt()) - - @pytest.mark.vcr_logs - def test_ragas_faithfulness_submits_evaluation(self, ragas, LLMObs, mock_llmobs_submit_evaluation): - """Test that evaluation is submitted for a valid llm span where question is in the prompt variables""" - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - llm_span = _llm_span_with_expected_ragas_inputs_in_prompt() - rf_evaluator.run_and_submit_evaluation(llm_span) - rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( - [ - mock.call( - span_context={ - "span_id": llm_span.get("span_id"), - "trace_id": llm_span.get("trace_id"), - }, - label=RagasFaithfulnessEvaluator.LABEL, - metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, - value=1.0, - metadata={ - "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, - "_dd.faithfulness_disagreements": mock.ANY, - "_dd.evaluation_kind": "faithfulness", - }, - ) - ] - ) - - @pytest.mark.vcr_logs - def test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages( - self, ragas, LLMObs, mock_llmobs_submit_evaluation - ): - """Test that evaluation is submitted for a valid llm span where the last message content is the question""" - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - llm_span = _llm_span_with_expected_ragas_inputs_in_messages() - rf_evaluator.run_and_submit_evaluation(llm_span) - rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( - [ - mock.call( - span_context={ - "span_id": llm_span.get("span_id"), - "trace_id": llm_span.get("trace_id"), - }, - label=RagasFaithfulnessEvaluator.LABEL, - metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, - value=1.0, - metadata={ - "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, - "_dd.faithfulness_disagreements": mock.ANY, - "_dd.evaluation_kind": "faithfulness", - }, - ) - ] - ) - - @pytest.mark.vcr_logs - def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys( - self, ragas, LLMObs, mock_llmobs_submit_evaluation - ): - """Test that evaluation is submitted for a valid llm span where the last message content is the question""" - rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) - llm_span = _expected_llmobs_llm_span_event( - Span("dummy"), - prompt={ - "variables": { - "user_input": "Is france part of europe?", - "context_1": "hello, ", - "context_2": "france is ", - "context_3": "part of europe", +def test_ragas_faithfulness_returns_none_if_inputs_extraction_fails(ragas, mock_llmobs_submit_evaluation, LLMObs): + rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + failure_msg, _ = rf_evaluator.evaluate(_llm_span_without_io()) + assert failure_msg == "fail_extract_faithfulness_inputs" + assert rf_evaluator.llmobs_service.submit_evaluation.call_count == 0 + + +def test_ragas_faithfulness_has_modified_faithfulness_instance( + ragas, mock_llmobs_submit_evaluation, reset_ragas_faithfulness_llm, LLMObs +): + """Faithfulness instance used in ragas evaluator should match the global ragas faithfulness instance""" + from ragas.llms import BaseRagasLLM + from ragas.metrics import faithfulness + + class FirstDummyLLM(BaseRagasLLM): + def __init__(self): + super().__init__() + + def generate_text(self) -> str: + return "dummy llm" + + def agenerate_text(self) -> str: + return "dummy llm" + + faithfulness.llm = FirstDummyLLM() + + rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + + assert rf_evaluator.ragas_faithfulness_instance.llm.generate_text() == "dummy llm" + + class SecondDummyLLM(BaseRagasLLM): + def __init__(self): + super().__init__() + + def generate_text(self, statements) -> str: + raise ValueError("dummy_llm") + + def agenerate_text(self, statements) -> str: + raise ValueError("dummy_llm") + + faithfulness.llm = SecondDummyLLM() + + with pytest.raises(ValueError, match="dummy_llm"): + rf_evaluator.evaluate(_llm_span_with_expected_ragas_inputs_in_prompt()) + + +@pytest.mark.vcr_logs +def test_ragas_faithfulness_submits_evaluation(ragas, LLMObs, mock_llmobs_submit_evaluation): + """Test that evaluation is submitted for a valid llm span where question is in the prompt variables""" + rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + llm_span = _llm_span_with_expected_ragas_inputs_in_prompt() + rf_evaluator.run_and_submit_evaluation(llm_span) + rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( + [ + mock.call( + span_context={ + "span_id": llm_span.get("span_id"), + "trace_id": llm_span.get("trace_id"), + }, + label=RagasFaithfulnessEvaluator.LABEL, + metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, + value=1.0, + metadata={ + "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, + "_dd.faithfulness_disagreements": mock.ANY, + "_dd.evaluation_kind": "faithfulness", + }, + ) + ] + ) + + +@pytest.mark.vcr_logs +def test_ragas_faithfulness_submits_evaluation_on_span_with_question_in_messages( + ragas, LLMObs, mock_llmobs_submit_evaluation +): + """Test that evaluation is submitted for a valid llm span where the last message content is the question""" + rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + llm_span = _llm_span_with_expected_ragas_inputs_in_messages() + rf_evaluator.run_and_submit_evaluation(llm_span) + rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( + [ + mock.call( + span_context={ + "span_id": llm_span.get("span_id"), + "trace_id": llm_span.get("trace_id"), + }, + label=RagasFaithfulnessEvaluator.LABEL, + metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, + value=1.0, + metadata={ + "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, + "_dd.faithfulness_disagreements": mock.ANY, + "_dd.evaluation_kind": "faithfulness", }, - "_dd_context_variable_keys": ["context_1", "context_2", "context_3"], - "_dd_query_variable_keys": ["user_input"], + ) + ] + ) + + +@pytest.mark.vcr_logs +def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys(ragas, LLMObs, mock_llmobs_submit_evaluation): + """Test that evaluation is submitted for a valid llm span where the last message content is the question""" + rf_evaluator = RagasFaithfulnessEvaluator(LLMObs) + llm_span = _expected_llmobs_llm_span_event( + Span("dummy"), + prompt={ + "variables": { + "user_input": "Is france part of europe?", + "context_1": "hello, ", + "context_2": "france is ", + "context_3": "part of europe", }, - output_messages=[{"content": "France is indeed part of europe"}], - ) - rf_evaluator.run_and_submit_evaluation(llm_span) - rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( - [ - mock.call( - span_context={ - "span_id": llm_span.get("span_id"), - "trace_id": llm_span.get("trace_id"), - }, - label=RagasFaithfulnessEvaluator.LABEL, - metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, - value=1.0, - metadata={ - "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, - "_dd.faithfulness_disagreements": mock.ANY, - "_dd.evaluation_kind": "faithfulness", - }, - ) - ] - ) - - @pytest.mark.vcr_logs - def test_ragas_faithfulness_emits_traces(self, ragas, llmobs, llmobs_events): - """Why are we asserting only 7 spans caught?""" - rf_evaluator = RagasFaithfulnessEvaluator(llmobs) - rf_evaluator.evaluate(_llm_span_with_expected_ragas_inputs_in_prompt()) - ragas_spans = [event for event in llmobs_events if event["name"].startswith("dd-ragas.")] - ragas_spans = sorted(ragas_spans, key=lambda d: d["start_ns"]) - assert len(ragas_spans) == 7 - # check name, io, span kinds match - assert ragas_spans == _expected_ragas_spans() - - # verify the trace structure - root_span = ragas_spans[0] - root_span_id = root_span["span_id"] - assert root_span["parent_id"] == "undefined" - assert root_span["meta"] is not None - assert root_span["meta"]["metadata"] is not None - assert isinstance(root_span["meta"]["metadata"]["faithfulness_list"], list) - assert isinstance(root_span["meta"]["metadata"]["statements"], list) - root_span_trace_id = root_span["trace_id"] - for child_span in ragas_spans[1:]: - assert child_span["trace_id"] == root_span_trace_id - - assert ragas_spans[1]["parent_id"] == root_span_id # input extraction (task) - assert ragas_spans[2]["parent_id"] == root_span_id # create statements (workflow) - assert ragas_spans[4]["parent_id"] == root_span_id # create verdicts (workflow) - assert ragas_spans[6]["parent_id"] == root_span_id # create score (task) - assert ragas_spans[3]["parent_id"] == ragas_spans[2]["span_id"] # create statements prompt (task) - assert ragas_spans[5]["parent_id"] == ragas_spans[4]["span_id"] # create verdicts prompt (task) - - def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit( - self, mock_writer_logs, run_python_code_in_subprocess - ): - env = os.environ.copy() - pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))] - if "PYTHONPATH" in env: - pypath.append(env["PYTHONPATH"]) - env.update( - { - "DD_API_KEY": os.getenv("DD_API_KEY", "dummy-api-key"), - "DD_SITE": "datad0g.com", - "PYTHONPATH": ":".join(pypath), - "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", "dummy-openai-api-key"), - "DD_LLMOBS_ML_APP": "unnamed-ml-app", - "_DD_LLMOBS_EVALUATOR_INTERVAL": "5", - "_DD_LLMOBS_EVALUATORS": "ragas_faithfulness", - "DD_LLMOBS_AGENTLESS_ENABLED": "true", - } - ) - out, err, status, pid = run_python_code_in_subprocess( - """ + "_dd_context_variable_keys": ["context_1", "context_2", "context_3"], + "_dd_query_variable_keys": ["user_input"], + }, + output_messages=[{"content": "France is indeed part of europe"}], + ) + rf_evaluator.run_and_submit_evaluation(llm_span) + rf_evaluator.llmobs_service.submit_evaluation.assert_has_calls( + [ + mock.call( + span_context={ + "span_id": llm_span.get("span_id"), + "trace_id": llm_span.get("trace_id"), + }, + label=RagasFaithfulnessEvaluator.LABEL, + metric_type=RagasFaithfulnessEvaluator.METRIC_TYPE, + value=1.0, + metadata={ + "_dd.evaluation_span": {"span_id": mock.ANY, "trace_id": mock.ANY}, + "_dd.faithfulness_disagreements": mock.ANY, + "_dd.evaluation_kind": "faithfulness", + }, + ) + ] + ) + + +@pytest.mark.vcr_logs +def test_ragas_faithfulness_emits_traces(ragas, llmobs, llmobs_events): + """Why are we asserting only 7 spans caught?""" + rf_evaluator = RagasFaithfulnessEvaluator(llmobs) + rf_evaluator.evaluate(_llm_span_with_expected_ragas_inputs_in_prompt()) + ragas_spans = [event for event in llmobs_events if event["name"].startswith("dd-ragas.")] + ragas_spans = sorted(ragas_spans, key=lambda d: d["start_ns"]) + assert len(ragas_spans) == 7 + # check name, io, span kinds match + assert ragas_spans == _expected_ragas_spans() + + # verify the trace structure + root_span = ragas_spans[0] + root_span_id = root_span["span_id"] + assert root_span["parent_id"] == "undefined" + assert root_span["meta"] is not None + assert root_span["meta"]["metadata"] is not None + assert isinstance(root_span["meta"]["metadata"]["faithfulness_list"], list) + assert isinstance(root_span["meta"]["metadata"]["statements"], list) + root_span_trace_id = root_span["trace_id"] + for child_span in ragas_spans[1:]: + assert child_span["trace_id"] == root_span_trace_id + + assert ragas_spans[1]["parent_id"] == root_span_id # input extraction (task) + assert ragas_spans[2]["parent_id"] == root_span_id # create statements (workflow) + assert ragas_spans[4]["parent_id"] == root_span_id # create verdicts (workflow) + assert ragas_spans[6]["parent_id"] == root_span_id # create score (task) + assert ragas_spans[3]["parent_id"] == ragas_spans[2]["span_id"] # create statements prompt (task) + assert ragas_spans[5]["parent_id"] == ragas_spans[4]["span_id"] # create verdicts prompt (task) + + +def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_logs, run_python_code_in_subprocess): + env = os.environ.copy() + pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))] + if "PYTHONPATH" in env: + pypath.append(env["PYTHONPATH"]) + env.update( + { + "DD_API_KEY": os.getenv("DD_API_KEY", "dummy-api-key"), + "DD_SITE": "datad0g.com", + "PYTHONPATH": ":".join(pypath), + "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", "dummy-openai-api-key"), + "DD_LLMOBS_ML_APP": "unnamed-ml-app", + "_DD_LLMOBS_EVALUATOR_INTERVAL": "5", + "_DD_LLMOBS_EVALUATORS": "ragas_faithfulness", + "DD_LLMOBS_AGENTLESS_ENABLED": "true", + } + ) + out, err, status, pid = run_python_code_in_subprocess( + """ import os import time import atexit @@ -230,7 +229,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit( from tests.llmobs._utils import logs_vcr ctx = logs_vcr.use_cassette( - "tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.TestRagasFaithfulnessEvaluator.emits_traces_and_evaluations_on_exit.yaml" + "tests.llmobs.test_llmobs_ragas_faithfulness_evaluator.emits_traces_and_evaluations_on_exit.yaml" ) ctx.__enter__() atexit.register(lambda: ctx.__exit__()) @@ -243,9 +242,9 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit( ): LLMObs.enable() LLMObs._instance._evaluator_runner.enqueue(_llm_span_with_expected_ragas_inputs_in_messages(), None) - """, - env=env, - ) - assert status == 0, err - assert out == b"" - assert err == b"" +""", + env=env, + ) + assert status == 0, err + assert out == b"" + assert err == b"" diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py index f550bf639ac..2e1d5e6035f 100644 --- a/tests/llmobs/test_llmobs_service.py +++ b/tests/llmobs/test_llmobs_service.py @@ -1746,8 +1746,8 @@ async def test_annotation_context_async_nested(llmobs): assert span._get_ctx_item(TAGS) == {"foo": "baz", "boo": "bar"} -@pytest.mark.skipif(not RAGAS_AVAILABLE, reason="Test requires ragas to be available on user env") def test_service_enable_starts_evaluator_runner_when_evaluators_exist(): + pytest.importorskip("ragas") with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")): with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")): dummy_tracer = DummyTracer() From 2f40a2dd2e569ccdf731256391b2e53de3227e8b Mon Sep 17 00:00:00 2001 From: Yun Kim Date: Tue, 7 Jan 2025 16:31:21 -0500 Subject: [PATCH 11/12] remove draft comment --- tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py b/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py index 7309d911b31..ec8e181e527 100644 --- a/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py +++ b/tests/llmobs/test_llmobs_ragas_faithfulness_evaluator.py @@ -171,7 +171,6 @@ def test_ragas_faithfulness_submits_evaluation_on_span_with_custom_keys(ragas, L @pytest.mark.vcr_logs def test_ragas_faithfulness_emits_traces(ragas, llmobs, llmobs_events): - """Why are we asserting only 7 spans caught?""" rf_evaluator = RagasFaithfulnessEvaluator(llmobs) rf_evaluator.evaluate(_llm_span_with_expected_ragas_inputs_in_prompt()) ragas_spans = [event for event in llmobs_events if event["name"].startswith("dd-ragas.")] From 047e1475a3b1ad86f836e75e041b5b51532e9f0f Mon Sep 17 00:00:00 2001 From: Yun Kim Date: Wed, 8 Jan 2025 17:55:06 -0500 Subject: [PATCH 12/12] Cleanup more tests that use unnecessary fixtures/mocks --- tests/llmobs/conftest.py | 23 ++----------------- tests/llmobs/test_llmobs_service.py | 8 +++---- tests/llmobs/test_llmobs_span_agent_writer.py | 3 ++- .../test_llmobs_span_agentless_writer.py | 15 +++++------- 4 files changed, 14 insertions(+), 35 deletions(-) diff --git a/tests/llmobs/conftest.py b/tests/llmobs/conftest.py index 3cd9d6055c7..15cffe5faa9 100644 --- a/tests/llmobs/conftest.py +++ b/tests/llmobs/conftest.py @@ -41,16 +41,6 @@ def mock_llmobs_span_writer(): patcher.stop() -@pytest.fixture -def mock_llmobs_span_agentless_writer(): - patcher = mock.patch("ddtrace.llmobs._llmobs.LLMObsSpanWriter") - LLMObsSpanWriterMock = patcher.start() - m = mock.MagicMock() - LLMObsSpanWriterMock.return_value = m - yield m - patcher.stop() - - @pytest.fixture def mock_llmobs_eval_metric_writer(): patcher = mock.patch("ddtrace.llmobs._llmobs.LLMObsEvalMetricWriter") @@ -85,10 +75,7 @@ def mock_llmobs_submit_evaluation(): def mock_http_writer_send_payload_response(): with mock.patch( "ddtrace.internal.writer.HTTPWriter._send_payload", - return_value=Response( - status=200, - body="{}", - ), + return_value=Response(status=200, body="{}"), ): yield @@ -123,12 +110,6 @@ def mock_evaluator_sampler_logs(): yield m -@pytest.fixture -def mock_http_writer_logs(): - with mock.patch("ddtrace.internal.writer.writer.log") as m: - yield m - - @pytest.fixture def mock_llmobs_logs(): with mock.patch("ddtrace.llmobs._llmobs.log") as m: @@ -161,7 +142,7 @@ def LLMObs( @pytest.fixture def AgentlessLLMObs( - mock_llmobs_span_agentless_writer, + mock_llmobs_span_writer, mock_llmobs_eval_metric_writer, mock_llmobs_evaluator_runner, ddtrace_global_config, diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py index 2e1d5e6035f..2ba5754019f 100644 --- a/tests/llmobs/test_llmobs_service.py +++ b/tests/llmobs/test_llmobs_service.py @@ -1189,10 +1189,10 @@ def test_submit_evaluation_with_numerical_metric_enqueues_writer_with_score_metr def test_flush_calls_periodic_agentless( - AgentlessLLMObs, mock_llmobs_span_agentless_writer, mock_llmobs_eval_metric_writer, mock_llmobs_evaluator_runner + AgentlessLLMObs, mock_llmobs_span_writer, mock_llmobs_eval_metric_writer, mock_llmobs_evaluator_runner ): AgentlessLLMObs.flush() - mock_llmobs_span_agentless_writer.periodic.assert_called_once() + mock_llmobs_span_writer.periodic.assert_called_once() mock_llmobs_eval_metric_writer.periodic.assert_called_once() mock_llmobs_evaluator_runner.periodic.assert_called_once() @@ -1216,14 +1216,14 @@ def test_flush_does_not_call_periodic_when_llmobs_is_disabled( def test_flush_does_not_call_periodic_when_llmobs_is_disabled_agentless( AgentlessLLMObs, - mock_llmobs_span_agentless_writer, + mock_llmobs_span_writer, mock_llmobs_eval_metric_writer, mock_llmobs_evaluator_runner, mock_llmobs_logs, disabled_llmobs, ): AgentlessLLMObs.flush() - mock_llmobs_span_agentless_writer.periodic.assert_not_called() + mock_llmobs_span_writer.periodic.assert_not_called() mock_llmobs_eval_metric_writer.periodic.assert_not_called() mock_llmobs_evaluator_runner.periodic.assert_not_called() mock_llmobs_logs.warning.assert_has_calls( diff --git a/tests/llmobs/test_llmobs_span_agent_writer.py b/tests/llmobs/test_llmobs_span_agent_writer.py index 76fe0f21aef..d16bb9f0e2c 100644 --- a/tests/llmobs/test_llmobs_span_agent_writer.py +++ b/tests/llmobs/test_llmobs_span_agent_writer.py @@ -44,7 +44,8 @@ def test_flush_queue_when_event_cause_queue_to_exceed_payload_limit( [ mock.call("flushing queue because queuing next event will exceed EVP payload limit"), mock.call("encode %d LLMObs span events to be sent", 5), - ] + ], + any_order=True, ) diff --git a/tests/llmobs/test_llmobs_span_agentless_writer.py b/tests/llmobs/test_llmobs_span_agentless_writer.py index 4882f3553d8..4a54faf130d 100644 --- a/tests/llmobs/test_llmobs_span_agentless_writer.py +++ b/tests/llmobs/test_llmobs_span_agentless_writer.py @@ -75,26 +75,25 @@ def test_truncating_oversized_events(mock_writer_logs, mock_http_writer_send_pay ) -def test_send_completion_event(mock_writer_logs, mock_http_writer_logs, mock_http_writer_send_payload_response): +def test_send_completion_event(mock_writer_logs, mock_http_writer_send_payload_response): with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="foobar.baz")): llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=1, timeout=1) llmobs_span_writer.start() llmobs_span_writer.enqueue(_completion_event()) llmobs_span_writer.periodic() mock_writer_logs.debug.assert_has_calls([mock.call("encode %d LLMObs span events to be sent", 1)]) - mock_http_writer_logs.error.assert_not_called() -def test_send_chat_completion_event(mock_writer_logs, mock_http_writer_logs, mock_http_writer_send_payload_response): +def test_send_chat_completion_event(mock_writer_logs, mock_http_writer_send_payload_response): with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="foobar.baz")): llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=1, timeout=1) llmobs_span_writer.start() llmobs_span_writer.enqueue(_chat_completion_event()) llmobs_span_writer.periodic() mock_writer_logs.debug.assert_has_calls([mock.call("encode %d LLMObs span events to be sent", 1)]) - mock_http_writer_logs.error.assert_not_called() +@mock.patch("ddtrace.internal.writer.writer.log") def test_send_completion_bad_api_key(mock_http_writer_logs, mock_http_writer_put_response_forbidden): with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="")): llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=1, timeout=1) @@ -109,7 +108,7 @@ def test_send_completion_bad_api_key(mock_http_writer_logs, mock_http_writer_put ) -def test_send_timed_events(mock_writer_logs, mock_http_writer_logs, mock_http_writer_send_payload_response): +def test_send_timed_events(mock_writer_logs, mock_http_writer_send_payload_response): with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="foobar.baz")): llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=0.01, timeout=1) llmobs_span_writer.start() @@ -122,10 +121,9 @@ def test_send_timed_events(mock_writer_logs, mock_http_writer_logs, mock_http_wr llmobs_span_writer.enqueue(_chat_completion_event()) time.sleep(0.1) mock_writer_logs.debug.assert_has_calls([mock.call("encode %d LLMObs span events to be sent", 1)]) - mock_http_writer_logs.error.assert_not_called() -def test_send_multiple_events(mock_writer_logs, mock_http_writer_logs, mock_http_writer_send_payload_response): +def test_send_multiple_events(mock_writer_logs, mock_http_writer_send_payload_response): with override_global_config(dict(_dd_site=DATADOG_SITE, _dd_api_key="foobar.baz")): llmobs_span_writer = LLMObsSpanWriter(is_agentless=True, interval=0.01, timeout=1) llmobs_span_writer.start() @@ -135,10 +133,9 @@ def test_send_multiple_events(mock_writer_logs, mock_http_writer_logs, mock_http llmobs_span_writer.enqueue(_chat_completion_event()) time.sleep(0.1) mock_writer_logs.debug.assert_has_calls([mock.call("encode %d LLMObs span events to be sent", 2)]) - mock_http_writer_logs.error.assert_not_called() -def test_send_on_exit(mock_writer_logs, run_python_code_in_subprocess): +def test_send_on_exit(run_python_code_in_subprocess): env = os.environ.copy() pypath = [os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))] if "PYTHONPATH" in env: