Skip to content

Commit e806495

Browse files
authored
feat(llmobs): add summary evaluator functionality for datasets & experiments (#14629)
- changes the ExperimentResult class to have summary eval results and the per experiment row results - adds the ability to define and run summary evaluators example: given this script ``` #!/usr/bin/env python # coding: utf-8 import os import math import random from dotenv import load_dotenv # Load environment variables from the .env file. load_dotenv(override=True) from ddtrace.llmobs import LLMObs LLMObs.enable( api_key=os.getenv("DD_API_KEY"), app_key=os.getenv("DD_APPLICATION_KEY"), project_name="Onboarding", ml_app="Onboarding-ML-App", ) dataset = LLMObs.create_dataset_from_csv( csv_path="./data/taskmaster.csv", dataset_name="taskmaster-mini-314", input_data_columns=["prompt", "topics"], expected_output_columns=["labels"], ) dataset.as_dataframe() def return_hello(input_data, config): return "hello" def return_2(input_data, output_data, expected_output): return 2 def sum_of_rows_times_2_and_hellos( inputs, outputs, expected_outputs, evaluators_results ): return sum(evaluators_results["return_2"]) + len(outputs) experiment = LLMObs.experiment( name="taskmaster-experiment", dataset=dataset, task=return_hello, evaluators=[return_2], summary_evaluators=[sum_of_rows_times_2_and_hellos], ) results = experiment.run(jobs=50, raise_errors=True) print(experiment.url) ``` results in https://dddev.datadoghq.com/llm/experiments/889df25f-95b8-4149-9ba0-756c1afcc5f6 <img width="327" height="298" alt="image" src="https://github.com/user-attachments/assets/759393c2-109a-43f6-82fd-21bc1ffbe0fc" /> the summary evaluator is essentially 3*number of records -> correct result ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
1 parent 8fbf836 commit e806495

8 files changed

+504
-31
lines changed

ddtrace/llmobs/_experiment.py

Lines changed: 110 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ class EvaluationResult(TypedDict):
8282
evaluations: Dict[str, Dict[str, JSONType]]
8383

8484

85-
class ExperimentResult(TypedDict):
85+
class ExperimentRowResult(TypedDict):
8686
idx: int
8787
record_id: Optional[str]
8888
span_id: str
@@ -96,6 +96,11 @@ class ExperimentResult(TypedDict):
9696
error: Dict[str, Optional[str]]
9797

9898

99+
class ExperimentResult(TypedDict):
100+
summary_evaluations: Dict[str, Dict[str, JSONType]]
101+
rows: List[ExperimentRowResult]
102+
103+
99104
class Dataset:
100105
name: str
101106
description: str
@@ -304,11 +309,19 @@ def __init__(
304309
tags: Optional[Dict[str, str]] = None,
305310
config: Optional[ExperimentConfigType] = None,
306311
_llmobs_instance: Optional["LLMObs"] = None,
312+
summary_evaluators: Optional[
313+
List[
314+
Callable[
315+
[List[DatasetRecordInputType], List[JSONType], List[JSONType], Dict[str, List[JSONType]]], JSONType
316+
]
317+
]
318+
] = None,
307319
) -> None:
308320
self.name = name
309321
self._task = task
310322
self._dataset = dataset
311323
self._evaluators = evaluators
324+
self._summary_evaluators = summary_evaluators or []
312325
self._description = description
313326
self._tags: Dict[str, str] = tags or {}
314327
self._tags["ddtrace.version"] = str(ddtrace.__version__)
@@ -327,21 +340,12 @@ def __init__(
327340
self._id: Optional[str] = None
328341
self._run_name: Optional[str] = None
329342

330-
def run(
331-
self, jobs: int = 1, raise_errors: bool = False, sample_size: Optional[int] = None
332-
) -> List[ExperimentResult]:
333-
if not self._llmobs_instance:
343+
def run(self, jobs: int = 1, raise_errors: bool = False, sample_size: Optional[int] = None) -> ExperimentResult:
344+
if not self._llmobs_instance or not self._llmobs_instance.enabled:
334345
raise ValueError(
335346
"LLMObs is not enabled. Ensure LLM Observability is enabled via `LLMObs.enable(...)` "
336347
"and create the experiment via `LLMObs.experiment(...)` before running the experiment."
337348
)
338-
if not self._llmobs_instance.enabled:
339-
logger.warning(
340-
"Skipping experiment as LLMObs is not enabled. "
341-
"Ensure LLM Observability is enabled via `LLMObs.enable(...)` "
342-
"or set `DD_LLMOBS_ENABLED=1` and use `ddtrace-run` to run your application."
343-
)
344-
return []
345349

346350
project = self._llmobs_instance._dne_client.project_create_or_get(self._project_name)
347351
self._project_id = project.get("_id", "")
@@ -360,11 +364,13 @@ def run(
360364
self._run_name = experiment_run_name
361365
task_results = self._run_task(jobs, raise_errors, sample_size)
362366
evaluations = self._run_evaluators(task_results, raise_errors=raise_errors)
363-
experiment_results = self._merge_results(task_results, evaluations)
367+
summary_evals = self._run_summary_evaluators(task_results, evaluations, raise_errors)
368+
experiment_results = self._merge_results(task_results, evaluations, summary_evals)
364369
experiment_evals = self._generate_metrics_from_exp_results(experiment_results)
365370
self._llmobs_instance._dne_client.experiment_eval_post(
366371
self._id, experiment_evals, convert_tags_dict_to_list(self._tags)
367372
)
373+
368374
return experiment_results
369375

370376
@property
@@ -476,17 +482,64 @@ def _run_evaluators(self, task_results: List[TaskResult], raise_errors: bool = F
476482
evaluations.append(evaluation)
477483
return evaluations
478484

485+
def _run_summary_evaluators(
486+
self, task_results: List[TaskResult], eval_results: List[EvaluationResult], raise_errors: bool = False
487+
) -> List[EvaluationResult]:
488+
evaluations: List[EvaluationResult] = []
489+
inputs: List[DatasetRecordInputType] = []
490+
outputs: List[JSONType] = []
491+
expected_outputs: List[JSONType] = []
492+
evals_dict = {}
493+
494+
# name of evaluator (not summary evaluator) -> list of eval results ordered by index of the list of task results
495+
# this is being computed so that the user can use the evaluation results in its original form
496+
eval_results_by_name: dict[str, List[JSONType]] = {}
497+
for idx, task_result in enumerate(task_results):
498+
outputs.append(task_result["output"])
499+
record: DatasetRecord = self._dataset[idx]
500+
inputs.append(record["input_data"])
501+
expected_outputs.append(record["expected_output"])
502+
503+
eval_result_at_idx_by_name = eval_results[idx]["evaluations"]
504+
for name, eval_value in eval_result_at_idx_by_name.items():
505+
if name not in eval_results_by_name:
506+
eval_results_by_name[name] = []
507+
508+
eval_results_by_name[name].append(eval_value.get("value"))
509+
510+
for idx, summary_evaluator in enumerate(self._summary_evaluators):
511+
eval_result: JSONType = None
512+
eval_err: JSONType = None
513+
514+
try:
515+
eval_result = summary_evaluator(inputs, outputs, expected_outputs, eval_results_by_name)
516+
except Exception as e:
517+
exc_type, exc_value, exc_tb = sys.exc_info()
518+
exc_type_name = type(e).__name__ if exc_type is not None else "Unknown Exception"
519+
exc_stack = "".join(traceback.format_exception(exc_type, exc_value, exc_tb))
520+
eval_err = {"message": str(exc_value), "type": exc_type_name, "stack": exc_stack}
521+
if raise_errors:
522+
raise RuntimeError(f"Summary evaluator {summary_evaluator.__name__} failed") from e
523+
evals_dict[summary_evaluator.__name__] = {"value": eval_result, "error": eval_err}
524+
evaluation: EvaluationResult = {"idx": idx, "evaluations": evals_dict}
525+
evaluations.append(evaluation)
526+
527+
return evaluations
528+
479529
def _merge_results(
480-
self, task_results: List[TaskResult], evaluations: List[EvaluationResult]
481-
) -> List[ExperimentResult]:
530+
self,
531+
task_results: List[TaskResult],
532+
evaluations: List[EvaluationResult],
533+
summary_evaluations: Optional[List[EvaluationResult]],
534+
) -> ExperimentResult:
482535
experiment_results = []
483536
for idx, task_result in enumerate(task_results):
484537
output_data = task_result["output"]
485538
metadata: Dict[str, JSONType] = {"tags": cast(List[JSONType], convert_tags_dict_to_list(self._tags))}
486539
metadata.update(task_result.get("metadata") or {})
487540
record: DatasetRecord = self._dataset[idx]
488541
evals = evaluations[idx]["evaluations"]
489-
exp_result: ExperimentResult = {
542+
exp_result: ExperimentRowResult = {
490543
"idx": idx,
491544
"span_id": task_result.get("span_id", ""),
492545
"trace_id": task_result.get("trace_id", ""),
@@ -500,10 +553,28 @@ def _merge_results(
500553
"error": task_result["error"],
501554
}
502555
experiment_results.append(exp_result)
503-
return experiment_results
556+
557+
summary_evals: Dict[str, Dict[str, JSONType]] = {}
558+
if summary_evaluations:
559+
for summary_evaluation in summary_evaluations:
560+
for name, eval_data in summary_evaluation["evaluations"].items():
561+
summary_evals[name] = eval_data
562+
563+
result: ExperimentResult = {
564+
"summary_evaluations": summary_evals,
565+
"rows": experiment_results,
566+
}
567+
return result
504568

505569
def _generate_metric_from_evaluation(
506-
self, eval_name: str, eval_value: JSONType, err: JSONType, span_id: str, trace_id: str, timestamp_ns: int
570+
self,
571+
eval_name: str,
572+
eval_value: JSONType,
573+
err: JSONType,
574+
span_id: str,
575+
trace_id: str,
576+
timestamp_ns: int,
577+
source: str = "custom",
507578
) -> "LLMObsExperimentEvalMetricEvent":
508579
metric_type = None
509580
if eval_value is None:
@@ -516,6 +587,7 @@ def _generate_metric_from_evaluation(
516587
metric_type = "categorical"
517588
eval_value = str(eval_value).lower()
518589
return {
590+
"metric_source": source,
519591
"span_id": span_id,
520592
"trace_id": trace_id,
521593
"timestamp_ms": int(timestamp_ns / 1e6),
@@ -528,14 +600,18 @@ def _generate_metric_from_evaluation(
528600
}
529601

530602
def _generate_metrics_from_exp_results(
531-
self, experiment_results: List[ExperimentResult]
603+
self, experiment_result: ExperimentResult
532604
) -> List["LLMObsExperimentEvalMetricEvent"]:
533605
eval_metrics = []
534-
for exp_result in experiment_results:
606+
latest_timestamp: int = 0
607+
for exp_result in experiment_result["rows"]:
535608
evaluations = exp_result.get("evaluations") or {}
536609
span_id = exp_result.get("span_id", "")
537610
trace_id = exp_result.get("trace_id", "")
538611
timestamp_ns = cast(int, exp_result.get("timestamp", 0))
612+
if timestamp_ns > latest_timestamp:
613+
latest_timestamp = timestamp_ns
614+
539615
for eval_name, eval_data in evaluations.items():
540616
if not eval_data:
541617
continue
@@ -544,6 +620,20 @@ def _generate_metrics_from_exp_results(
544620
eval_name, eval_value, eval_data.get("error"), span_id, trace_id, timestamp_ns
545621
)
546622
eval_metrics.append(eval_metric)
623+
624+
for name, summary_eval_data in experiment_result.get("summary_evaluations", {}).items():
625+
if not summary_eval_data:
626+
continue
627+
eval_metric = self._generate_metric_from_evaluation(
628+
name,
629+
summary_eval_data.get("value"),
630+
summary_eval_data.get("error"),
631+
"",
632+
"",
633+
latest_timestamp,
634+
source="summary",
635+
)
636+
eval_metrics.append(eval_metric)
547637
return eval_metrics
548638

549639

ddtrace/llmobs/_llmobs.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -752,6 +752,13 @@ def experiment(
752752
description: str = "",
753753
tags: Optional[Dict[str, str]] = None,
754754
config: Optional[ExperimentConfigType] = None,
755+
summary_evaluators: Optional[
756+
List[
757+
Callable[
758+
[List[DatasetRecordInputType], List[JSONType], List[JSONType], Dict[str, List[JSONType]]], JSONType
759+
]
760+
]
761+
] = None,
755762
) -> Experiment:
756763
"""Initializes an Experiment to run a task on a Dataset and evaluators.
757764
@@ -777,9 +784,21 @@ def experiment(
777784
for evaluator in evaluators:
778785
sig = inspect.signature(evaluator)
779786
params = sig.parameters
780-
required_params = ("input_data", "output_data", "expected_output")
781-
if not all(param in params for param in required_params):
782-
raise TypeError("Evaluator function must have parameters {}.".format(required_params))
787+
evaluator_required_params = ("input_data", "output_data", "expected_output")
788+
if not all(param in params for param in evaluator_required_params):
789+
raise TypeError("Evaluator function must have parameters {}.".format(evaluator_required_params))
790+
791+
if summary_evaluators and not all(callable(summary_evaluator) for summary_evaluator in summary_evaluators):
792+
raise TypeError("Summary evaluators must be a list of callable functions.")
793+
if summary_evaluators:
794+
for summary_evaluator in summary_evaluators:
795+
sig = inspect.signature(summary_evaluator)
796+
params = sig.parameters
797+
summary_evaluator_required_params = ("inputs", "outputs", "expected_outputs", "evaluators_results")
798+
if not all(param in params for param in summary_evaluator_required_params):
799+
raise TypeError(
800+
"Summary evaluator function must have parameters {}.".format(summary_evaluator_required_params)
801+
)
783802
return Experiment(
784803
name,
785804
task,
@@ -790,6 +809,7 @@ def experiment(
790809
description=description,
791810
config=config,
792811
_llmobs_instance=cls._instance,
812+
summary_evaluators=summary_evaluators,
793813
)
794814

795815
@classmethod

ddtrace/llmobs/_writer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ class LLMObsEvaluationMetricEvent(TypedDict, total=False):
8787

8888

8989
class LLMObsExperimentEvalMetricEvent(TypedDict, total=False):
90+
metric_source: str
9091
span_id: str
9192
trace_id: str
9293
timestamp_ms: int
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
interactions:
2+
- request:
3+
body: '{"data": {"type": "experiments", "attributes": {"scope": "experiments",
4+
"metrics": [{"metric_source": "custom", "span_id": "123", "trace_id": "456",
5+
"timestamp_ms": 1234, "metric_type": "score", "label": "dummy_evaluator", "score_value":
6+
0, "error": null, "tags": ["ddtrace.version:1.2.3", "experiment_id:9e046fc7-cf3f-4f01-b5ed-e5e7746fefa8"],
7+
"experiment_id": "9e046fc7-cf3f-4f01-b5ed-e5e7746fefa8"}], "tags": ["ddtrace.version:1.2.3",
8+
"experiment_id:9e046fc7-cf3f-4f01-b5ed-e5e7746fefa8"]}}}'
9+
headers:
10+
Accept:
11+
- '*/*'
12+
? !!python/object/apply:multidict._multidict.istr
13+
- Accept-Encoding
14+
: - identity
15+
Connection:
16+
- keep-alive
17+
Content-Length:
18+
- '494'
19+
? !!python/object/apply:multidict._multidict.istr
20+
- Content-Type
21+
: - application/json
22+
User-Agent:
23+
- python-requests/2.32.3
24+
method: POST
25+
uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments/9e046fc7-cf3f-4f01-b5ed-e5e7746fefa8/events
26+
response:
27+
body:
28+
string: ''
29+
headers:
30+
content-length:
31+
- '0'
32+
content-security-policy:
33+
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
34+
content-type:
35+
- application/vnd.api+json
36+
date:
37+
- Thu, 18 Sep 2025 15:03:41 GMT
38+
strict-transport-security:
39+
- max-age=31536000; includeSubDomains; preload
40+
vary:
41+
- Accept-Encoding
42+
x-content-type-options:
43+
- nosniff
44+
x-frame-options:
45+
- SAMEORIGIN
46+
status:
47+
code: 202
48+
message: Accepted
49+
version: 1
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
interactions:
2+
- request:
3+
body: '{"data": {"type": "experiments", "attributes": {"scope": "experiments",
4+
"metrics": [{"metric_source": "custom", "span_id": "123", "trace_id": "456",
5+
"timestamp_ms": 1234, "metric_type": "score", "label": "dummy_evaluator", "score_value":
6+
0, "error": null, "tags": ["ddtrace.version:1.2.3", "experiment_id:9e046fc7-cf3f-4f01-b5ed-e5e7746fefa8"],
7+
"experiment_id": "9e046fc7-cf3f-4f01-b5ed-e5e7746fefa8"}, {"metric_source":
8+
"summary", "span_id": "", "trace_id": "", "timestamp_ms": 1234, "metric_type":
9+
"score", "label": "dummy_summary_evaluator", "score_value": 4, "error": null,
10+
"tags": ["ddtrace.version:1.2.3", "experiment_id:9e046fc7-cf3f-4f01-b5ed-e5e7746fefa8"],
11+
"experiment_id": "9e046fc7-cf3f-4f01-b5ed-e5e7746fefa8"}], "tags": ["ddtrace.version:1.2.3",
12+
"experiment_id:9e046fc7-cf3f-4f01-b5ed-e5e7746fefa8"]}}}'
13+
headers:
14+
Accept:
15+
- '*/*'
16+
? !!python/object/apply:multidict._multidict.istr
17+
- Accept-Encoding
18+
: - identity
19+
Connection:
20+
- keep-alive
21+
Content-Length:
22+
- '816'
23+
? !!python/object/apply:multidict._multidict.istr
24+
- Content-Type
25+
: - application/json
26+
User-Agent:
27+
- python-requests/2.32.3
28+
method: POST
29+
uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/experiments/9e046fc7-cf3f-4f01-b5ed-e5e7746fefa8/events
30+
response:
31+
body:
32+
string: ''
33+
headers:
34+
content-length:
35+
- '0'
36+
content-security-policy:
37+
- frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
38+
content-type:
39+
- application/vnd.api+json
40+
date:
41+
- Thu, 18 Sep 2025 21:00:41 GMT
42+
strict-transport-security:
43+
- max-age=31536000; includeSubDomains; preload
44+
vary:
45+
- Accept-Encoding
46+
x-content-type-options:
47+
- nosniff
48+
x-frame-options:
49+
- SAMEORIGIN
50+
status:
51+
code: 202
52+
message: Accepted
53+
version: 1

0 commit comments

Comments
 (0)