Skip to content

Commit 1026069

Browse files
zhanghaotongZhang HaotongShunkang
authored
[None][feat] Add opentelemetry tracing (#5897)
Signed-off-by: Zhang Haotong <[email protected]> Signed-off-by: zhanghaotong <[email protected]> Signed-off-by: Shunkang <[email protected]> Co-authored-by: Zhang Haotong <[email protected]> Co-authored-by: Shunkang <[email protected]>
1 parent ce0d761 commit 1026069

File tree

17 files changed

+591
-35
lines changed

17 files changed

+591
-35
lines changed

examples/opentelemetry/README.md

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# OpenTelemetry Integration Guide
2+
3+
This guide explains how to setup OpenTelemetry tracing in TensorRT-LLM to monitor and debug your LLM inference services.
4+
5+
## Install OpenTelemetry
6+
7+
Install the required OpenTelemetry packages:
8+
9+
```bash
10+
pip install \
11+
'opentelemetry-sdk' \
12+
'opentelemetry-api' \
13+
'opentelemetry-exporter-otlp' \
14+
'opentelemetry-semantic-conventions-ai'
15+
```
16+
17+
## Start Jaeger
18+
19+
You can start Jaeger with Docker:
20+
21+
```bash
22+
docker run --rm --name jaeger \
23+
-e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
24+
-p 6831:6831/udp \
25+
-p 6832:6832/udp \
26+
-p 5778:5778 \
27+
-p 16686:16686 \
28+
-p 4317:4317 \
29+
-p 4318:4318 \
30+
-p 14250:14250 \
31+
-p 14268:14268 \
32+
-p 14269:14269 \
33+
-p 9411:9411 \
34+
jaegertracing/all-in-one:1.57.0
35+
```
36+
37+
Or run the jaeger-all-in-one(.exe) executable from [the binary distribution archives](https://www.jaegertracing.io/download/):
38+
39+
```bash
40+
jaeger-all-in-one --collector.zipkin.host-port=:9411
41+
```
42+
43+
## Setup environment variables and run TensorRT-LLM
44+
45+
Set up the environment variables:
46+
47+
```bash
48+
export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
49+
export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=grpc
50+
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
51+
export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
52+
export OTEL_SERVICE_NAME="trt-server"
53+
```
54+
55+
Then run TensorRT-LLM with OpenTelemetry, and make sure to set `return_perf_metrics` to true in the model configuration:
56+
57+
```bash
58+
trtllm-serve models/Qwen3-8B/ --otlp_traces_endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
59+
```
60+
61+
## Send requests and find traces in Jaeger
62+
63+
You can send a request to the server and view the traces in [Jaeger UI](http://localhost:16686/).
64+
The traces should be visible under the service name "trt-server".
65+
66+
## Configuration for Disaggregated Serving
67+
68+
For disaggregated serving scenarios, the configuration for ctx server and gen server remains the same as the standalone model. For the proxy, you can configure it as follows:
69+
70+
```yaml
71+
# disagg_config.yaml
72+
hostname: 127.0.0.1
73+
port: 8000
74+
backend: pytorch
75+
context_servers:
76+
num_instances: 1
77+
urls:
78+
- "127.0.0.1:8001"
79+
generation_servers:
80+
num_instances: 1
81+
urls:
82+
- "127.0.0.1:8002"
83+
otlp_config:
84+
otlp_traces_endpoint: "grpc://0.0.0.0:4317"
85+
```

tensorrt_llm/_utils.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,14 @@
2828
from enum import EnumMeta
2929
from functools import lru_cache, partial, wraps
3030
from pathlib import Path
31-
from typing import Any, Dict, List, Optional, Sequence, Union
31+
from typing import Any, Callable, Dict, List, Optional, Sequence, Union
3232

3333
import numpy as np
3434
import nvtx
3535
from mpi4py import MPI
3636
from mpi4py.util import pkl5
3737
from packaging import version
38+
from typing_extensions import ParamSpec
3839

3940
# isort: off
4041
import torch
@@ -1155,6 +1156,21 @@ def set_prometheus_multiproc_dir() -> object:
11551156
f"PROMETHEUS_MULTIPROC_DIR: {os.environ['PROMETHEUS_MULTIPROC_DIR']}")
11561157

11571158

1159+
P = ParamSpec("P")
1160+
1161+
1162+
# From: https://stackoverflow.com/a/4104188/2749989
1163+
def run_once(f: Callable[P, None]) -> Callable[P, None]:
1164+
1165+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> None:
1166+
if not wrapper.has_run: # type: ignore[attr-defined]
1167+
wrapper.has_run = True # type: ignore[attr-defined]
1168+
return f(*args, **kwargs)
1169+
1170+
wrapper.has_run = False # type: ignore[attr-defined]
1171+
return wrapper
1172+
1173+
11581174
TORCH_PYBIND11_ABI = None
11591175

11601176

tensorrt_llm/commands/serve.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ def get_llm_args(model: str,
9191
trust_remote_code: bool = False,
9292
reasoning_parser: Optional[str] = None,
9393
fail_fast_on_attention_window_too_large: bool = False,
94+
otlp_traces_endpoint: Optional[str] = None,
9495
enable_chunked_prefill: bool = False,
9596
**llm_args_extra_dict: Any):
9697

@@ -134,6 +135,7 @@ def get_llm_args(model: str,
134135
"reasoning_parser": reasoning_parser,
135136
"fail_fast_on_attention_window_too_large":
136137
fail_fast_on_attention_window_too_large,
138+
"otlp_traces_endpoint": otlp_traces_endpoint,
137139
"enable_chunked_prefill": enable_chunked_prefill,
138140
}
139141

@@ -322,6 +324,10 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
322324
help=
323325
"Exit with runtime error when attention window is too large to fit even a single sequence in the KV cache."
324326
)
327+
@click.option("--otlp_traces_endpoint",
328+
type=str,
329+
default=None,
330+
help="Target URL to which OpenTelemetry traces will be sent.")
325331
@click.option("--disagg_cluster_uri",
326332
type=str,
327333
default=None,
@@ -344,8 +350,8 @@ def serve(
344350
extra_llm_api_options: Optional[str], reasoning_parser: Optional[str],
345351
metadata_server_config_file: Optional[str], server_role: Optional[str],
346352
fail_fast_on_attention_window_too_large: bool,
347-
enable_chunked_prefill: bool, disagg_cluster_uri: Optional[str],
348-
media_io_kwargs: Optional[str]):
353+
otlp_traces_endpoint: Optional[str], enable_chunked_prefill: bool,
354+
disagg_cluster_uri: Optional[str], media_io_kwargs: Optional[str]):
349355
"""Running an OpenAI API compatible server
350356
351357
MODEL: model name | HF checkpoint path | TensorRT engine path
@@ -371,6 +377,7 @@ def serve(
371377
reasoning_parser=reasoning_parser,
372378
fail_fast_on_attention_window_too_large=
373379
fail_fast_on_attention_window_too_large,
380+
otlp_traces_endpoint=otlp_traces_endpoint,
374381
enable_chunked_prefill=enable_chunked_prefill)
375382

376383
llm_args_extra_dict = {}

tensorrt_llm/executor/base_worker.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -886,7 +886,15 @@ def _get_metrics_dict(
886886
req_perf_metrics.timing_metrics.first_scheduled_time.
887887
total_seconds(),
888888
RequestEventTiming.LAST_TOKEN_TIME:
889-
req_perf_metrics.timing_metrics.last_token_time.total_seconds()
889+
req_perf_metrics.timing_metrics.last_token_time.total_seconds(),
890+
RequestEventTiming.KV_CACHE_TRANSFER_START:
891+
req_perf_metrics.timing_metrics.kv_cache_transfer_start.
892+
total_seconds(),
893+
RequestEventTiming.KV_CACHE_TRANSFER_END:
894+
req_perf_metrics.timing_metrics.kv_cache_transfer_end.
895+
total_seconds(),
896+
RequestEventTiming.KV_CACHE_SIZE:
897+
req_perf_metrics.timing_metrics.kv_cache_size,
890898
}
891899
return metrics_dict
892900

tensorrt_llm/executor/executor.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import signal
66
import traceback
77
from abc import ABC, abstractmethod
8+
from collections.abc import Mapping
89
from pathlib import Path
910
from queue import Queue
1011
from typing import (TYPE_CHECKING, AsyncIterable, Dict, Generator, List,
@@ -123,6 +124,7 @@ def generate_async(
123124
streaming: bool = False,
124125
kv_cache_retention_config: Optional[KvCacheRetentionConfig] = None,
125126
disaggregated_params: Optional[DisaggregatedParams] = None,
127+
trace_headers: Optional[Mapping[str, str]] = None,
126128
postproc_params: Optional[PostprocParams] = None,
127129
multimodal_params: Optional[MultimodalParams] = None,
128130
scheduling_params: Optional[SchedulingParams] = None,
@@ -150,6 +152,7 @@ def generate_async(
150152
streaming=streaming,
151153
kv_cache_retention_config=kv_cache_retention_config,
152154
disaggregated_params=disaggregated_params,
155+
trace_headers=trace_headers,
153156
multimodal_params=multimodal_params,
154157
scheduling_params=scheduling_params,
155158
cache_salt_id=cache_salt_id,

tensorrt_llm/executor/request.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
from collections.abc import Mapping
23
from dataclasses import dataclass
34
from typing import List, Optional, Union
45

@@ -94,6 +95,7 @@ def __init__(
9495
streaming: bool = False,
9596
kv_cache_retention_config: Optional[KvCacheRetentionConfig] = None,
9697
disaggregated_params: Optional[DisaggregatedParams] = None,
98+
trace_headers: Optional[Mapping[str, str]] = None,
9799
postproc_params: Optional[PostprocParams] = None,
98100
multimodal_params: Optional[MultimodalParams] = None,
99101
scheduling_params: Optional[SchedulingParams] = None,
@@ -123,6 +125,7 @@ def __init__(
123125
self.kv_cache_retention_config = kv_cache_retention_config
124126
self.id: Optional[int] = None
125127
self.disaggregated_params = disaggregated_params
128+
self.trace_headers = trace_headers
126129
self.scheduling_params = scheduling_params
127130
self.cache_salt_id = cache_salt_id
128131
self.arrival_time = arrival_time

tensorrt_llm/executor/result.py

Lines changed: 111 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import asyncio
22
import json
33
import threading
4+
import time
45
import weakref
56
from dataclasses import dataclass, field
67
from queue import Empty, Queue
@@ -11,6 +12,8 @@
1112
import torch
1213
import torch.nn.functional as F
1314

15+
from tensorrt_llm.llmapi import tracing
16+
1417
try:
1518
import ray
1619
except ModuleNotFoundError:
@@ -268,6 +271,7 @@ def __init__(self,
268271
self.avg_decoded_tokens_per_iter: Optional[float] = None
269272
self._done = False
270273
self.metrics_dict = {}
274+
self.trace_headers: Optional[dict[str, str]] = None
271275

272276
if ray_queue is not None:
273277
if has_event_loop():
@@ -436,6 +440,7 @@ def _handle_sequence(self,
436440
raise ValueError(
437441
f"Unknown finish reason: {finish_reasons[src_idx]}")
438442
self.record_stats(output, req_perf_metrics_dict)
443+
self.do_tracing(output, req_perf_metrics_dict)
439444

440445
@print_traceback_on_error
441446
@nvtx_range_debug("handle_response",
@@ -472,7 +477,7 @@ def _handle_response(self,
472477
self._outputs[0].disaggregated_params = disaggregated_params
473478

474479
if response.metrics:
475-
self.metrics_dict = response.metrics
480+
self.metrics_dict.update(response.metrics)
476481

477482
if response.error:
478483
if self._background_error_handler is not None and (
@@ -570,7 +575,110 @@ def record_stats(self,
570575
stats, len(output.token_ids), self.sampling_params.n > 1)
571576
if processed_metrics_stat:
572577
metrics_stats.update(processed_metrics_stat)
573-
self.metrics_dict = metrics_stats
578+
self.metrics_dict.update(metrics_stats)
579+
580+
def do_tracing(
581+
self,
582+
output: CompletionOutput,
583+
req_perf_metrics_dict: Optional[dict[str, float]] = None,
584+
) -> None:
585+
"""Perform distributed tracing for the generation request.
586+
587+
Args:
588+
output (CompletionOutput): The output of the generation result.
589+
req_perf_metrics_dict (Optional[dict[str, float]]): Request performance metrics. Defaults to None.
590+
"""
591+
if not tracing.global_otlp_tracer():
592+
return
593+
594+
metrics_dict = self.metrics_dict
595+
if not metrics_dict or not req_perf_metrics_dict:
596+
# Insufficient request metrics available; trace generation aborted.
597+
tracing.insufficient_request_metrics_warning()
598+
return
599+
600+
trace_context = tracing.extract_trace_context(self.trace_headers)
601+
sampling_params = self.sampling_params
602+
603+
# Since arrival_time and other timing metrics are based on different time origins,
604+
# we need to apply corrections to align them with absolute timestamps
605+
time_correction = time.time() - time.monotonic()
606+
arrival_time = req_perf_metrics_dict.get(
607+
RequestEventTiming.ARRIVAL_TIME, 0)
608+
609+
with tracing.global_otlp_tracer().start_as_current_span(
610+
"llm_request",
611+
kind=tracing.SpanKind.SERVER,
612+
context=trace_context,
613+
start_time=int((arrival_time + time_correction) * 1e9),
614+
) as span:
615+
616+
def safe_set_attr(span, attr, value):
617+
if value is not None:
618+
span.set_attribute(attr, value)
619+
620+
safe_set_attr(span,
621+
tracing.SpanAttributes.GEN_AI_REQUEST_TEMPERATURE,
622+
sampling_params.temperature)
623+
safe_set_attr(span, tracing.SpanAttributes.GEN_AI_REQUEST_TOP_P,
624+
sampling_params.top_p)
625+
safe_set_attr(span, tracing.SpanAttributes.GEN_AI_REQUEST_TOP_K,
626+
sampling_params.top_k)
627+
safe_set_attr(
628+
span,
629+
tracing.SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS,
630+
sampling_params.max_tokens,
631+
)
632+
safe_set_attr(span, tracing.SpanAttributes.GEN_AI_REQUEST_N,
633+
sampling_params.n)
634+
safe_set_attr(span, tracing.SpanAttributes.GEN_AI_REQUEST_ID,
635+
self.id)
636+
if prompt_token_ids := getattr(self, "prompt_token_ids", None):
637+
safe_set_attr(span,
638+
tracing.SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS,
639+
len(prompt_token_ids))
640+
safe_set_attr(span,
641+
tracing.SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS,
642+
output.length)
643+
safe_set_attr(
644+
span, tracing.SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN,
645+
metrics_dict.get(MetricNames.TTFT, -1))
646+
safe_set_attr(span, tracing.SpanAttributes.GEN_AI_LATENCY_E2E,
647+
metrics_dict.get(MetricNames.E2E, -1))
648+
safe_set_attr(span,
649+
tracing.SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE,
650+
metrics_dict.get(MetricNames.REQUEST_QUEUE_TIME, -1))
651+
safe_set_attr(
652+
span, tracing.SpanAttributes.GEN_AI_RESPONSE_FINISH_REASONS,
653+
json.dumps([output.finish_reason])
654+
if output.finish_reason else None)
655+
safe_set_attr(
656+
span,
657+
tracing.SpanAttributes.GEN_AI_LATENCY_KV_CACHE_TRANSFER_TIME,
658+
req_perf_metrics_dict.get(
659+
RequestEventTiming.KV_CACHE_TRANSFER_END, 0.0) -
660+
req_perf_metrics_dict.get(
661+
RequestEventTiming.KV_CACHE_TRANSFER_START, 0.0))
662+
663+
if req_perf_metrics_dict.get(
664+
RequestEventTiming.KV_CACHE_TRANSFER_START,
665+
0) and req_perf_metrics_dict.get(
666+
RequestEventTiming.KV_CACHE_TRANSFER_END, 0):
667+
tracing.add_event(
668+
tracing.SpanEvents.KV_CACHE_TRANSFER_START,
669+
timestamp=int((req_perf_metrics_dict.get(
670+
RequestEventTiming.KV_CACHE_TRANSFER_START, 0.0) +
671+
time_correction) * 1e9))
672+
tracing.add_event(
673+
tracing.SpanEvents.KV_CACHE_TRANSFER_END,
674+
attributes={
675+
"kv_cache_size":
676+
req_perf_metrics_dict.get(
677+
RequestEventTiming.KV_CACHE_SIZE, 0)
678+
},
679+
timestamp=int((req_perf_metrics_dict.get(
680+
RequestEventTiming.KV_CACHE_TRANSFER_END, 0.0) +
681+
time_correction) * 1e9))
574682

575683

576684
class DetokenizedGenerationResultBase(GenerationResultBase):
@@ -688,6 +796,7 @@ def __init__(
688796
self.disaggregated_params = disaggregated_params
689797
# minimal sampling params needed for logprob calculation
690798
self._logprob_params = logprob_params
799+
self.trace_headers = generation_request.trace_headers
691800

692801
# for aborting the request
693802
self._executor: Optional[weakref.ReferenceType[

0 commit comments

Comments
 (0)