feat(llmobs): [MLOB-4258] add support for OpenAI server-side MCP calls (#15057)

ncybul · Yun-Kim · web-flow · commit a14402e8449a · 2025-11-21T12:56:29.000-05:00
## Description This PR adds support for server-side MCP calls made via the OpenAI Responses API. In the Responses API, LLMs can invoke MCP tools on behalf of the client. They do this by asking the provided MCP server to list available tools and then calling the relevant tool. Our current support for these kinds of interactions is not great: we do not capture any tool calls, tool results, or tool spans. This PR provides better support by: 1. Capturing the `McpCall` output item and parsing it into a Tool Call and Tool Result for the current active LLM span 2. Generating a Tool span to represent the server-side tool invocation _(note that this tool span is a child of the active LLM span since it technically happens within the LLM operation)_ 3. Adding any tools returned from the MCP server to the LLM's available tools field. ## Manual Testing I manually tested my changes with the following script: ``` import asyncio from ddtrace.llmobs import LLMObs from agents import set_default_openai_client from agents.tracing import set_tracing_disabled from dd_internal_authentication.client import JWTDDToolAuthClientTokenManager from openai import AsyncOpenAI LLMObs.enable( ml_app="nicole-test", site="datadoghq.com" ) set_tracing_disabled(True) token=JWTDDToolAuthClientTokenManager.instance(name="rapid-ai-platform", datacenter='us1.staging.dog').get_token("rapid-ai-platform") ai_gateway_client = AsyncOpenAI(base_url="https://ai-gateway.us1.staging.dog/v1", default_headers={"source": "nicole-test", "org-id": "2"}, api_key=token) set_default_openai_client(ai_gateway_client) async def main(): resp = await ai_gateway_client.responses.create( model="gpt-5", tools=[ { "type": "mcp", "server_label": "dice_roller", "server_description": "Public dice-roller MCP server for testing.", "server_url": "https://dice-rolling-mcp.vercel.app/mcp", # or use the FastMCP URL "require_approval": "never", }, ], input="Roll 2d4+1", ) print(resp) if __name__ == "__main__": asyncio.run(main()) ``` ### Before Before, our experience for these types of server-side MCP use cases was very poor. Running the script, I get this [trace](https://app.datadoghq.com/llm/traces?query=%40ml_app%3Anicole-test%20%40event_type%3Aspan%20%40parent_id%3Aundefined&agg_m=count&agg_m_source=base&agg_t=count&fromUser=false&sp=%5B%7B%22p%22%3A%7B%22eventId%22%3A%22AwAAAZo7t33VDIDW2gAAABhBWm83dDMzVkFBRHU0Q2I1ZDNYT0FBQUEAAAAkZjE5YTNiYmEtMGM3NC00NzVlLWFlZDQtY2ExZmVmYWU2ZDRkAABNvg%22%7D%2C%22i%22%3A%22llm-obs-panel%22%7D%5D&spanId=2741126858136455689&start=1761936196353&end=1761939796353&paused=false) which does not parse the MCP server side calls: <img width="2668" height="1304" alt="image" src="https://github.com/user-attachments/assets/51426be1-d670-45ad-b2f2-d3dab8a85f52" /> ### After With the changes in this PR, the [trace](https://app.datadoghq.com/llm/traces?query=%40ml_app%3Anicole-test%20%40event_type%3Aspan%20%40parent_id%3Aundefined&agg_m=count&agg_m_source=base&agg_t=count&fromUser=true&sp=%5B%7B%22p%22%3A%7B%22eventId%22%3A%22AwAAAZqSG1GZuwlyswAAABhBWnFTRzFHWkFBQy13SDVmUXVhNkFBQUEAAAAkZjE5YTkyMWMtODg0Mi00M2YwLWJlYmQtNjM5MTdiNjkzNTI0AAAckw%22%7D%2C%22i%22%3A%22llm-obs-panel%22%7D%5D&spanId=11798342076467341016&start=1763387074443&end=1763387974443&paused=false) looks much cleaner and correctly parses all the information related to the MCP usage. #### Tool Calls and Tool Results are highlighted: <img width="1988" height="442" alt="image" src="https://github.com/user-attachments/assets/8db2cdbf-425a-426b-ab85-3e5d13dd439e" /> #### Available Tools from the MCP server are captured: <img width="2030" height="882" alt="image" src="https://github.com/user-attachments/assets/e7153191-b852-484a-817d-b5d24c687c2b" /> #### Separate tool span is emitted: <img width="1376" height="612" alt="image" src="https://github.com/user-attachments/assets/7ceaa977-61ff-4917-af53-6a97babf86e5" /> I also tried this out with more than one tool call in this [trace](https://app.datadoghq.com/llm/traces?query=%40ml_app%3Anicole-test%20%40event_type%3Aspan%20%40parent_id%3Aundefined&agg_m=count&agg_m_source=base&agg_t=count&fromUser=false&sp=%5B%7B%22p%22%3A%7B%22eventId%22%3A%22AwAAAZqSHLq7ndIu6wAAABhBWnFTSExxN0FBRG1Md0RzUlVWbUFBQUEAAAAkZjE5YTkyMWMtZjcxMi00ZGU5LTg4ODItN2Q5NTdlNGU2MTliAAACpA%22%7D%2C%22i%22%3A%22llm-obs-panel%22%7D%5D&spanId=4722629383297745650&start=1763387152732&end=1763388052732&paused=false). ## Risks  ## Additional Notes  --------- Co-authored-by: Yun Kim <35776586+Yun-Kim@users.noreply.github.com>
diff --git a/ddtrace/contrib/internal/openai/_endpoint_hooks.py b/ddtrace/contrib/internal/openai/_endpoint_hooks.py
@@ -7,7 +7,10 @@
 from ddtrace.contrib.internal.openai.utils import _loop_handler
 from ddtrace.contrib.internal.openai.utils import _process_finished_stream
 from ddtrace.internal.utils.version import parse_version
+from ddtrace.llmobs._constants import OAI_HANDOFF_TOOL_ARG
 from ddtrace.llmobs._integrations.base_stream_handler import make_traced_stream
+from ddtrace.llmobs._utils import _get_attr
+from ddtrace.llmobs._utils import safe_load_json
 
 
 API_VERSION = "v1"
@@ -520,6 +523,7 @@ class _ResponseHook(_BaseCompletionHook):
 
     def _record_response(self, pin, integration, span, args, kwargs, resp, error):
         resp = super()._record_response(pin, integration, span, args, kwargs, resp, error)
+        self._trace_mcp_tool_usage(pin, integration, resp)
         if not resp:
             integration.llmobs_set_tags(span, args=[], kwargs=kwargs, response=resp, operation="response")
             return resp
@@ -528,6 +532,35 @@ def _record_response(self, pin, integration, span, args, kwargs, resp, error):
         integration.llmobs_set_tags(span, args=[], kwargs=kwargs, response=resp, operation="response")
         return resp
 
+    def _trace_mcp_tool_usage(self, pin, integration, resp):
+        """Detect and trace server-side MCP tool usage in the response."""
+        if not resp:
+            return
+
+        messages = _get_attr(resp, "output", [])
+
+        if messages and isinstance(messages, list):
+            for item in messages:
+                message_type = _get_attr(item, "type", "")
+                if message_type == "mcp_call":
+                    self._create_mcp_tool_span(item, integration, pin)
+
+    def _create_mcp_tool_span(self, item, integration, pin):
+        """Creates and submits a tool span to LLMObs to represent a server-side MCP tool call."""
+        with integration.trace(pin, "client_tool_call", submit_to_llmobs=True, kind="tool") as span:
+            tool_id = str(_get_attr(item, "id", ""))
+            tool_name = str(_get_attr(item, "name", ""))
+            raw_arguments = _get_attr(item, "arguments", OAI_HANDOFF_TOOL_ARG)
+            tool_arguments = safe_load_json(str(raw_arguments))
+            tool_output = str(_get_attr(item, "output", ""))
+            integration.llmobs_set_tags(
+                span,
+                args=[],
+                kwargs={"name": tool_name, "arguments": tool_arguments, "tool_id": tool_id},
+                response=tool_output,
+                operation="tool",
+            )
+
 
 class _ResponseParseHook(_ResponseHook):
     OPERATION_ID = "parseResponse"
diff --git a/ddtrace/llmobs/_integrations/openai.py b/ddtrace/llmobs/_integrations/openai.py
@@ -10,10 +10,12 @@
 from ddtrace.llmobs._constants import CACHE_READ_INPUT_TOKENS_METRIC_KEY
 from ddtrace.llmobs._constants import INPUT_DOCUMENTS
 from ddtrace.llmobs._constants import INPUT_TOKENS_METRIC_KEY
+from ddtrace.llmobs._constants import INPUT_VALUE
 from ddtrace.llmobs._constants import METADATA
 from ddtrace.llmobs._constants import METRICS
 from ddtrace.llmobs._constants import MODEL_NAME
 from ddtrace.llmobs._constants import MODEL_PROVIDER
+from ddtrace.llmobs._constants import NAME
 from ddtrace.llmobs._constants import OUTPUT_TOKENS_METRIC_KEY
 from ddtrace.llmobs._constants import OUTPUT_VALUE
 from ddtrace.llmobs._constants import PROXY_REQUEST
@@ -27,12 +29,15 @@
 from ddtrace.llmobs._integrations.utils import openai_set_meta_tags_from_response
 from ddtrace.llmobs._integrations.utils import update_proxy_workflow_input_output_value
 from ddtrace.llmobs._utils import _get_attr
+from ddtrace.llmobs._utils import safe_json
 from ddtrace.llmobs.types import Document
 from ddtrace.trace import Span
 
 
 log = get_logger(__name__)
 
+OPENAI_LLM_OPERATIONS = ("completion", "chat", "response")
+
 
 class OpenAIIntegration(BaseLLMIntegration):
     _integration_name = "openai"
@@ -105,7 +110,11 @@ def _llmobs_set_tags(
     ) -> None:
         """Sets meta tags and metrics for span events to be sent to LLMObs."""
         span_kind = (
-            "workflow" if span._get_ctx_item(PROXY_REQUEST) else "embedding" if operation == "embedding" else "llm"
+            "workflow"
+            if span._get_ctx_item(PROXY_REQUEST)
+            else "llm"
+            if operation in OPENAI_LLM_OPERATIONS
+            else operation
         )
         model_name = span.get_tag("openai.response.model") or span.get_tag("openai.request.model")
 
@@ -121,7 +130,9 @@ def _llmobs_set_tags(
         elif operation == "embedding":
             self._llmobs_set_meta_tags_from_embedding(span, kwargs, response)
         elif operation == "response":
-            openai_set_meta_tags_from_response(span, kwargs, response)
+            openai_set_meta_tags_from_response(span, kwargs, response, self)
+        elif operation == "tool":
+            self._llmobs_set_tags_from_tool(span, kwargs, response)
         update_proxy_workflow_input_output_value(span, span_kind)
         metrics = self._extract_llmobs_metrics_tags(span, response, span_kind, kwargs)
         span._set_ctx_items(
@@ -153,6 +164,27 @@ def _llmobs_set_meta_tags_from_embedding(span: Span, kwargs: Dict[str, Any], res
             return
         span._set_ctx_item(OUTPUT_VALUE, "[{} embedding(s) returned]".format(len(resp.data)))
 
+    @staticmethod
+    def _llmobs_set_tags_from_tool(span: Span, kwargs: Dict[str, Any], response: Any) -> None:
+        """Extract tool name, arguments, and output from the request and response to be submitted to LLMObs."""
+        tool_id = kwargs.get("tool_id", "unknown_tool_id")
+        tool_name = kwargs.get("name", "unknown_tool")
+        tool_arguments = kwargs.get("arguments")
+        tool_output = response
+
+        span_name = "MCP Client Tool Call: {}".format(tool_name)
+        span.name = span_name
+
+        span._set_ctx_items(
+            {
+                SPAN_KIND: "tool",
+                NAME: span_name,
+                INPUT_VALUE: safe_json(tool_arguments) if tool_arguments is not None else "",
+                OUTPUT_VALUE: safe_json(tool_output) if tool_output is not None else "",
+                METADATA: {"tool_id": tool_id},
+            }
+        )
+
     @staticmethod
     def _extract_llmobs_metrics_tags(
         span: Span, resp: Any, span_kind: str, kwargs: Dict[str, Any]
diff --git a/ddtrace/llmobs/_integrations/openai_agents.py b/ddtrace/llmobs/_integrations/openai_agents.py
@@ -231,7 +231,7 @@ def _llmobs_set_response_attributes(self, span: Span, oai_span: OaiSpanAdapter)
             span._set_ctx_item(INPUT_MESSAGES, messages)
 
         if oai_span.response and oai_span.response.output:
-            messages, tool_call_outputs = oai_span.llmobs_output_messages()
+            messages, tool_call_outputs, _ = oai_span.llmobs_output_messages()
 
             for tool_call_output in tool_call_outputs:
                 core.dispatch(
diff --git a/ddtrace/llmobs/_integrations/utils.py b/ddtrace/llmobs/_integrations/utils.py
@@ -628,32 +628,38 @@ def _openai_parse_input_response_messages(
     return processed, tool_call_ids
 
 
-def openai_get_output_messages_from_response(response: Optional[Any]) -> List[Message]:
+def openai_get_output_messages_from_response(
+    response: Optional[Any], integration: Any = None
+) -> Tuple[List[Message], List[ToolDefinition]]:
     """
-    Parses the output to openai responses api into a list of output messages
+    Parses the output to openai responses api into a list of output messages and a list of
+    MCP tool definitions returned from the MCP server.
 
     Args:
         response: An OpenAI response object or dictionary containing output messages
 
     Returns:
         - A list of processed messages
+        - A list of MCP tool definitions
     """
     if not response:
-        return []
+        return [], []
 
     messages = _get_attr(response, "output", [])
     if not messages:
-        return []
+        return [], []
 
-    processed_messages, _ = _openai_parse_output_response_messages(messages)
+    processed_messages, _, mcp_tool_definitions = _openai_parse_output_response_messages(messages, integration)
 
-    return processed_messages
+    return processed_messages, mcp_tool_definitions
 
 
-def _openai_parse_output_response_messages(messages: List[Any]) -> Tuple[List[Message], List[ToolCall]]:
+def _openai_parse_output_response_messages(
+    messages: List[Any], integration: Any = None
+) -> Tuple[List[Message], List[ToolCall], List[ToolDefinition]]:
     """
     Parses output messages from the openai responses api into a list of processed messages
-    and a list of tool call outputs.
+    and a list of tool call outputs and a list of MCP tool definitions.
 
     Args:
         messages: A list of output messages
@@ -664,6 +670,7 @@ def _openai_parse_output_response_messages(messages: List[Any]) -> Tuple[List[Me
     """
     processed: List[Message] = []
     tool_call_outputs: List[ToolCall] = []
+    mcp_tool_definitions: List[ToolDefinition] = []
 
     for item in messages:
         message: Message = Message()
@@ -707,12 +714,41 @@ def _openai_parse_output_response_messages(messages: List[Any]) -> Tuple[List[Me
                     "role": "assistant",
                 }
             )
+        elif message_type == "mcp_call":
+            call_id = str(_get_attr(item, "id", ""))
+            name = str(_get_attr(item, "name", ""))
+            raw_arguments = _get_attr(item, "arguments", OAI_HANDOFF_TOOL_ARG)
+            arguments = safe_load_json(str(raw_arguments))
+            output = str(_get_attr(item, "output", ""))
+            tool_call_info = ToolCall(
+                tool_id=call_id,
+                arguments=arguments,
+                name=name,
+                type=str(message_type),
+            )
+            tool_call_outputs.append(tool_call_info)
+            tool_result_info = ToolResult(
+                name=name,
+                result=output,
+                tool_id=call_id,
+                type="mcp_tool_result",
+            )
+            message.update(
+                {
+                    "tool_calls": [tool_call_info],
+                    "tool_results": [tool_result_info],
+                    "role": "assistant",
+                }
+            )
+        elif message_type == "mcp_list_tools":
+            mcp_tool_definitions.extend(_openai_get_tool_definitions(_get_attr(item, "tools", [])))
+            continue
         else:
             message.update({"content": str(item), "role": "assistant"})
 
         processed.append(message)
 
-    return processed, tool_call_outputs
+    return processed, tool_call_outputs, mcp_tool_definitions
 
 
 def openai_get_metadata_from_response(
@@ -802,7 +838,9 @@ def _extract_chat_template_from_instructions(
     return chat_template
 
 
-def openai_set_meta_tags_from_response(span: Span, kwargs: Dict[str, Any], response: Optional[Any]) -> None:
+def openai_set_meta_tags_from_response(
+    span: Span, kwargs: Dict[str, Any], response: Optional[Any], integration: Any = None
+) -> None:
     """Extract input/output tags from response and set them as temporary "_ml_obs.meta.*" tags."""
     input_data = kwargs.get("input", [])
 
@@ -851,11 +889,11 @@ def openai_set_meta_tags_from_response(span: Span, kwargs: Dict[str, Any], respo
     metadata = span._get_ctx_item(METADATA) or {}
     metadata.update(openai_get_metadata_from_response(response))
     span._set_ctx_item(METADATA, metadata)
-    output_messages: List[Message] = openai_get_output_messages_from_response(response)
+    output_messages, mcp_tool_definitions = openai_get_output_messages_from_response(response, integration)
     span._set_ctx_item(OUTPUT_MESSAGES, output_messages)
     tools = _openai_get_tool_definitions(kwargs.get("tools") or [])
-    if tools:
-        span._set_ctx_item(TOOL_DEFINITIONS, tools)
+    if mcp_tool_definitions or tools:
+        span._set_ctx_item(TOOL_DEFINITIONS, tools + mcp_tool_definitions)
 
 
 def _openai_get_tool_definitions(tools: List[Any]) -> List[ToolDefinition]:
@@ -878,12 +916,14 @@ def _openai_get_tool_definitions(tools: List[Any]) -> List[ToolDefinition]:
                 schema=_get_attr(custom_tool, "format", {}),  # format is a dict
             )
         # chat API function access and response API tool access
-        # only handles FunctionToolParam and CustomToolParam for response API for now
+        # only handles FunctionToolParam, CustomToolParam and McpListToolsTool for response API for now
         else:
             tool_definition = ToolDefinition(
                 name=str(_get_attr(tool, "name", "")),
                 description=str(_get_attr(tool, "description", "")),
-                schema=_get_attr(tool, "parameters", {}) or _get_attr(tool, "format", {}),
+                schema=_get_attr(tool, "parameters", {})
+                or _get_attr(tool, "format", {})
+                or _get_attr(tool, "input_schema", {}),
             )
         if not any(tool_definition.values()):
             continue
@@ -1198,19 +1238,20 @@ def llmobs_input_messages(self) -> Tuple[List[Message], List[str]]:
         """
         return _openai_parse_input_response_messages(self.input, self.response_system_instructions)
 
-    def llmobs_output_messages(self) -> Tuple[List[Message], List[ToolCall]]:
+    def llmobs_output_messages(self) -> Tuple[List[Message], List[ToolCall], List[ToolDefinition]]:
         """Returns processed output messages for LLM Obs LLM spans.
 
         Returns:
             - A list of processed messages
             - A list of tool calls for span linking purposes
+            - A list of MCP tool definitions
         """
         if not self.response or not self.response.output:
-            return [], []
+            return [], [], []
 
         messages: List[Any] = self.response.output
         if not messages:
-            return [], []
+            return [], [], []
 
         if not isinstance(messages, list):
             messages = [messages]
diff --git a/releasenotes/notes/openai-mcp-support-6529a8c07e00a6a1.yaml b/releasenotes/notes/openai-mcp-support-6529a8c07e00a6a1.yaml
@@ -0,0 +1,4 @@
+features:
+  - |
+    openai, LLM Observability: This introduces support for capturing server-side MCP tool calls invoked via the OpenAI Responses API as a separate span.
+  
diff --git a/tests/contrib/openai/test_openai_llmobs.py b/tests/contrib/openai/test_openai_llmobs.py
diff --git a/tests/contrib/openai/utils.py b/tests/contrib/openai/utils.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +features:
 +  - |
 +    openai, LLM Observability: This introduces support for capturing server-side MCP tool calls invoked via the OpenAI Responses API as a separate span.
++