Enable streaming usage metrics for OpenAI providers

skamenan7 · skamenan7 · commit 43253456c286 · 2025-11-19T17:11:38.000-05:00
Inject stream_options for telemetry, add completion streaming metrics,
fix params mutation, remove duplicate provider logic. Add unit tests.
diff --git a/src/llama_stack/core/routers/inference.py b/src/llama_stack/core/routers/inference.py
@@ -185,9 +185,12 @@ async def openai_completion(
         params.model = provider_resource_id
 
         if params.stream:
-            return await provider.openai_completion(params)
-            # TODO: Metrics do NOT work with openai_completion stream=True due to the fact
-            # that we do not return an AsyncIterator, our tests expect a stream of chunks we cannot intercept currently.
+            response_stream = await provider.openai_completion(params)
+            return self.wrap_completion_stream_with_metrics(
+                response=response_stream,
+                fully_qualified_model_id=request_model_id,
+                provider_id=provider.__provider_id__,
+            )
 
         response = await provider.openai_completion(params)
         response.model = request_model_id
@@ -412,16 +415,17 @@ async def stream_tokens_and_compute_metrics_openai_chat(
                         completion_text += "".join(choice_data["content_parts"])
 
                     # Add metrics to the chunk
-                    if self.telemetry_enabled and hasattr(chunk, "usage") and chunk.usage:
-                        metrics = self._construct_metrics(
-                            prompt_tokens=chunk.usage.prompt_tokens,
-                            completion_tokens=chunk.usage.completion_tokens,
-                            total_tokens=chunk.usage.total_tokens,
-                            fully_qualified_model_id=fully_qualified_model_id,
-                            provider_id=provider_id,
-                        )
-                        for metric in metrics:
-                            enqueue_event(metric)
+                    if self.telemetry_enabled:
+                        if hasattr(chunk, "usage") and chunk.usage:
+                            metrics = self._construct_metrics(
+                                prompt_tokens=chunk.usage.prompt_tokens,
+                                completion_tokens=chunk.usage.completion_tokens,
+                                total_tokens=chunk.usage.total_tokens,
+                                fully_qualified_model_id=fully_qualified_model_id,
+                                provider_id=provider_id,
+                            )
+                            for metric in metrics:
+                                enqueue_event(metric)
 
                 yield chunk
         finally:
@@ -471,3 +475,31 @@ async def stream_tokens_and_compute_metrics_openai_chat(
                 )
                 logger.debug(f"InferenceRouter.completion_response: {final_response}")
                 asyncio.create_task(self.store.store_chat_completion(final_response, messages))
+
+    async def wrap_completion_stream_with_metrics(
+        self,
+        response: AsyncIterator,
+        fully_qualified_model_id: str,
+        provider_id: str,
+    ) -> AsyncIterator:
+        """Stream OpenAI completion chunks and compute metrics on final chunk."""
+
+        async for chunk in response:
+            if hasattr(chunk, "model"):
+                chunk.model = fully_qualified_model_id
+
+            if getattr(chunk, "choices", None) and any(c.finish_reason for c in chunk.choices):
+                if self.telemetry_enabled:
+                    if getattr(chunk, "usage", None):
+                        usage = chunk.usage
+                        metrics = self._construct_metrics(
+                            prompt_tokens=usage.prompt_tokens,
+                            completion_tokens=usage.completion_tokens,
+                            total_tokens=usage.total_tokens,
+                            fully_qualified_model_id=fully_qualified_model_id,
+                            provider_id=provider_id,
+                        )
+                        for metric in metrics:
+                            enqueue_event(metric)
+
+            yield chunk
diff --git a/src/llama_stack/providers/remote/inference/runpod/runpod.py b/src/llama_stack/providers/remote/inference/runpod/runpod.py
@@ -4,14 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 
-from collections.abc import AsyncIterator
-
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
-from llama_stack_api import (
-    OpenAIChatCompletion,
-    OpenAIChatCompletionChunk,
-    OpenAIChatCompletionRequestWithExtraBody,
-)
 
 from .config import RunpodImplConfig
 
@@ -29,15 +22,3 @@ class RunpodInferenceAdapter(OpenAIMixin):
     def get_base_url(self) -> str:
         """Get base URL for OpenAI client."""
         return str(self.config.base_url)
-
-    async def openai_chat_completion(
-        self,
-        params: OpenAIChatCompletionRequestWithExtraBody,
-    ) -> OpenAIChatCompletion | AsyncIterator[OpenAIChatCompletionChunk]:
-        """Override to add RunPod-specific stream_options requirement."""
-        params = params.model_copy()
-
-        if params.stream and not params.stream_options:
-            params.stream_options = {"include_usage": True}
-
-        return await super().openai_chat_completion(params)
diff --git a/src/llama_stack/providers/remote/inference/watsonx/watsonx.py b/src/llama_stack/providers/remote/inference/watsonx/watsonx.py
@@ -10,7 +10,6 @@
 import litellm
 import requests
 
-from llama_stack.core.telemetry.tracing import get_current_span
 from llama_stack.log import get_logger
 from llama_stack.providers.remote.inference.watsonx.config import WatsonXConfig
 from llama_stack.providers.utils.inference.litellm_openai_mixin import LiteLLMOpenAIMixin
@@ -56,15 +55,6 @@ async def openai_chat_completion(
         Override parent method to add timeout and inject usage object when missing.
         This works around a LiteLLM defect where usage block is sometimes dropped.
         """
-
-        # Add usage tracking for streaming when telemetry is active
-        stream_options = params.stream_options
-        if params.stream and get_current_span() is not None:
-            if stream_options is None:
-                stream_options = {"include_usage": True}
-            elif "include_usage" not in stream_options:
-                stream_options = {**stream_options, "include_usage": True}
-
         model_obj = await self.model_store.get_model(params.model)
 
         request_params = await prepare_openai_completion_params(
@@ -84,7 +74,7 @@ async def openai_chat_completion(
             seed=params.seed,
             stop=params.stop,
             stream=params.stream,
-            stream_options=stream_options,
+            stream_options=params.stream_options,
             temperature=params.temperature,
             tool_choice=params.tool_choice,
             tools=params.tools,
diff --git a/src/llama_stack/providers/utils/inference/openai_mixin.py b/src/llama_stack/providers/utils/inference/openai_mixin.py
@@ -258,6 +258,16 @@ async def openai_completion(
         """
         Direct OpenAI completion API call.
         """
+        from llama_stack.core.telemetry.tracing import get_current_span
+
+        # inject if streaming AND telemetry active
+        if params.stream and get_current_span() is not None:
+            params = params.model_copy()
+            if params.stream_options is None:
+                params.stream_options = {"include_usage": True}
+            elif "include_usage" not in params.stream_options:
+                params.stream_options = {**params.stream_options, "include_usage": True}
+
         # TODO: fix openai_completion to return type compatible with OpenAI's API response
         completion_kwargs = await prepare_openai_completion_params(
             model=await self._get_provider_model_id(params.model),
@@ -292,6 +302,16 @@ async def openai_chat_completion(
         """
         Direct OpenAI chat completion API call.
         """
+        from llama_stack.core.telemetry.tracing import get_current_span
+
+        # inject if streaming AND telemetry active
+        if params.stream and get_current_span() is not None:
+            params = params.model_copy()
+            if params.stream_options is None:
+                params.stream_options = {"include_usage": True}
+            elif "include_usage" not in params.stream_options:
+                params.stream_options = {**params.stream_options, "include_usage": True}
+
         messages = params.messages
 
         if self.download_images:
diff --git a/tests/unit/providers/utils/inference/test_openai_mixin.py b/tests/unit/providers/utils/inference/test_openai_mixin.py
@@ -15,7 +15,13 @@
 from llama_stack.core.request_headers import request_provider_data_context
 from llama_stack.providers.utils.inference.model_registry import RemoteInferenceProviderConfig
 from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
-from llama_stack_api import Model, ModelType, OpenAIChatCompletionRequestWithExtraBody, OpenAIUserMessageParam
+from llama_stack_api import (
+    Model,
+    ModelType,
+    OpenAIChatCompletionRequestWithExtraBody,
+    OpenAICompletionRequestWithExtraBody,
+    OpenAIUserMessageParam,
+)
 
 
 class OpenAIMixinImpl(OpenAIMixin):
@@ -834,3 +840,146 @@ def test_error_message_includes_correct_field_names(self, mixin_with_provider_da
         error_message = str(exc_info.value)
         assert "test_api_key" in error_message
         assert "x-llamastack-provider-data" in error_message
+
+
+class TestOpenAIMixinStreamingMetrics:
+    """Test cases for streaming metrics injection in OpenAIMixin"""
+
+    async def test_openai_chat_completion_streaming_metrics_injection(self, mixin, mock_client_context):
+        """Test that stream_options={"include_usage": True} is injected when streaming and telemetry is enabled"""
+
+        params = OpenAIChatCompletionRequestWithExtraBody(
+            model="test-model",
+            messages=[{"role": "user", "content": "hello"}],
+            stream=True,
+            stream_options=None,
+        )
+
+        mock_client = MagicMock()
+        mock_client.chat.completions.create = AsyncMock(return_value=MagicMock())
+
+        with mock_client_context(mixin, mock_client):
+            with patch("llama_stack.core.telemetry.tracing.get_current_span") as mock_get_span:
+                mock_get_span.return_value = MagicMock()
+
+                with patch(
+                    "llama_stack.providers.utils.inference.openai_mixin.prepare_openai_completion_params"
+                ) as mock_prepare:
+                    mock_prepare.return_value = {"model": "test-model"}
+
+                    await mixin.openai_chat_completion(params)
+
+                    call_kwargs = mock_prepare.call_args.kwargs
+                    assert call_kwargs["stream_options"] == {"include_usage": True}
+
+                    assert params.stream_options is None
+
+    async def test_openai_chat_completion_streaming_no_telemetry(self, mixin, mock_client_context):
+        """Test that stream_options is NOT injected when telemetry is disabled"""
+
+        params = OpenAIChatCompletionRequestWithExtraBody(
+            model="test-model",
+            messages=[{"role": "user", "content": "hello"}],
+            stream=True,
+            stream_options=None,
+        )
+
+        mock_client = MagicMock()
+        mock_client.chat.completions.create = AsyncMock(return_value=MagicMock())
+
+        with mock_client_context(mixin, mock_client):
+            with patch("llama_stack.core.telemetry.tracing.get_current_span") as mock_get_span:
+                mock_get_span.return_value = None
+
+                with patch(
+                    "llama_stack.providers.utils.inference.openai_mixin.prepare_openai_completion_params"
+                ) as mock_prepare:
+                    mock_prepare.return_value = {"model": "test-model"}
+
+                    await mixin.openai_chat_completion(params)
+
+                    call_kwargs = mock_prepare.call_args.kwargs
+                    assert call_kwargs["stream_options"] is None
+
+    async def test_openai_completion_streaming_metrics_injection(self, mixin, mock_client_context):
+        """Test that stream_options={"include_usage": True} is injected for legacy completion"""
+
+        params = OpenAICompletionRequestWithExtraBody(
+            model="test-model",
+            prompt="hello",
+            stream=True,
+            stream_options=None,
+        )
+
+        mock_client = MagicMock()
+        mock_client.completions.create = AsyncMock(return_value=MagicMock())
+
+        with mock_client_context(mixin, mock_client):
+            with patch("llama_stack.core.telemetry.tracing.get_current_span") as mock_get_span:
+                mock_get_span.return_value = MagicMock()
+
+                with patch(
+                    "llama_stack.providers.utils.inference.openai_mixin.prepare_openai_completion_params"
+                ) as mock_prepare:
+                    mock_prepare.return_value = {"model": "test-model"}
+
+                    await mixin.openai_completion(params)
+
+                    call_kwargs = mock_prepare.call_args.kwargs
+                    assert call_kwargs["stream_options"] == {"include_usage": True}
+                    assert params.stream_options is None
+
+    async def test_preserves_existing_stream_options(self, mixin, mock_client_context):
+        """Test that existing stream_options are preserved and merged"""
+
+        params = OpenAIChatCompletionRequestWithExtraBody(
+            model="test-model",
+            messages=[{"role": "user", "content": "hello"}],
+            stream=True,
+            stream_options={"include_usage": False},
+        )
+
+        mock_client = MagicMock()
+        mock_client.chat.completions.create = AsyncMock(return_value=MagicMock())
+
+        with mock_client_context(mixin, mock_client):
+            with patch("llama_stack.core.telemetry.tracing.get_current_span") as mock_get_span:
+                mock_get_span.return_value = MagicMock()
+
+                with patch(
+                    "llama_stack.providers.utils.inference.openai_mixin.prepare_openai_completion_params"
+                ) as mock_prepare:
+                    mock_prepare.return_value = {"model": "test-model"}
+
+                    await mixin.openai_chat_completion(params)
+
+                    call_kwargs = mock_prepare.call_args.kwargs
+                    # It should stay False because it was present
+                    assert call_kwargs["stream_options"] == {"include_usage": False}
+
+    async def test_merges_existing_stream_options(self, mixin, mock_client_context):
+        """Test that existing stream_options are merged"""
+
+        params = OpenAIChatCompletionRequestWithExtraBody(
+            model="test-model",
+            messages=[{"role": "user", "content": "hello"}],
+            stream=True,
+            stream_options={"other_option": True},
+        )
+
+        mock_client = MagicMock()
+        mock_client.chat.completions.create = AsyncMock(return_value=MagicMock())
+
+        with mock_client_context(mixin, mock_client):
+            with patch("llama_stack.core.telemetry.tracing.get_current_span") as mock_get_span:
+                mock_get_span.return_value = MagicMock()
+
+                with patch(
+                    "llama_stack.providers.utils.inference.openai_mixin.prepare_openai_completion_params"
+                ) as mock_prepare:
+                    mock_prepare.return_value = {"model": "test-model"}
+
+                    await mixin.openai_chat_completion(params)
+
+                    call_kwargs = mock_prepare.call_args.kwargs
+                    assert call_kwargs["stream_options"] == {"other_option": True, "include_usage": True}