feat: add Prompts API to Responses API

r3v5 · r3v5 · commit ef753bcd6863 · 2025-09-21T13:52:55.000+01:00
diff --git a/docs/_static/llama-stack-spec.html b/docs/_static/llama-stack-spec.html
@@ -8962,6 +8962,10 @@
                         "type": "string",
                         "description": "The underlying LLM used for completions."
                     },
+                    "prompt_id": {
+                        "type": "string",
+                        "description": "Prompt ID that refers to system prompt created by user and reused in current response."
+                    },
                     "instructions": {
                         "type": "string"
                     },
@@ -9066,6 +9070,10 @@
                         "type": "string",
                         "description": "(Optional) ID of the previous response in a conversation"
                     },
+                    "prompt_id": {
+                        "type": "string",
+                        "description": "(Optional) Reusable prompt created by user"
+                    },
                     "status": {
                         "type": "string",
                         "description": "Current status of the response generation"
@@ -13277,6 +13285,10 @@
                         "type": "string",
                         "description": "(Optional) ID of the previous response in a conversation"
                     },
+                    "prompt_id": {
+                        "type": "string",
+                        "description": "(Optional) Reusable prompt created by user"
+                    },
                     "status": {
                         "type": "string",
                         "description": "Current status of the response generation"
diff --git a/docs/_static/llama-stack-spec.yaml b/docs/_static/llama-stack-spec.yaml
@@ -6509,6 +6509,11 @@ components:
         model:
           type: string
           description: The underlying LLM used for completions.
+        prompt_id:
+          type: string
+          description: >-
+            Prompt ID that refers to system prompt created by user and reused in current
+            response.
         instructions:
           type: string
         previous_response_id:
@@ -6598,6 +6603,10 @@ components:
           type: string
           description: >-
             (Optional) ID of the previous response in a conversation
+        prompt_id:
+          type: string
+          description: >-
+            (Optional) Reusable prompt created by user
         status:
           type: string
           description: >-
@@ -9862,6 +9871,10 @@ components:
           type: string
           description: >-
             (Optional) ID of the previous response in a conversation
+        prompt_id:
+          type: string
+          description: >-
+            (Optional) Reusable prompt created by user
         status:
           type: string
           description: >-
diff --git a/llama_stack/apis/agents/agents.py b/llama_stack/apis/agents/agents.py
@@ -699,6 +699,7 @@ async def create_openai_response(
         self,
         input: str | list[OpenAIResponseInput],
         model: str,
+        prompt_id: str | None = None,
         instructions: str | None = None,
         previous_response_id: str | None = None,
         store: bool | None = True,
@@ -710,9 +711,9 @@ async def create_openai_response(
         max_infer_iters: int | None = 10,  # this is an extension to the OpenAI API
     ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]:
         """Create a new OpenAI response.
-
         :param input: Input message(s) to create the response.
         :param model: The underlying LLM used for completions.
+        :param prompt_id: Prompt ID that refers to system prompt created by user and reused in current response.
         :param previous_response_id: (Optional) if specified, the new response will be a continuation of the previous response. This can be used to easily fork-off new responses from existing responses.
         :param include: (Optional) Additional fields to include in the response.
         :returns: An OpenAIResponseObject.
diff --git a/llama_stack/apis/agents/openai_responses.py b/llama_stack/apis/agents/openai_responses.py
@@ -330,6 +330,7 @@ class OpenAIResponseObject(BaseModel):
     :param object: Object type identifier, always "response"
     :param output: List of generated output items (messages, tool calls, etc.)
     :param parallel_tool_calls: Whether tool calls can be executed in parallel
+    :param prompt_id: (Optional) Reusable prompt created by user
     :param previous_response_id: (Optional) ID of the previous response in a conversation
     :param status: Current status of the response generation
     :param temperature: (Optional) Sampling temperature used for generation
@@ -347,6 +348,7 @@ class OpenAIResponseObject(BaseModel):
     output: list[OpenAIResponseOutput]
     parallel_tool_calls: bool = False
     previous_response_id: str | None = None
+    prompt_id: str | None = None
     status: str
     temperature: float | None = None
     # Default to text format to avoid breaking the loading of old responses
diff --git a/llama_stack/core/stack.py b/llama_stack/core/stack.py
@@ -314,6 +314,10 @@ def add_internal_implementations(impls: dict[Api, Any], run_config: StackRunConf
     )
     impls[Api.prompts] = prompts_impl
 
+    # Set prompts API on agents provider if it exists
+    if Api.agents in impls and hasattr(impls[Api.agents], "set_prompts_api"):
+        impls[Api.agents].set_prompts_api(prompts_impl)
+
 
 # Produces a stack of providers for the given run config. Not all APIs may be
 # asked for in the run config.
diff --git a/llama_stack/providers/inline/agents/meta_reference/__init__.py b/llama_stack/providers/inline/agents/meta_reference/__init__.py
@@ -21,6 +21,7 @@ async def get_provider_impl(config: MetaReferenceAgentsImplConfig, deps: dict[Ap
         deps[Api.safety],
         deps[Api.tool_runtime],
         deps[Api.tool_groups],
+        None,  # prompts_api will be set later when available
         policy,
     )
     await impl.initialize()
diff --git a/llama_stack/providers/inline/agents/meta_reference/agents.py b/llama_stack/providers/inline/agents/meta_reference/agents.py
@@ -37,6 +37,7 @@
     ToolResponseMessage,
     UserMessage,
 )
+from llama_stack.apis.prompts import Prompts
 from llama_stack.apis.safety import Safety
 from llama_stack.apis.tools import ToolGroups, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
@@ -63,6 +64,7 @@ def __init__(
         safety_api: Safety,
         tool_runtime_api: ToolRuntime,
         tool_groups_api: ToolGroups,
+        prompts_api: Prompts | None,
         policy: list[AccessRule],
     ):
         self.config = config
@@ -71,6 +73,7 @@ def __init__(
         self.safety_api = safety_api
         self.tool_runtime_api = tool_runtime_api
         self.tool_groups_api = tool_groups_api
+        self.prompts_api = prompts_api
 
         self.in_memory_store = InmemoryKVStoreImpl()
         self.openai_responses_impl: OpenAIResponsesImpl | None = None
@@ -86,8 +89,14 @@ async def initialize(self) -> None:
             tool_runtime_api=self.tool_runtime_api,
             responses_store=self.responses_store,
             vector_io_api=self.vector_io_api,
+            prompts_api=self.prompts_api,
         )
 
+    def set_prompts_api(self, prompts_api: Prompts) -> None:
+        self.prompts_api = prompts_api
+        if hasattr(self, "openai_responses_impl") and self.openai_responses_impl:
+            self.openai_responses_impl.prompts_api = prompts_api
+
     async def create_agent(
         self,
         agent_config: AgentConfig,
@@ -320,6 +329,7 @@ async def create_openai_response(
         self,
         input: str | list[OpenAIResponseInput],
         model: str,
+        prompt_id: str | None = None,
         instructions: str | None = None,
         previous_response_id: str | None = None,
         store: bool | None = True,
@@ -333,6 +343,7 @@ async def create_openai_response(
         return await self.openai_responses_impl.create_openai_response(
             input,
             model,
+            prompt_id,
             instructions,
             previous_response_id,
             store,
diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@@ -26,8 +26,10 @@
 )
 from llama_stack.apis.inference import (
     Inference,
+    OpenAIMessageParam,
     OpenAISystemMessageParam,
 )
+from llama_stack.apis.prompts import Prompts
 from llama_stack.apis.tools import ToolGroups, ToolRuntime
 from llama_stack.apis.vector_io import VectorIO
 from llama_stack.log import get_logger
@@ -57,12 +59,14 @@ def __init__(
         tool_runtime_api: ToolRuntime,
         responses_store: ResponsesStore,
         vector_io_api: VectorIO,  # VectorIO
+        prompts_api: Prompts,
     ):
         self.inference_api = inference_api
         self.tool_groups_api = tool_groups_api
         self.tool_runtime_api = tool_runtime_api
         self.responses_store = responses_store
         self.vector_io_api = vector_io_api
+        self.prompts_api = prompts_api
         self.tool_executor = ToolExecutor(
             tool_groups_api=tool_groups_api,
             tool_runtime_api=tool_runtime_api,
@@ -97,6 +101,22 @@ async def _prepend_instructions(self, messages, instructions):
         if instructions:
             messages.insert(0, OpenAISystemMessageParam(content=instructions))
 
+    async def _prepend_prompt(self, messages: list[OpenAIMessageParam], prompt_id: str) -> str:
+        if not prompt_id:
+            return None
+
+        try:
+            # Check if prompt exists in Llama Stack and retrieve it
+            prompt = await self.prompts_api.get_prompt(prompt_id)
+            if prompt and prompt.prompt:
+                messages.insert(0, OpenAISystemMessageParam(content=prompt.prompt))
+                logger.info(f"Prompt {prompt_id} found")
+                return prompt_id
+        except ValueError:
+            # Prompt not found, skip prepending
+            logger.warning(f"Prompt {prompt_id} not found, skipping prompt prepending")
+            return None
+
     async def get_openai_response(
         self,
         response_id: str,
@@ -171,6 +191,7 @@ async def create_openai_response(
         self,
         input: str | list[OpenAIResponseInput],
         model: str,
+        prompt_id: str | None = None,
         instructions: str | None = None,
         previous_response_id: str | None = None,
         store: bool | None = True,
@@ -187,6 +208,7 @@ async def create_openai_response(
         stream_gen = self._create_streaming_response(
             input=input,
             model=model,
+            prompt_id=prompt_id,
             instructions=instructions,
             previous_response_id=previous_response_id,
             store=store,
@@ -215,6 +237,7 @@ async def _create_streaming_response(
         self,
         input: str | list[OpenAIResponseInput],
         model: str,
+        prompt_id: str | None = None,
         instructions: str | None = None,
         previous_response_id: str | None = None,
         store: bool | None = True,
@@ -226,6 +249,9 @@ async def _create_streaming_response(
         # Input preprocessing
         input = await self._prepend_previous_response(input, previous_response_id)
         messages = await convert_response_input_to_chat_messages(input)
+
+        # Prepend reusable prompt (if provided)
+        prompt_id = await self._prepend_prompt(messages, prompt_id)
         await self._prepend_instructions(messages, instructions)
 
         # Structured outputs
@@ -248,6 +274,7 @@ async def _create_streaming_response(
             ctx=ctx,
             response_id=response_id,
             created_at=created_at,
+            prompt_id=prompt_id,
             text=text,
             max_infer_iters=max_infer_iters,
             tool_executor=self.tool_executor,
diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -57,6 +57,7 @@ def __init__(
         ctx: ChatCompletionContext,
         response_id: str,
         created_at: int,
+        prompt_id: str | None,
         text: OpenAIResponseText,
         max_infer_iters: int,
         tool_executor,  # Will be the tool execution logic from the main class
@@ -65,6 +66,7 @@ def __init__(
         self.ctx = ctx
         self.response_id = response_id
         self.created_at = created_at
+        self.prompt_id = prompt_id
         self.text = text
         self.max_infer_iters = max_infer_iters
         self.tool_executor = tool_executor
@@ -83,6 +85,7 @@ async def create_response(self) -> AsyncIterator[OpenAIResponseObjectStream]:
             object="response",
             status="in_progress",
             output=output_messages.copy(),
+            prompt_id=self.prompt_id,
             text=self.text,
         )
 
@@ -157,6 +160,7 @@ async def create_response(self) -> AsyncIterator[OpenAIResponseObjectStream]:
             model=self.ctx.model,
             object="response",
             status="completed",
+            prompt_id=self.prompt_id,
             text=self.text,
             output=output_messages,
         )
diff --git a/tests/unit/providers/agents/meta_reference/test_openai_responses.py b/tests/unit/providers/agents/meta_reference/test_openai_responses.py
@@ -40,6 +40,7 @@
     OpenAIResponseFormatText,
     OpenAIUserMessageParam,
 )
+from llama_stack.apis.prompts import Prompt
 from llama_stack.apis.tools.tools import Tool, ToolGroups, ToolInvocationResult, ToolParameter, ToolRuntime
 from llama_stack.core.access_control.access_control import default_policy
 from llama_stack.providers.inline.agents.meta_reference.responses.openai_responses import (
@@ -79,16 +80,28 @@ def mock_vector_io_api():
     return vector_io_api
 
 
+@pytest.fixture
+def mock_prompts_api():
+    prompts_api = AsyncMock()
+    return prompts_api
+
+
 @pytest.fixture
 def openai_responses_impl(
-    mock_inference_api, mock_tool_groups_api, mock_tool_runtime_api, mock_responses_store, mock_vector_io_api
+    mock_inference_api,
+    mock_tool_groups_api,
+    mock_tool_runtime_api,
+    mock_responses_store,
+    mock_vector_io_api,
+    mock_prompts_api,
 ):
     return OpenAIResponsesImpl(
         inference_api=mock_inference_api,
         tool_groups_api=mock_tool_groups_api,
         tool_runtime_api=mock_tool_runtime_api,
         responses_store=mock_responses_store,
         vector_io_api=mock_vector_io_api,
+        prompts_api=mock_prompts_api,
     )
 
 
@@ -868,3 +881,51 @@ async def test_create_openai_response_with_invalid_text_format(openai_responses_
             model=model,
             text=OpenAIResponseText(format={"type": "invalid"}),
         )
+
+
+async def test_create_openai_response_with_prompt(openai_responses_impl, mock_inference_api, mock_prompts_api):
+    """Test creating an OpenAI response with a prompt."""
+    # Setup
+    input_text = "What is the capital of Ireland?"
+    model = "meta-llama/Llama-3.1-8B-Instruct"
+    prompt_id = "pmpt_1234567890abcdef1234567890abcdef1234567890abcdef"
+    prompt = Prompt(
+        prompt="You are a helpful {{ geography }} assistant. Always provide accurate information.",
+        prompt_id=prompt_id,
+        version=1,
+        variables=["geography"],
+        is_default=True,
+    )
+
+    # Mock the prompts API to return the prompt
+    mock_prompts_api.get_prompt.return_value = prompt
+    mock_inference_api.openai_chat_completion.return_value = fake_stream()
+
+    # Execute
+    result = await openai_responses_impl.create_openai_response(
+        input=input_text,
+        model=model,
+        prompt_id=prompt_id,
+    )
+
+    # Verify
+    mock_prompts_api.get_prompt.assert_called_once_with(prompt_id)
+    mock_inference_api.openai_chat_completion.assert_called_once()
+    call_args = mock_inference_api.openai_chat_completion.call_args
+    sent_messages = call_args.kwargs["messages"]
+    assert len(sent_messages) == 2
+
+    # Check that prompt was prepended as a system message
+    system_messages = [msg for msg in sent_messages if msg.role == "system"]
+    assert len(system_messages) == 1
+    assert system_messages[0].content == prompt.prompt
+
+    # Test that user input exists
+    user_messages = [msg for msg in sent_messages if msg.role == "user"]
+    assert len(user_messages) == 1
+    assert user_messages[0].content == input_text
+
+    # Verify the response
+    assert result.model == model
+    assert result.status == "completed"
+    assert result.prompt_id == prompt_id

Original file line number	Diff line number	Diff line change
`@@ -314,6 +314,10 @@ def add_internal_implementations(impls: dict[Api, Any], run_config: StackRunConf`
`314`	`314`	`)`
`315`	`315`	`impls[Api.prompts] = prompts_impl`
`316`	`316`
	`317`	`+ # Set prompts API on agents provider if it exists`
	`318`	`+ if Api.agents in impls and hasattr(impls[Api.agents], "set_prompts_api"):`
	`319`	`+ impls[Api.agents].set_prompts_api(prompts_impl)`
	`320`	`+`
`317`	`321`
`318`	`322`	`# Produces a stack of providers for the given run config. Not all APIs may be`
`319`	`323`	`# asked for in the run config.`
Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@ async def get_provider_impl(config: MetaReferenceAgentsImplConfig, deps: dict[Ap`
`21`	`21`	`deps[Api.safety],`
`22`	`22`	`deps[Api.tool_runtime],`
`23`	`23`	`deps[Api.tool_groups],`
	`24`	`+ None, # prompts_api will be set later when available`
`24`	`25`	`policy,`
`25`	`26`	`)`
`26`	`27`	`await impl.initialize()`