From 958d0dc515992f07691154635045dfd4f17a5f69 Mon Sep 17 00:00:00 2001 From: Anastas Stoyanovsky Date: Wed, 19 Nov 2025 14:33:54 -0500 Subject: [PATCH] Pass parallel_tool_calls directly and document intended usage in integration test Signed-off-by: Anastas Stoyanovsky --- docs/docs/providers/agents/index.mdx | 4 +- docs/docs/providers/batches/index.mdx | 24 +-- docs/docs/providers/eval/index.mdx | 4 +- docs/docs/providers/files/index.mdx | 4 +- docs/docs/providers/inference/index.mdx | 20 +-- docs/docs/providers/safety/index.mdx | 4 +- .../meta_reference/responses/streaming.py | 1 + .../agents/test_openai_responses.py | 166 ------------------ 8 files changed, 31 insertions(+), 196 deletions(-) diff --git a/docs/docs/providers/agents/index.mdx b/docs/docs/providers/agents/index.mdx index 200a3b9caa..1f7e0c7888 100644 --- a/docs/docs/providers/agents/index.mdx +++ b/docs/docs/providers/agents/index.mdx @@ -2,7 +2,7 @@ description: | Agents - APIs for creating and interacting with agentic systems. + APIs for creating and interacting with agentic systems. sidebar_label: Agents title: Agents --- @@ -13,6 +13,6 @@ title: Agents Agents - APIs for creating and interacting with agentic systems. +APIs for creating and interacting with agentic systems. This section contains documentation for all available providers for the **agents** API. diff --git a/docs/docs/providers/batches/index.mdx b/docs/docs/providers/batches/index.mdx index 18fd499459..23b7df14b7 100644 --- a/docs/docs/providers/batches/index.mdx +++ b/docs/docs/providers/batches/index.mdx @@ -1,15 +1,15 @@ --- description: | The Batches API enables efficient processing of multiple requests in a single operation, - particularly useful for processing large datasets, batch evaluation workflows, and - cost-effective inference at scale. + particularly useful for processing large datasets, batch evaluation workflows, and + cost-effective inference at scale. - The API is designed to allow use of openai client libraries for seamless integration. + The API is designed to allow use of openai client libraries for seamless integration. - This API provides the following extensions: - - idempotent batch creation + This API provides the following extensions: + - idempotent batch creation - Note: This API is currently under active development and may undergo changes. + Note: This API is currently under active development and may undergo changes. sidebar_label: Batches title: Batches --- @@ -19,14 +19,14 @@ title: Batches ## Overview The Batches API enables efficient processing of multiple requests in a single operation, - particularly useful for processing large datasets, batch evaluation workflows, and - cost-effective inference at scale. +particularly useful for processing large datasets, batch evaluation workflows, and +cost-effective inference at scale. - The API is designed to allow use of openai client libraries for seamless integration. +The API is designed to allow use of openai client libraries for seamless integration. - This API provides the following extensions: - - idempotent batch creation +This API provides the following extensions: + - idempotent batch creation - Note: This API is currently under active development and may undergo changes. +Note: This API is currently under active development and may undergo changes. This section contains documentation for all available providers for the **batches** API. diff --git a/docs/docs/providers/eval/index.mdx b/docs/docs/providers/eval/index.mdx index 3543db2461..a6e35d6118 100644 --- a/docs/docs/providers/eval/index.mdx +++ b/docs/docs/providers/eval/index.mdx @@ -2,7 +2,7 @@ description: | Evaluations - Llama Stack Evaluation API for running evaluations on model and agent candidates. + Llama Stack Evaluation API for running evaluations on model and agent candidates. sidebar_label: Eval title: Eval --- @@ -13,6 +13,6 @@ title: Eval Evaluations - Llama Stack Evaluation API for running evaluations on model and agent candidates. +Llama Stack Evaluation API for running evaluations on model and agent candidates. This section contains documentation for all available providers for the **eval** API. diff --git a/docs/docs/providers/files/index.mdx b/docs/docs/providers/files/index.mdx index 0b28e9aee4..0540c5c3e0 100644 --- a/docs/docs/providers/files/index.mdx +++ b/docs/docs/providers/files/index.mdx @@ -2,7 +2,7 @@ description: | Files - This API is used to upload documents that can be used with other Llama Stack APIs. + This API is used to upload documents that can be used with other Llama Stack APIs. sidebar_label: Files title: Files --- @@ -13,6 +13,6 @@ title: Files Files - This API is used to upload documents that can be used with other Llama Stack APIs. +This API is used to upload documents that can be used with other Llama Stack APIs. This section contains documentation for all available providers for the **files** API. diff --git a/docs/docs/providers/inference/index.mdx b/docs/docs/providers/inference/index.mdx index e2d94bfafd..ad050e5013 100644 --- a/docs/docs/providers/inference/index.mdx +++ b/docs/docs/providers/inference/index.mdx @@ -2,12 +2,12 @@ description: | Inference - Llama Stack Inference API for generating completions, chat completions, and embeddings. + Llama Stack Inference API for generating completions, chat completions, and embeddings. - This API provides the raw interface to the underlying models. Three kinds of models are supported: - - LLM models: these models generate "raw" and "chat" (conversational) completions. - - Embedding models: these models generate embeddings to be used for semantic search. - - Rerank models: these models reorder the documents based on their relevance to a query. + This API provides the raw interface to the underlying models. Three kinds of models are supported: + - LLM models: these models generate "raw" and "chat" (conversational) completions. + - Embedding models: these models generate embeddings to be used for semantic search. + - Rerank models: these models reorder the documents based on their relevance to a query. sidebar_label: Inference title: Inference --- @@ -18,11 +18,11 @@ title: Inference Inference - Llama Stack Inference API for generating completions, chat completions, and embeddings. +Llama Stack Inference API for generating completions, chat completions, and embeddings. - This API provides the raw interface to the underlying models. Three kinds of models are supported: - - LLM models: these models generate "raw" and "chat" (conversational) completions. - - Embedding models: these models generate embeddings to be used for semantic search. - - Rerank models: these models reorder the documents based on their relevance to a query. +This API provides the raw interface to the underlying models. Three kinds of models are supported: +- LLM models: these models generate "raw" and "chat" (conversational) completions. +- Embedding models: these models generate embeddings to be used for semantic search. +- Rerank models: these models reorder the documents based on their relevance to a query. This section contains documentation for all available providers for the **inference** API. diff --git a/docs/docs/providers/safety/index.mdx b/docs/docs/providers/safety/index.mdx index 0c13de28c6..e7205f4ada 100644 --- a/docs/docs/providers/safety/index.mdx +++ b/docs/docs/providers/safety/index.mdx @@ -2,7 +2,7 @@ description: | Safety - OpenAI-compatible Moderations API. + OpenAI-compatible Moderations API. sidebar_label: Safety title: Safety --- @@ -13,6 +13,6 @@ title: Safety Safety - OpenAI-compatible Moderations API. +OpenAI-compatible Moderations API. This section contains documentation for all available providers for the **safety** API. diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py index cdbd872443..5591693454 100644 --- a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py @@ -242,6 +242,7 @@ async def create_response(self) -> AsyncIterator[OpenAIResponseObjectStream]: messages=messages, # Pydantic models are dict-compatible but mypy treats them as distinct types tools=self.ctx.chat_tools, # type: ignore[arg-type] + parallel_tool_calls=self.parallel_tool_calls, stream=True, temperature=self.ctx.temperature, response_format=response_format, diff --git a/tests/integration/agents/test_openai_responses.py b/tests/integration/agents/test_openai_responses.py index 057cee774a..d413d52016 100644 --- a/tests/integration/agents/test_openai_responses.py +++ b/tests/integration/agents/test_openai_responses.py @@ -516,169 +516,3 @@ def test_response_with_instructions(openai_client, client_with_models, text_mode # Verify instructions from previous response was not carried over to the next response assert response_with_instructions2.instructions == instructions2 - - -@pytest.mark.skip(reason="Tool calling is not reliable.") -def test_max_tool_calls_with_function_tools(openai_client, client_with_models, text_model_id): - """Test handling of max_tool_calls with function tools in responses.""" - if isinstance(client_with_models, LlamaStackAsLibraryClient): - pytest.skip("OpenAI responses are not supported when testing with library client yet.") - - client = openai_client - max_tool_calls = 1 - - tools = [ - { - "type": "function", - "name": "get_weather", - "description": "Get weather information for a specified location", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The city name (e.g., 'New York', 'London')", - }, - }, - }, - }, - { - "type": "function", - "name": "get_time", - "description": "Get current time for a specified location", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The city name (e.g., 'New York', 'London')", - }, - }, - }, - }, - ] - - # First create a response that triggers function tools - response = client.responses.create( - model=text_model_id, - input="Can you tell me the weather in Paris and the current time?", - tools=tools, - stream=False, - max_tool_calls=max_tool_calls, - ) - - # Verify we got two function calls and that the max_tool_calls do not affect function tools - assert len(response.output) == 2 - assert response.output[0].type == "function_call" - assert response.output[0].name == "get_weather" - assert response.output[0].status == "completed" - assert response.output[1].type == "function_call" - assert response.output[1].name == "get_time" - assert response.output[0].status == "completed" - - # Verify we have a valid max_tool_calls field - assert response.max_tool_calls == max_tool_calls - - -def test_max_tool_calls_invalid(openai_client, client_with_models, text_model_id): - """Test handling of invalid max_tool_calls in responses.""" - if isinstance(client_with_models, LlamaStackAsLibraryClient): - pytest.skip("OpenAI responses are not supported when testing with library client yet.") - - client = openai_client - - input = "Search for today's top technology news." - invalid_max_tool_calls = 0 - tools = [ - {"type": "web_search"}, - ] - - # Create a response with an invalid max_tool_calls value i.e. 0 - # Handle ValueError from LLS and BadRequestError from OpenAI client - with pytest.raises((ValueError, BadRequestError)) as excinfo: - client.responses.create( - model=text_model_id, - input=input, - tools=tools, - stream=False, - max_tool_calls=invalid_max_tool_calls, - ) - - error_message = str(excinfo.value) - assert f"Invalid max_tool_calls={invalid_max_tool_calls}; should be >= 1" in error_message, ( - f"Expected error message about invalid max_tool_calls, got: {error_message}" - ) - - -def test_max_tool_calls_with_builtin_tools(openai_client, client_with_models, text_model_id): - """Test handling of max_tool_calls with built-in tools in responses.""" - if isinstance(client_with_models, LlamaStackAsLibraryClient): - pytest.skip("OpenAI responses are not supported when testing with library client yet.") - - client = openai_client - - input = "Search for today's top technology and a positive news story. You MUST make exactly two separate web search calls." - max_tool_calls = [1, 5] - tools = [ - {"type": "web_search"}, - ] - - # First create a response that triggers web_search tools without max_tool_calls - response = client.responses.create( - model=text_model_id, - input=input, - tools=tools, - stream=False, - ) - - # Verify we got two web search calls followed by a message - assert len(response.output) == 3 - assert response.output[0].type == "web_search_call" - assert response.output[0].status == "completed" - assert response.output[1].type == "web_search_call" - assert response.output[1].status == "completed" - assert response.output[2].type == "message" - assert response.output[2].status == "completed" - assert response.output[2].role == "assistant" - - # Next create a response that triggers web_search tools with max_tool_calls set to 1 - response_2 = client.responses.create( - model=text_model_id, - input=input, - tools=tools, - stream=False, - max_tool_calls=max_tool_calls[0], - ) - - # Verify we got one web search tool call followed by a message - assert len(response_2.output) == 2 - assert response_2.output[0].type == "web_search_call" - assert response_2.output[0].status == "completed" - assert response_2.output[1].type == "message" - assert response_2.output[1].status == "completed" - assert response_2.output[1].role == "assistant" - - # Verify we have a valid max_tool_calls field - assert response_2.max_tool_calls == max_tool_calls[0] - - # Finally create a response that triggers web_search tools with max_tool_calls set to 5 - response_3 = client.responses.create( - model=text_model_id, - input=input, - tools=tools, - stream=False, - max_tool_calls=max_tool_calls[1], - ) - - # Verify we got two web search calls followed by a message - assert len(response_3.output) == 3 - assert response_3.output[0].type == "web_search_call" - assert response_3.output[0].status == "completed" - assert response_3.output[1].type == "web_search_call" - assert response_3.output[1].status == "completed" - assert response_3.output[2].type == "message" - assert response_3.output[2].status == "completed" - assert response_3.output[2].role == "assistant" - - # Verify we have a valid max_tool_calls field - assert response_3.max_tool_calls == max_tool_calls[1]