diff --git a/cookbook/models/openai/.gitignore b/cookbook/models/.gitignore similarity index 100% rename from cookbook/models/openai/.gitignore rename to cookbook/models/.gitignore diff --git a/cookbook/models/aws/bedrock/image_agent_bytes.py b/cookbook/models/aws/bedrock/image_agent_bytes.py index 4573532d3c..3e96a2a610 100644 --- a/cookbook/models/aws/bedrock/image_agent_bytes.py +++ b/cookbook/models/aws/bedrock/image_agent_bytes.py @@ -4,6 +4,7 @@ from agno.media import Image from agno.models.aws import AwsBedrock from agno.tools.duckduckgo import DuckDuckGoTools +from agno.utils.media import download_image agent = Agent( model=AwsBedrock(id="amazon.nova-pro-v1:0"), @@ -13,9 +14,10 @@ image_path = Path(__file__).parent.joinpath("sample.jpg") +download_image(url="https://upload.wikimedia.org/wikipedia/commons/0/0c/GoldenGateBridge-001.jpg", save_path=str(image_path)) + # Read the image file content as bytes -with open(image_path, "rb") as img_file: - image_bytes = img_file.read() +image_bytes = image_path.read_bytes() agent.print_response( "Tell me about this image and give me the latest news about it.", diff --git a/cookbook/models/azure/ai_foundry/image_agent_bytes.py b/cookbook/models/azure/ai_foundry/image_agent_bytes.py index 1932321023..66f58c3d5e 100644 --- a/cookbook/models/azure/ai_foundry/image_agent_bytes.py +++ b/cookbook/models/azure/ai_foundry/image_agent_bytes.py @@ -3,6 +3,7 @@ from agno.agent import Agent from agno.media import Image from agno.models.azure import AzureAIFoundry +from agno.utils.media import download_image agent = Agent( model=AzureAIFoundry(id="Llama-3.2-11B-Vision-Instruct"), @@ -11,9 +12,10 @@ image_path = Path(__file__).parent.joinpath("sample.jpg") +download_image(url="https://upload.wikimedia.org/wikipedia/commons/0/0c/GoldenGateBridge-001.jpg", save_path=str(image_path)) + # Read the image file content as bytes -with open(image_path, "rb") as img_file: - image_bytes = img_file.read() +image_bytes = image_path.read_bytes() agent.print_response( "Tell me about this image.", diff --git a/cookbook/models/cohere/image_agent.py b/cookbook/models/cohere/image_agent.py new file mode 100644 index 0000000000..9086ae0b9d --- /dev/null +++ b/cookbook/models/cohere/image_agent.py @@ -0,0 +1,18 @@ +from agno.agent import Agent +from agno.media import Image +from agno.models.cohere import Cohere + +agent = Agent( + model=Cohere(id="c4ai-aya-vision-8b"), + markdown=True, +) + +agent.print_response( + "Tell me about this image.", + images=[ + Image( + url="https://upload.wikimedia.org/wikipedia/commons/0/0c/GoldenGateBridge-001.jpg" + ) + ], + stream=True, +) diff --git a/cookbook/models/cohere/image_agent_bytes.py b/cookbook/models/cohere/image_agent_bytes.py new file mode 100644 index 0000000000..680d98fa4b --- /dev/null +++ b/cookbook/models/cohere/image_agent_bytes.py @@ -0,0 +1,26 @@ +from pathlib import Path + +from agno.agent import Agent +from agno.media import Image +from agno.models.cohere.chat import Cohere +from agno.utils.media import download_image + +agent = Agent( + model=Cohere(id="c4ai-aya-vision-8b"), + markdown=True, +) + +image_path = Path(__file__).parent.joinpath("sample.jpg") + +download_image(url="https://upload.wikimedia.org/wikipedia/commons/0/0c/GoldenGateBridge-001.jpg", save_path=str(image_path)) + +# Read the image file content as bytes +image_bytes = image_path.read_bytes() + +agent.print_response( + "Tell me about this image.", + images=[ + Image(content=image_bytes), + ], + stream=True, +) diff --git a/cookbook/models/cohere/image_agent_local_file.py b/cookbook/models/cohere/image_agent_local_file.py new file mode 100644 index 0000000000..3b449cee90 --- /dev/null +++ b/cookbook/models/cohere/image_agent_local_file.py @@ -0,0 +1,23 @@ +from pathlib import Path + +from agno.agent import Agent +from agno.media import Image +from agno.models.cohere.chat import Cohere +from agno.utils.media import download_image + +agent = Agent( + model=Cohere(id="c4ai-aya-vision-8b"), + markdown=True, +) + +image_path = Path(__file__).parent.joinpath("sample.jpg") + +download_image(url="https://upload.wikimedia.org/wikipedia/commons/0/0c/GoldenGateBridge-001.jpg", save_path=str(image_path)) + +agent.print_response( + "Tell me about this image.", + images=[ + Image(filepath=image_path), + ], + stream=True, +) diff --git a/cookbook/models/ibm/watsonx/image_agent_bytes.py b/cookbook/models/ibm/watsonx/image_agent_bytes.py index 606be91522..768ffb94c8 100644 --- a/cookbook/models/ibm/watsonx/image_agent_bytes.py +++ b/cookbook/models/ibm/watsonx/image_agent_bytes.py @@ -3,7 +3,6 @@ from agno.agent import Agent from agno.media import Image from agno.models.ibm import WatsonX -from agno.tools.duckduckgo import DuckDuckGoTools agent = Agent( model=WatsonX(id="meta-llama/llama-3-2-11b-vision-instruct"), @@ -13,8 +12,7 @@ image_path = Path(__file__).parent.joinpath("sample.jpg") # Read the image file content as bytes -with open(image_path, "rb") as img_file: - image_bytes = img_file.read() +image_bytes = image_path.read_bytes() agent.print_response( "Tell me about this image and and give me the latest news about it.", diff --git a/cookbook/models/openai/image_agent_bytes.py b/cookbook/models/openai/image_agent_bytes.py index f703ef1318..a359eebf71 100644 --- a/cookbook/models/openai/image_agent_bytes.py +++ b/cookbook/models/openai/image_agent_bytes.py @@ -4,6 +4,7 @@ from agno.media import Image from agno.models.openai import OpenAIChat from agno.tools.duckduckgo import DuckDuckGoTools +from agno.utils.media import download_image agent = Agent( model=OpenAIChat(id="gpt-4o"), @@ -13,9 +14,10 @@ image_path = Path(__file__).parent.joinpath("sample.jpg") +download_image(url="https://upload.wikimedia.org/wikipedia/commons/0/0c/GoldenGateBridge-001.jpg", save_path=str(image_path)) + # Read the image file content as bytes -with open(image_path, "rb") as img_file: - image_bytes = img_file.read() +image_bytes = image_path.read_bytes() agent.print_response( "Tell me about this image and give me the latest news about it.", diff --git a/cookbook/models/together/image_agent_bytes.py b/cookbook/models/together/image_agent_bytes.py index 4f24d678b4..82c3f35e31 100644 --- a/cookbook/models/together/image_agent_bytes.py +++ b/cookbook/models/together/image_agent_bytes.py @@ -12,8 +12,7 @@ image_path = Path(__file__).parent.joinpath("sample.jpg") # Read the image file content as bytes -with open(image_path, "rb") as img_file: - image_bytes = img_file.read() +image_bytes = image_path.read_bytes() agent.print_response( "Tell me about this image", diff --git a/cookbook/models/xai/image_agent_bytes.py b/cookbook/models/xai/image_agent_bytes.py index 7f7c9f1da4..232236ceb3 100644 --- a/cookbook/models/xai/image_agent_bytes.py +++ b/cookbook/models/xai/image_agent_bytes.py @@ -4,6 +4,7 @@ from agno.media import Image from agno.models.xai import xAI from agno.tools.duckduckgo import DuckDuckGoTools +from agno.utils.media import download_image agent = Agent( model=xAI(id="grok-2-vision-latest"), @@ -13,9 +14,10 @@ image_path = Path(__file__).parent.joinpath("sample.jpg") +download_image(url="https://upload.wikimedia.org/wikipedia/commons/0/0c/GoldenGateBridge-001.jpg", save_path=str(image_path)) + # Read the image file content as bytes -with open(image_path, "rb") as img_file: - image_bytes = img_file.read() +image_bytes = image_path.read_bytes() agent.print_response( "Tell me about this image and give me the latest news about it.", diff --git a/libs/agno/agno/models/cohere/chat.py b/libs/agno/agno/models/cohere/chat.py index 375ebee64d..b3ac145d4a 100644 --- a/libs/agno/agno/models/cohere/chat.py +++ b/libs/agno/agno/models/cohere/chat.py @@ -1,8 +1,10 @@ from dataclasses import dataclass from os import getenv -from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Tuple +from pathlib import Path +from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Sequence, Tuple from agno.exceptions import ModelProviderError +from agno.media import Image from agno.models.base import MessageData, Model from agno.models.message import Message from agno.models.response import ModelResponse @@ -17,6 +19,78 @@ raise ImportError("`cohere` not installed. Please install using `pip install cohere`") +def _format_images_for_message(message: Message, images: Sequence[Image]) -> List[Dict[str, Any]]: + """ + Format an image into the format expected by WatsonX. + """ + + # Create a default message content with text + message_content_with_image: List[Dict[str, Any]] = [{"type": "text", "text": message.content}] + + # Add images to the message content + for image in images: + try: + if image.content is not None: + image_content = image.content + elif image.url is not None: + image_content = image.image_url_content + elif image.filepath is not None: + if isinstance(image.filepath, Path): + image_content = image.filepath.read_bytes() + else: + with open(image.filepath, "rb") as f: + image_content = f.read() + else: + logger.warning(f"Unsupported image format: {image}") + continue + + if image_content is not None: + import base64 + + base64_image = base64.b64encode(image_content).decode("utf-8") + image_url = f"data:image/jpeg;base64,{base64_image}" + image_payload = {"type": "image_url", "image_url": {"url": image_url}} + message_content_with_image.append(image_payload) + + except Exception as e: + logger.error(f"Failed to process image: {str(e)}") + + # Update the message content with the images + return message_content_with_image + + +def _format_messages(messages: List[Message]) -> List[Dict[str, Any]]: + """ + Format messages for the Cohere API. + + Args: + messages (List[Message]): The list of messages. + + Returns: + List[Dict[str, Any]]: The formatted messages. + """ + formatted_messages = [] + for message in messages: + message_dict = { + "role": message.role, + "content": message.content, + "name": message.name, + "tool_call_id": message.tool_call_id, + "tool_calls": message.tool_calls, + } + + if message.images is not None and len(message.images) > 0: + # Ignore non-string message content + if isinstance(message.content, str): + message_content_with_image = _format_images_for_message(message=message, images=message.images) + if len(message_content_with_image) > 1: + message_dict["content"] = message_content_with_image + + message_dict = {k: v for k, v in message_dict.items() if v is not None} + formatted_messages.append(message_dict) + return formatted_messages + + @dataclass class Cohere(Model): id: str = "command-r-plus" @@ -116,29 +190,6 @@ def request_kwargs(self) -> Dict[str, Any]: _request_params.update(self.request_params) return _request_params - def _format_messages(self, messages: List[Message]) -> List[Dict[str, Any]]: - """ - Format messages for the Cohere API. - - Args: - messages (List[Message]): The list of messages. - - Returns: - List[Dict[str, Any]]: The formatted messages. - """ - formatted_messages = [] - for message in messages: - message_dict = { - "role": message.role, - "content": message.content, - "name": message.name, - "tool_call_id": message.tool_call_id, - "tool_calls": message.tool_calls, - } - message_dict = {k: v for k, v in message_dict.items() if v is not None} - formatted_messages.append(message_dict) - return formatted_messages - def invoke(self, messages: List[Message]) -> ChatResponse: """ Invoke a non-streamed chat response from the Cohere API. @@ -153,7 +204,7 @@ def invoke(self, messages: List[Message]) -> ChatResponse: request_kwargs = self.request_kwargs try: - return self.get_client().chat(model=self.id, messages=self._format_messages(messages), **request_kwargs) # type: ignore + return self.get_client().chat(model=self.id, messages=_format_messages(messages), **request_kwargs) # type: ignore except Exception as e: logger.error(f"Unexpected error calling Cohere API: {str(e)}") raise ModelProviderError(message=str(e), model_name=self.name, model_id=self.id) from e @@ -173,7 +224,7 @@ def invoke_stream(self, messages: List[Message]) -> Iterator[StreamedChatRespons try: return self.get_client().chat_stream( model=self.id, - messages=self._format_messages(messages), # type: ignore + messages=_format_messages(messages), # type: ignore **request_kwargs, ) except Exception as e: @@ -195,7 +246,7 @@ async def ainvoke(self, messages: List[Message]) -> ChatResponse: try: return await self.get_async_client().chat( model=self.id, - messages=self._format_messages(messages), # type: ignore + messages=_format_messages(messages), # type: ignore **request_kwargs, ) except Exception as e: @@ -217,7 +268,7 @@ async def ainvoke_stream(self, messages: List[Message]) -> AsyncIterator[Streame try: async for response in self.get_async_client().chat_stream( model=self.id, - messages=self._format_messages(messages), # type: ignore + messages=_format_messages(messages), # type: ignore **request_kwargs, ): yield response diff --git a/libs/agno/agno/utils/media.py b/libs/agno/agno/utils/media.py index 5f8fd0cd0e..6eb56db98e 100644 --- a/libs/agno/agno/utils/media.py +++ b/libs/agno/agno/utils/media.py @@ -3,7 +3,7 @@ import requests -def download_image(url, save_path): +def download_image(url: str, save_path: str) -> bool: """ Downloads an image from the specified URL and saves it to the given local path. Parameters: diff --git a/libs/agno/pyproject.toml b/libs/agno/pyproject.toml index 23422d00b5..ce08959fc1 100644 --- a/libs/agno/pyproject.toml +++ b/libs/agno/pyproject.toml @@ -75,7 +75,7 @@ postgres = ["psycopg-binary", "psycopg"] # Dependencies for Vector databases pgvector = ["pgvector"] chromadb = ["chromadb"] -lancedb = ["lancedb", "tantivy"] +lancedb = ["lancedb==0.20.0", "tantivy"] qdrant = ["qdrant-client"] cassandra = ["cassio"] mongodb = ["pymongo[srv]"] diff --git a/libs/agno/tests/integration/models/aws/bedrock/test_multimodal.py b/libs/agno/tests/integration/models/aws/bedrock/test_multimodal.py index 659787c2bf..650bef0885 100644 --- a/libs/agno/tests/integration/models/aws/bedrock/test_multimodal.py +++ b/libs/agno/tests/integration/models/aws/bedrock/test_multimodal.py @@ -11,11 +11,10 @@ def test_image_input_bytes(): """ agent = Agent(model=AwsBedrock(id="amazon.nova-pro-v1:0"), markdown=True, telemetry=False, monitoring=False) - image_path = Path(__file__).parent.joinpath("../../sample_image.jpg") + image_path = Path(__file__).parent.parent.parent.joinpath("sample_image.jpg") # Read the image file content as bytes - with open(image_path, "rb") as img_file: - image_bytes = img_file.read() + image_bytes = image_path.read_bytes() response = agent.run( "Tell me about this image.", diff --git a/libs/agno/tests/integration/models/cohere/test_multimodal.py b/libs/agno/tests/integration/models/cohere/test_multimodal.py new file mode 100644 index 0000000000..663bb4c496 --- /dev/null +++ b/libs/agno/tests/integration/models/cohere/test_multimodal.py @@ -0,0 +1,78 @@ +from pathlib import Path + +import pytest + +from agno.agent.agent import Agent +from agno.media import Image +from agno.models.cohere.chat import Cohere +from agno.tools.duckduckgo import DuckDuckGoTools + + +def test_image_input(): + agent = Agent( + model=Cohere(id="c4ai-aya-vision-8b"), + add_history_to_messages=True, + markdown=True, + telemetry=False, + monitoring=False, + ) + + response = agent.run( + "Tell me about this image.", + images=[Image(url="https://upload.wikimedia.org/wikipedia/commons/0/0c/GoldenGateBridge-001.jpg")], + ) + + assert "golden" in response.content.lower() + + # Just check it doesn't break on subsequent messages + response = agent.run("Where can I find more information?") + assert [message.role for message in response.messages] == ["system", "user", "assistant", "user", "assistant"] + + +def test_image_input_bytes(): + agent = Agent(model=Cohere(id="c4ai-aya-vision-8b"), telemetry=False, monitoring=False) + + image_path = Path(__file__).parent.parent.joinpath("sample_image.jpg") + + # Read the image file content as bytes + image_bytes = image_path.read_bytes() + + response = agent.run( + "Tell me about this image.", + images=[Image(content=image_bytes)], + ) + + assert "golden" in response.content.lower() + assert "bridge" in response.content.lower() + + +def test_image_input_local_file(): + agent = Agent(model=Cohere(id="c4ai-aya-vision-8b"), telemetry=False, monitoring=False) + + image_path = Path(__file__).parent.parent.joinpath("sample_image.jpg") + + response = agent.run( + "Tell me about this image.", + images=[Image(filepath=image_path)], + ) + + assert "golden" in response.content.lower() + assert "bridge" in response.content.lower() + + +@pytest.mark.skip(reason="Image with tool call is not supported yet.") +def test_image_input_with_tool_call(): + agent = Agent( + model=Cohere(id="c4ai-aya-vision-8b"), + tools=[DuckDuckGoTools()], + markdown=True, + telemetry=False, + monitoring=False, + ) + + response = agent.run( + "Tell me about this image and give me the latest news about it.", + images=[Image(url="https://upload.wikimedia.org/wikipedia/commons/0/0c/GoldenGateBridge-001.jpg")], + ) + + assert "golden" in response.content.lower() diff --git a/libs/agno/tests/integration/models/huggingface/__init__.py b/libs/agno/tests/integration/models/huggingface/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/libs/agno/tests/integration/models/ibm/watsonx/test_multimodal.py b/libs/agno/tests/integration/models/ibm/watsonx/test_multimodal.py index 8b8de35d31..052dba6896 100644 --- a/libs/agno/tests/integration/models/ibm/watsonx/test_multimodal.py +++ b/libs/agno/tests/integration/models/ibm/watsonx/test_multimodal.py @@ -20,11 +20,10 @@ def test_image_input(): def test_image_input_bytes(): agent = Agent(model=WatsonX(id="meta-llama/llama-3-2-11b-vision-instruct"), telemetry=False, monitoring=False) - image_path = Path(__file__).parent.joinpath("../../sample_image.jpg") + image_path = Path(__file__).parent.parent.parent.joinpath("sample_image.jpg") # Read the image file content as bytes - with open(image_path, "rb") as img_file: - image_bytes = img_file.read() + image_bytes = image_path.read_bytes() response = agent.run( "Tell me about this image.",