Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion autogen/agentchat/contrib/multimodal_conversable_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,20 @@ def generate_oai_reply(
messages = self._oai_messages[sender]

messages_with_b64_img = message_formatter_pil_to_b64(self._oai_system_message + messages)

# Fix tool response format for OpenAI API
fixed_messages = []
for msg in messages_with_b64_img:
if isinstance(msg, dict) and msg.get("role") == "tool" and "tool_responses" in msg:
# Unpack tool_responses to individual tool messages with tool_call_id
for tool_response in msg["tool_responses"]:
fixed_messages.append(tool_response)
else:
fixed_messages.append(msg)

# TODO: #1143 handle token limit exceeded error
response = client.create(
context=messages[-1].pop("context", None), messages=messages_with_b64_img, agent=self.name
context=messages[-1].pop("context", None) if messages else None, messages=fixed_messages, agent=self.name
)

# TODO: line 301, line 271 is converting messages to dict. Can be removed after ChatCompletionMessage_to_dict is merged.
Expand Down
9 changes: 9 additions & 0 deletions autogen/events/agent_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,15 @@ def create_received_event_model(
if role == "function":
return FunctionResponseEvent(**event, sender=sender.name, recipient=recipient.name, uuid=uuid)
if role == "tool":
# Handle multimodal content format - extract text if content is a list of dicts
content = event.get('content')
if isinstance(content, list) and len(content) > 0 and isinstance(content[0], dict):
# Extract text from multimodal format [{'type': 'text', 'text': '...'}]
text_parts = []
for item in content:
if isinstance(item, dict) and item.get('type') == 'text':
text_parts.append(item.get('text', ''))
event['content'] = ''.join(text_parts)
return ToolResponseEvent(**event, sender=sender.name, recipient=recipient.name, uuid=uuid)

# Role is neither function nor tool
Expand Down