From 4e6aae725fcbd536c820c02ba60982f770b7bae6 Mon Sep 17 00:00:00 2001 From: Ethan Zhang Date: Mon, 30 Dec 2024 17:25:30 +0000 Subject: [PATCH 1/6] feat: initial tool --- .../extension/coze_python_async/extension.py | 3 + .../extension/dify_python/extension.py | 3 + .../extension/gemini_v2v_python/extension.py | 3 + .../extension/glue_python_async/extension.py | 3 + .../extension/image_generate_tool/README.md | 29 ++++++ .../extension/image_generate_tool/__init__.py | 6 ++ .../extension/image_generate_tool/addon.py | 20 +++++ .../image_generate_tool/extension.py | 48 ++++++++++ .../image_generate_tool/manifest.json | 88 +++++++++++++++++++ .../image_generate_tool/property.json | 1 + .../image_generate_tool/requirements.txt | 0 .../openai_chatgpt_python/extension.py | 10 ++- .../extension/openai_chatgpt_python/openai.py | 24 ++++- .../extension/openai_v2v_python/extension.py | 3 + .../interface/ten_ai_base/const.py | 1 + .../ten_ai_base/interface/ten_ai_base/llm.py | 8 +- 16 files changed, 246 insertions(+), 4 deletions(-) create mode 100644 agents/ten_packages/extension/image_generate_tool/README.md create mode 100644 agents/ten_packages/extension/image_generate_tool/__init__.py create mode 100644 agents/ten_packages/extension/image_generate_tool/addon.py create mode 100644 agents/ten_packages/extension/image_generate_tool/extension.py create mode 100644 agents/ten_packages/extension/image_generate_tool/manifest.json create mode 100644 agents/ten_packages/extension/image_generate_tool/property.json create mode 100644 agents/ten_packages/extension/image_generate_tool/requirements.txt diff --git a/agents/ten_packages/extension/coze_python_async/extension.py b/agents/ten_packages/extension/coze_python_async/extension.py index eb70133e..b8273be6 100644 --- a/agents/ten_packages/extension/coze_python_async/extension.py +++ b/agents/ten_packages/extension/coze_python_async/extension.py @@ -168,6 +168,9 @@ async def on_call_chat_completion( ) -> any: raise RuntimeError("Not implemented") + async def on_generate_image(self, async_ten_env, prompt)->str: + raise RuntimeError("Not implemented") + async def on_data_chat_completion( self, ten_env: AsyncTenEnv, **kargs: LLMDataCompletionArgs ) -> None: diff --git a/agents/ten_packages/extension/dify_python/extension.py b/agents/ten_packages/extension/dify_python/extension.py index baf70f72..f2957f25 100644 --- a/agents/ten_packages/extension/dify_python/extension.py +++ b/agents/ten_packages/extension/dify_python/extension.py @@ -171,6 +171,9 @@ async def on_video_frame( async def on_call_chat_completion(self, async_ten_env, **kargs): raise NotImplementedError + + async def on_generate_image(self, async_ten_env, prompt)->str: + return NotImplementedError async def on_tools_update(self, async_ten_env, tool): raise NotImplementedError diff --git a/agents/ten_packages/extension/gemini_v2v_python/extension.py b/agents/ten_packages/extension/gemini_v2v_python/extension.py index 77463fde..ab8f11a6 100644 --- a/agents/ten_packages/extension/gemini_v2v_python/extension.py +++ b/agents/ten_packages/extension/gemini_v2v_python/extension.py @@ -739,5 +739,8 @@ async def _update_usage(self, usage: dict) -> None: async def on_call_chat_completion(self, async_ten_env, **kargs): raise NotImplementedError + async def on_generate_image(self, async_ten_env, prompt)->str: + return NotImplementedError + async def on_data_chat_completion(self, async_ten_env, **kargs): raise NotImplementedError diff --git a/agents/ten_packages/extension/glue_python_async/extension.py b/agents/ten_packages/extension/glue_python_async/extension.py index cae63e44..bdc69848 100644 --- a/agents/ten_packages/extension/glue_python_async/extension.py +++ b/agents/ten_packages/extension/glue_python_async/extension.py @@ -224,6 +224,9 @@ async def on_call_chat_completion( ) -> any: raise RuntimeError("Not implemented") + async def on_generate_image(self, async_ten_env, prompt) -> str: + return RuntimeError("Not implemented") + async def on_data_chat_completion( self, ten_env: AsyncTenEnv, **kargs: LLMDataCompletionArgs ) -> None: diff --git a/agents/ten_packages/extension/image_generate_tool/README.md b/agents/ten_packages/extension/image_generate_tool/README.md new file mode 100644 index 00000000..73c27cb2 --- /dev/null +++ b/agents/ten_packages/extension/image_generate_tool/README.md @@ -0,0 +1,29 @@ +# image_generate_tool + + + +## Features + + + +- xxx feature + +## API + +Refer to `api` definition in [manifest.json] and default values in [property.json](property.json). + + + +## Development + +### Build + + + +### Unit test + + + +## Misc + + diff --git a/agents/ten_packages/extension/image_generate_tool/__init__.py b/agents/ten_packages/extension/image_generate_tool/__init__.py new file mode 100644 index 00000000..72593ab2 --- /dev/null +++ b/agents/ten_packages/extension/image_generate_tool/__init__.py @@ -0,0 +1,6 @@ +# +# This file is part of TEN Framework, an open source project. +# Licensed under the Apache License, Version 2.0. +# See the LICENSE file for more information. +# +from . import addon diff --git a/agents/ten_packages/extension/image_generate_tool/addon.py b/agents/ten_packages/extension/image_generate_tool/addon.py new file mode 100644 index 00000000..647204cf --- /dev/null +++ b/agents/ten_packages/extension/image_generate_tool/addon.py @@ -0,0 +1,20 @@ +# +# This file is part of TEN Framework, an open source project. +# Licensed under the Apache License, Version 2.0. +# See the LICENSE file for more information. +# +from ten import ( + Addon, + register_addon_as_extension, + TenEnv, +) +from .extension import ImageGenerateToolExtension + + +@register_addon_as_extension("image_generate_tool") +class ImageGenerateToolExtensionAddon(Addon): + + def on_create_instance(self, ten_env: TenEnv, name: str, context) -> None: + ten_env.log_info("on_create_instance") + ten_env.on_create_instance_done( + ImageGenerateToolExtension(name), context) diff --git a/agents/ten_packages/extension/image_generate_tool/extension.py b/agents/ten_packages/extension/image_generate_tool/extension.py new file mode 100644 index 00000000..65c72de3 --- /dev/null +++ b/agents/ten_packages/extension/image_generate_tool/extension.py @@ -0,0 +1,48 @@ +# +# This file is part of TEN Framework, an open source project. +# Licensed under the Apache License, Version 2.0. +# See the LICENSE file for more information. +# +from ten import ( + TenEnv, + AsyncTenEnv, +) +from ten_ai_base import ( + AsyncLLMToolBaseExtension, LLMToolMetadata, LLMToolResult, BaseConfig +) +from dataclasses import dataclass + + +@dataclass +class ImageGenerateToolConfig(BaseConfig): + # TODO: add extra config fields here + pass + + +class ImageGenerateToolExtension(AsyncLLMToolBaseExtension): + def __init__(self, name: str): + super().__init__(name) + self.config = None + + async def on_start(self, ten_env: AsyncTenEnv) -> None: + await super().on_start(ten_env) + + # initialize configuration + self.config = await ImageGenerateToolConfig.create_async(ten_env=ten_env) + ten_env.log_info(f"config: {self.config}") + + # Implement this method to construct and start your resources. + ten_env.log_debug("TODO: on_start") + + async def on_stop(self, ten_env: AsyncTenEnv) -> None: + await super().on_stop(ten_env) + + #Implement this method to stop and destruct your resources. + ten_env.log_debug("TODO: on_stop") + + def get_tool_metadata(self, ten_env: TenEnv) -> list[LLMToolMetadata]: + ten_env.log_debug("TODO: get_tool_metadata") + return [] + + async def run_tool(self, ten_env: AsyncTenEnv, name: str, args: dict) -> LLMToolResult | None: + ten_env.log_debug(f"TODO: run_tool {name} {args}") diff --git a/agents/ten_packages/extension/image_generate_tool/manifest.json b/agents/ten_packages/extension/image_generate_tool/manifest.json new file mode 100644 index 00000000..b9e81545 --- /dev/null +++ b/agents/ten_packages/extension/image_generate_tool/manifest.json @@ -0,0 +1,88 @@ +{ + "type": "extension", + "name": "image_generate_tool", + "version": "0.1.0", + "dependencies": [ + { + "type": "system", + "name": "ten_runtime_python", + "version": "0.6" + } + ], + "package": { + "include": [ + "manifest.json", + "property.json", + "requirements.txt", + "**.tent", + "**.py", + "README.md" + ] + }, + "api": { + "property": {}, + "cmd_in": [ + { + "name": "tool_call", + "property": { + "name": { + "type": "string" + }, + "arguments": { + "type": "string" + } + }, + "required": [ + "name" + ], + "result": { + "property": { + "tool_result": { + "type": "string" + } + }, + "required": [ + "tool_result" + ] + } + } + ], + "cmd_out": [ + { + "name": "tool_register", + "property": { + "tool": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "description": { + "type": "string" + }, + "parameters": { + "type": "array", + "items": { + "type": "object", + "properties": {} + } + } + }, + "required": [ + "name", + "description", + "parameters" + ] + } + }, + "result": { + "property": { + "response": { + "type": "string" + } + } + } + } + ] + } +} \ No newline at end of file diff --git a/agents/ten_packages/extension/image_generate_tool/property.json b/agents/ten_packages/extension/image_generate_tool/property.json new file mode 100644 index 00000000..9e26dfee --- /dev/null +++ b/agents/ten_packages/extension/image_generate_tool/property.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/agents/ten_packages/extension/image_generate_tool/requirements.txt b/agents/ten_packages/extension/image_generate_tool/requirements.txt new file mode 100644 index 00000000..e69de29b diff --git a/agents/ten_packages/extension/openai_chatgpt_python/extension.py b/agents/ten_packages/extension/openai_chatgpt_python/extension.py index 4fb6fd96..3965d310 100644 --- a/agents/ten_packages/extension/openai_chatgpt_python/extension.py +++ b/agents/ten_packages/extension/openai_chatgpt_python/extension.py @@ -29,7 +29,7 @@ ) from .helper import parse_sentences -from .openai import OpenAIChatGPT, OpenAIChatGPTConfig +from .openai import OpenAIChatGPT, OpenAIChatGPTConfig, OpenAIImageConfig from ten import ( Cmd, StatusCode, @@ -53,6 +53,7 @@ def __init__(self, name: str): self.memory = [] self.memory_cache = [] self.config = None + self.image_config = None self.client = None self.sentence_fragment = "" self.tool_task_future = None @@ -67,6 +68,7 @@ async def on_start(self, async_ten_env: AsyncTenEnv) -> None: await super().on_start(async_ten_env) self.config = await OpenAIChatGPTConfig.create_async(ten_env=async_ten_env) + self.image_config = await OpenAIImageConfig.create_async(ten_env=async_ten_env) # Mandatory properties if not self.config.api_key: @@ -75,7 +77,7 @@ async def on_start(self, async_ten_env: AsyncTenEnv) -> None: # Create instance try: - self.client = OpenAIChatGPT(async_ten_env, self.config) + self.client = OpenAIChatGPT(async_ten_env, self.config, self.image_config) async_ten_env.log_info( f"initialized with max_tokens: {self.config.max_tokens}, model: {self.config.model}, vendor: {self.config.vendor}" ) @@ -147,6 +149,10 @@ async def on_tools_update( ) -> None: return await super().on_tools_update(async_ten_env, tool) + async def on_generate_image(self, async_ten_env, prompt)->str: + url = await self.client.generate_image(prompt) + return url + async def on_call_chat_completion( self, async_ten_env: AsyncTenEnv, **kargs: LLMCallCompletionArgs ) -> any: diff --git a/agents/ten_packages/extension/openai_chatgpt_python/openai.py b/agents/ten_packages/extension/openai_chatgpt_python/openai.py index 8c4845ea..48523d7a 100644 --- a/agents/ten_packages/extension/openai_chatgpt_python/openai.py +++ b/agents/ten_packages/extension/openai_chatgpt_python/openai.py @@ -39,12 +39,20 @@ class OpenAIChatGPTConfig(BaseConfig): azure_endpoint: str = "" azure_api_version: str = "" +@dataclass +class OpenAIImageConfig(BaseConfig): + model: str = "dall-e-3" + size: str = "512x512" + quality: str = "standard" + n: int = 1 + class OpenAIChatGPT: client = None - def __init__(self, ten_env: AsyncTenEnv, config: OpenAIChatGPTConfig): + def __init__(self, ten_env: AsyncTenEnv, config: OpenAIChatGPTConfig, image_config: OpenAIImageConfig): self.config = config + self.image_config = image_config ten_env.log_info(f"OpenAIChatGPT initialized with config: {config.api_key}") if self.config.vendor == "azure": self.client = AsyncAzureOpenAI( @@ -173,3 +181,17 @@ async def get_chat_completions_stream(self, messages, tools=None, listener=None) # Emit content finished event after the loop completes if listener: listener.emit("content_finished", full_content) + + + async def generate_image(self, prompt:str): + try: + response = await self.client.images.generate( + prompt=prompt, + model=self.image_config.model, + size=self.image_config.size, + quality=self.image_config.quality, + ) + except Exception as e: + raise RuntimeError(f"GenerateImage failed, err: {e}") from e + + return response.data[0].url \ No newline at end of file diff --git a/agents/ten_packages/extension/openai_v2v_python/extension.py b/agents/ten_packages/extension/openai_v2v_python/extension.py index 680769fa..306f3885 100644 --- a/agents/ten_packages/extension/openai_v2v_python/extension.py +++ b/agents/ten_packages/extension/openai_v2v_python/extension.py @@ -830,5 +830,8 @@ async def _update_usage(self, usage: dict) -> None: async def on_call_chat_completion(self, async_ten_env, **kargs): raise NotImplementedError + async def on_generate_image(self, async_ten_env, prompt)->str: + return NotImplementedError + async def on_data_chat_completion(self, async_ten_env, **kargs): raise NotImplementedError diff --git a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/const.py b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/const.py index bb19e0e8..c77cfe50 100644 --- a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/const.py +++ b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/const.py @@ -3,6 +3,7 @@ CMD_PROPERTY_TOOL = "tool" CMD_PROPERTY_RESULT = "tool_result" CMD_CHAT_COMPLETION_CALL = "chat_completion_call" +CMD_GENERATE_IMAGE_CALL = "generate_image_call" CMD_IN_FLUSH = "flush" CMD_OUT_FLUSH = "flush" diff --git a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/llm.py b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/llm.py index 5ef942d3..3baecc4e 100644 --- a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/llm.py +++ b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/llm.py @@ -82,7 +82,7 @@ async def on_cmd(self, async_ten_env: AsyncTenEnv, cmd: Cmd) -> None: self.available_tools.append(tool_metadata) await self.on_tools_update(async_ten_env, tool_metadata) await async_ten_env.return_result(CmdResult.create(StatusCode.OK), cmd) - except Exception as err: + except Exception: async_ten_env.log_warn(f"on_cmd failed: {traceback.format_exc()}") await async_ten_env.return_result( CmdResult.create(StatusCode.ERROR), cmd @@ -147,6 +147,12 @@ async def on_data_chat_completion( Note that this method is stream-based, and it should consider supporting local context caching. """ + @abstractmethod + async def on_generate_image( + self, async_ten_env: AsyncTenEnv, prompt: str + ) -> str: + """Called when an image generation is requested. Implement this method to process the image generation.""" + @abstractmethod async def on_tools_update( self, async_ten_env: AsyncTenEnv, tool: LLMToolMetadata From ef24ca21a4125ecda254b53bc5b81fab29f63afa Mon Sep 17 00:00:00 2001 From: Ethan Zhang Date: Tue, 31 Dec 2024 04:54:53 +0000 Subject: [PATCH 2/6] feat: initial version --- .vscode/settings.json | 4 + agents/examples/default/manifest.json | 5 + agents/examples/default/property.json | 276 ++++++++++++++++++ .../extension/coze_python_async/extension.py | 3 - .../extension/dify_python/extension.py | 3 - .../extension/gemini_v2v_python/extension.py | 6 +- .../extension/glue_python_async/extension.py | 7 +- .../image_generate_tool/extension.py | 48 --- .../image_generate_tool/requirements.txt | 0 .../message_collector_rtm/src/extension.py | 8 +- .../extension/minimax_v2v_python/extension.py | 2 +- .../openai_chatgpt_python/extension.py | 58 ++-- .../openai_chatgpt_python/manifest.json | 8 + .../extension/openai_chatgpt_python/openai.py | 25 +- .../README.md | 0 .../__init__.py | 0 .../addon.py | 8 +- .../openai_image_generate_tool/extension.py | 73 +++++ .../manifest.json | 46 ++- .../openai_image_generate_tool/openai.py | 70 +++++ .../property.json | 0 .../requirements.txt | 2 + .../extension/openai_v2v_python/extension.py | 7 +- .../vision_analyze_tool_python/extension.py | 8 +- .../extension/vision_tool_python/extension.py | 15 +- .../weatherapi_tool_python/extension.py | 18 +- .../interface/ten_ai_base/const.py | 4 + .../ten_ai_base/interface/ten_ai_base/llm.py | 17 ++ .../interface/ten_ai_base/types.py | 15 +- pyrightconfig.json | 1 + 30 files changed, 584 insertions(+), 153 deletions(-) delete mode 100644 agents/ten_packages/extension/image_generate_tool/extension.py delete mode 100644 agents/ten_packages/extension/image_generate_tool/requirements.txt rename agents/ten_packages/extension/{image_generate_tool => openai_image_generate_tool}/README.md (100%) rename agents/ten_packages/extension/{image_generate_tool => openai_image_generate_tool}/__init__.py (100%) rename agents/ten_packages/extension/{image_generate_tool => openai_image_generate_tool}/addon.py (64%) create mode 100644 agents/ten_packages/extension/openai_image_generate_tool/extension.py rename agents/ten_packages/extension/{image_generate_tool => openai_image_generate_tool}/manifest.json (70%) create mode 100644 agents/ten_packages/extension/openai_image_generate_tool/openai.py rename agents/ten_packages/extension/{image_generate_tool => openai_image_generate_tool}/property.json (100%) create mode 100644 agents/ten_packages/extension/openai_image_generate_tool/requirements.txt diff --git a/.vscode/settings.json b/.vscode/settings.json index 068ac5c4..69cb5833 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -10,4 +10,8 @@ "editor.defaultFormatter": "ms-python.black-formatter" }, "git.ignoreLimitWarning": true, + "pylint.ignorePatterns": [ + "*/ten_runtime_python/**/*", + "/usr/lib/**/*" + ], } \ No newline at end of file diff --git a/agents/examples/default/manifest.json b/agents/examples/default/manifest.json index 7a33b295..843e3421 100644 --- a/agents/examples/default/manifest.json +++ b/agents/examples/default/manifest.json @@ -137,6 +137,11 @@ "type": "extension", "name": "coze_python_async", "version": "=0.1.0" + }, + { + "type": "extension", + "name": "openai_image_generate_tool", + "version": "=0.1.0" } ] } \ No newline at end of file diff --git a/agents/examples/default/property.json b/agents/examples/default/property.json index d3b18a80..b7f20960 100644 --- a/agents/examples/default/property.json +++ b/agents/examples/default/property.json @@ -651,6 +651,282 @@ ] } ] + }, + { + "name": "story_teller", + "auto_start": true, + "nodes": [ + { + "type": "extension", + "name": "agora_rtc", + "addon": "agora_rtc", + "extension_group": "default", + "property": { + "app_id": "${env:AGORA_APP_ID}", + "token": "", + "channel": "ten_agent_test", + "stream_id": 1234, + "remote_stream_id": 123, + "subscribe_audio": true, + "publish_audio": true, + "publish_data": true, + "enable_agora_asr": false + } + }, + { + "type": "extension", + "name": "stt", + "addon": "deepgram_asr_python", + "extension_group": "stt", + "property": { + "api_key": "${env:DEEPGRAM_API_KEY}", + "language": "en-US", + "model": "nova-2", + "sample_rate": 16000 + } + }, + { + "type": "extension", + "name": "llm", + "addon": "openai_chatgpt_python", + "extension_group": "chatgpt", + "property": { + "api_key": "${env:OPENAI_API_KEY}", + "base_url": "", + "frequency_penalty": 0.9, + "greeting": "TEN Agent connected. How can I help you today?", + "max_memory_length": 10, + "max_tokens": 512, + "model": "${env:OPENAI_MODEL}", + "prompt": "", + "proxy_url": "${env:OPENAI_PROXY_URL}" + } + }, + { + "type": "extension", + "name": "tts", + "addon": "fish_audio_tts", + "extension_group": "tts", + "property": { + "api_key": "${env:FISH_AUDIO_TTS_KEY}", + "model_id": "d8639b5cc95548f5afbcfe22d3ba5ce5", + "optimize_streaming_latency": true, + "request_timeout_seconds": 30, + "base_url": "https://api.fish.audio" + } + }, + { + "type": "extension", + "name": "interrupt_detector", + "addon": "interrupt_detector_python", + "extension_group": "default", + "property": {} + }, + { + "type": "extension", + "name": "message_collector", + "addon": "message_collector", + "extension_group": "transcriber", + "property": {} + }, + { + "type": "extension", + "name": "weatherapi_tool_python", + "addon": "weatherapi_tool_python", + "extension_group": "default", + "property": { + "api_key": "${env:WEATHERAPI_API_KEY|}" + } + }, + { + "type": "extension", + "name": "openai_image_generate_tool", + "addon": "openai_image_generate_tool", + "extension_group": "default", + "property": { + "api_key": "${env:OPENAI_API_KEY}" + } + } + ], + "connections": [ + { + "extension": "agora_rtc", + "cmd": [ + { + "name": "on_user_joined", + "dest": [ + { + "extension": "llm" + } + ] + }, + { + "name": "on_user_left", + "dest": [ + { + "extension": "llm" + } + ] + }, + { + "name": "on_connection_failure", + "dest": [ + { + "extension": "llm" + } + ] + } + ], + "audio_frame": [ + { + "name": "pcm_frame", + "dest": [ + { + "extension": "stt" + } + ] + } + ] + }, + { + "extension": "stt", + "data": [ + { + "name": "text_data", + "dest": [ + { + "extension": "interrupt_detector" + }, + { + "extension": "message_collector" + } + ] + } + ] + }, + { + "extension": "llm", + "cmd": [ + { + "name": "flush", + "dest": [ + { + "extension": "tts" + } + ] + }, + { + "name": "tool_call", + "dest": [ + { + "extension": "weatherapi_tool_python" + }, + { + "extension": "openai_image_generate_tool" + } + ] + } + ], + "data": [ + { + "name": "text_data", + "dest": [ + { + "extension": "tts" + }, + { + "extension": "message_collector" + } + ] + } + ] + }, + { + "extension": "message_collector", + "data": [ + { + "name": "data", + "dest": [ + { + "extension": "agora_rtc" + } + ] + } + ] + }, + { + "extension": "tts", + "cmd": [ + { + "name": "flush", + "dest": [ + { + "extension": "agora_rtc" + } + ] + } + ], + "audio_frame": [ + { + "name": "pcm_frame", + "dest": [ + { + "extension": "agora_rtc" + } + ] + } + ] + }, + { + "extension": "interrupt_detector", + "cmd": [ + { + "name": "flush", + "dest": [ + { + "extension": "llm" + } + ] + } + ], + "data": [ + { + "name": "text_data", + "dest": [ + { + "extension": "llm" + } + ] + } + ] + }, + { + "extension": "weatherapi_tool_python", + "cmd": [ + { + "name": "tool_register", + "dest": [ + { + "extension": "llm" + } + ] + } + ] + }, + { + "extension": "openai_image_generate_tool", + "cmd": [ + { + "name": "tool_register", + "dest": [ + { + "extension": "llm" + } + ] + } + ] + } + ] } ], "log_level": 3 diff --git a/agents/ten_packages/extension/coze_python_async/extension.py b/agents/ten_packages/extension/coze_python_async/extension.py index b8273be6..eb70133e 100644 --- a/agents/ten_packages/extension/coze_python_async/extension.py +++ b/agents/ten_packages/extension/coze_python_async/extension.py @@ -168,9 +168,6 @@ async def on_call_chat_completion( ) -> any: raise RuntimeError("Not implemented") - async def on_generate_image(self, async_ten_env, prompt)->str: - raise RuntimeError("Not implemented") - async def on_data_chat_completion( self, ten_env: AsyncTenEnv, **kargs: LLMDataCompletionArgs ) -> None: diff --git a/agents/ten_packages/extension/dify_python/extension.py b/agents/ten_packages/extension/dify_python/extension.py index f2957f25..baf70f72 100644 --- a/agents/ten_packages/extension/dify_python/extension.py +++ b/agents/ten_packages/extension/dify_python/extension.py @@ -171,9 +171,6 @@ async def on_video_frame( async def on_call_chat_completion(self, async_ten_env, **kargs): raise NotImplementedError - - async def on_generate_image(self, async_ten_env, prompt)->str: - return NotImplementedError async def on_tools_update(self, async_ten_env, tool): raise NotImplementedError diff --git a/agents/ten_packages/extension/gemini_v2v_python/extension.py b/agents/ten_packages/extension/gemini_v2v_python/extension.py index ab8f11a6..db976596 100644 --- a/agents/ten_packages/extension/gemini_v2v_python/extension.py +++ b/agents/ten_packages/extension/gemini_v2v_python/extension.py @@ -24,6 +24,7 @@ StatusCode, CmdResult, Data, + TenError, ) from ten.audio_frame import AudioFrameDataFmt from ten_ai_base.const import CMD_PROPERTY_RESULT, CMD_TOOL_CALL @@ -599,7 +600,7 @@ async def _handle_tool_call(self, func_calls: list[FunctionCall]) -> None: cmd: Cmd = Cmd.create(CMD_TOOL_CALL) cmd.set_property_string("name", name) cmd.set_property_from_json("arguments", json.dumps(arguments)) - result: CmdResult = await self.ten_env.send_cmd(cmd) + [result, _] = await self.ten_env.send_cmd(cmd) func_response = FunctionResponse( id=tool_call_id, name=name, response={"error": "Failed to call tool"} @@ -739,8 +740,5 @@ async def _update_usage(self, usage: dict) -> None: async def on_call_chat_completion(self, async_ten_env, **kargs): raise NotImplementedError - async def on_generate_image(self, async_ten_env, prompt)->str: - return NotImplementedError - async def on_data_chat_completion(self, async_ten_env, **kargs): raise NotImplementedError diff --git a/agents/ten_packages/extension/glue_python_async/extension.py b/agents/ten_packages/extension/glue_python_async/extension.py index bdc69848..8739d30b 100644 --- a/agents/ten_packages/extension/glue_python_async/extension.py +++ b/agents/ten_packages/extension/glue_python_async/extension.py @@ -166,7 +166,7 @@ async def on_start(self, ten_env: AsyncTenEnv) -> None: self.memory = ChatMemory(self.config.max_history) if self.config.enable_storage: - result = await ten_env.send_cmd(Cmd.create("retrieve")) + [result, _] = await ten_env.send_cmd(Cmd.create("retrieve")) if result.get_status_code() == StatusCode.OK: try: history = json.loads(result.get_property_string("response")) @@ -224,9 +224,6 @@ async def on_call_chat_completion( ) -> any: raise RuntimeError("Not implemented") - async def on_generate_image(self, async_ten_env, prompt) -> str: - return RuntimeError("Not implemented") - async def on_data_chat_completion( self, ten_env: AsyncTenEnv, **kargs: LLMDataCompletionArgs ) -> None: @@ -385,7 +382,7 @@ async def handle_tool_call(self, call: ToolCall) -> ToolCallResponse: cmd.set_property_from_json("arguments", call.function.arguments) # Send the command and handle the result through the future - result: CmdResult = await self.ten_env.send_cmd(cmd) + [result, _] = await self.ten_env.send_cmd(cmd) if result.get_status_code() == StatusCode.OK: tool_result: LLMToolResult = json.loads( result.get_property_to_json(CMD_PROPERTY_RESULT) diff --git a/agents/ten_packages/extension/image_generate_tool/extension.py b/agents/ten_packages/extension/image_generate_tool/extension.py deleted file mode 100644 index 65c72de3..00000000 --- a/agents/ten_packages/extension/image_generate_tool/extension.py +++ /dev/null @@ -1,48 +0,0 @@ -# -# This file is part of TEN Framework, an open source project. -# Licensed under the Apache License, Version 2.0. -# See the LICENSE file for more information. -# -from ten import ( - TenEnv, - AsyncTenEnv, -) -from ten_ai_base import ( - AsyncLLMToolBaseExtension, LLMToolMetadata, LLMToolResult, BaseConfig -) -from dataclasses import dataclass - - -@dataclass -class ImageGenerateToolConfig(BaseConfig): - # TODO: add extra config fields here - pass - - -class ImageGenerateToolExtension(AsyncLLMToolBaseExtension): - def __init__(self, name: str): - super().__init__(name) - self.config = None - - async def on_start(self, ten_env: AsyncTenEnv) -> None: - await super().on_start(ten_env) - - # initialize configuration - self.config = await ImageGenerateToolConfig.create_async(ten_env=ten_env) - ten_env.log_info(f"config: {self.config}") - - # Implement this method to construct and start your resources. - ten_env.log_debug("TODO: on_start") - - async def on_stop(self, ten_env: AsyncTenEnv) -> None: - await super().on_stop(ten_env) - - #Implement this method to stop and destruct your resources. - ten_env.log_debug("TODO: on_stop") - - def get_tool_metadata(self, ten_env: TenEnv) -> list[LLMToolMetadata]: - ten_env.log_debug("TODO: get_tool_metadata") - return [] - - async def run_tool(self, ten_env: AsyncTenEnv, name: str, args: dict) -> LLMToolResult | None: - ten_env.log_debug(f"TODO: run_tool {name} {args}") diff --git a/agents/ten_packages/extension/image_generate_tool/requirements.txt b/agents/ten_packages/extension/image_generate_tool/requirements.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/agents/ten_packages/extension/message_collector_rtm/src/extension.py b/agents/ten_packages/extension/message_collector_rtm/src/extension.py index d24d1ffd..69fda46b 100644 --- a/agents/ten_packages/extension/message_collector_rtm/src/extension.py +++ b/agents/ten_packages/extension/message_collector_rtm/src/extension.py @@ -215,8 +215,8 @@ async def _handle_text_data(self, data: dict): json_bytes = json.dumps(data).encode("utf-8") cmd = Cmd.create("publish") cmd.set_property_buf("message", json_bytes) - cmd_result: CmdResult = await self.ten_env.send_cmd(cmd) - self.ten_env.log_info(f"send_cmd result {cmd_result.to_json()}") + [result, _] = await self.ten_env.send_cmd(cmd) + self.ten_env.log_info(f"send_cmd result {result.to_json()}") except Exception as e: self.ten_env.log_error(f"Failed to handle text data: {e}") @@ -225,7 +225,7 @@ async def _handle_user_state(self, data: dict): json_bytes = json.dumps(data) cmd = Cmd.create("set_presence_state") cmd.set_property_string("states", json_bytes) - cmd_result: CmdResult = await self.ten_env.send_cmd(cmd) - self.ten_env.log_info(f"send_cmd result {cmd_result.to_json()}") + [result, _] = await self.ten_env.send_cmd(cmd) + self.ten_env.log_info(f"send_cmd result {result.to_json()}") except Exception as e: self.ten_env.log_error(f"Failed to handle user state: {e}") diff --git a/agents/ten_packages/extension/minimax_v2v_python/extension.py b/agents/ten_packages/extension/minimax_v2v_python/extension.py index 6612b0af..000801f6 100644 --- a/agents/ten_packages/extension/minimax_v2v_python/extension.py +++ b/agents/ten_packages/extension/minimax_v2v_python/extension.py @@ -124,7 +124,7 @@ async def on_cmd(self, ten_env: AsyncTenEnv, cmd: Cmd) -> None: match cmd_name: case "flush": await self._flush(ten_env=ten_env) - _result = await ten_env.send_cmd(Cmd.create("flush")) + await ten_env.send_cmd(Cmd.create("flush")) ten_env.log_debug("flush done") case _: pass diff --git a/agents/ten_packages/extension/openai_chatgpt_python/extension.py b/agents/ten_packages/extension/openai_chatgpt_python/extension.py index 3965d310..2c3d1d6f 100644 --- a/agents/ten_packages/extension/openai_chatgpt_python/extension.py +++ b/agents/ten_packages/extension/openai_chatgpt_python/extension.py @@ -26,10 +26,13 @@ LLMDataCompletionArgs, LLMToolMetadata, LLMToolResult, + LLMToolResultDirectRawResponse, + LLMToolResultDirectSpeechResponse, + LLMToolResultRequery, ) from .helper import parse_sentences -from .openai import OpenAIChatGPT, OpenAIChatGPTConfig, OpenAIImageConfig +from .openai import OpenAIChatGPT, OpenAIChatGPTConfig from ten import ( Cmd, StatusCode, @@ -53,7 +56,6 @@ def __init__(self, name: str): self.memory = [] self.memory_cache = [] self.config = None - self.image_config = None self.client = None self.sentence_fragment = "" self.tool_task_future = None @@ -68,7 +70,6 @@ async def on_start(self, async_ten_env: AsyncTenEnv) -> None: await super().on_start(async_ten_env) self.config = await OpenAIChatGPTConfig.create_async(ten_env=async_ten_env) - self.image_config = await OpenAIImageConfig.create_async(ten_env=async_ten_env) # Mandatory properties if not self.config.api_key: @@ -77,7 +78,7 @@ async def on_start(self, async_ten_env: AsyncTenEnv) -> None: # Create instance try: - self.client = OpenAIChatGPT(async_ten_env, self.config, self.image_config) + self.client = OpenAIChatGPT(async_ten_env, self.config) async_ten_env.log_info( f"initialized with max_tokens: {self.config.max_tokens}, model: {self.config.model}, vendor: {self.config.vendor}" ) @@ -149,10 +150,6 @@ async def on_tools_update( ) -> None: return await super().on_tools_update(async_ten_env, tool) - async def on_generate_image(self, async_ten_env, prompt)->str: - url = await self.client.generate_image(prompt) - return url - async def on_call_chat_completion( self, async_ten_env: AsyncTenEnv, **kargs: LLMCallCompletionArgs ) -> any: @@ -235,29 +232,40 @@ async def handle_tool_call(tool_call): # cmd.set_property_from_json("arguments", json.dumps([])) # Send the command and handle the result through the future - result: CmdResult = await async_ten_env.send_cmd(cmd) + [result, _] = await async_ten_env.send_cmd(cmd) if result.get_status_code() == StatusCode.OK: tool_result: LLMToolResult = json.loads( result.get_property_to_json(CMD_PROPERTY_RESULT) ) async_ten_env.log_info(f"tool_result: {tool_result}") - # self.memory_cache = [] - self.memory_cache.pop() - result_content = tool_result["content"] - nonlocal message - new_message = { - "role": "user", - "content": self._convert_to_content_parts( - message["content"] - ), - } - new_message["content"] = new_message[ - "content" - ] + self._convert_to_content_parts(result_content) - await self.queue_input_item( - True, messages=[new_message], no_tool=True - ) + + + if tool_result["type"] == "direct_raw_response": + self.send_raw_text_output(async_ten_env, json.dumps(tool_result["content"]), True) + elif tool_result["type"] == "direct_speech_response": + pass + elif tool_result["type"] == "requery": + # self.memory_cache = [] + self.memory_cache.pop() + result_content = tool_result["content"] + nonlocal message + new_message = { + "role": "user", + "content": self._convert_to_content_parts( + message["content"] + ), + } + new_message["content"] = new_message[ + "content" + ] + self._convert_to_content_parts(result_content) + await self.queue_input_item( + True, messages=[new_message], no_tool=True + ) + else: + async_ten_env.log_error( + f"Unknown tool result type: {tool_result}" + ) else: async_ten_env.log_error("Tool call failed") self.tool_task_future.set_result(None) diff --git a/agents/ten_packages/extension/openai_chatgpt_python/manifest.json b/agents/ten_packages/extension/openai_chatgpt_python/manifest.json index f71d0d76..b955f5b8 100644 --- a/agents/ten_packages/extension/openai_chatgpt_python/manifest.json +++ b/agents/ten_packages/extension/openai_chatgpt_python/manifest.json @@ -85,6 +85,14 @@ "type": "string" } } + }, + { + "name": "raw_text_data", + "property": { + "text": { + "type": "string" + } + } } ], "cmd_in": [ diff --git a/agents/ten_packages/extension/openai_chatgpt_python/openai.py b/agents/ten_packages/extension/openai_chatgpt_python/openai.py index 48523d7a..5c2b346f 100644 --- a/agents/ten_packages/extension/openai_chatgpt_python/openai.py +++ b/agents/ten_packages/extension/openai_chatgpt_python/openai.py @@ -39,20 +39,11 @@ class OpenAIChatGPTConfig(BaseConfig): azure_endpoint: str = "" azure_api_version: str = "" -@dataclass -class OpenAIImageConfig(BaseConfig): - model: str = "dall-e-3" - size: str = "512x512" - quality: str = "standard" - n: int = 1 - - class OpenAIChatGPT: client = None - def __init__(self, ten_env: AsyncTenEnv, config: OpenAIChatGPTConfig, image_config: OpenAIImageConfig): + def __init__(self, ten_env: AsyncTenEnv, config: OpenAIChatGPTConfig): self.config = config - self.image_config = image_config ten_env.log_info(f"OpenAIChatGPT initialized with config: {config.api_key}") if self.config.vendor == "azure": self.client = AsyncAzureOpenAI( @@ -181,17 +172,3 @@ async def get_chat_completions_stream(self, messages, tools=None, listener=None) # Emit content finished event after the loop completes if listener: listener.emit("content_finished", full_content) - - - async def generate_image(self, prompt:str): - try: - response = await self.client.images.generate( - prompt=prompt, - model=self.image_config.model, - size=self.image_config.size, - quality=self.image_config.quality, - ) - except Exception as e: - raise RuntimeError(f"GenerateImage failed, err: {e}") from e - - return response.data[0].url \ No newline at end of file diff --git a/agents/ten_packages/extension/image_generate_tool/README.md b/agents/ten_packages/extension/openai_image_generate_tool/README.md similarity index 100% rename from agents/ten_packages/extension/image_generate_tool/README.md rename to agents/ten_packages/extension/openai_image_generate_tool/README.md diff --git a/agents/ten_packages/extension/image_generate_tool/__init__.py b/agents/ten_packages/extension/openai_image_generate_tool/__init__.py similarity index 100% rename from agents/ten_packages/extension/image_generate_tool/__init__.py rename to agents/ten_packages/extension/openai_image_generate_tool/__init__.py diff --git a/agents/ten_packages/extension/image_generate_tool/addon.py b/agents/ten_packages/extension/openai_image_generate_tool/addon.py similarity index 64% rename from agents/ten_packages/extension/image_generate_tool/addon.py rename to agents/ten_packages/extension/openai_image_generate_tool/addon.py index 647204cf..b2034783 100644 --- a/agents/ten_packages/extension/image_generate_tool/addon.py +++ b/agents/ten_packages/extension/openai_image_generate_tool/addon.py @@ -8,13 +8,13 @@ register_addon_as_extension, TenEnv, ) -from .extension import ImageGenerateToolExtension +from .extension import OpenAIImageGenerateToolExtension -@register_addon_as_extension("image_generate_tool") -class ImageGenerateToolExtensionAddon(Addon): +@register_addon_as_extension("openai_image_generate_tool") +class OpenAIImageGenerateToolExtensionAddon(Addon): def on_create_instance(self, ten_env: TenEnv, name: str, context) -> None: ten_env.log_info("on_create_instance") ten_env.on_create_instance_done( - ImageGenerateToolExtension(name), context) + OpenAIImageGenerateToolExtension(name), context) diff --git a/agents/ten_packages/extension/openai_image_generate_tool/extension.py b/agents/ten_packages/extension/openai_image_generate_tool/extension.py new file mode 100644 index 00000000..15102d78 --- /dev/null +++ b/agents/ten_packages/extension/openai_image_generate_tool/extension.py @@ -0,0 +1,73 @@ +# +# This file is part of TEN Framework, an open source project. +# Licensed under the Apache License, Version 2.0. +# See the LICENSE file for more information. +# +from ten import ( + TenEnv, + AsyncTenEnv, +) +from ten_ai_base import ( + AsyncLLMToolBaseExtension, LLMToolMetadata, LLMToolResult +) +from ten_ai_base.types import LLMChatCompletionContentPartImageParam, LLMToolMetadataParameter, LLMToolResultDirectRawResponse +from .openai import OpenAIImageGenerateClient, OpenAIImageGenerateToolConfig + +class OpenAIImageGenerateToolExtension(AsyncLLMToolBaseExtension): + def __init__(self, name: str): + super().__init__(name) + self.config = None + self.client = None + + async def on_start(self, ten_env: AsyncTenEnv) -> None: + await super().on_start(ten_env) + + # initialize configuration + self.config = await OpenAIImageGenerateToolConfig.create_async(ten_env=ten_env) + ten_env.log_info(f"config: {self.config}") + + if not self.config.api_key: + ten_env.log_error("API key is not set") + return + + # initialize OpenAIImageGenerateClient + self.client = OpenAIImageGenerateClient(ten_env, self.config) + + async def on_stop(self, ten_env: AsyncTenEnv) -> None: + await super().on_stop(ten_env) + + def get_tool_metadata(self, ten_env: TenEnv) -> list[LLMToolMetadata]: + return [ + LLMToolMetadata( + name="image_generate", + description="Generate image by prompt query", + parameters=[ + LLMToolMetadataParameter( + name="prompt", + type="string", + description="Prompt to generate images", + ), + ], + ) + ] + + async def run_tool(self, ten_env: AsyncTenEnv, name: str, args: dict) -> LLMToolResult | None: + ten_env.log_info(f"run_tool {name} {args}") + if name == "image_generate": + prompt = args.get("prompt") + if prompt: + # Implement this method to run your tool with the given arguments. + ten_env.log_info(f"Generating image with prompt: {prompt}") + # call OpenAIImageGenerateClient to generate images + response_url = await self.client.generate_images(prompt) + ten_env.log_info(f"Generated image: {response_url}") + result = LLMToolResultDirectRawResponse( + type="direct_raw_response", + content=[ + LLMChatCompletionContentPartImageParam( + type="image_url", + image_url=response_url + ) + ] + ) + return result diff --git a/agents/ten_packages/extension/image_generate_tool/manifest.json b/agents/ten_packages/extension/openai_image_generate_tool/manifest.json similarity index 70% rename from agents/ten_packages/extension/image_generate_tool/manifest.json rename to agents/ten_packages/extension/openai_image_generate_tool/manifest.json index b9e81545..3da331d5 100644 --- a/agents/ten_packages/extension/image_generate_tool/manifest.json +++ b/agents/ten_packages/extension/openai_image_generate_tool/manifest.json @@ -1,6 +1,6 @@ { "type": "extension", - "name": "image_generate_tool", + "name": "openai_image_generate_tool", "version": "0.1.0", "dependencies": [ { @@ -20,7 +20,35 @@ ] }, "api": { - "property": {}, + "property": { + "api_key": { + "type": "string" + }, + "model": { + "type": "string" + }, + "base_url": { + "type": "string" + }, + "size": { + "type": "string" + }, + "n": { + "type": "int64" + }, + "proxy_url": { + "type": "string" + }, + "vendor": { + "type": "string" + }, + "azure_endpoint": { + "type": "string" + }, + "azure_api_version": { + "type": "string" + } + }, "cmd_in": [ { "name": "tool_call", @@ -29,22 +57,12 @@ "type": "string" }, "arguments": { - "type": "string" + "type": "object" } }, "required": [ "name" - ], - "result": { - "property": { - "tool_result": { - "type": "string" - } - }, - "required": [ - "tool_result" - ] - } + ] } ], "cmd_out": [ diff --git a/agents/ten_packages/extension/openai_image_generate_tool/openai.py b/agents/ten_packages/extension/openai_image_generate_tool/openai.py new file mode 100644 index 00000000..36bb74f7 --- /dev/null +++ b/agents/ten_packages/extension/openai_image_generate_tool/openai.py @@ -0,0 +1,70 @@ +# +# +# Agora Real Time Engagement +# Created by Wei Hu in 2024-08. +# Copyright (c) 2024 Agora IO. All rights reserved. +# +# +from dataclasses import dataclass +import requests +from openai import AsyncOpenAI, AsyncAzureOpenAI + +from ten.async_ten_env import AsyncTenEnv +from ten_ai_base.config import BaseConfig + + +@dataclass +class OpenAIImageGenerateToolConfig(BaseConfig): + api_key: str = "" + base_url: str = "https://api.openai.com/v1" + model: str = "dall-e-3" + size: str = "1024x1024" + quality: str = "standard" + n: int = 1 + proxy_url: str = "" + vendor: str = "openai" + azure_endpoint: str = "" + azure_api_version: str = "" + +class OpenAIImageGenerateClient: + client = None + + def __init__(self, ten_env: AsyncTenEnv, config: OpenAIImageGenerateToolConfig): + self.config = config + ten_env.log_info(f"OpenAIImageGenerateClient initialized with config: {config.api_key}") + if self.config.vendor == "azure": + self.client = AsyncAzureOpenAI( + api_key=config.api_key, + api_version=self.config.azure_api_version, + azure_endpoint=config.azure_endpoint, + ) + ten_env.log_info( + f"Using Azure OpenAI with endpoint: {config.azure_endpoint}, api_version: {config.azure_api_version}" + ) + else: + self.client = AsyncOpenAI(api_key=config.api_key, base_url=config.base_url) + self.session = requests.Session() + if config.proxy_url: + proxies = { + "http": config.proxy_url, + "https": config.proxy_url, + } + ten_env.log_info(f"Setting proxies: {proxies}") + self.session.proxies.update(proxies) + self.client.session = self.session + + + async def generate_images(self, prompt: str) -> str: + req = { + "model": self.config.model, + "prompt": prompt, + "size": self.config.size, + "quality": self.config.quality, + "n": self.config.n, + } + + try: + response = await self.client.images.generate(**req) + except Exception as e: + raise RuntimeError(f"GenerateImages failed, err: {e}") from e + return response.data[0].url \ No newline at end of file diff --git a/agents/ten_packages/extension/image_generate_tool/property.json b/agents/ten_packages/extension/openai_image_generate_tool/property.json similarity index 100% rename from agents/ten_packages/extension/image_generate_tool/property.json rename to agents/ten_packages/extension/openai_image_generate_tool/property.json diff --git a/agents/ten_packages/extension/openai_image_generate_tool/requirements.txt b/agents/ten_packages/extension/openai_image_generate_tool/requirements.txt new file mode 100644 index 00000000..2fec3068 --- /dev/null +++ b/agents/ten_packages/extension/openai_image_generate_tool/requirements.txt @@ -0,0 +1,2 @@ +openai +requests[socks] \ No newline at end of file diff --git a/agents/ten_packages/extension/openai_v2v_python/extension.py b/agents/ten_packages/extension/openai_v2v_python/extension.py index 306f3885..2b4d3864 100644 --- a/agents/ten_packages/extension/openai_v2v_python/extension.py +++ b/agents/ten_packages/extension/openai_v2v_python/extension.py @@ -170,7 +170,7 @@ async def on_start(self, ten_env: AsyncTenEnv) -> None: self.memory = ChatMemory(self.config.max_history) if self.config.enable_storage: - result = await ten_env.send_cmd(Cmd.create("retrieve")) + [result, _] = await ten_env.send_cmd(Cmd.create("retrieve")) if result.get_status_code() == StatusCode.OK: try: history = json.loads(result.get_property_string("response")) @@ -687,7 +687,7 @@ async def _handle_tool_call( cmd: Cmd = Cmd.create(CMD_TOOL_CALL) cmd.set_property_string("name", name) cmd.set_property_from_json("arguments", arguments) - result: CmdResult = await self.ten_env.send_cmd(cmd) + [result, _] = await self.ten_env.send_cmd(cmd) tool_response = ItemCreate( item=FunctionCallOutputItemParam( @@ -830,8 +830,5 @@ async def _update_usage(self, usage: dict) -> None: async def on_call_chat_completion(self, async_ten_env, **kargs): raise NotImplementedError - async def on_generate_image(self, async_ten_env, prompt)->str: - return NotImplementedError - async def on_data_chat_completion(self, async_ten_env, **kargs): raise NotImplementedError diff --git a/agents/ten_packages/extension/vision_analyze_tool_python/extension.py b/agents/ten_packages/extension/vision_analyze_tool_python/extension.py index 7b565706..010bd61f 100644 --- a/agents/ten_packages/extension/vision_analyze_tool_python/extension.py +++ b/agents/ten_packages/extension/vision_analyze_tool_python/extension.py @@ -23,6 +23,7 @@ LLMToolMetadata, LLMToolMetadataParameter, LLMToolResult, + LLMToolResultDirectSpeechResponse, ) @@ -174,6 +175,9 @@ async def run_tool( ) cmd.set_property_from_json("arguments", json.dumps({"messages": [message]})) ten_env.log_info("send_cmd {}".format(message)) - cmd_result: CmdResult = await ten_env.send_cmd(cmd) + [cmd_result, _] = await ten_env.send_cmd(cmd) result = cmd_result.get_property_to_json("response") - return {"content": [{"type": "text", "text": result}]} + return LLMToolResultDirectSpeechResponse( + type="direct_speech_response", + content=json.dumps(result), + ) diff --git a/agents/ten_packages/extension/vision_tool_python/extension.py b/agents/ten_packages/extension/vision_tool_python/extension.py index f3cfc957..9f1a97bc 100644 --- a/agents/ten_packages/extension/vision_tool_python/extension.py +++ b/agents/ten_packages/extension/vision_tool_python/extension.py @@ -4,7 +4,7 @@ # See the LICENSE file for more information. # from ten_ai_base import AsyncLLMToolBaseExtension -from ten_ai_base.types import LLMToolMetadata, LLMToolResult +from ten_ai_base.types import LLMChatCompletionContentPartImageParam, LLMToolMetadata, LLMToolResult, LLMToolResultRequery from ten import ( AudioFrame, VideoFrame, @@ -140,7 +140,12 @@ async def run_tool( base64_image = rgb2base64jpeg( self.image_data, self.image_width, self.image_height ) - # return LLMToolResult(message=LLMCompletionArgsMessage(role="user", content=[result])) - return { - "content": [{"type": "image_url", "image_url": {"url": base64_image}}] - } + return LLMToolResultRequery( + type="requery", + content=[ + LLMChatCompletionContentPartImageParam( + type="image_url", + image_url=base64_image, + ) + ], + ) diff --git a/agents/ten_packages/extension/weatherapi_tool_python/extension.py b/agents/ten_packages/extension/weatherapi_tool_python/extension.py index d4cf5580..54efca36 100644 --- a/agents/ten_packages/extension/weatherapi_tool_python/extension.py +++ b/agents/ten_packages/extension/weatherapi_tool_python/extension.py @@ -17,7 +17,7 @@ from ten.async_ten_env import AsyncTenEnv from ten_ai_base.config import BaseConfig from ten_ai_base import AsyncLLMToolBaseExtension -from ten_ai_base.types import LLMToolMetadata, LLMToolMetadataParameter, LLMToolResult +from ten_ai_base.types import LLMToolMetadata, LLMToolMetadataParameter, LLMToolResult, LLMToolResultRequery CMD_TOOL_REGISTER = "tool_register" CMD_TOOL_CALL = "tool_call" @@ -172,16 +172,24 @@ async def run_tool( ten_env.log_info(f"run_tool name: {name}, args: {args}") if name == CURRENT_TOOL_NAME: result = await self._get_current_weather(args) - # result = LLMCompletionContentItemText(text="I see something") - return {"content": json.dumps(result)} + return LLMToolResultRequery( + type="requery", + content=json.dumps(result), + ) elif name == HISTORY_TOOL_NAME: result = await self._get_past_weather(args) # result = LLMCompletionContentItemText(text="I see something") - return {"content": json.dumps(result)} + return LLMToolResultRequery( + type="requery", + content=json.dumps(result), + ) elif name == FORECAST_TOOL_NAME: result = await self._get_future_weather(args) # result = LLMCompletionContentItemText(text="I see something") - return {"content": json.dumps(result)} + return LLMToolResultRequery( + type="requery", + content=json.dumps(result), + ) async def _get_current_weather(self, args: dict) -> Any: if "location" not in args: diff --git a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/const.py b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/const.py index c77cfe50..6555d3da 100644 --- a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/const.py +++ b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/const.py @@ -8,6 +8,8 @@ CMD_OUT_FLUSH = "flush" DATA_OUT_NAME = "text_data" +RAW_DATA_OUT_NAME = "raw_text_data" +DATA_OUT_PROPERTY_TEXT = "text" DATA_OUT_PROPERTY_TEXT = "text" DATA_OUT_PROPERTY_END_OF_SEGMENT = "end_of_segment" @@ -15,4 +17,6 @@ DATA_IN_PROPERTY_END_OF_SEGMENT = "end_of_segment" DATA_INPUT_NAME = "text_data" +RAW_DATA_INPUT_NAME = "raw_text_data" + AUDIO_FRAME_OUTPUT_NAME = "pcm_frame" \ No newline at end of file diff --git a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/llm.py b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/llm.py index 3baecc4e..a81cfe9b 100644 --- a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/llm.py +++ b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/llm.py @@ -21,6 +21,7 @@ DATA_OUT_PROPERTY_END_OF_SEGMENT, DATA_OUT_PROPERTY_TEXT, CMD_CHAT_COMPLETION_CALL, + RAW_DATA_OUT_NAME, ) from .types import LLMCallCompletionArgs, LLMDataCompletionArgs, LLMToolMetadata from .helper import AsyncQueue @@ -116,6 +117,22 @@ async def flush_input_items(self, async_ten_env: AsyncTenEnv): async_ten_env.log_info("Cancelling the current task during flush.") self.current_task.cancel() + def send_raw_text_output( + self, async_ten_env: AsyncTenEnv, sentence: str, end_of_segment: bool + ): + try: + output_data = Data.create(RAW_DATA_OUT_NAME) + output_data.set_property_string(DATA_OUT_PROPERTY_TEXT, sentence) + output_data.set_property_bool( + DATA_OUT_PROPERTY_END_OF_SEGMENT, end_of_segment + ) + asyncio.create_task(async_ten_env.send_data(output_data)) + async_ten_env.log_info( + f"{'end of segment ' if end_of_segment else ''}sent raw sentence [{sentence}]" + ) + except Exception as err: + async_ten_env.log_warn(f"send sentence [{sentence}] failed, err: {err}") + def send_text_output( self, async_ten_env: AsyncTenEnv, sentence: str, end_of_segment: bool ): diff --git a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/types.py b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/types.py index 84bf8cc8..e918caf2 100644 --- a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/types.py +++ b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/types.py @@ -95,10 +95,23 @@ class LLMChatCompletionUserMessageParam(TypedDict, total=False): LLMChatCompletionUserMessageParam, LLMChatCompletionToolMessageParam ] +class LLMToolResultRequery(TypedDict, total=False): + type: Required[Literal["requery"]] + content: Required[Union[str, Iterable[LLMChatCompletionContentPartParam]]] + +class LLMToolResultDirectSpeechResponse(TypedDict, total=False): + type: Required[Literal["direct_speech_response"]] + content: Required[str] -class LLMToolResult(TypedDict, total=False): +class LLMToolResultDirectRawResponse(TypedDict, total=False): + type: Required[Literal["direct_raw_response"]] content: Required[Union[str, Iterable[LLMChatCompletionContentPartParam]]] +LLMToolResult: TypeAlias = Union[ + LLMToolResultRequery, + LLMToolResultDirectSpeechResponse, + LLMToolResultDirectRawResponse, +] class LLMCallCompletionArgs(TypedDict, total=False): messages: Iterable[LLMChatCompletionMessageParam] diff --git a/pyrightconfig.json b/pyrightconfig.json index f8458b4f..944d0958 100644 --- a/pyrightconfig.json +++ b/pyrightconfig.json @@ -20,6 +20,7 @@ "venv", "__pycache__", "./agents/ten_packages/system/ten_runtime_python/**/*", + "/usr/lib/**/*" ], "include": [ "agents" From 3a026f6401b90e9123f6267d21e996f2c2736322 Mon Sep 17 00:00:00 2001 From: Ethan Zhang Date: Tue, 31 Dec 2024 09:47:49 +0000 Subject: [PATCH 3/6] feat: support tool --- agents/examples/default/property.json | 28 +++++++++----- .../extension/message_collector/manifest.json | 11 ++++++ .../message_collector/src/extension.py | 4 ++ .../openai_chatgpt_python/extension.py | 2 +- .../openai_image_generate_tool/extension.py | 8 +--- .../openai_image_generate_tool/manifest.json | 3 +- .../interface/ten_ai_base/types.py | 2 +- playground/src/manager/rtc/rtc.ts | 38 ++++++++++++------- playground/src/types/index.ts | 2 +- 9 files changed, 65 insertions(+), 33 deletions(-) diff --git a/agents/examples/default/property.json b/agents/examples/default/property.json index b7f20960..d5814951 100644 --- a/agents/examples/default/property.json +++ b/agents/examples/default/property.json @@ -546,19 +546,17 @@ "connections": [ { "extension": "agora_rtc", - "audio_frame": [ + "cmd": [ { - "name": "pcm_frame", + "name": "on_user_joined", "dest": [ { "extension": "v2v" } ] - } - ], - "cmd": [ + }, { - "name": "on_user_joined", + "name": "on_user_left", "dest": [ { "extension": "v2v" @@ -566,15 +564,17 @@ ] }, { - "name": "on_user_left", + "name": "on_connection_failure", "dest": [ { "extension": "v2v" } ] - }, + } + ], + "audio_frame": [ { - "name": "on_connection_failure", + "name": "pcm_frame", "dest": [ { "extension": "v2v" @@ -698,7 +698,7 @@ "max_memory_length": 10, "max_tokens": 512, "model": "${env:OPENAI_MODEL}", - "prompt": "", + "prompt": "You are an ai agent bot producing child picture books. Each response should be short and no more than 50 words as it's for child. \nFor each response, you will use the 'image_generate' tool to create an image based on the description or key moment in that part of the story. The story should be set in a fantasy world. Try asking questions relevant to the story to decide how the story should proceed. Each response should include rich, vivid descriptions that will guide the 'image_generate' tool to produce an image that aligns with the scene or mood.\n Whether it’s the setting, a character’s expression, or a dramatic moment, the paragraph should give enough detail for a meaningful visual representation.", "proxy_url": "${env:OPENAI_PROXY_URL}" } }, @@ -838,6 +838,14 @@ "extension": "message_collector" } ] + }, + { + "name": "raw_text_data", + "dest": [ + { + "extension": "message_collector" + } + ] } ] }, diff --git a/agents/ten_packages/extension/message_collector/manifest.json b/agents/ten_packages/extension/message_collector/manifest.json index 835c1721..00f5a640 100644 --- a/agents/ten_packages/extension/message_collector/manifest.json +++ b/agents/ten_packages/extension/message_collector/manifest.json @@ -40,6 +40,17 @@ "type": "bool" } } + }, + { + "name": "raw_text_data", + "property": { + "text": { + "type": "string" + }, + "end_of_segment": { + "type": "bool" + } + } } ], "data_out": [ diff --git a/agents/ten_packages/extension/message_collector/src/extension.py b/agents/ten_packages/extension/message_collector/src/extension.py index f12152ab..450b0856 100644 --- a/agents/ten_packages/extension/message_collector/src/extension.py +++ b/agents/ten_packages/extension/message_collector/src/extension.py @@ -214,6 +214,10 @@ def on_data(self, ten_env: TenEnv, data: Data) -> None: "text": text, } + # Add the raw data type if the data is raw text data + if data.get_name() == "raw_text_data": + base_msg_data["data_type"] = "raw" + try: chunks = _text_to_base64_chunks(ten_env, json.dumps(base_msg_data), message_id) for chunk in chunks: diff --git a/agents/ten_packages/extension/openai_chatgpt_python/extension.py b/agents/ten_packages/extension/openai_chatgpt_python/extension.py index 2c3d1d6f..2c825924 100644 --- a/agents/ten_packages/extension/openai_chatgpt_python/extension.py +++ b/agents/ten_packages/extension/openai_chatgpt_python/extension.py @@ -58,7 +58,7 @@ def __init__(self, name: str): self.config = None self.client = None self.sentence_fragment = "" - self.tool_task_future = None + self.tool_task_future: asyncio.Future | None = None self.users_count = 0 async def on_init(self, async_ten_env: AsyncTenEnv) -> None: diff --git a/agents/ten_packages/extension/openai_image_generate_tool/extension.py b/agents/ten_packages/extension/openai_image_generate_tool/extension.py index 15102d78..e198f8f3 100644 --- a/agents/ten_packages/extension/openai_image_generate_tool/extension.py +++ b/agents/ten_packages/extension/openai_image_generate_tool/extension.py @@ -3,6 +3,7 @@ # Licensed under the Apache License, Version 2.0. # See the LICENSE file for more information. # +import json from ten import ( TenEnv, AsyncTenEnv, @@ -63,11 +64,6 @@ async def run_tool(self, ten_env: AsyncTenEnv, name: str, args: dict) -> LLMTool ten_env.log_info(f"Generated image: {response_url}") result = LLMToolResultDirectRawResponse( type="direct_raw_response", - content=[ - LLMChatCompletionContentPartImageParam( - type="image_url", - image_url=response_url - ) - ] + content=json.dumps({"data":{"image_url": response_url}, "type": "image_url"}), ) return result diff --git a/agents/ten_packages/extension/openai_image_generate_tool/manifest.json b/agents/ten_packages/extension/openai_image_generate_tool/manifest.json index 3da331d5..6f5c5342 100644 --- a/agents/ten_packages/extension/openai_image_generate_tool/manifest.json +++ b/agents/ten_packages/extension/openai_image_generate_tool/manifest.json @@ -57,7 +57,8 @@ "type": "string" }, "arguments": { - "type": "object" + "type": "object", + "properties": {} } }, "required": [ diff --git a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/types.py b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/types.py index e918caf2..2119fffb 100644 --- a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/types.py +++ b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/types.py @@ -105,7 +105,7 @@ class LLMToolResultDirectSpeechResponse(TypedDict, total=False): class LLMToolResultDirectRawResponse(TypedDict, total=False): type: Required[Literal["direct_raw_response"]] - content: Required[Union[str, Iterable[LLMChatCompletionContentPartParam]]] + content: Required[str] LLMToolResult: TypeAlias = Union[ LLMToolResultRequery, diff --git a/playground/src/manager/rtc/rtc.ts b/playground/src/manager/rtc/rtc.ts index 447323b5..c536641e 100644 --- a/playground/src/manager/rtc/rtc.ts +++ b/playground/src/manager/rtc/rtc.ts @@ -86,10 +86,10 @@ export class RtcManager extends AGEventEmitter { this.emit("localTracksChanged", this.localTracks); } - async switchVideoSource(type:VideoSourceType) { + async switchVideoSource(type: VideoSourceType) { if (type === VideoSourceType.SCREEN) { await this.createScreenShareTrack(); - if(this.localTracks.screenTrack) { + if (this.localTracks.screenTrack) { this.client.unpublish(this.localTracks.videoTrack); this.localTracks.videoTrack?.close(); this.localTracks.videoTrack = undefined; @@ -98,7 +98,7 @@ export class RtcManager extends AGEventEmitter { } } else if (type === VideoSourceType.CAMERA) { await this.createCameraTracks(); - if(this.localTracks.videoTrack) { + if (this.localTracks.videoTrack) { this.client.unpublish(this.localTracks.screenTrack); this.localTracks.screenTrack?.close(); this.localTracks.screenTrack = undefined; @@ -228,18 +228,30 @@ export class RtcManager extends AGEventEmitter { const completeMessage = this.reconstructMessage( this.messageCache[message_id] ); - const { stream_id, is_final, text, text_ts } = JSON.parse( + const { stream_id, is_final, text, text_ts, data_type } = JSON.parse( atob(completeMessage) ); - const textItem: ITextItem = { - uid: `${stream_id}`, - time: text_ts, - dataType: "transcribe", - text: text, - isFinal: is_final, - }; - - if (text.trim().length > 0) { + let textItem: ITextItem; + + if (data_type === "raw") { + textItem = { + uid: `${stream_id}`, + time: text_ts, + dataType: "image_url", + text: text, + isFinal: is_final, + } + } else { + textItem = { + uid: `${stream_id}`, + time: text_ts, + dataType: "transcribe", + text: text, + isFinal: is_final, + }; + } + + if (text.trim().length > 0 && textItem) { this.emit("textChanged", textItem); } diff --git a/playground/src/types/index.ts b/playground/src/types/index.ts index 812d4543..2fdf4da6 100644 --- a/playground/src/types/index.ts +++ b/playground/src/types/index.ts @@ -35,7 +35,7 @@ export interface IChatItem { /** @deprecated */ export interface ITextItem { - dataType: "transcribe" | "translate"; + dataType: "transcribe" | "translate" | "image_url"; uid: string; time: number; text: string; From e70473ed9120f773f95d035bf40a8606855e1b1c Mon Sep 17 00:00:00 2001 From: Ethan Zhang Date: Tue, 31 Dec 2024 11:54:12 +0000 Subject: [PATCH 4/6] feat: init success --- agents/examples/default/property.json | 22 ++++++++++- .../openai_image_generate_tool/extension.py | 2 +- .../src/components/Chat/MessageList.tsx | 8 +++- playground/src/components/Dynamic/RTCCard.tsx | 21 +++-------- playground/src/manager/rtc/rtc.ts | 37 ++++++++++--------- playground/src/manager/rtc/types.ts | 4 +- playground/src/types/index.ts | 6 +++ 7 files changed, 62 insertions(+), 38 deletions(-) diff --git a/agents/examples/default/property.json b/agents/examples/default/property.json index d5814951..75196981 100644 --- a/agents/examples/default/property.json +++ b/agents/examples/default/property.json @@ -729,6 +729,13 @@ "extension_group": "transcriber", "property": {} }, + { + "type": "extension", + "name": "message_collector2", + "addon": "message_collector", + "extension_group": "transcriber", + "property": {} + }, { "type": "extension", "name": "weatherapi_tool_python", @@ -843,7 +850,7 @@ "name": "raw_text_data", "dest": [ { - "extension": "message_collector" + "extension": "message_collector2" } ] } @@ -862,6 +869,19 @@ } ] }, + { + "extension": "message_collector2", + "data": [ + { + "name": "data", + "dest": [ + { + "extension": "agora_rtc" + } + ] + } + ] + }, { "extension": "tts", "cmd": [ diff --git a/agents/ten_packages/extension/openai_image_generate_tool/extension.py b/agents/ten_packages/extension/openai_image_generate_tool/extension.py index e198f8f3..e9d976e6 100644 --- a/agents/ten_packages/extension/openai_image_generate_tool/extension.py +++ b/agents/ten_packages/extension/openai_image_generate_tool/extension.py @@ -64,6 +64,6 @@ async def run_tool(self, ten_env: AsyncTenEnv, name: str, args: dict) -> LLMTool ten_env.log_info(f"Generated image: {response_url}") result = LLMToolResultDirectRawResponse( type="direct_raw_response", - content=json.dumps({"data":{"image_url": response_url}, "type": "image_url"}), + content={"data":{"image_url": response_url}, "type": "image_url"}, ) return result diff --git a/playground/src/components/Chat/MessageList.tsx b/playground/src/components/Chat/MessageList.tsx index ce1e0b82..c2612e1f 100644 --- a/playground/src/components/Chat/MessageList.tsx +++ b/playground/src/components/Chat/MessageList.tsx @@ -8,7 +8,7 @@ import { isRagGraph, } from "@/common" import { Bot } from "lucide-react" -import { EMessageType, type IChatItem } from "@/types" +import { EMessageDataType, EMessageType, type IChatItem } from "@/types" import { Avatar, AvatarFallback } from "@/components/ui/avatar" import { cn } from "@/lib/utils" @@ -50,7 +50,11 @@ export function MessageItem(props: { data: IChatItem }) { }
-

{data.text}

+ {data.data_type === EMessageDataType.IMAGE ? ( + chat + ) : ( +

{data.text}

+ )}
diff --git a/playground/src/components/Dynamic/RTCCard.tsx b/playground/src/components/Dynamic/RTCCard.tsx index 8bba345d..62f984cc 100644 --- a/playground/src/components/Dynamic/RTCCard.tsx +++ b/playground/src/components/Dynamic/RTCCard.tsx @@ -4,7 +4,7 @@ import * as React from "react" import { cn } from "@/lib/utils" import { ICameraVideoTrack, ILocalVideoTrack, IMicrophoneAudioTrack } from "agora-rtc-sdk-ng" import { useAppSelector, useAppDispatch, VOICE_OPTIONS, VideoSourceType } from "@/common" -import { ITextItem, EMessageType } from "@/types" +import { ITextItem, EMessageType, IChatItem } from "@/types" import { rtcManager, IUserTracks, IRtcUser } from "@/manager" import { setRoomConnected, @@ -98,20 +98,11 @@ export default function RTCCard(props: { className?: string }) { } } - const onTextChanged = (text: ITextItem) => { + const onTextChanged = (text: IChatItem) => { console.log("[rtc] onTextChanged", text) - if (text.dataType == "transcribe") { - const isAgent = Number(text.uid) != Number(userId) - dispatch( - addChatItem({ - userId: text.uid, - text: text.text, - type: isAgent ? EMessageType.AGENT : EMessageType.USER, - isFinal: text.isFinal, - time: text.time, - }), - ) - } + dispatch( + addChatItem(text), + ) } const onVoiceChange = (value: any) => { @@ -138,7 +129,7 @@ export default function RTCCard(props: { className?: string }) { {/* -- You */}
- { localTracks: IUserTracks; appId: string | null = null; token: string | null = null; + userId: number | null = null; constructor() { super(); @@ -45,6 +46,7 @@ export class RtcManager extends AGEventEmitter { const { appId, token } = data; this.appId = appId; this.token = token; + this.userId = userId; await this.client?.join(appId, channel, token, userId); this._joined = true; } @@ -231,27 +233,28 @@ export class RtcManager extends AGEventEmitter { const { stream_id, is_final, text, text_ts, data_type } = JSON.parse( atob(completeMessage) ); - let textItem: ITextItem; + const isAgent = Number(stream_id) != Number(this.userId) + let textItem: IChatItem = { + type: isAgent ? EMessageType.AGENT : EMessageType.USER, + time: text_ts, + text: text, + data_type: EMessageDataType.TEXT, + userId: stream_id, + isFinal: is_final, + };; if (data_type === "raw") { - textItem = { - uid: `${stream_id}`, - time: text_ts, - dataType: "image_url", - text: text, - isFinal: is_final, + let { data, type } = JSON.parse(text); + if (type === "image_url") { + textItem = { + ...textItem, + data_type: EMessageDataType.IMAGE, + text: data.image_url, + }; } - } else { - textItem = { - uid: `${stream_id}`, - time: text_ts, - dataType: "transcribe", - text: text, - isFinal: is_final, - }; } - if (text.trim().length > 0 && textItem) { + if (text.trim().length > 0) { this.emit("textChanged", textItem); } diff --git a/playground/src/manager/rtc/types.ts b/playground/src/manager/rtc/types.ts index 042d17cd..1f27f3a2 100644 --- a/playground/src/manager/rtc/types.ts +++ b/playground/src/manager/rtc/types.ts @@ -7,7 +7,7 @@ import { NetworkQuality, ILocalVideoTrack, } from "agora-rtc-sdk-ng" -import { ITextItem } from "@/types" +import { IChatItem, ITextItem } from "@/types" export interface IRtcUser extends IUserTracks { userId: UID @@ -17,7 +17,7 @@ export interface RtcEvents { remoteUserChanged: (user: IRtcUser) => void localTracksChanged: (tracks: IUserTracks) => void networkQuality: (quality: NetworkQuality) => void - textChanged: (text: ITextItem) => void + textChanged: (text: IChatItem) => void } export interface IUserTracks { diff --git a/playground/src/types/index.ts b/playground/src/types/index.ts index 2fdf4da6..a7887ade 100644 --- a/playground/src/types/index.ts +++ b/playground/src/types/index.ts @@ -24,10 +24,16 @@ export enum EMessageType { USER = "user", } +export enum EMessageDataType { + TEXT = "text", + IMAGE = "image", +} + export interface IChatItem { userId: number | string; userName?: string; text: string; + data_type: EMessageDataType; type: EMessageType; isFinal?: boolean; time: number; From 10e33388d5e772b17c8812a33ed31557460ecabb Mon Sep 17 00:00:00 2001 From: Ethan Zhang Date: Tue, 7 Jan 2025 17:33:10 +0000 Subject: [PATCH 5/6] feat: revert changes; support normal tool result --- .../openai_chatgpt_python/extension.py | 95 +++++++++++-------- .../openai_image_generate_tool/extension.py | 6 +- .../extension/vision_tool_python/extension.py | 4 +- .../weatherapi_tool_python/extension.py | 14 +-- .../ten_ai_base/interface/ten_ai_base/llm.py | 4 +- .../interface/ten_ai_base/types.py | 13 +-- 6 files changed, 74 insertions(+), 62 deletions(-) diff --git a/agents/ten_packages/extension/openai_chatgpt_python/extension.py b/agents/ten_packages/extension/openai_chatgpt_python/extension.py index 2c825924..520c0b22 100644 --- a/agents/ten_packages/extension/openai_chatgpt_python/extension.py +++ b/agents/ten_packages/extension/openai_chatgpt_python/extension.py @@ -26,8 +26,6 @@ LLMDataCompletionArgs, LLMToolMetadata, LLMToolResult, - LLMToolResultDirectRawResponse, - LLMToolResultDirectSpeechResponse, LLMToolResultRequery, ) @@ -166,43 +164,44 @@ async def on_data_chat_completion( kmessages: Iterable[LLMChatCompletionUserMessageParam] = kargs.get( "messages", [] ) - kmessage = next(iter(kmessages), None) - if not kmessage: + if len(kmessages) == 0: async_ten_env.log_error("No message in data") return - message = self.message_to_dict(kmessage) + messages = [] + for message in kmessages: + messages = messages + [self.message_to_dict(message)] self.memory_cache = [] memory = self.memory try: - async_ten_env.log_info(f"for input text: [{message}] memory: {memory}") + async_ten_env.log_info(f"for input text: [{messages}] memory: {memory}") tools = None no_tool = kargs.get("no_tool", False) - if ( - not isinstance(message.get("content"), str) - and message.get("role") == "user" - ): - non_artifact_content = [ - item - for item in message.get("content", []) - if item.get("type") == "text" - ] - non_artifact_message = { - "role": message.get("role"), - "content": non_artifact_content, - } - self.memory_cache = self.memory_cache + [ - non_artifact_message, - {"role": "assistant", "content": ""}, - ] - else: - self.memory_cache = self.memory_cache + [ - message, - {"role": "assistant", "content": ""}, - ] + for message in messages: + if ( + not isinstance(message.get("content"), str) + and message.get("role") == "user" + ): + non_artifact_content = [ + item + for item in message.get("content", []) + if item.get("type") == "text" + ] + non_artifact_message = { + "role": message.get("role"), + "content": non_artifact_content, + } + self.memory_cache = self.memory_cache + [ + non_artifact_message, + ] + else: + self.memory_cache = self.memory_cache + [ + message, + ] + self.memory_cache = self.memory_cache + [{"role": "assistant", "content": ""}] tools = None if not no_tool and len(self.available_tools) > 0: @@ -241,10 +240,25 @@ async def handle_tool_call(tool_call): async_ten_env.log_info(f"tool_result: {tool_result}") - if tool_result["type"] == "direct_raw_response": - self.send_raw_text_output(async_ten_env, json.dumps(tool_result["content"]), True) - elif tool_result["type"] == "direct_speech_response": - pass + if tool_result["type"] == "normal": + result_content = tool_result["content"] + if isinstance(result_content, str): + tool_message = { + "role": "assistant", + "tool_calls": [tool_call], + } + new_message = { + "role": "tool", + "content": result_content, + "tool_call_id": tool_call["id"], + } + await self.queue_input_item( + True, messages=[tool_message, new_message], no_tool=True + ) + else: + async_ten_env.log_error( + f"Unknown tool result content: {result_content}" + ) elif tool_result["type"] == "requery": # self.memory_cache = [] self.memory_cache.pop() @@ -295,20 +309,20 @@ async def handle_content_finished(_: str): # Make an async API call to get chat completions await self.client.get_chat_completions_stream( - memory + [message], tools, listener + memory + messages, tools, listener ) # Wait for the content to be finished await content_finished_event.wait() async_ten_env.log_info( - f"Chat completion finished for input text: {message}" + f"Chat completion finished for input text: {messages}" ) except asyncio.CancelledError: - async_ten_env.log_info(f"Task cancelled: {message}") + async_ten_env.log_info(f"Task cancelled: {messages}") except Exception: async_ten_env.log_error( - f"Error in chat_completion: {traceback.format_exc()} for input text: {message}" + f"Error in chat_completion: {traceback.format_exc()} for input text: {messages}" ) finally: self.send_text_output(async_ten_env, "", True) @@ -355,10 +369,11 @@ def _convert_tools_to_dict(self, tool: LLMToolMetadata): return json_dict def message_to_dict(self, message: LLMChatCompletionMessageParam): - if isinstance(message["content"], str): - message["content"] = str(message["content"]) - else: - message["content"] = list(message["content"]) + if message.get("content") is not None: + if isinstance(message["content"], str): + message["content"] = str(message["content"]) + else: + message["content"] = list(message["content"]) return message def _append_memory(self, message: str): diff --git a/agents/ten_packages/extension/openai_image_generate_tool/extension.py b/agents/ten_packages/extension/openai_image_generate_tool/extension.py index e9d976e6..a39629be 100644 --- a/agents/ten_packages/extension/openai_image_generate_tool/extension.py +++ b/agents/ten_packages/extension/openai_image_generate_tool/extension.py @@ -11,7 +11,7 @@ from ten_ai_base import ( AsyncLLMToolBaseExtension, LLMToolMetadata, LLMToolResult ) -from ten_ai_base.types import LLMChatCompletionContentPartImageParam, LLMToolMetadataParameter, LLMToolResultDirectRawResponse +from ten_ai_base.types import LLMChatCompletionContentPartImageParam, LLMToolMetadataParameter, LLMToolResultNormal from .openai import OpenAIImageGenerateClient, OpenAIImageGenerateToolConfig class OpenAIImageGenerateToolExtension(AsyncLLMToolBaseExtension): @@ -62,8 +62,8 @@ async def run_tool(self, ten_env: AsyncTenEnv, name: str, args: dict) -> LLMTool # call OpenAIImageGenerateClient to generate images response_url = await self.client.generate_images(prompt) ten_env.log_info(f"Generated image: {response_url}") - result = LLMToolResultDirectRawResponse( - type="direct_raw_response", + result = LLMToolResultNormal( + type="normal", content={"data":{"image_url": response_url}, "type": "image_url"}, ) return result diff --git a/agents/ten_packages/extension/vision_tool_python/extension.py b/agents/ten_packages/extension/vision_tool_python/extension.py index 9f1a97bc..43522af4 100644 --- a/agents/ten_packages/extension/vision_tool_python/extension.py +++ b/agents/ten_packages/extension/vision_tool_python/extension.py @@ -145,7 +145,9 @@ async def run_tool( content=[ LLMChatCompletionContentPartImageParam( type="image_url", - image_url=base64_image, + image_url={ + "url": base64_image, + }, ) ], ) diff --git a/agents/ten_packages/extension/weatherapi_tool_python/extension.py b/agents/ten_packages/extension/weatherapi_tool_python/extension.py index 54efca36..2bf1095c 100644 --- a/agents/ten_packages/extension/weatherapi_tool_python/extension.py +++ b/agents/ten_packages/extension/weatherapi_tool_python/extension.py @@ -17,7 +17,7 @@ from ten.async_ten_env import AsyncTenEnv from ten_ai_base.config import BaseConfig from ten_ai_base import AsyncLLMToolBaseExtension -from ten_ai_base.types import LLMToolMetadata, LLMToolMetadataParameter, LLMToolResult, LLMToolResultRequery +from ten_ai_base.types import LLMToolMetadata, LLMToolMetadataParameter, LLMToolResult, LLMToolResultNormal CMD_TOOL_REGISTER = "tool_register" CMD_TOOL_CALL = "tool_call" @@ -172,22 +172,22 @@ async def run_tool( ten_env.log_info(f"run_tool name: {name}, args: {args}") if name == CURRENT_TOOL_NAME: result = await self._get_current_weather(args) - return LLMToolResultRequery( - type="requery", + return LLMToolResultNormal( + type="normal", content=json.dumps(result), ) elif name == HISTORY_TOOL_NAME: result = await self._get_past_weather(args) # result = LLMCompletionContentItemText(text="I see something") - return LLMToolResultRequery( - type="requery", + return LLMToolResultNormal( + type="normal", content=json.dumps(result), ) elif name == FORECAST_TOOL_NAME: result = await self._get_future_weather(args) # result = LLMCompletionContentItemText(text="I see something") - return LLMToolResultRequery( - type="requery", + return LLMToolResultNormal( + type="normal", content=json.dumps(result), ) diff --git a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/llm.py b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/llm.py index a81cfe9b..44af592a 100644 --- a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/llm.py +++ b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/llm.py @@ -189,5 +189,5 @@ async def _process_queue(self, async_ten_env: AsyncTenEnv): await self.current_task # Wait for the current task to finish or be cancelled except asyncio.CancelledError: async_ten_env.log_info(f"Task cancelled: {args}") - except Exception as err: - async_ten_env.log_error(f"Task failed: {args}, err: {err}") + except Exception: + async_ten_env.log_error(f"Task failed: {args}, err: {traceback.format_exc()}") diff --git a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/types.py b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/types.py index 2119fffb..5d6493e2 100644 --- a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/types.py +++ b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/types.py @@ -99,18 +99,13 @@ class LLMToolResultRequery(TypedDict, total=False): type: Required[Literal["requery"]] content: Required[Union[str, Iterable[LLMChatCompletionContentPartParam]]] -class LLMToolResultDirectSpeechResponse(TypedDict, total=False): - type: Required[Literal["direct_speech_response"]] - content: Required[str] - -class LLMToolResultDirectRawResponse(TypedDict, total=False): - type: Required[Literal["direct_raw_response"]] - content: Required[str] +class LLMToolResultNormal(TypedDict, total=False): + type: Required[Literal["normal"]] + content: Required[Union[str, Iterable[LLMChatCompletionContentPartParam]]] LLMToolResult: TypeAlias = Union[ LLMToolResultRequery, - LLMToolResultDirectSpeechResponse, - LLMToolResultDirectRawResponse, + LLMToolResultNormal, ] class LLMCallCompletionArgs(TypedDict, total=False): From aea809d465f4b07ed4d04e3eb6231db880c5ac27 Mon Sep 17 00:00:00 2001 From: Ethan Zhang Date: Tue, 7 Jan 2025 18:58:02 +0000 Subject: [PATCH 6/6] fix: done adapting --- agents/examples/default/property.json | 20 +++++++------- .../message_collector/src/extension.py | 15 +++++------ .../openai_chatgpt_python/extension.py | 5 +++- .../openai_chatgpt_python/manifest.json | 8 ------ .../openai_image_generate_tool/extension.py | 27 ++++++++++++++++++- .../openai_image_generate_tool/manifest.json | 10 +++++++ .../ten_ai_base/interface/ten_ai_base/llm.py | 16 ----------- playground/src/manager/rtc/rtc.ts | 1 + 8 files changed, 59 insertions(+), 43 deletions(-) diff --git a/agents/examples/default/property.json b/agents/examples/default/property.json index 75196981..1b0ab506 100644 --- a/agents/examples/default/property.json +++ b/agents/examples/default/property.json @@ -698,7 +698,7 @@ "max_memory_length": 10, "max_tokens": 512, "model": "${env:OPENAI_MODEL}", - "prompt": "You are an ai agent bot producing child picture books. Each response should be short and no more than 50 words as it's for child. \nFor each response, you will use the 'image_generate' tool to create an image based on the description or key moment in that part of the story. The story should be set in a fantasy world. Try asking questions relevant to the story to decide how the story should proceed. Each response should include rich, vivid descriptions that will guide the 'image_generate' tool to produce an image that aligns with the scene or mood.\n Whether it’s the setting, a character’s expression, or a dramatic moment, the paragraph should give enough detail for a meaningful visual representation.", + "prompt": "You are an ai agent bot producing child picture books. Each response should be short and no more than 50 words as it's for child. \nFor every response relevant to the story-telling, you will use the 'image_generate' tool to create an image based on the description or key moment in that part of the story. \n The story should be set in a fantasy world. Try asking questions relevant to the story to decide how the story should proceed. Every response should include rich, vivid descriptions that will guide the 'image_generate' tool to produce an image that aligns with the scene or mood.\n Whether it’s the setting, a character’s expression, or a dramatic moment, the paragraph should give enough detail for a meaningful visual representation.", "proxy_url": "${env:OPENAI_PROXY_URL}" } }, @@ -845,14 +845,6 @@ "extension": "message_collector" } ] - }, - { - "name": "raw_text_data", - "dest": [ - { - "extension": "message_collector2" - } - ] } ] }, @@ -952,6 +944,16 @@ } ] } + ], + "data": [ + { + "name": "raw_text_data", + "dest": [ + { + "extension": "message_collector2" + } + ] + } ] } ] diff --git a/agents/ten_packages/extension/message_collector/src/extension.py b/agents/ten_packages/extension/message_collector/src/extension.py index 450b0856..f90638e1 100644 --- a/agents/ten_packages/extension/message_collector/src/extension.py +++ b/agents/ten_packages/extension/message_collector/src/extension.py @@ -32,8 +32,6 @@ TEXT_DATA_STREAM_ID_FIELD = "stream_id" TEXT_DATA_END_OF_SEGMENT_FIELD = "end_of_segment" -# record the cached text data for each stream id -cached_text_map = {} MAX_CHUNK_SIZE_BYTES = 1024 @@ -104,6 +102,7 @@ def __init__(self, name: str): super().__init__(name) self.queue = asyncio.Queue() self.loop = None + self.cached_text_map = {} def on_init(self, ten_env: TenEnv) -> None: ten_env.log_info("on_init") @@ -191,15 +190,15 @@ def on_data(self, ten_env: TenEnv, data: Data) -> None: # We cache all final text data and append the non-final text data to the cached data # until the end of the segment. if end_of_segment: - if stream_id in cached_text_map: - text = cached_text_map[stream_id] + text - del cached_text_map[stream_id] + if stream_id in self.cached_text_map: + text = self.cached_text_map[stream_id] + text + del self.cached_text_map[stream_id] else: if final: - if stream_id in cached_text_map: - text = cached_text_map[stream_id] + text + if stream_id in self.cached_text_map: + text = self.cached_text_map[stream_id] + text - cached_text_map[stream_id] = text + self.cached_text_map[stream_id] = text # Generate a unique message ID for this batch of parts message_id = str(uuid.uuid4())[:8] diff --git a/agents/ten_packages/extension/openai_chatgpt_python/extension.py b/agents/ten_packages/extension/openai_chatgpt_python/extension.py index 520c0b22..79c4e424 100644 --- a/agents/ten_packages/extension/openai_chatgpt_python/extension.py +++ b/agents/ten_packages/extension/openai_chatgpt_python/extension.py @@ -378,5 +378,8 @@ def message_to_dict(self, message: LLMChatCompletionMessageParam): def _append_memory(self, message: str): if len(self.memory) > self.config.max_memory_length: - self.memory.pop(0) + removed_item = self.memory.pop(0) + # Remove tool calls from memory + if removed_item.get("tool_calls") and self.memory[0].get("role") == "tool": + self.memory.pop(0) self.memory.append(message) diff --git a/agents/ten_packages/extension/openai_chatgpt_python/manifest.json b/agents/ten_packages/extension/openai_chatgpt_python/manifest.json index b955f5b8..f71d0d76 100644 --- a/agents/ten_packages/extension/openai_chatgpt_python/manifest.json +++ b/agents/ten_packages/extension/openai_chatgpt_python/manifest.json @@ -85,14 +85,6 @@ "type": "string" } } - }, - { - "name": "raw_text_data", - "property": { - "text": { - "type": "string" - } - } } ], "cmd_in": [ diff --git a/agents/ten_packages/extension/openai_image_generate_tool/extension.py b/agents/ten_packages/extension/openai_image_generate_tool/extension.py index a39629be..e5635c36 100644 --- a/agents/ten_packages/extension/openai_image_generate_tool/extension.py +++ b/agents/ten_packages/extension/openai_image_generate_tool/extension.py @@ -3,14 +3,17 @@ # Licensed under the Apache License, Version 2.0. # See the LICENSE file for more information. # +import asyncio import json from ten import ( + Data, TenEnv, AsyncTenEnv, ) from ten_ai_base import ( AsyncLLMToolBaseExtension, LLMToolMetadata, LLMToolResult ) +from ten_ai_base.const import DATA_OUT_PROPERTY_END_OF_SEGMENT, DATA_OUT_PROPERTY_TEXT, RAW_DATA_OUT_NAME from ten_ai_base.types import LLMChatCompletionContentPartImageParam, LLMToolMetadataParameter, LLMToolResultNormal from .openai import OpenAIImageGenerateClient, OpenAIImageGenerateToolConfig @@ -52,6 +55,27 @@ def get_tool_metadata(self, ten_env: TenEnv) -> list[LLMToolMetadata]: ) ] + async def send_image(self, async_ten_env: AsyncTenEnv, image_url: str) -> None: + # Implement this method to send the image to the chat. + async_ten_env.log_info(f"Sending image: {image_url}") + try: + sentence = json.dumps({"data":{"image_url": image_url}, "type": "image_url"}) + output_data = Data.create(RAW_DATA_OUT_NAME) + output_data.set_property_string( + DATA_OUT_PROPERTY_TEXT, + sentence + ) + output_data.set_property_bool( + DATA_OUT_PROPERTY_END_OF_SEGMENT, True + ) + asyncio.create_task(async_ten_env.send_data(output_data)) + async_ten_env.log_info( + f"sent sentence [{sentence}]" + ) + except Exception as err: + async_ten_env.log_warn(f"send sentence [{sentence}] failed, err: {err}") + + async def run_tool(self, ten_env: AsyncTenEnv, name: str, args: dict) -> LLMToolResult | None: ten_env.log_info(f"run_tool {name} {args}") if name == "image_generate": @@ -62,8 +86,9 @@ async def run_tool(self, ten_env: AsyncTenEnv, name: str, args: dict) -> LLMTool # call OpenAIImageGenerateClient to generate images response_url = await self.client.generate_images(prompt) ten_env.log_info(f"Generated image: {response_url}") + await self.send_image(ten_env, response_url) result = LLMToolResultNormal( type="normal", - content={"data":{"image_url": response_url}, "type": "image_url"}, + content=json.dumps({"success": True}), ) return result diff --git a/agents/ten_packages/extension/openai_image_generate_tool/manifest.json b/agents/ten_packages/extension/openai_image_generate_tool/manifest.json index 6f5c5342..b04f49be 100644 --- a/agents/ten_packages/extension/openai_image_generate_tool/manifest.json +++ b/agents/ten_packages/extension/openai_image_generate_tool/manifest.json @@ -102,6 +102,16 @@ } } } + ], + "data_out": [ + { + "name": "raw_text_data", + "property": { + "text": { + "type": "string" + } + } + } ] } } \ No newline at end of file diff --git a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/llm.py b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/llm.py index 44af592a..b90d0914 100644 --- a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/llm.py +++ b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/llm.py @@ -117,22 +117,6 @@ async def flush_input_items(self, async_ten_env: AsyncTenEnv): async_ten_env.log_info("Cancelling the current task during flush.") self.current_task.cancel() - def send_raw_text_output( - self, async_ten_env: AsyncTenEnv, sentence: str, end_of_segment: bool - ): - try: - output_data = Data.create(RAW_DATA_OUT_NAME) - output_data.set_property_string(DATA_OUT_PROPERTY_TEXT, sentence) - output_data.set_property_bool( - DATA_OUT_PROPERTY_END_OF_SEGMENT, end_of_segment - ) - asyncio.create_task(async_ten_env.send_data(output_data)) - async_ten_env.log_info( - f"{'end of segment ' if end_of_segment else ''}sent raw sentence [{sentence}]" - ) - except Exception as err: - async_ten_env.log_warn(f"send sentence [{sentence}] failed, err: {err}") - def send_text_output( self, async_ten_env: AsyncTenEnv, sentence: str, end_of_segment: bool ): diff --git a/playground/src/manager/rtc/rtc.ts b/playground/src/manager/rtc/rtc.ts index 65242322..ed6dcd9f 100644 --- a/playground/src/manager/rtc/rtc.ts +++ b/playground/src/manager/rtc/rtc.ts @@ -233,6 +233,7 @@ export class RtcManager extends AGEventEmitter { const { stream_id, is_final, text, text_ts, data_type } = JSON.parse( atob(completeMessage) ); + console.log(`[test] message_id: ${message_id} stream_id: ${stream_id}, text: ${text}, data_type: ${data_type}`); const isAgent = Number(stream_id) != Number(this.userId) let textItem: IChatItem = { type: isAgent ? EMessageType.AGENT : EMessageType.USER,