feat: support tool

TEN-framework · Dec 31, 2024 · 3a026f6 · 3a026f6
1 parent ef24ca2
commit 3a026f6
Show file tree

Hide file tree

Showing 9 changed files with 65 additions and 33 deletions.
diff --git a/agents/examples/default/property.json b/agents/examples/default/property.json
@@ -546,35 +546,35 @@
         "connections": [
           {
             "extension": "agora_rtc",
-            "audio_frame": [
+            "cmd": [
               {
-                "name": "pcm_frame",
+                "name": "on_user_joined",
                 "dest": [
                   {
                     "extension": "v2v"
                   }
                 ]
-              }
-            ],
-            "cmd": [
+              },
               {
-                "name": "on_user_joined",
+                "name": "on_user_left",
                 "dest": [
                   {
                     "extension": "v2v"
                   }
                 ]
               },
               {
-                "name": "on_user_left",
+                "name": "on_connection_failure",
                 "dest": [
                   {
                     "extension": "v2v"
                   }
                 ]
-              },
+              }
+            ],
+            "audio_frame": [
               {
-                "name": "on_connection_failure",
+                "name": "pcm_frame",
                 "dest": [
                   {
                     "extension": "v2v"
@@ -698,7 +698,7 @@
               "max_memory_length": 10,
               "max_tokens": 512,
               "model": "${env:OPENAI_MODEL}",
-              "prompt": "",
+              "prompt": "You are an ai agent bot producing child picture books. Each response should be short and no more than 50 words as it's for child. \nFor each response, you will use the 'image_generate' tool to create an image based on the description or key moment in that part of the story.  The story should be set in a fantasy world. Try asking questions relevant to the story to decide how the story should proceed. Each response should include rich, vivid descriptions that will guide the 'image_generate' tool to produce an image that aligns with the scene or mood.\n Whether it’s the setting, a character’s expression, or a dramatic moment, the paragraph should give enough detail for a meaningful visual representation.",
               "proxy_url": "${env:OPENAI_PROXY_URL}"
             }
           },
@@ -838,6 +838,14 @@
                     "extension": "message_collector"
                   }
                 ]
+              },
+              {
+                "name": "raw_text_data",
+                "dest": [
+                  {
+                    "extension": "message_collector"
+                  }
+                ]
               }
             ]
           },

diff --git a/agents/ten_packages/extension/message_collector/manifest.json b/agents/ten_packages/extension/message_collector/manifest.json
@@ -40,6 +40,17 @@
             "type": "bool"
           }
         }
+      },
+      {
+        "name": "raw_text_data",
+        "property": {
+          "text": {
+            "type": "string"
+          },
+          "end_of_segment": {
+            "type": "bool"
+          }
+        }
       }
     ],
     "data_out": [

diff --git a/agents/ten_packages/extension/message_collector/src/extension.py b/agents/ten_packages/extension/message_collector/src/extension.py
@@ -214,6 +214,10 @@ def on_data(self, ten_env: TenEnv, data: Data) -> None:
             "text": text,
         }
 
+        # Add the raw data type if the data is raw text data
+        if data.get_name() == "raw_text_data":
+            base_msg_data["data_type"] = "raw"
+
         try:
             chunks = _text_to_base64_chunks(ten_env, json.dumps(base_msg_data), message_id)
             for chunk in chunks:

diff --git a/agents/ten_packages/extension/openai_chatgpt_python/extension.py b/agents/ten_packages/extension/openai_chatgpt_python/extension.py
@@ -58,7 +58,7 @@ def __init__(self, name: str):
         self.config = None
         self.client = None
         self.sentence_fragment = ""
-        self.tool_task_future = None
+        self.tool_task_future: asyncio.Future | None = None
         self.users_count = 0
 
     async def on_init(self, async_ten_env: AsyncTenEnv) -> None:

diff --git a/agents/ten_packages/extension/openai_image_generate_tool/extension.py b/agents/ten_packages/extension/openai_image_generate_tool/extension.py
@@ -3,6 +3,7 @@
 # Licensed under the Apache License, Version 2.0.
 # See the LICENSE file for more information.
 #
+import json
 from ten import (
     TenEnv,
     AsyncTenEnv,
@@ -63,11 +64,6 @@ async def run_tool(self, ten_env: AsyncTenEnv, name: str, args: dict) -> LLMTool
                 ten_env.log_info(f"Generated image: {response_url}")
                 result = LLMToolResultDirectRawResponse(
                     type="direct_raw_response",
-                    content=[
-                        LLMChatCompletionContentPartImageParam(
-                            type="image_url",
-                            image_url=response_url
-                        )
-                    ]
+                    content=json.dumps({"data":{"image_url": response_url}, "type": "image_url"}),
                 )
                 return result
diff --git a/agents/ten_packages/extension/openai_image_generate_tool/manifest.json b/agents/ten_packages/extension/openai_image_generate_tool/manifest.json
@@ -57,7 +57,8 @@
             "type": "string"
           },
           "arguments": {
-            "type": "object"
+            "type": "object",
+            "properties": {}
           }
         },
         "required": [

diff --git a/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/types.py b/agents/ten_packages/system/ten_ai_base/interface/ten_ai_base/types.py
@@ -105,7 +105,7 @@ class LLMToolResultDirectSpeechResponse(TypedDict, total=False):
 
 class LLMToolResultDirectRawResponse(TypedDict, total=False):
     type: Required[Literal["direct_raw_response"]]
-    content: Required[Union[str, Iterable[LLMChatCompletionContentPartParam]]]
+    content: Required[str]
 
 LLMToolResult: TypeAlias = Union[
     LLMToolResultRequery,

diff --git a/playground/src/manager/rtc/rtc.ts b/playground/src/manager/rtc/rtc.ts
@@ -86,10 +86,10 @@ export class RtcManager extends AGEventEmitter<RtcEvents> {
     this.emit("localTracksChanged", this.localTracks);
   }
 
-  async switchVideoSource(type:VideoSourceType) {
+  async switchVideoSource(type: VideoSourceType) {
     if (type === VideoSourceType.SCREEN) {
       await this.createScreenShareTrack();
-      if(this.localTracks.screenTrack) {
+      if (this.localTracks.screenTrack) {
         this.client.unpublish(this.localTracks.videoTrack);
         this.localTracks.videoTrack?.close();
         this.localTracks.videoTrack = undefined;
@@ -98,7 +98,7 @@ export class RtcManager extends AGEventEmitter<RtcEvents> {
       }
     } else if (type === VideoSourceType.CAMERA) {
       await this.createCameraTracks();
-      if(this.localTracks.videoTrack) {
+      if (this.localTracks.videoTrack) {
         this.client.unpublish(this.localTracks.screenTrack);
         this.localTracks.screenTrack?.close();
         this.localTracks.screenTrack = undefined;
@@ -228,18 +228,30 @@ export class RtcManager extends AGEventEmitter<RtcEvents> {
         const completeMessage = this.reconstructMessage(
           this.messageCache[message_id]
         );
-        const { stream_id, is_final, text, text_ts } = JSON.parse(
+        const { stream_id, is_final, text, text_ts, data_type } = JSON.parse(
           atob(completeMessage)
         );
-        const textItem: ITextItem = {
-          uid: `${stream_id}`,
-          time: text_ts,
-          dataType: "transcribe",
-          text: text,
-          isFinal: is_final,
-        };
-
-        if (text.trim().length > 0) {
+        let textItem: ITextItem;
+
+        if (data_type === "raw") {
+          textItem = {
+            uid: `${stream_id}`,
+            time: text_ts,
+            dataType: "image_url",
+            text: text,
+            isFinal: is_final,
+          }
+        } else {
+          textItem = {
+            uid: `${stream_id}`,
+            time: text_ts,
+            dataType: "transcribe",
+            text: text,
+            isFinal: is_final,
+          };
+        }
+
+        if (text.trim().length > 0 && textItem) {
           this.emit("textChanged", textItem);
         }
 

diff --git a/playground/src/types/index.ts b/playground/src/types/index.ts
@@ -35,7 +35,7 @@ export interface IChatItem {
 
 /** @deprecated */
 export interface ITextItem {
-  dataType: "transcribe" | "translate";
+  dataType: "transcribe" | "translate" | "image_url";
   uid: string;
   time: number;
   text: string;