Merge pull request #566 from zhuermu/main

feat: Update the plugin bedrockPython to support video input, enablin…
TEN-framework · Jan 20, 2025 · 984265d · 984265d
2 parents 5f6657d + c355566
commit 984265d
Show file tree

Hide file tree

Showing 18 changed files with 1,139 additions and 497 deletions.
diff --git a/agents/examples/demo/property.json b/agents/examples/demo/property.json
@@ -1491,6 +1491,252 @@
             ]
           }
         ]
+      },
+      {
+        "name": "va_nova_multimodal_aws",
+        "auto_start": true,
+        "nodes": [
+          {
+            "type": "extension",
+            "name": "agora_rtc",
+            "addon": "agora_rtc",
+            "extension_group": "default",
+            "property": {
+              "app_id": "${env:AGORA_APP_ID}",
+              "token": "<agora_token>",
+              "channel": "ten_agent_test",
+              "stream_id": 1234,
+              "remote_stream_id": 123,
+              "subscribe_audio": true,
+              "publish_audio": true,
+              "publish_data": true,
+              "enable_agora_asr": false,
+              "agora_asr_vendor_name": "microsoft",
+              "agora_asr_language": "en-US",
+              "agora_asr_vendor_key": "${env:AZURE_STT_KEY|}",
+              "agora_asr_vendor_region": "${env:AZURE_STT_REGION|}",
+              "agora_asr_session_control_file_path": "session_control.conf",
+              "subscribe_video_pix_fmt": 4,
+              "subscribe_video": true,
+              "max_memory_length":10
+            }
+          },
+          {
+            "type": "extension",
+            "name": "stt",
+            "addon": "transcribe_asr_python",
+            "extension_group": "stt",
+            "property": {
+              "access_key": "${env:AWS_ACCESS_KEY_ID}",
+              "lang_code": "en-US",
+              "region": "us-east-1",
+              "sample_rate": "16000",
+              "secret_key": "${env:AWS_SECRET_ACCESS_KEY}"
+            }
+          },
+          {
+            "type": "extension",
+            "name": "llm",
+            "addon": "bedrock_llm_python",
+            "extension_group": "chatgpt",
+            "property": {
+              "access_key_id": "${env:AWS_ACCESS_KEY_ID}",
+              "greeting": "TEN Agent connected. I am nova, How can I help you today?",
+              "max_memory_length": 10,
+              "max_tokens": 256,
+              "model": "us.amazon.nova-lite-v1:0",
+              "prompt": "Now you are an intelligent assistant with real-time interaction capabilities. I will provide you with a series of real-time video image information. Please understand these images as video frames. Based on the images and the user's input, engage in a conversation with the user, remembering the dialogue content in a concise and clear manner.",
+              "region": "us-east-1",
+              "secret_access_key": "${env:AWS_SECRET_ACCESS_KEY}",
+              "temperature": 0.7,
+              "topK": 10,
+              "topP": 0.5,
+              "is_memory_enabled": false,
+              "is_enable_video": true
+            }
+          },
+          {
+            "type": "extension",
+            "name": "tts",
+            "addon": "polly_tts",
+            "extension_group": "tts",
+            "property": {
+              "region": "us-east-1",
+              "access_key": "${env:AWS_ACCESS_KEY_ID}",
+              "secret_key": "${env:AWS_SECRET_ACCESS_KEY}",
+              "engine": "generative",
+              "voice": "Ruth",
+              "sample_rate": 16000,
+              "lang_code": "en-US"
+            }
+          },
+          {
+            "type": "extension",
+            "name": "interrupt_detector",
+            "addon": "interrupt_detector_python",
+            "extension_group": "default",
+            "property": {}
+          },
+          {
+            "type": "extension",
+            "name": "message_collector",
+            "addon": "message_collector",
+            "extension_group": "transcriber",
+            "property": {}
+          }
+        ],
+        "connections": [
+          {
+            "extension": "agora_rtc",
+            "cmd": [
+              {
+                "name": "on_user_joined",
+                "dest": [
+                  {
+                    "extension": "llm"
+                  }
+                ]
+              },
+              {
+                "name": "on_user_left",
+                "dest": [
+                  {
+                    "extension": "llm"
+                  }
+                ]
+              },
+              {
+                "name": "on_connection_failure",
+                "dest": [
+                  {
+                    "extension": "llm"
+                  }
+                ]
+              }
+            ],
+            "audio_frame": [
+              {
+                "name": "pcm_frame",
+                "dest": [
+                  {
+                    "extension": "stt"
+                  }
+                ]
+              }
+            ],
+            "video_frame": [
+              {
+                "name": "video_frame",
+                "dest": [
+                  {
+                    "extension": "llm"
+                  }
+                ]
+              }
+            ]
+          },
+          {
+            "extension": "stt",
+            "data": [
+              {
+                "name": "text_data",
+                "dest": [
+                  {
+                    "extension": "interrupt_detector"
+                  },
+                  {
+                    "extension": "message_collector"
+                  }
+                ]
+              }
+            ]
+          },
+          {
+            "extension": "llm",
+            "cmd": [
+              {
+                "name": "flush",
+                "dest": [
+                  {
+                    "extension": "tts"
+                  }
+                ]
+              }
+            ],
+            "data": [
+              {
+                "name": "text_data",
+                "dest": [
+                  {
+                    "extension": "tts"
+                  },
+                  {
+                    "extension": "message_collector"
+                  }
+                ]
+              }
+            ]
+          },
+          {
+            "extension": "message_collector",
+            "data": [
+              {
+                "name": "data",
+                "dest": [
+                  {
+                    "extension": "agora_rtc"
+                  }
+                ]
+              }
+            ]
+          },
+          {
+            "extension": "tts",
+            "cmd": [
+              {
+                "name": "flush",
+                "dest": [
+                  {
+                    "extension": "agora_rtc"
+                  }
+                ]
+              }
+            ],
+            "audio_frame": [
+              {
+                "name": "pcm_frame",
+                "dest": [
+                  {
+                    "extension": "agora_rtc"
+                  }
+                ]
+              }
+            ]
+          },
+          {
+            "extension": "interrupt_detector",
+            "cmd": [
+              {
+                "name": "flush",
+                "dest": [
+                  {
+                    "extension": "llm"
+                  }
+                ]
+              }
+            ],
+            "data": [
+              {
+                "name": "text_data",
+                "dest": [
+                  {
+                    "extension": "llm"
+                  }
+                ]
+              }
+            ]
+          }
+        ]
       }
     ],
     "log_level": 3

diff --git a/agents/ten_packages/extension/bedrock_llm_python/README.md b/agents/ten_packages/extension/bedrock_llm_python/README.md
@@ -9,4 +9,97 @@ You can config this extension by providing following environments:
 | AWS_REGION | No | us-east-1 | The Region of Amazon Bedrock service you want to use. |
 | AWS_ACCESS_KEY_ID | No | - | Access Key of your IAM User, make sure you've set proper permissions to [invoke Bedrock models](https://docs.aws.amazon.com/bedrock/latest/userguide/security_iam_id-based-policy-examples.html) and gain [models access](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access.html) in Bedrock. Will use default credentials provider if not provided. Check [document](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html).  |
 | AWS_SECRET_ACCESS_KEY | No | - | Secret Key of your IAM User, make sure you've set proper permissions to [invoke Bedrock models](https://docs.aws.amazon.com/bedrock/latest/userguide/security_iam_id-based-policy-examples.html) and gain [models access](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access.html) in Bedrock. Will use default credentials provider if not provided. Check [document](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html). |
-| AWS_BEDROCK_MODEL | No | Claude 3.5(anthropic.claude-3-5-sonnet-20240620-v1:0) | Bedrock model id, check [docuement](https://docs.aws.amazon.com/bedrock/latest/userguide/model-ids.html#model-ids-arns).  |
+| AWS_BEDROCK_MODEL | No | Nova (https://docs.aws.amazon.com/nova/latest/userguide/what-is-nova.html) | Bedrock model id, check [docuement](https://docs.aws.amazon.com/bedrock/latest/userguide/model-ids.html#model-ids-arns).  |
+
+## Features
+
+- Real-time video and audio interaction similar to Gemini 2.0
+- Audio recognition using TEN framework's STT plugin
+- Text-to-speech conversion using TEN framework's TTS plugin
+- Integration with AWS Bedrock's Nova model
+- Smart input truncation logic
+- Multi-language support
+
+## Requirements
+- Python 3.9+
+- AWS account with Bedrock access
+- TEN framework with STT and TTS plugins
+- Dependencies listed in requirements.txt
+
+## Installation
+
+1. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+2. Configure AWS credentials:
+- Set up AWS credentials with Bedrock access
+- Update the api_key in configuration
+
+## Configuration
+
+The extension can be configured through manifest.json properties:
+- `base_uri`: Bedrock API endpoint
+- `region`: AWS region for Bedrock
+- `aws_access_key_id`: AWS access key ID
+- `aws_secret_access_key`: AWS secret access key
+- `model_id`: Bedrock Nova model ID
+- `language`: Language code for STT/TTS
+- See manifest.json for full configuration options
+
+## Input Truncation Logic
+
+The extension implements smart input truncation:
+
+1. Duration-based truncation:
+   - Automatically truncates input exceeding 30 seconds
+
+2. Silence-based truncation:
+   - Triggers when silence exceeds 2 seconds
+
+3. Manual truncation:
+   - Supports user-initiated truncation
+
+## Architecture
+
+1. Audio Processing:
+   - Uses TEN framework's STT plugin for audio recognition
+   - Buffers and processes audio in real-time
+   - Provides intermediate and final transcripts
+
+2. Nova Model Integration:
+   - Combines transcribed text with video input
+   - Sends to Bedrock's Nova model for processing
+   - Handles responses and error conditions
+
+3. Speech Synthesis:
+   - Converts Nova model responses to speech
+   - Uses TEN framework's TTS plugin
+   - Synchronizes with video output
+
+## API Usage
+
+### Commands
+
+1. Flush Command:
+```python
+cmd = Cmd.create("flush")
+await ten_env.send_cmd(cmd)
+```
+
+2. User Events:
+```python
+# User joined
+cmd = Cmd.create("on_user_joined")
+await ten_env.send_cmd(cmd)
+
+# User left
+cmd = Cmd.create("on_user_left")
+await ten_env.send_cmd(cmd)
+```
+
+## Contributing
+1. Fork the repository
+2. Create a feature branch
+3. Submit a pull request
diff --git a/agents/ten_packages/extension/bedrock_llm_python/__init__.py b/agents/ten_packages/extension/bedrock_llm_python/__init__.py
@@ -1 +1,6 @@
-from . import bedrock_llm_extension
+#
+# This file is part of TEN Framework, an open source project.
+# Licensed under the Apache License, Version 2.0.
+# See the LICENSE file for more information.
+#
+from . import addon
diff --git a/agents/ten_packages/extension/bedrock_llm_python/addon.py b/agents/ten_packages/extension/bedrock_llm_python/addon.py
@@ -0,0 +1,18 @@
+#
+# This file is part of TEN Framework, an open source project.
+# Licensed under the Apache License, Version 2.0.
+# See the LICENSE file for more information.
+#
+from ten import (
+    Addon,
+    register_addon_as_extension,
+    TenEnv,
+)
+from .extension import BedrockLLMExtension
+
+
+@register_addon_as_extension("bedrock_llm_python")
+class LLMExtensionExtensionAddon(Addon):
+    def on_create_instance(self, ten_env: TenEnv, name: str, context) -> None:
+        ten_env.log_info("on_create_instance")
+        ten_env.on_create_instance_done(BedrockLLMExtension(name), context)