Skip to content

Commit

Permalink
Merge pull request #566 from zhuermu/main
Browse files Browse the repository at this point in the history
feat: Update the plugin bedrockPython to support video input, enablin…
  • Loading branch information
plutoless authored Jan 20, 2025
2 parents 5f6657d + c355566 commit 984265d
Show file tree
Hide file tree
Showing 18 changed files with 1,139 additions and 497 deletions.
246 changes: 246 additions & 0 deletions agents/examples/demo/property.json
Original file line number Diff line number Diff line change
Expand Up @@ -1491,6 +1491,252 @@
]
}
]
},
{
"name": "va_nova_multimodal_aws",
"auto_start": true,
"nodes": [
{
"type": "extension",
"name": "agora_rtc",
"addon": "agora_rtc",
"extension_group": "default",
"property": {
"app_id": "${env:AGORA_APP_ID}",
"token": "<agora_token>",
"channel": "ten_agent_test",
"stream_id": 1234,
"remote_stream_id": 123,
"subscribe_audio": true,
"publish_audio": true,
"publish_data": true,
"enable_agora_asr": false,
"agora_asr_vendor_name": "microsoft",
"agora_asr_language": "en-US",
"agora_asr_vendor_key": "${env:AZURE_STT_KEY|}",
"agora_asr_vendor_region": "${env:AZURE_STT_REGION|}",
"agora_asr_session_control_file_path": "session_control.conf",
"subscribe_video_pix_fmt": 4,
"subscribe_video": true,
"max_memory_length":10
}
},
{
"type": "extension",
"name": "stt",
"addon": "transcribe_asr_python",
"extension_group": "stt",
"property": {
"access_key": "${env:AWS_ACCESS_KEY_ID}",
"lang_code": "en-US",
"region": "us-east-1",
"sample_rate": "16000",
"secret_key": "${env:AWS_SECRET_ACCESS_KEY}"
}
},
{
"type": "extension",
"name": "llm",
"addon": "bedrock_llm_python",
"extension_group": "chatgpt",
"property": {
"access_key_id": "${env:AWS_ACCESS_KEY_ID}",
"greeting": "TEN Agent connected. I am nova, How can I help you today?",
"max_memory_length": 10,
"max_tokens": 256,
"model": "us.amazon.nova-lite-v1:0",
"prompt": "Now you are an intelligent assistant with real-time interaction capabilities. I will provide you with a series of real-time video image information. Please understand these images as video frames. Based on the images and the user's input, engage in a conversation with the user, remembering the dialogue content in a concise and clear manner.",
"region": "us-east-1",
"secret_access_key": "${env:AWS_SECRET_ACCESS_KEY}",
"temperature": 0.7,
"topK": 10,
"topP": 0.5,
"is_memory_enabled": false,
"is_enable_video": true
}
},
{
"type": "extension",
"name": "tts",
"addon": "polly_tts",
"extension_group": "tts",
"property": {
"region": "us-east-1",
"access_key": "${env:AWS_ACCESS_KEY_ID}",
"secret_key": "${env:AWS_SECRET_ACCESS_KEY}",
"engine": "generative",
"voice": "Ruth",
"sample_rate": 16000,
"lang_code": "en-US"
}
},
{
"type": "extension",
"name": "interrupt_detector",
"addon": "interrupt_detector_python",
"extension_group": "default",
"property": {}
},
{
"type": "extension",
"name": "message_collector",
"addon": "message_collector",
"extension_group": "transcriber",
"property": {}
}
],
"connections": [
{
"extension": "agora_rtc",
"cmd": [
{
"name": "on_user_joined",
"dest": [
{
"extension": "llm"
}
]
},
{
"name": "on_user_left",
"dest": [
{
"extension": "llm"
}
]
},
{
"name": "on_connection_failure",
"dest": [
{
"extension": "llm"
}
]
}
],
"audio_frame": [
{
"name": "pcm_frame",
"dest": [
{
"extension": "stt"
}
]
}
],
"video_frame": [
{
"name": "video_frame",
"dest": [
{
"extension": "llm"
}
]
}
]
},
{
"extension": "stt",
"data": [
{
"name": "text_data",
"dest": [
{
"extension": "interrupt_detector"
},
{
"extension": "message_collector"
}
]
}
]
},
{
"extension": "llm",
"cmd": [
{
"name": "flush",
"dest": [
{
"extension": "tts"
}
]
}
],
"data": [
{
"name": "text_data",
"dest": [
{
"extension": "tts"
},
{
"extension": "message_collector"
}
]
}
]
},
{
"extension": "message_collector",
"data": [
{
"name": "data",
"dest": [
{
"extension": "agora_rtc"
}
]
}
]
},
{
"extension": "tts",
"cmd": [
{
"name": "flush",
"dest": [
{
"extension": "agora_rtc"
}
]
}
],
"audio_frame": [
{
"name": "pcm_frame",
"dest": [
{
"extension": "agora_rtc"
}
]
}
]
},
{
"extension": "interrupt_detector",
"cmd": [
{
"name": "flush",
"dest": [
{
"extension": "llm"
}
]
}
],
"data": [
{
"name": "text_data",
"dest": [
{
"extension": "llm"
}
]
}
]
}
]
}
],
"log_level": 3
Expand Down
95 changes: 94 additions & 1 deletion agents/ten_packages/extension/bedrock_llm_python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,97 @@ You can config this extension by providing following environments:
| AWS_REGION | No | us-east-1 | The Region of Amazon Bedrock service you want to use. |
| AWS_ACCESS_KEY_ID | No | - | Access Key of your IAM User, make sure you've set proper permissions to [invoke Bedrock models](https://docs.aws.amazon.com/bedrock/latest/userguide/security_iam_id-based-policy-examples.html) and gain [models access](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access.html) in Bedrock. Will use default credentials provider if not provided. Check [document](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html). |
| AWS_SECRET_ACCESS_KEY | No | - | Secret Key of your IAM User, make sure you've set proper permissions to [invoke Bedrock models](https://docs.aws.amazon.com/bedrock/latest/userguide/security_iam_id-based-policy-examples.html) and gain [models access](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access.html) in Bedrock. Will use default credentials provider if not provided. Check [document](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html). |
| AWS_BEDROCK_MODEL | No | Claude 3.5(anthropic.claude-3-5-sonnet-20240620-v1:0) | Bedrock model id, check [docuement](https://docs.aws.amazon.com/bedrock/latest/userguide/model-ids.html#model-ids-arns). |
| AWS_BEDROCK_MODEL | No | Nova (https://docs.aws.amazon.com/nova/latest/userguide/what-is-nova.html) | Bedrock model id, check [docuement](https://docs.aws.amazon.com/bedrock/latest/userguide/model-ids.html#model-ids-arns). |

## Features

- Real-time video and audio interaction similar to Gemini 2.0
- Audio recognition using TEN framework's STT plugin
- Text-to-speech conversion using TEN framework's TTS plugin
- Integration with AWS Bedrock's Nova model
- Smart input truncation logic
- Multi-language support

## Requirements
- Python 3.9+
- AWS account with Bedrock access
- TEN framework with STT and TTS plugins
- Dependencies listed in requirements.txt

## Installation

1. Install dependencies:
```bash
pip install -r requirements.txt
```

2. Configure AWS credentials:
- Set up AWS credentials with Bedrock access
- Update the api_key in configuration

## Configuration

The extension can be configured through manifest.json properties:
- `base_uri`: Bedrock API endpoint
- `region`: AWS region for Bedrock
- `aws_access_key_id`: AWS access key ID
- `aws_secret_access_key`: AWS secret access key
- `model_id`: Bedrock Nova model ID
- `language`: Language code for STT/TTS
- See manifest.json for full configuration options

## Input Truncation Logic

The extension implements smart input truncation:

1. Duration-based truncation:
- Automatically truncates input exceeding 30 seconds

2. Silence-based truncation:
- Triggers when silence exceeds 2 seconds

3. Manual truncation:
- Supports user-initiated truncation

## Architecture

1. Audio Processing:
- Uses TEN framework's STT plugin for audio recognition
- Buffers and processes audio in real-time
- Provides intermediate and final transcripts

2. Nova Model Integration:
- Combines transcribed text with video input
- Sends to Bedrock's Nova model for processing
- Handles responses and error conditions

3. Speech Synthesis:
- Converts Nova model responses to speech
- Uses TEN framework's TTS plugin
- Synchronizes with video output

## API Usage

### Commands

1. Flush Command:
```python
cmd = Cmd.create("flush")
await ten_env.send_cmd(cmd)
```

2. User Events:
```python
# User joined
cmd = Cmd.create("on_user_joined")
await ten_env.send_cmd(cmd)

# User left
cmd = Cmd.create("on_user_left")
await ten_env.send_cmd(cmd)
```

## Contributing
1. Fork the repository
2. Create a feature branch
3. Submit a pull request
7 changes: 6 additions & 1 deletion agents/ten_packages/extension/bedrock_llm_python/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,6 @@
from . import bedrock_llm_extension
#
# This file is part of TEN Framework, an open source project.
# Licensed under the Apache License, Version 2.0.
# See the LICENSE file for more information.
#
from . import addon
18 changes: 18 additions & 0 deletions agents/ten_packages/extension/bedrock_llm_python/addon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#
# This file is part of TEN Framework, an open source project.
# Licensed under the Apache License, Version 2.0.
# See the LICENSE file for more information.
#
from ten import (
Addon,
register_addon_as_extension,
TenEnv,
)
from .extension import BedrockLLMExtension


@register_addon_as_extension("bedrock_llm_python")
class LLMExtensionExtensionAddon(Addon):
def on_create_instance(self, ten_env: TenEnv, name: str, context) -> None:
ten_env.log_info("on_create_instance")
ten_env.on_create_instance_done(BedrockLLMExtension(name), context)
Loading

0 comments on commit 984265d

Please sign in to comment.