-
Notifications
You must be signed in to change notification settings - Fork 1.5k
feat(minimax): add streaming word-level subtitles and simplify API #3159
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -89,7 +89,7 @@ class MiniMaxHttpTTSService(TTSService): | |
| """Text-to-speech service using MiniMax's T2A (Text-to-Audio) API. | ||
|
|
||
| Provides streaming text-to-speech synthesis using MiniMax's HTTP API | ||
| with support for various voice settings, emotions, and audio configurations. | ||
| with support for various voice settings and audio configurations. | ||
| Supports real-time audio streaming with configurable voice parameters. | ||
|
|
||
| Platform documentation: | ||
|
|
@@ -105,8 +105,6 @@ class InputParams(BaseModel): | |
| speed: Speech speed (range: 0.5 to 2.0). | ||
| volume: Speech volume (range: 0 to 10). | ||
| pitch: Pitch adjustment (range: -12 to 12). | ||
| emotion: Emotional tone (options: "happy", "sad", "angry", "fearful", | ||
| "disgusted", "surprised", "calm", "fluent"). | ||
| english_normalization: Deprecated; use `text_normalization` instead | ||
|
|
||
| .. deprecated:: 0.0.96 | ||
|
|
@@ -116,17 +114,18 @@ class InputParams(BaseModel): | |
| text_normalization: Enable text normalization (Chinese/English). | ||
| latex_read: Enable LaTeX formula reading. | ||
| exclude_aggregated_audio: Whether to exclude aggregated audio in final chunk. | ||
| subtitle_enable: Enable subtitle generation with word-level timestamps. | ||
| """ | ||
|
|
||
| language: Optional[Language] = Language.EN | ||
| speed: Optional[float] = 1.0 | ||
| volume: Optional[float] = 1.0 | ||
| pitch: Optional[int] = 0 | ||
| emotion: Optional[str] = None | ||
| english_normalization: Optional[bool] = None # Deprecated | ||
| text_normalization: Optional[bool] = None | ||
| latex_read: Optional[bool] = None | ||
| exclude_aggregated_audio: Optional[bool] = None | ||
| subtitle_enable: Optional[bool] = None | ||
|
|
||
| def __init__( | ||
| self, | ||
|
|
@@ -196,26 +195,6 @@ def __init__( | |
| if service_lang: | ||
| self._settings["language_boost"] = service_lang | ||
|
|
||
| # Add optional emotion if provided | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are you removing emotion? This feature was just added in the last release? Removing it is a breaking change. You should instead deprecate and warn about it if it's no longer relevant.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hey markbackman, if we add emotion into the request body, the latency will be very high |
||
| if params.emotion: | ||
| # Validate emotion is in the supported list | ||
| supported_emotions = [ | ||
| "happy", | ||
| "sad", | ||
| "angry", | ||
| "fearful", | ||
| "disgusted", | ||
| "surprised", | ||
| "neutral", | ||
| "fluent", | ||
| ] | ||
| if params.emotion in supported_emotions: | ||
| self._settings["voice_setting"]["emotion"] = params.emotion | ||
| else: | ||
| logger.warning( | ||
| f"Unsupported emotion: {params.emotion}. Supported emotions: {supported_emotions}" | ||
| ) | ||
|
|
||
| # If `english_normalization`, add `text_normalization` and print warning | ||
| if params.english_normalization is not None: | ||
| import warnings | ||
|
|
@@ -236,6 +215,18 @@ def __init__( | |
| if params.latex_read is not None: | ||
| self._settings["voice_setting"]["latex_read"] = params.latex_read | ||
|
|
||
| # Add subtitle settings if provided | ||
| if params.subtitle_enable is not None: | ||
| self._settings["subtitle_enable"] = params.subtitle_enable | ||
|
|
||
| # Always use word-level timestamps for streaming subtitles | ||
| if params.subtitle_enable: | ||
| self._settings["subtitle_type"] = "word" | ||
| else: | ||
| logger.info( | ||
| "Subtitle generation is disabled. No word-level timestamps will be provided." | ||
| ) | ||
|
|
||
| def can_generate_metrics(self) -> bool: | ||
| """Check if this service can generate processing metrics. | ||
|
|
||
|
|
@@ -357,6 +348,14 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]: | |
| if not chunk_data: | ||
| continue | ||
|
|
||
| # Check for subtitle file (if subtitle generation is enabled) | ||
| subtitle_file = chunk_data.get("subtitle_file") | ||
| if subtitle_file: | ||
| logger.info( | ||
| f"Subtitle file available: {subtitle_file}", | ||
| extra={"trace_id": self._current_trace_id, "subtitle_url": subtitle_file}, | ||
| ) | ||
|
|
||
| audio_data = chunk_data.get("audio") | ||
| if not audio_data: | ||
| continue | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Word timestamps require a broader change in this class. The subclass needs to change to reflect the change. This would become a
WordTTSServiceand would need to implementadd_word_timestamps, etc. to work with the subclass.