Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 23 additions & 24 deletions src/pipecat/services/minimax/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class MiniMaxHttpTTSService(TTSService):
"""Text-to-speech service using MiniMax's T2A (Text-to-Audio) API.

Provides streaming text-to-speech synthesis using MiniMax's HTTP API
with support for various voice settings, emotions, and audio configurations.
with support for various voice settings and audio configurations.
Supports real-time audio streaming with configurable voice parameters.

Platform documentation:
Expand All @@ -105,8 +105,6 @@ class InputParams(BaseModel):
speed: Speech speed (range: 0.5 to 2.0).
volume: Speech volume (range: 0 to 10).
pitch: Pitch adjustment (range: -12 to 12).
emotion: Emotional tone (options: "happy", "sad", "angry", "fearful",
"disgusted", "surprised", "calm", "fluent").
english_normalization: Deprecated; use `text_normalization` instead

.. deprecated:: 0.0.96
Expand All @@ -116,17 +114,18 @@ class InputParams(BaseModel):
text_normalization: Enable text normalization (Chinese/English).
latex_read: Enable LaTeX formula reading.
exclude_aggregated_audio: Whether to exclude aggregated audio in final chunk.
subtitle_enable: Enable subtitle generation with word-level timestamps.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Word timestamps require a broader change in this class. The subclass needs to change to reflect the change. This would become a WordTTSService and would need to implement add_word_timestamps, etc. to work with the subclass.

"""

language: Optional[Language] = Language.EN
speed: Optional[float] = 1.0
volume: Optional[float] = 1.0
pitch: Optional[int] = 0
emotion: Optional[str] = None
english_normalization: Optional[bool] = None # Deprecated
text_normalization: Optional[bool] = None
latex_read: Optional[bool] = None
exclude_aggregated_audio: Optional[bool] = None
subtitle_enable: Optional[bool] = None

def __init__(
self,
Expand Down Expand Up @@ -196,26 +195,6 @@ def __init__(
if service_lang:
self._settings["language_boost"] = service_lang

# Add optional emotion if provided
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are you removing emotion? This feature was just added in the last release? Removing it is a breaking change. You should instead deprecate and warn about it if it's no longer relevant.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey markbackman, if we add emotion into the request body, the latency will be very high

if params.emotion:
# Validate emotion is in the supported list
supported_emotions = [
"happy",
"sad",
"angry",
"fearful",
"disgusted",
"surprised",
"neutral",
"fluent",
]
if params.emotion in supported_emotions:
self._settings["voice_setting"]["emotion"] = params.emotion
else:
logger.warning(
f"Unsupported emotion: {params.emotion}. Supported emotions: {supported_emotions}"
)

# If `english_normalization`, add `text_normalization` and print warning
if params.english_normalization is not None:
import warnings
Expand All @@ -236,6 +215,18 @@ def __init__(
if params.latex_read is not None:
self._settings["voice_setting"]["latex_read"] = params.latex_read

# Add subtitle settings if provided
if params.subtitle_enable is not None:
self._settings["subtitle_enable"] = params.subtitle_enable

# Always use word-level timestamps for streaming subtitles
if params.subtitle_enable:
self._settings["subtitle_type"] = "word"
else:
logger.info(
"Subtitle generation is disabled. No word-level timestamps will be provided."
)

def can_generate_metrics(self) -> bool:
"""Check if this service can generate processing metrics.

Expand Down Expand Up @@ -357,6 +348,14 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
if not chunk_data:
continue

# Check for subtitle file (if subtitle generation is enabled)
subtitle_file = chunk_data.get("subtitle_file")
if subtitle_file:
logger.info(
f"Subtitle file available: {subtitle_file}",
extra={"trace_id": self._current_trace_id, "subtitle_url": subtitle_file},
)

audio_data = chunk_data.get("audio")
if not audio_data:
continue
Expand Down