Skip to content

Commit ab0df80

Browse files
Plugin azure add voice styles (#1379)
Co-authored-by: David Zhao <[email protected]>
1 parent c62738f commit ab0df80

File tree

1 file changed

+67
-16
lines changed
  • livekit-plugins/livekit-plugins-azure/livekit/plugins/azure

1 file changed

+67
-16
lines changed

livekit-plugins/livekit-plugins-azure/livekit/plugins/azure/tts.py

+67-16
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,27 @@ def __post_init__(self):
103103
self.validate()
104104

105105

106+
@dataclass
107+
class StyleConfig:
108+
"""
109+
Style configuration for Azure TTS neural voices.
110+
111+
Args:
112+
style: Speaking style for neural voices. Examples: "cheerful", "sad", "angry", etc.
113+
degree: Intensity of the style, from 0.1 to 2.0.
114+
"""
115+
116+
style: str
117+
degree: float | None = None
118+
119+
def validate(self) -> None:
120+
if self.degree is not None and not 0.1 <= self.degree <= 2.0:
121+
raise ValueError("Style degree must be between 0.1 and 2.0")
122+
123+
def __post_init__(self):
124+
self.validate()
125+
126+
106127
@dataclass
107128
class _TTSOptions:
108129
sample_rate: int
@@ -121,6 +142,7 @@ class _TTSOptions:
121142
# See https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#adjust-prosody
122143
prosody: ProsodyConfig | None = None
123144
speech_endpoint: str | None = None
145+
style: StyleConfig | None = None
124146

125147

126148
class TTS(tts.TTS):
@@ -136,6 +158,7 @@ def __init__(
136158
speech_host: str | None = None,
137159
speech_auth_token: str | None = None,
138160
endpoint_id: str | None = None,
161+
style: StyleConfig | None = None,
139162
) -> None:
140163
"""
141164
Create a new instance of Azure TTS.
@@ -176,6 +199,9 @@ def __init__(
176199
if prosody:
177200
prosody.validate()
178201

202+
if style:
203+
style.validate()
204+
179205
self._opts = _TTSOptions(
180206
sample_rate=sample_rate,
181207
speech_key=speech_key,
@@ -186,6 +212,7 @@ def __init__(
186212
endpoint_id=endpoint_id,
187213
language=language,
188214
prosody=prosody,
215+
style=style,
189216
)
190217

191218
def update_options(
@@ -194,10 +221,12 @@ def update_options(
194221
voice: str | None = None,
195222
language: str | None = None,
196223
prosody: ProsodyConfig | None = None,
224+
style: StyleConfig | None = None,
197225
) -> None:
198226
self._opts.voice = voice or self._opts.voice
199227
self._opts.language = language or self._opts.language
200228
self._opts.prosody = prosody or self._opts.prosody
229+
self._opts.style = style or self._opts.style
201230

202231
def synthesize(
203232
self,
@@ -234,22 +263,44 @@ async def _run(self):
234263
)
235264

236265
def _synthesize() -> speechsdk.SpeechSynthesisResult:
237-
if self._opts.prosody:
238-
ssml = f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="{self._opts.language or "en-US"}">'
239-
prosody_ssml = f'<voice name="{self._opts.voice}"><prosody'
240-
if self._opts.prosody.rate:
241-
prosody_ssml += f' rate="{self._opts.prosody.rate}"'
242-
243-
if self._opts.prosody.volume:
244-
prosody_ssml += f' volume="{self._opts.prosody.volume}"'
245-
246-
if self._opts.prosody.pitch:
247-
prosody_ssml += f' pitch="{self._opts.prosody.pitch}"'
248-
249-
prosody_ssml += ">"
250-
ssml += prosody_ssml
251-
ssml += self._input_text
252-
ssml += "</prosody></voice></speak>"
266+
if self._opts.prosody or self._opts.style:
267+
ssml = (
268+
'<speak version="1.0" '
269+
'xmlns="http://www.w3.org/2001/10/synthesis" '
270+
'xmlns:mstts="http://www.w3.org/2001/mstts" '
271+
f'xml:lang="{self._opts.language or "en-US"}">'
272+
)
273+
ssml += f'<voice name="{self._opts.voice}">'
274+
275+
# Add style if specified
276+
if self._opts.style:
277+
style_degree = (
278+
f' styledegree="{self._opts.style.degree}"'
279+
if self._opts.style.degree
280+
else ""
281+
)
282+
ssml += f'<mstts:express-as style="{self._opts.style.style}"{style_degree}>'
283+
284+
# Add prosody if specified
285+
if self._opts.prosody:
286+
ssml += "<prosody"
287+
if self._opts.prosody.rate:
288+
ssml += f' rate="{self._opts.prosody.rate}"'
289+
if self._opts.prosody.volume:
290+
ssml += f' volume="{self._opts.prosody.volume}"'
291+
if self._opts.prosody.pitch:
292+
ssml += f' pitch="{self._opts.prosody.pitch}"'
293+
ssml += ">"
294+
ssml += self._input_text
295+
ssml += "</prosody>"
296+
else:
297+
ssml += self._input_text
298+
299+
# Close style tag if it was opened
300+
if self._opts.style:
301+
ssml += "</mstts:express-as>"
302+
303+
ssml += "</voice></speak>"
253304
return synthesizer.speak_ssml_async(ssml).get() # type: ignore
254305

255306
return synthesizer.speak_text_async(self.input_text).get() # type: ignore

0 commit comments

Comments
 (0)