@@ -103,6 +103,27 @@ def __post_init__(self):
103
103
self .validate ()
104
104
105
105
106
+ @dataclass
107
+ class StyleConfig :
108
+ """
109
+ Style configuration for Azure TTS neural voices.
110
+
111
+ Args:
112
+ style: Speaking style for neural voices. Examples: "cheerful", "sad", "angry", etc.
113
+ degree: Intensity of the style, from 0.1 to 2.0.
114
+ """
115
+
116
+ style : str
117
+ degree : float | None = None
118
+
119
+ def validate (self ) -> None :
120
+ if self .degree is not None and not 0.1 <= self .degree <= 2.0 :
121
+ raise ValueError ("Style degree must be between 0.1 and 2.0" )
122
+
123
+ def __post_init__ (self ):
124
+ self .validate ()
125
+
126
+
106
127
@dataclass
107
128
class _TTSOptions :
108
129
sample_rate : int
@@ -121,6 +142,7 @@ class _TTSOptions:
121
142
# See https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#adjust-prosody
122
143
prosody : ProsodyConfig | None = None
123
144
speech_endpoint : str | None = None
145
+ style : StyleConfig | None = None
124
146
125
147
126
148
class TTS (tts .TTS ):
@@ -136,6 +158,7 @@ def __init__(
136
158
speech_host : str | None = None ,
137
159
speech_auth_token : str | None = None ,
138
160
endpoint_id : str | None = None ,
161
+ style : StyleConfig | None = None ,
139
162
) -> None :
140
163
"""
141
164
Create a new instance of Azure TTS.
@@ -176,6 +199,9 @@ def __init__(
176
199
if prosody :
177
200
prosody .validate ()
178
201
202
+ if style :
203
+ style .validate ()
204
+
179
205
self ._opts = _TTSOptions (
180
206
sample_rate = sample_rate ,
181
207
speech_key = speech_key ,
@@ -186,6 +212,7 @@ def __init__(
186
212
endpoint_id = endpoint_id ,
187
213
language = language ,
188
214
prosody = prosody ,
215
+ style = style ,
189
216
)
190
217
191
218
def update_options (
@@ -194,10 +221,12 @@ def update_options(
194
221
voice : str | None = None ,
195
222
language : str | None = None ,
196
223
prosody : ProsodyConfig | None = None ,
224
+ style : StyleConfig | None = None ,
197
225
) -> None :
198
226
self ._opts .voice = voice or self ._opts .voice
199
227
self ._opts .language = language or self ._opts .language
200
228
self ._opts .prosody = prosody or self ._opts .prosody
229
+ self ._opts .style = style or self ._opts .style
201
230
202
231
def synthesize (
203
232
self ,
@@ -234,22 +263,44 @@ async def _run(self):
234
263
)
235
264
236
265
def _synthesize () -> speechsdk .SpeechSynthesisResult :
237
- if self ._opts .prosody :
238
- ssml = f'<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="{ self ._opts .language or "en-US" } ">'
239
- prosody_ssml = f'<voice name="{ self ._opts .voice } "><prosody'
240
- if self ._opts .prosody .rate :
241
- prosody_ssml += f' rate="{ self ._opts .prosody .rate } "'
242
-
243
- if self ._opts .prosody .volume :
244
- prosody_ssml += f' volume="{ self ._opts .prosody .volume } "'
245
-
246
- if self ._opts .prosody .pitch :
247
- prosody_ssml += f' pitch="{ self ._opts .prosody .pitch } "'
248
-
249
- prosody_ssml += ">"
250
- ssml += prosody_ssml
251
- ssml += self ._input_text
252
- ssml += "</prosody></voice></speak>"
266
+ if self ._opts .prosody or self ._opts .style :
267
+ ssml = (
268
+ '<speak version="1.0" '
269
+ 'xmlns="http://www.w3.org/2001/10/synthesis" '
270
+ 'xmlns:mstts="http://www.w3.org/2001/mstts" '
271
+ f'xml:lang="{ self ._opts .language or "en-US" } ">'
272
+ )
273
+ ssml += f'<voice name="{ self ._opts .voice } ">'
274
+
275
+ # Add style if specified
276
+ if self ._opts .style :
277
+ style_degree = (
278
+ f' styledegree="{ self ._opts .style .degree } "'
279
+ if self ._opts .style .degree
280
+ else ""
281
+ )
282
+ ssml += f'<mstts:express-as style="{ self ._opts .style .style } "{ style_degree } >'
283
+
284
+ # Add prosody if specified
285
+ if self ._opts .prosody :
286
+ ssml += "<prosody"
287
+ if self ._opts .prosody .rate :
288
+ ssml += f' rate="{ self ._opts .prosody .rate } "'
289
+ if self ._opts .prosody .volume :
290
+ ssml += f' volume="{ self ._opts .prosody .volume } "'
291
+ if self ._opts .prosody .pitch :
292
+ ssml += f' pitch="{ self ._opts .prosody .pitch } "'
293
+ ssml += ">"
294
+ ssml += self ._input_text
295
+ ssml += "</prosody>"
296
+ else :
297
+ ssml += self ._input_text
298
+
299
+ # Close style tag if it was opened
300
+ if self ._opts .style :
301
+ ssml += "</mstts:express-as>"
302
+
303
+ ssml += "</voice></speak>"
253
304
return synthesizer .speak_ssml_async (ssml ).get () # type: ignore
254
305
255
306
return synthesizer .speak_text_async (self .input_text ).get () # type: ignore
0 commit comments