Skip to content

Commit df9cfbb

Browse files
zaptremajar98
andauthored
azure: stop specifying english for multilingual (#302)
* azure: stop specifying english for multilingual * - adds language code parameter to synthesizer_config - splits on the voice_name to figure out the top level lang, then uses the language_code to house the message * move reading data into ephemeral thread --------- Co-authored-by: Ajay Raj <[email protected]>
1 parent 5642e02 commit df9cfbb

File tree

2 files changed

+13
-5
lines changed

2 files changed

+13
-5
lines changed

vocode/streaming/models/synthesizer.py

+1
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ class AzureSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.AZURE.value
8181
voice_name: str = AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME
8282
pitch: int = AZURE_SYNTHESIZER_DEFAULT_PITCH
8383
rate: int = AZURE_SYNTHESIZER_DEFAULT_RATE
84+
language_code: str = "en-US"
8485

8586

8687
DEFAULT_GOOGLE_LANGUAGE_CODE = "en-US"

vocode/streaming/synthesizer/azure_synthesizer.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -169,12 +169,18 @@ def word_boundary_cb(self, evt, pool):
169169
def create_ssml(
170170
self, message: str, bot_sentiment: Optional[BotSentiment] = None
171171
) -> str:
172+
voice_language_code = self.synthesizer_config.voice_name[:5]
172173
ssml_root = ElementTree.fromstring(
173-
'<speak version="1.0" xmlns="https://www.w3.org/2001/10/synthesis" xml:lang="en-US"></speak>'
174+
f'<speak version="1.0" xmlns="https://www.w3.org/2001/10/synthesis" xml:lang="{voice_language_code}"></speak>'
174175
)
175176
voice = ElementTree.SubElement(ssml_root, "voice")
176177
voice.set("name", self.voice_name)
177-
voice_root = voice
178+
if self.synthesizer_config.language_code != "en-US":
179+
lang = ElementTree.SubElement(voice, "{%s}lang" % NAMESPACES.get(""))
180+
lang.set("xml:lang", self.synthesizer_config.language_code)
181+
voice_root = lang
182+
else:
183+
voice_root = voice
178184
if bot_sentiment and bot_sentiment.emotion:
179185
styled = ElementTree.SubElement(
180186
voice, "{%s}express-as" % NAMESPACES.get("mstts")
@@ -247,9 +253,10 @@ async def chunk_generator(
247253
audio_data_stream: speechsdk.AudioDataStream, chunk_transform=lambda x: x
248254
):
249255
audio_buffer = bytes(chunk_size)
250-
while not audio_data_stream.can_read_data(chunk_size):
251-
await asyncio.sleep(0)
252-
filled_size = audio_data_stream.read_data(audio_buffer)
256+
filled_size = await asyncio.get_event_loop().run_in_executor(
257+
self.thread_pool_executor,
258+
lambda: audio_data_stream.read_data(audio_buffer),
259+
)
253260
if filled_size != chunk_size:
254261
yield SynthesisResult.ChunkResult(
255262
chunk_transform(audio_buffer[offset:]), True

0 commit comments

Comments
 (0)