azure: stop specifying english for multilingual (#302)

zaptrem · ajar98 · web-flow · commit df9cfbb38c00 · 2023-08-18T15:56:38.000-07:00
* azure: stop specifying english for multilingual

* - adds language code parameter to synthesizer_config
- splits on the voice_name to figure out the top level lang, then uses the language_code to house the message

* move reading data into ephemeral thread

---------

Co-authored-by: Ajay Raj &lt;ajay.n.raj@gmail.com&gt;
diff --git a/vocode/streaming/models/synthesizer.py b/vocode/streaming/models/synthesizer.py
@@ -81,6 +81,7 @@ class AzureSynthesizerConfig(SynthesizerConfig, type=SynthesizerType.AZURE.value
     voice_name: str = AZURE_SYNTHESIZER_DEFAULT_VOICE_NAME
     pitch: int = AZURE_SYNTHESIZER_DEFAULT_PITCH
     rate: int = AZURE_SYNTHESIZER_DEFAULT_RATE
+    language_code: str = "en-US"
 
 
 DEFAULT_GOOGLE_LANGUAGE_CODE = "en-US"
diff --git a/vocode/streaming/synthesizer/azure_synthesizer.py b/vocode/streaming/synthesizer/azure_synthesizer.py
@@ -169,12 +169,18 @@ def word_boundary_cb(self, evt, pool):
     def create_ssml(
         self, message: str, bot_sentiment: Optional[BotSentiment] = None
     ) -> str:
+        voice_language_code = self.synthesizer_config.voice_name[:5]
         ssml_root = ElementTree.fromstring(
-            '<speak version="1.0" xmlns="https://www.w3.org/2001/10/synthesis" xml:lang="en-US"></speak>'
+            f'<speak version="1.0" xmlns="https://www.w3.org/2001/10/synthesis" xml:lang="{voice_language_code}"></speak>'
         )
         voice = ElementTree.SubElement(ssml_root, "voice")
         voice.set("name", self.voice_name)
-        voice_root = voice
+        if self.synthesizer_config.language_code != "en-US":
+            lang = ElementTree.SubElement(voice, "{%s}lang" % NAMESPACES.get(""))
+            lang.set("xml:lang", self.synthesizer_config.language_code)
+            voice_root = lang
+        else:
+            voice_root = voice
         if bot_sentiment and bot_sentiment.emotion:
             styled = ElementTree.SubElement(
                 voice, "{%s}express-as" % NAMESPACES.get("mstts")
@@ -247,9 +253,10 @@ async def chunk_generator(
             audio_data_stream: speechsdk.AudioDataStream, chunk_transform=lambda x: x
         ):
             audio_buffer = bytes(chunk_size)
-            while not audio_data_stream.can_read_data(chunk_size):
-                await asyncio.sleep(0)
-            filled_size = audio_data_stream.read_data(audio_buffer)
+            filled_size = await asyncio.get_event_loop().run_in_executor(
+                self.thread_pool_executor,
+                lambda: audio_data_stream.read_data(audio_buffer),
+            )
             if filled_size != chunk_size:
                 yield SynthesisResult.ChunkResult(
                     chunk_transform(audio_buffer[offset:]), True