From 3d232fdf21619562792b408b8dd50e15bd2a08e0 Mon Sep 17 00:00:00 2001 From: Tudor Evans Date: Tue, 21 Feb 2023 13:44:28 +0000 Subject: [PATCH 1/6] add speechmatics --- README.rst | 8 +++++++ examples/audio_transcribe.py | 10 +++++++++ speech_recognition/__init__.py | 41 ++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+) diff --git a/README.rst b/README.rst index f9bde14e..3e1470d1 100644 --- a/README.rst +++ b/README.rst @@ -39,6 +39,7 @@ Speech recognition engine/API support: * `Tensorflow `__ * `Vosk API `__ (works offline) * `OpenAI whisper `__ (works offline) +* `Speechmatics ASR API `__ **Quickstart:** ``pip install SpeechRecognition``. See the "Installing" section for more details. @@ -95,6 +96,7 @@ To use all of the functionality of the library, you should have: * **FLAC encoder** (required only if the system is not x86-based Windows/Linux/OS X) * **Vosk** (required only if you need to use Vosk API speech recognition ``recognizer_instance.recognize_vosk``) * **Whisper** (required only if you need to use Whisper ``recognizer_instance.recognize_whisper``) +* **Speechmatics** (required only if you need to use Speechmatics ``recognizer_instance.recognize_speechmatics``) The following requirements are optional, but can improve or extend functionality in some situations: @@ -169,6 +171,12 @@ Whisper is **required if and only if you want to use whisper** (``recognizer_ins You can install it with ``python3 -m pip install git+https://github.com/openai/whisper.git soundfile``. +Speechmatics (for Speechmatics users) +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Speechmatics is **required if and only if you want to use speechmatics** (``recognizer_instance.recognize_speechmatics``). + +You can install it with ``python3 -m pip install speechmatics-python``. + Troubleshooting --------------- diff --git a/examples/audio_transcribe.py b/examples/audio_transcribe.py index 7806023f..5d91b25b 100644 --- a/examples/audio_transcribe.py +++ b/examples/audio_transcribe.py @@ -13,6 +13,16 @@ with sr.AudioFile(AUDIO_FILE) as source: audio = r.record(source) # read the entire audio file +# recognize speech using Speechmatics Speech to Text +API_KEY = "INSERT API KEY HERE" +try: + print("Speechmatics thinks you said " + r.recognize_ibm(audio, key=API_KEY)) +except sr.UnknownValueError: + print("Speechmatics could not understand audio") +except sr.RequestError as e: + print("Could not request results from the Speechmatics service; {0}".format(e)) + + # recognize speech using Sphinx try: print("Sphinx thinks you said " + r.recognize_sphinx(audio)) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 66ebc04c..ecc0bebb 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1702,6 +1702,47 @@ def recognize_vosk(self, audio_data, language='en'): return finalRecognition + def recognize_speechmatics(self, audio_data, key=None, language="en", transcript_format="txt"): + """ + Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Speechmatics ASR + + The key value is your speechmatics API key. You can get an API key by creating an account and signing into the portal at https://portal.speechmatics.com/manage-access/. + + The recognition language is determined by ``language``, an RFC5646 language tag like "en" or "es". The full list of supported languages can be found at https://docs.speechmatics.com/introduction/supported-languages. + + Returns a text representation of the transcript by default. You can alson get a json representation of the transcript by setting transcript_format='json-v2', which comes with a range of meta-data about each word in the transcript. The full transcript schema is documented here: https://docs.speechmatics.com/features. You can also request an SRT format by setting `format='srt'` + + Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. + """ + assert isinstance(audio_data, AudioData), "Data must be audio data" + assert isinstance(key, str), "``key`` must be a string" + + try: + from speechmatics.models import ConnectionSettings, BatchTranscriptionConfig + from speechmatics.batch_client import BatchClient + from speechmatics.constants import BATCH_SELF_SERVICE_URL + except: + raise RequestError("missing speechmatics python module: install using `pip install speechmatics-python`") + + wav_data = audio_data.get_wav_data( + convert_rate=None if audio_data.sample_rate >= 16000 else 16000 # audio samples must be at least 16 kHz + ) + audio_input = ("audio_file.wav", wav_data) + settings = ConnectionSettings( + url=BATCH_SELF_SERVICE_URL, + auth_token=key, + ) + conf = BatchTranscriptionConfig( + language=language, + ) + with BatchClient(settings) as client: + job_id = client.submit_job( + audio=audio_input, + transcription_config=conf, + ) + transcript = client.wait_for_completion(job_id, transcription_format=transcript_format) + return transcript + def get_flac_converter(): """Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found.""" flac_converter = shutil_which("flac") # check for installed version first From cce0869c83642c5dde42c00bdf926c51a10aca07 Mon Sep 17 00:00:00 2001 From: Tudor Evans Date: Tue, 21 Feb 2023 13:58:49 +0000 Subject: [PATCH 2/6] add examples and tests --- examples/audio_transcribe.py | 6 +++--- examples/extended_results.py | 10 ++++++++++ examples/microphone_recognition.py | 9 +++++++++ tests/test_recognition.py | 18 ++++++++++++++++++ 4 files changed, 40 insertions(+), 3 deletions(-) diff --git a/examples/audio_transcribe.py b/examples/audio_transcribe.py index 5d91b25b..aab2be8b 100644 --- a/examples/audio_transcribe.py +++ b/examples/audio_transcribe.py @@ -13,10 +13,10 @@ with sr.AudioFile(AUDIO_FILE) as source: audio = r.record(source) # read the entire audio file -# recognize speech using Speechmatics Speech to Text -API_KEY = "INSERT API KEY HERE" +# recognize speech using Speechmatics +SPEECHMATICS_KEY = "INSERT SPEECHMATICS API KEY HERE" try: - print("Speechmatics thinks you said " + r.recognize_ibm(audio, key=API_KEY)) + print("Speechmatics thinks you said " + r.recognize_ibm(audio, key=SPEECHMATICS_KEY)) except sr.UnknownValueError: print("Speechmatics could not understand audio") except sr.RequestError as e: diff --git a/examples/extended_results.py b/examples/extended_results.py index 599c67f2..117f519b 100644 --- a/examples/extended_results.py +++ b/examples/extended_results.py @@ -16,6 +16,16 @@ with sr.AudioFile(AUDIO_FILE) as source: audio = r.record(source) # read the entire audio file +# recognize speech using Speechmatics +SPEECHMATICS_KEY = "INSERT SPEECHMATICS API KEY HERE" +try: + print("Speechmatics results:") + pprint(r.recognize_sphinx(audio, key=SPEECHMATICS_KEY, transcript_format="json-v2")) +except sr.UnknownValueError: + print("Speechmatics could not understand audio") +except sr.RequestError as e: + print("Speechmatics error; {0}".format(e)) + # recognize speech using Sphinx try: print("Sphinx thinks you said " + r.recognize_sphinx(audio)) diff --git a/examples/microphone_recognition.py b/examples/microphone_recognition.py index 56168b29..863abe87 100644 --- a/examples/microphone_recognition.py +++ b/examples/microphone_recognition.py @@ -10,6 +10,15 @@ print("Say something!") audio = r.listen(source) +# recognize speech using Speechmatics +SPEECHMATICS_KEY = "INSERT SPEECHMATICS API KEY HERE" +try: + print("Speechmatics thinks you said " + r.recognize_speechmatics(audio, key=SPEECHMATICS_KEY)) +except sr.UnknownValueError: + print("Speechmatics could not understand audio") +except sr.RequestError as e: + print("Could not request results from Speechmatics service; {0}".format(e)) + # recognize speech using Sphinx try: print("Sphinx thinks you said " + r.recognize_sphinx(audio)) diff --git a/tests/test_recognition.py b/tests/test_recognition.py index 5759d657..7e86ea94 100644 --- a/tests/test_recognition.py +++ b/tests/test_recognition.py @@ -34,6 +34,24 @@ def test_google_chinese(self): with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source) self.assertEqual(r.recognize_google(audio, language="zh-CN"), u"砸自己的脚") + @unittest.skipUnless("SPEECHMATICS_KEY" in os.environ, "requires Speechmatics key to be specified in SPEECHMATICS_KEY environment variable") + def test_speechmatics_english(self): + r = sr.Recognizer() + with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source) + self.assertEqual(r.recognize_speechmatics(audio, key=os.environ["SPEECHMATICS_KEY"]), "One, two, three.") + + @unittest.skipUnless("SPEECHMATICS_KEY" in os.environ, "requires Speechmatics key to be specified in SPEECHMATICS_KEY environment variable") + def test_speechmatics_french(self): + r = sr.Recognizer() + with sr.AudioFile(self.AUDIO_FILE_FR) as source: audio = r.record(source) + self.assertEqual(r.recognize_speechmatics(audio, key=os.environ["SPEECHMATICS_KEY"], language="fr"), u"Essaye la dictée numéro un.") + + @unittest.skipUnless("SPEECHMATICS_KEY" in os.environ, "requires Speechmatics key to be specified in SPEECHMATICS_KEY environment variable") + def test_speechmatics_mandarin(self): + r = sr.Recognizer() + with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source) + self.assertEqual(r.recognize_speechmatics(audio, key=os.environ["SPEECHMATICS_KEY"], language="cmn"), u"砸自己的脚。") + @unittest.skipUnless("WIT_AI_KEY" in os.environ, "requires Wit.ai key to be specified in WIT_AI_KEY environment variable") def test_wit_english(self): r = sr.Recognizer() From e371938c2d46d036b7bc570fd101dc6df7e1c51d Mon Sep 17 00:00:00 2001 From: Tudor Evans Date: Tue, 21 Feb 2023 14:32:38 +0000 Subject: [PATCH 3/6] fix copy-paste errors --- README.rst | 2 +- examples/audio_transcribe.py | 2 +- examples/extended_results.py | 2 +- tests/test_recognition.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 3e1470d1..604bc1a4 100644 --- a/README.rst +++ b/README.rst @@ -175,7 +175,7 @@ Speechmatics (for Speechmatics users) ~~~~~~~~~~~~~~~~~~~~~~~~~~~ Speechmatics is **required if and only if you want to use speechmatics** (``recognizer_instance.recognize_speechmatics``). -You can install it with ``python3 -m pip install speechmatics-python``. +You can install it with ``python3 -m pip install speechmatics-python``. You will also need an API key from `__`. Troubleshooting --------------- diff --git a/examples/audio_transcribe.py b/examples/audio_transcribe.py index aab2be8b..0736b02a 100644 --- a/examples/audio_transcribe.py +++ b/examples/audio_transcribe.py @@ -16,7 +16,7 @@ # recognize speech using Speechmatics SPEECHMATICS_KEY = "INSERT SPEECHMATICS API KEY HERE" try: - print("Speechmatics thinks you said " + r.recognize_ibm(audio, key=SPEECHMATICS_KEY)) + print("Speechmatics thinks you said " + r.recognize_speechmatics(audio, key=SPEECHMATICS_KEY)) except sr.UnknownValueError: print("Speechmatics could not understand audio") except sr.RequestError as e: diff --git a/examples/extended_results.py b/examples/extended_results.py index 117f519b..c848212b 100644 --- a/examples/extended_results.py +++ b/examples/extended_results.py @@ -20,7 +20,7 @@ SPEECHMATICS_KEY = "INSERT SPEECHMATICS API KEY HERE" try: print("Speechmatics results:") - pprint(r.recognize_sphinx(audio, key=SPEECHMATICS_KEY, transcript_format="json-v2")) + pprint(r.recognize_speechmatics(audio, key=SPEECHMATICS_KEY, transcript_format="json-v2")) except sr.UnknownValueError: print("Speechmatics could not understand audio") except sr.RequestError as e: diff --git a/tests/test_recognition.py b/tests/test_recognition.py index 7e86ea94..96fade84 100644 --- a/tests/test_recognition.py +++ b/tests/test_recognition.py @@ -44,7 +44,7 @@ def test_speechmatics_english(self): def test_speechmatics_french(self): r = sr.Recognizer() with sr.AudioFile(self.AUDIO_FILE_FR) as source: audio = r.record(source) - self.assertEqual(r.recognize_speechmatics(audio, key=os.environ["SPEECHMATICS_KEY"], language="fr"), u"Essaye la dictée numéro un.") + self.assertEqual(r.recognize_speechmatics(audio, key=os.environ["SPEECHMATICS_KEY"], language="fr"), u"C'est la dictée numéro un.") @unittest.skipUnless("SPEECHMATICS_KEY" in os.environ, "requires Speechmatics key to be specified in SPEECHMATICS_KEY environment variable") def test_speechmatics_mandarin(self): From 650adc3bafa5737bcc88a31a4dbfc6120b3d1551 Mon Sep 17 00:00:00 2001 From: Tudor Evans Date: Tue, 21 Feb 2023 15:30:02 +0000 Subject: [PATCH 4/6] remove sample rate change --- speech_recognition/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index ecc0bebb..c836f257 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1724,9 +1724,7 @@ def recognize_speechmatics(self, audio_data, key=None, language="en", transcript except: raise RequestError("missing speechmatics python module: install using `pip install speechmatics-python`") - wav_data = audio_data.get_wav_data( - convert_rate=None if audio_data.sample_rate >= 16000 else 16000 # audio samples must be at least 16 kHz - ) + wav_data = audio_data.get_wav_data() audio_input = ("audio_file.wav", wav_data) settings = ConnectionSettings( url=BATCH_SELF_SERVICE_URL, From 800567ae7905544b459a251d37dc3fc3615f0d8b Mon Sep 17 00:00:00 2001 From: Tudor Evans Date: Tue, 21 Feb 2023 15:35:03 +0000 Subject: [PATCH 5/6] add speechmatics to library-reference.rst --- reference/library-reference.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/reference/library-reference.rst b/reference/library-reference.rst index 7323bd9b..6ca27092 100644 --- a/reference/library-reference.rst +++ b/reference/library-reference.rst @@ -314,6 +314,19 @@ You can translate the result to english with Whisper by passing translate=True Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options +``recognize_speechmatics(self, audio_data, key=None, language="en", transcript_format="txt")`` +---------------------------------------------------------------------------------------------- + +Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Speechmatics ASR + +The key value is your speechmatics API key. You can get an API key by creating an account and signing into the portal at https://portal.speechmatics.com/manage-access/. + +The recognition language is determined by ``language``, an RFC5646 language tag like "en" or "es". The full list of supported languages can be found at https://docs.speechmatics.com/introduction/supported-languages. + +Returns a text representation of the transcript by default. You can alson get a json representation of the transcript by setting transcript_format='json-v2', which comes with a range of meta-data about each word in the transcript. The full transcript schema is documented here: https://docs.speechmatics.com/features. You can also request an SRT format by setting `format='srt'` + +Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. + ``AudioSource`` --------------- From 8037f6aed15d66695459dc50ee7aa7b984dc2f8d Mon Sep 17 00:00:00 2001 From: Tudor Evans Date: Tue, 21 Feb 2023 15:37:43 +0000 Subject: [PATCH 6/6] add speechmatics to library-reference.rst --- reference/library-reference.rst | 2 +- speech_recognition/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/reference/library-reference.rst b/reference/library-reference.rst index 6ca27092..c6bb2b43 100644 --- a/reference/library-reference.rst +++ b/reference/library-reference.rst @@ -325,7 +325,7 @@ The recognition language is determined by ``language``, an RFC5646 language tag Returns a text representation of the transcript by default. You can alson get a json representation of the transcript by setting transcript_format='json-v2', which comes with a range of meta-data about each word in the transcript. The full transcript schema is documented here: https://docs.speechmatics.com/features. You can also request an SRT format by setting `format='srt'` -Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. +Raises errors directly from the speechmatics-python package. Read more at https://speechmatics.github.io/speechmatics-python/exceptions.html. ``AudioSource`` --------------- diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index c836f257..2f3e62bb 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1712,7 +1712,7 @@ def recognize_speechmatics(self, audio_data, key=None, language="en", transcript Returns a text representation of the transcript by default. You can alson get a json representation of the transcript by setting transcript_format='json-v2', which comes with a range of meta-data about each word in the transcript. The full transcript schema is documented here: https://docs.speechmatics.com/features. You can also request an SRT format by setting `format='srt'` - Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. + Raises errors directly from the speechmatics-python package. Read more at https://speechmatics.github.io/speechmatics-python/exceptions.html. """ assert isinstance(audio_data, AudioData), "Data must be audio data" assert isinstance(key, str), "``key`` must be a string"