Uberi · ajsyp · Nov 8, 2022
diff --git a/README.rst b/README.rst
@@ -39,6 +39,7 @@ Speech recognition engine/API support:
 * `Tensorflow <https://www.tensorflow.org/>`__
 * `Vosk API <https://github.com/alphacep/vosk-api/>`__ (works offline)
 * `OpenAI whisper <https://github.com/openai/whisper>`__ (works offline)
+* `Deepgram <https://www.deepgram.com>`__
 
 **Quickstart:** ``pip install SpeechRecognition``. See the "Installing" section for more details.
 
@@ -377,6 +378,7 @@ Authors
     tb0hdan (Bohdan Turkynewych)
     Thynix <[email protected]> (Steve Dougherty)
     beeedy <[email protected]> (Broderick Carlin)
+    ajsyp <[email protected]> (Adam Sypniewski)
 
 Please report bugs and suggestions at the `issue tracker <https://github.com/Uberi/speech_recognition/issues>`__!
 

diff --git a/examples/audio_transcribe.py b/examples/audio_transcribe.py
@@ -87,3 +87,12 @@
     print("IBM Speech to Text could not understand audio")
 except sr.RequestError as e:
     print("Could not request results from IBM Speech to Text service; {0}".format(e))
+
+# recognize speech using Deepgram Speech to Text
+DEEPGRAM_API_SECRET = "INSERT DEEPGRAM API SECRET HERE"  # Deepgram API secrets are 40-character lowercase hexadecimal strings.
+try:
+    print("Deepgram thinks you said " + r.recognize_deepgram(audio, key=DEEPGRAM_API_SECRET))
+except sr.UnknownValueError:
+    print("Deepgram could not understand audio")
+except sr.RequestError as e:
+    print("Could not request results from Deepgram; {0}".format(e))
diff --git a/examples/extended_results.py b/examples/extended_results.py
@@ -87,3 +87,13 @@
     print("IBM Speech to Text could not understand audio")
 except sr.RequestError as e:
     print("Could not request results from IBM Speech to Text service; {0}".format(e))
+
+# recognize speech using Deepgram Speech to Text
+DEEPGRAM_API_SECRET = "INSERT DEEPGRAM API KEY SECRET HERE"  # Deepgram API key secrets are 40-character lowercase hexadecimal strings.
+try:
+    print("Deepgram results:")
+    pprint(r.recognize_deepgram(audio, key=DEEPGRAM_API_SECRET, show_all=True))
+except sr.UnknownValueError:
+    print("Deepgram could not understand audio")
+except sr.RequestError as e:
+    print("Could not request results from Deepgram; {0}".format(e))
diff --git a/examples/microphone_recognition.py b/examples/microphone_recognition.py
@@ -92,3 +92,12 @@
     print("Whisper could not understand audio")
 except sr.RequestError as e:
     print("Could not request results from Whisper")
+
+# recognize speech using Deepgram Speech to Text
+DEEPGRAM_API_SECRET = "INSERT DEEPGRAM API KEY SECRET HERE"  # Deepgram API key secrets are 40-character lowercase hexadecimal strings.
+try:
+    print("Deepgram thinks you said " + r.recognize_deepgram(audio, key=DEEPGRAM_API_SECRET))
+except sr.UnknownValueError:
+    print("Deepgram could not understand audio")
+except sr.RequestError as e:
+    print("Could not request results from Deepgram; {0}".format(e))
diff --git a/examples/special_recognizer_features.py b/examples/special_recognizer_features.py
@@ -44,3 +44,12 @@
     print("Google Cloud Speech could not understand audio")
 except sr.RequestError as e:
     print("Could not request results from Google Cloud Speech service; {0}".format(e))
+
+# boost keyword detection in speech using Deepgram Speech to Text
+DEEPGRAM_API_SECRET = "INSERT DEEPGRAM API KEY SECRET HERE"  # Deepgram API key secrets are 40-character lowercase hexadecimal strings.
+try:
+    print("Deepgram thinks you said " + r.recognize_deepgram(audio_en, key=DEEPGRAM_API_SECRET, keywords=['elephant:10']))
+except sr.UnknownValueError:
+    print("Deepgram could not understand audio")
+except sr.RequestError as e:
+    print("Could not request results from Deepgram; {0}".format(e))
diff --git a/reference/library-reference.rst b/reference/library-reference.rst
@@ -314,6 +314,17 @@ You can translate the result to english with Whisper by passing translate=True
 
 Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options
 
+``recognizer_instance.recognize_deepgram(audio_data: AudioData, key: str, tier: Optional[str] = "enhanced", model: Optional[str] = "general", version: Optional[str] = "latest", language: Optional[str] = "en-US", detect_language: bool = False, punctuate: bool = True, profanity_filter: bool = False, redact: Optional[str] = None, diarize: bool = False, diarize_version: Optional[str] = None, ner: bool = True, multichannel: bool = False, alternatives: int = 1, numerals: bool = True, search: Optional[Iterable[str]] = None, replace: Optional[Dict[str, str]] = None, keywords: Optional[Iterable[str]] = None, paragraphs: bool = False, summarize: bool = False, detect_topics: bool = False, utterances: bool = False, utt_split: Optional[float] = None, show_all: bool = False) -> Union[str, Dict[str, Any]]``
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Performs speech recognition of ``audio_data`` (an ``AudioData`` instance) using the Deepgram speech recognition API.
+
+Deepgram uses an API secret to authenticate users and authorize requests. To obtain an API secret, create an account with `Deepgram <https://console.deepgram.com>`__. The API secret is a 40-character hexadecimal string that can only be retrieved at creation time. It is identified using a UUID, which is not the API secret and shouldn't be used here.
+
+If ``show_all`` is false (the default), returns the most likely transcript string; otherwise, returns the raw API JSON response.
+
+Details of the various features can be found in the `Deepgram Documentation <https://developers.deepgram.com/>`__.
+
 ``AudioSource``
 ---------------
 

diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py
@@ -1726,6 +1726,132 @@ def recognize_vosk(self, audio_data, language='en'):
 
         return finalRecognition
 
+    def recognize_deepgram(
+        self,
+        audio_data,
+        key,
+        tier='enhanced',
+        model='general',
+        version='latest',
+        language='en-US',
+        detect_language=False,
+        punctuate=True,
+        profanity_filter=False,
+        redact=None,
+        diarize=False,
+        diarize_version=None,
+        ner=True,
+        multichannel=False,
+        alternatives=1,
+        numerals=True,
+        search=None,
+        replace=None,
+        keywords=None,
+        paragraphs=False,
+        summarize=False,
+        detect_topics=False,
+        utterances=False,
+        utt_split=None,
+        show_all=False
+    ):
+        """
+        Performs speech recognition of ``audio_data`` (an ``AudioData`` instance) using the Deepgram speech recognition API.
+
+        Deepgram uses an API secret to authenticate users and authorize requests. To obtain an API secret, create an account with `Deepgram <https://console.deepgram.com>`__. The API secret is a 40-character hexadecimal string that can only be retrieved at creation time. It is identified using a UUID, which is not the API secret and shouldn't be used here.
+
+        If ``show_all`` is false (the default), returns the most likely transcript string; otherwise, returns the raw API JSON response.
+
+        Details of the various features can be found in the `Deepgram Documentation <https://developers.deepgram.com/>`__.
+        """
+        assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data"
+        assert isinstance(key, str), "``key`` must be a string"
+        assert tier is None or (isinstance(tier, str) and tier in {'base', 'enhanced'}), "invalid ``tier``"
+        assert model is None or isinstance(model, str), "``model`` must be None or a string"
+        assert version is None or isinstance(version, str), "``verison`` must be None or a string"
+        assert language is None or isinstance(language, str), "``language`` must be None or a string"
+        assert isinstance(detect_language, bool), "``detect_language`` must be a bool"
+        assert isinstance(punctuate, bool), "``punctuate`` must be a bool"
+        assert isinstance(profanity_filter, bool), "``profanity_filter`` must be a bool"
+        assert redact is None or isinstance(redact, str), "``redact`` must be None or a string"
+        assert isinstance(diarize, bool), "``diarize`` must be a bool"
+        assert diarize_version is None or isinstance(diarize_version, str), "``diarize_version`` must be None or a string"
+        assert isinstance(ner, bool), "``ner`` must be a bool"
+        assert isinstance(multichannel, bool), "``multichannel`` must be a bool"
+        assert isinstance(alternatives, int) and alternatives > 0, "``alternatives`` must be a positive integer"
+        assert isinstance(numerals, bool), "``numerals`` must be a bool"
+        assert search is None or (isinstance(search, list) and all(isinstance(s, str) for s in search)), "``search`` must be None or a list of strings"
+        assert replace is None or (isinstance(replace, dict) and all(isinstance(k, str) and isinstance(v, str) for k, v in replace.items())), "``replace`` must be None or a dicitonary with string keys and values"
+        assert keywords is None or (isinstance(keywords, list) and all(isinstance(s, str) for s in keywords)), "``keywords`` must be None or a list of strings"
+        assert isinstance(paragraphs, bool), "``paragraphs`` must be a bool"
+        assert isinstance(summarize, bool), "``summarize`` must be a bool"
+        assert isinstance(detect_topics, bool), "``detect_topics`` must be a bool"
+        assert isinstance(utterances, bool), "``utterances`` must be a bool"
+        assert utt_split is None or (isinstance(utt_split, (int, float)) and utt_split > 0), "``utt_split`` must be None or positive real number"
+
+        def convert_bool(x):
+            if isinstance(x, bool):
+                return str(x).lower()
+            else:
+                return x
+
+        params = [
+            (p[0], convert_bool(p[1])) for p in (
+                ('tier', tier),
+                ('model', model),
+                ('version', version),
+                ('language', language),
+                ('detect_language', detect_language),
+                ('punctuate', punctuate),
+                ('profanity_filter', profanity_filter),
+                ('redact', redact),
+                ('diarize', diarize),
+                ('diarize_version', diarize_version),
+                ('ner', ner),
+                ('multichannel', multichannel),
+                ('alternatives', alternatives),
+                ('numerals', numerals),
+                ('paragraphs', paragraphs),
+                ('summarize', summarize),
+                ('detect_topics', detect_topics),
+                ('utterances', utterances),
+                ('utt_split', utt_split),
+            ) if p[1] is not None
+        ]
+        if search is not None:
+            for s in search:
+                params.append(('search', s))
+        if keywords is not None:
+            for k in keywords:
+                params.append(('keywords', k))
+        if replace is not None:
+            for k, v in replace.items():
+                k = k.replace(':', '%3a')
+                v = v.replace(':', '%3a')
+                params.append(('replace', f'{k}:{v}'))
+
+        headers = {
+            'authorization': f'token {key}',
+        }
+        url = 'https://api.deepgram.com/v1/listen?{}'.format(urlencode(params))
+        data = audio_data.get_wav_data()
+
+        request = Request(url, data, headers)
+        try:
+            response = urlopen(request, timeout=self.operation_timeout)
+        except HTTPError as e:
+            raise RequestError("recognition request failed: {}".format(e.reason))
+        except URLError as e:
+            raise RequestError("recognition connection failed: {}".format(e.reason))
+
+        result = json.load(response)
+
+        if show_all:
+            return result
+        pprint(result, indent=4)
+
+        return result['results']['channels'][0]['alternatives'][0]['transcript']
+
+
 def get_flac_converter():
     """Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found."""
     flac_converter = shutil_which("flac")  # check for installed version first

diff --git a/tests/test_recognition.py b/tests/test_recognition.py
@@ -97,5 +97,12 @@ def test_whisper_chinese(self):
         with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source)
         self.assertEqual(r.recognize_whisper(audio, model="small", language="chinese", **self.WHISPER_CONFIG), u"砸自己的腳")
 
+    @unittest.skipUnless("DEEPGRAM_API_SECRET" in os.environ, "requires Deepgram API secret to be specified in DEEPGRAM_API_SECRET environment variables")
+    def test_deepgram(self):
+        r = sr.Recognizer()
+        with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
+        self.assertEqual(r.recognize_deepgram(audio, key=os.environ["DEEPGRAM_API_SECRET"]), "123")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_special_features.py b/tests/test_special_features.py
@@ -25,6 +25,12 @@ def assertSameWords(self, tested, reference, msg=None):
         if set_tested != set_reference:
             raise self.failureException(msg if msg is not None else "%r doesn't consist of the same words as %r" % (tested, reference))
 
+    @unittest.skipUnless("DEEPGRAM_API_SECRET" in os.environ, "requires Deepgram API secret to be specified in DEEPGRAM_API_SECRET environment variables")
+    def test_deepgram_keywords(self):
+        r = sr.Recognizer()
+        with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
+        self.assertEqual(r.recognize_deepgram(audio, key=os.environ["DEEPGRAM_API_SECRET"], tier='base', keywords=['elephant:1000000']), "elephant elephant elephant")
+
 
 if __name__ == "__main__":
     unittest.main()