Skip to content

Added Deepgram as a speech recognition provider. #631

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Speech recognition engine/API support:
* `Tensorflow <https://www.tensorflow.org/>`__
* `Vosk API <https://github.com/alphacep/vosk-api/>`__ (works offline)
* `OpenAI whisper <https://github.com/openai/whisper>`__ (works offline)
* `Deepgram <https://www.deepgram.com>`__

**Quickstart:** ``pip install SpeechRecognition``. See the "Installing" section for more details.

Expand Down Expand Up @@ -377,6 +378,7 @@ Authors
tb0hdan (Bohdan Turkynewych)
Thynix <[email protected]> (Steve Dougherty)
beeedy <[email protected]> (Broderick Carlin)
ajsyp <[email protected]> (Adam Sypniewski)

Please report bugs and suggestions at the `issue tracker <https://github.com/Uberi/speech_recognition/issues>`__!

Expand Down
9 changes: 9 additions & 0 deletions examples/audio_transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,12 @@
print("IBM Speech to Text could not understand audio")
except sr.RequestError as e:
print("Could not request results from IBM Speech to Text service; {0}".format(e))

# recognize speech using Deepgram Speech to Text
DEEPGRAM_API_SECRET = "INSERT DEEPGRAM API SECRET HERE" # Deepgram API secrets are 40-character lowercase hexadecimal strings.
try:
print("Deepgram thinks you said " + r.recognize_deepgram(audio, key=DEEPGRAM_API_SECRET))
except sr.UnknownValueError:
print("Deepgram could not understand audio")
except sr.RequestError as e:
print("Could not request results from Deepgram; {0}".format(e))
10 changes: 10 additions & 0 deletions examples/extended_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,13 @@
print("IBM Speech to Text could not understand audio")
except sr.RequestError as e:
print("Could not request results from IBM Speech to Text service; {0}".format(e))

# recognize speech using Deepgram Speech to Text
DEEPGRAM_API_SECRET = "INSERT DEEPGRAM API KEY SECRET HERE" # Deepgram API key secrets are 40-character lowercase hexadecimal strings.
try:
print("Deepgram results:")
pprint(r.recognize_deepgram(audio, key=DEEPGRAM_API_SECRET, show_all=True))
except sr.UnknownValueError:
print("Deepgram could not understand audio")
except sr.RequestError as e:
print("Could not request results from Deepgram; {0}".format(e))
9 changes: 9 additions & 0 deletions examples/microphone_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,12 @@
print("Whisper could not understand audio")
except sr.RequestError as e:
print("Could not request results from Whisper")

# recognize speech using Deepgram Speech to Text
DEEPGRAM_API_SECRET = "INSERT DEEPGRAM API KEY SECRET HERE" # Deepgram API key secrets are 40-character lowercase hexadecimal strings.
try:
print("Deepgram thinks you said " + r.recognize_deepgram(audio, key=DEEPGRAM_API_SECRET))
except sr.UnknownValueError:
print("Deepgram could not understand audio")
except sr.RequestError as e:
print("Could not request results from Deepgram; {0}".format(e))
9 changes: 9 additions & 0 deletions examples/special_recognizer_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,12 @@
print("Google Cloud Speech could not understand audio")
except sr.RequestError as e:
print("Could not request results from Google Cloud Speech service; {0}".format(e))

# boost keyword detection in speech using Deepgram Speech to Text
DEEPGRAM_API_SECRET = "INSERT DEEPGRAM API KEY SECRET HERE" # Deepgram API key secrets are 40-character lowercase hexadecimal strings.
try:
print("Deepgram thinks you said " + r.recognize_deepgram(audio_en, key=DEEPGRAM_API_SECRET, keywords=['elephant:10']))
except sr.UnknownValueError:
print("Deepgram could not understand audio")
except sr.RequestError as e:
print("Could not request results from Deepgram; {0}".format(e))
11 changes: 11 additions & 0 deletions reference/library-reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,17 @@ You can translate the result to english with Whisper by passing translate=True

Other values are passed directly to whisper. See https://github.com/openai/whisper/blob/main/whisper/transcribe.py for all options

``recognizer_instance.recognize_deepgram(audio_data: AudioData, key: str, tier: Optional[str] = "enhanced", model: Optional[str] = "general", version: Optional[str] = "latest", language: Optional[str] = "en-US", detect_language: bool = False, punctuate: bool = True, profanity_filter: bool = False, redact: Optional[str] = None, diarize: bool = False, diarize_version: Optional[str] = None, ner: bool = True, multichannel: bool = False, alternatives: int = 1, numerals: bool = True, search: Optional[Iterable[str]] = None, replace: Optional[Dict[str, str]] = None, keywords: Optional[Iterable[str]] = None, paragraphs: bool = False, summarize: bool = False, detect_topics: bool = False, utterances: bool = False, utt_split: Optional[float] = None, show_all: bool = False) -> Union[str, Dict[str, Any]]``
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Performs speech recognition of ``audio_data`` (an ``AudioData`` instance) using the Deepgram speech recognition API.

Deepgram uses an API secret to authenticate users and authorize requests. To obtain an API secret, create an account with `Deepgram <https://console.deepgram.com>`__. The API secret is a 40-character hexadecimal string that can only be retrieved at creation time. It is identified using a UUID, which is not the API secret and shouldn't be used here.

If ``show_all`` is false (the default), returns the most likely transcript string; otherwise, returns the raw API JSON response.

Details of the various features can be found in the `Deepgram Documentation <https://developers.deepgram.com/>`__.

``AudioSource``
---------------

Expand Down
126 changes: 126 additions & 0 deletions speech_recognition/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1726,6 +1726,132 @@ def recognize_vosk(self, audio_data, language='en'):

return finalRecognition

def recognize_deepgram(
self,
audio_data,
key,
tier='enhanced',
model='general',
version='latest',
language='en-US',
detect_language=False,
punctuate=True,
profanity_filter=False,
redact=None,
diarize=False,
diarize_version=None,
ner=True,
multichannel=False,
alternatives=1,
numerals=True,
search=None,
replace=None,
keywords=None,
paragraphs=False,
summarize=False,
detect_topics=False,
utterances=False,
utt_split=None,
show_all=False
):
"""
Performs speech recognition of ``audio_data`` (an ``AudioData`` instance) using the Deepgram speech recognition API.

Deepgram uses an API secret to authenticate users and authorize requests. To obtain an API secret, create an account with `Deepgram <https://console.deepgram.com>`__. The API secret is a 40-character hexadecimal string that can only be retrieved at creation time. It is identified using a UUID, which is not the API secret and shouldn't be used here.

If ``show_all`` is false (the default), returns the most likely transcript string; otherwise, returns the raw API JSON response.

Details of the various features can be found in the `Deepgram Documentation <https://developers.deepgram.com/>`__.
"""
assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data"
assert isinstance(key, str), "``key`` must be a string"
assert tier is None or (isinstance(tier, str) and tier in {'base', 'enhanced'}), "invalid ``tier``"
assert model is None or isinstance(model, str), "``model`` must be None or a string"
assert version is None or isinstance(version, str), "``verison`` must be None or a string"
assert language is None or isinstance(language, str), "``language`` must be None or a string"
assert isinstance(detect_language, bool), "``detect_language`` must be a bool"
assert isinstance(punctuate, bool), "``punctuate`` must be a bool"
assert isinstance(profanity_filter, bool), "``profanity_filter`` must be a bool"
assert redact is None or isinstance(redact, str), "``redact`` must be None or a string"
assert isinstance(diarize, bool), "``diarize`` must be a bool"
assert diarize_version is None or isinstance(diarize_version, str), "``diarize_version`` must be None or a string"
assert isinstance(ner, bool), "``ner`` must be a bool"
assert isinstance(multichannel, bool), "``multichannel`` must be a bool"
assert isinstance(alternatives, int) and alternatives > 0, "``alternatives`` must be a positive integer"
assert isinstance(numerals, bool), "``numerals`` must be a bool"
assert search is None or (isinstance(search, list) and all(isinstance(s, str) for s in search)), "``search`` must be None or a list of strings"
assert replace is None or (isinstance(replace, dict) and all(isinstance(k, str) and isinstance(v, str) for k, v in replace.items())), "``replace`` must be None or a dicitonary with string keys and values"
assert keywords is None or (isinstance(keywords, list) and all(isinstance(s, str) for s in keywords)), "``keywords`` must be None or a list of strings"
assert isinstance(paragraphs, bool), "``paragraphs`` must be a bool"
assert isinstance(summarize, bool), "``summarize`` must be a bool"
assert isinstance(detect_topics, bool), "``detect_topics`` must be a bool"
assert isinstance(utterances, bool), "``utterances`` must be a bool"
assert utt_split is None or (isinstance(utt_split, (int, float)) and utt_split > 0), "``utt_split`` must be None or positive real number"

def convert_bool(x):
if isinstance(x, bool):
return str(x).lower()
else:
return x

params = [
(p[0], convert_bool(p[1])) for p in (
('tier', tier),
('model', model),
('version', version),
('language', language),
('detect_language', detect_language),
('punctuate', punctuate),
('profanity_filter', profanity_filter),
('redact', redact),
('diarize', diarize),
('diarize_version', diarize_version),
('ner', ner),
('multichannel', multichannel),
('alternatives', alternatives),
('numerals', numerals),
('paragraphs', paragraphs),
('summarize', summarize),
('detect_topics', detect_topics),
('utterances', utterances),
('utt_split', utt_split),
) if p[1] is not None
]
if search is not None:
for s in search:
params.append(('search', s))
if keywords is not None:
for k in keywords:
params.append(('keywords', k))
if replace is not None:
for k, v in replace.items():
k = k.replace(':', '%3a')
v = v.replace(':', '%3a')
params.append(('replace', f'{k}:{v}'))

headers = {
'authorization': f'token {key}',
}
url = 'https://api.deepgram.com/v1/listen?{}'.format(urlencode(params))
data = audio_data.get_wav_data()

request = Request(url, data, headers)
try:
response = urlopen(request, timeout=self.operation_timeout)
except HTTPError as e:
raise RequestError("recognition request failed: {}".format(e.reason))
except URLError as e:
raise RequestError("recognition connection failed: {}".format(e.reason))

result = json.load(response)

if show_all:
return result
pprint(result, indent=4)

return result['results']['channels'][0]['alternatives'][0]['transcript']


def get_flac_converter():
"""Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found."""
flac_converter = shutil_which("flac") # check for installed version first
Expand Down
7 changes: 7 additions & 0 deletions tests/test_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,5 +97,12 @@ def test_whisper_chinese(self):
with sr.AudioFile(self.AUDIO_FILE_ZH) as source: audio = r.record(source)
self.assertEqual(r.recognize_whisper(audio, model="small", language="chinese", **self.WHISPER_CONFIG), u"砸自己的腳")

@unittest.skipUnless("DEEPGRAM_API_SECRET" in os.environ, "requires Deepgram API secret to be specified in DEEPGRAM_API_SECRET environment variables")
def test_deepgram(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
self.assertEqual(r.recognize_deepgram(audio, key=os.environ["DEEPGRAM_API_SECRET"]), "123")


if __name__ == "__main__":
unittest.main()
6 changes: 6 additions & 0 deletions tests/test_special_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ def assertSameWords(self, tested, reference, msg=None):
if set_tested != set_reference:
raise self.failureException(msg if msg is not None else "%r doesn't consist of the same words as %r" % (tested, reference))

@unittest.skipUnless("DEEPGRAM_API_SECRET" in os.environ, "requires Deepgram API secret to be specified in DEEPGRAM_API_SECRET environment variables")
def test_deepgram_keywords(self):
r = sr.Recognizer()
with sr.AudioFile(self.AUDIO_FILE_EN) as source: audio = r.record(source)
self.assertEqual(r.recognize_deepgram(audio, key=os.environ["DEEPGRAM_API_SECRET"], tier='base', keywords=['elephant:1000000']), "elephant elephant elephant")


if __name__ == "__main__":
unittest.main()