-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathgujarati_speech_recognition.py
40 lines (34 loc) · 1.27 KB
/
gujarati_speech_recognition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import numpy as np
import wave
from deepspeech import Model
from time import time
start = time()
# These constants control the beam search decoder
# Beam width used in the CTC decoder when building candidate transcriptions
BEAM_WIDTH = 500
# The alpha hyperparameter of the CTC decoder. Language Model weight
LM_ALPHA = 0.75
# The beta hyperparameter of the CTC decoder. Word insertion bonus.
LM_BETA = 1.85
# These constants are tied to the shape of the graph used (changing them changes
# the geometry of the first layer), so make sure you use the same constants that
# were used during training
# Number of MFCC features to use
N_FEATURES = 26
# Size of the context window used for producing timesteps in the input vector
N_CONTEXT = 9
# Model Path
model_dir = 'gu_model'
model = model_dir + '/output_graph.pb'
alphabet = model_dir + '/alphabet.txt'
lm = model_dir + '/lm.binary'
trie = model_dir + '/trie'
ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA)
end = time()
print("Saved Time: ", end-start)
while True:
audio = input("Input file path: ")
with wave.open(audio, 'rb') as fin:
audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
print('Transcript: ', ds.stt(audio, 16000))