Skip to content

Commit cd2c897

Browse files
authored
Merge pull request #428 from jhj0517/fix/faster-whisper
Install `faster-whisper` directly from repository
2 parents edc67ab + bdc4855 commit cd2c897

File tree

6 files changed

+36
-38
lines changed

6 files changed

+36
-38
lines changed

backend/requirements-backend.txt

+1-13
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,5 @@
11
# Whisper-WebUI dependencies
2-
--extra-index-url https://download.pytorch.org/whl/cu124
3-
torch
4-
torchaudio
5-
git+https://github.com/jhj0517/jhj0517-whisper.git
6-
faster-whisper==1.0.3
7-
transformers
8-
gradio
9-
gradio-i18n
10-
pytubefix
11-
ruamel.yaml==0.18.6
12-
pyannote.audio==3.3.2
13-
git+https://github.com/jhj0517/ultimatevocalremover_api.git
14-
git+https://github.com/jhj0517/pyrubberband.git
2+
-r ./../requirements.txt
153

164
# Backend dependencies
175
python-dotenv

configs/default_parameters.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ whisper:
2828
max_new_tokens: null
2929
hallucination_silence_threshold: null
3030
hotwords: null
31-
language_detection_threshold: null
31+
language_detection_threshold: 0.5
3232
language_detection_segments: 1
3333
add_timestamp: true
3434

modules/vad/silero_vad.py

+31-21
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@
44
import numpy as np
55
from typing import BinaryIO, Union, List, Optional, Tuple
66
import warnings
7+
import bisect
78
import faster_whisper
8-
from modules.whisper.data_classes import *
99
from faster_whisper.transcribe import SpeechTimestampsMap
1010
import gradio as gr
1111

12+
from modules.whisper.data_classes import *
13+
1214

1315
class SileroVAD:
1416
def __init__(self):
@@ -58,6 +60,7 @@ def run(self,
5860
vad_options=vad_parameters,
5961
progress=progress
6062
)
63+
6164
audio = self.collect_chunks(audio, speech_chunks)
6265
duration_after_vad = audio.shape[0] / sampling_rate
6366

@@ -94,35 +97,27 @@ def get_speech_timestamps(
9497
min_silence_duration_ms = vad_options.min_silence_duration_ms
9598
window_size_samples = self.window_size_samples
9699
speech_pad_ms = vad_options.speech_pad_ms
97-
sampling_rate = 16000
98-
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
99-
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
100+
min_speech_samples = self.sampling_rate * min_speech_duration_ms / 1000
101+
speech_pad_samples = self.sampling_rate * speech_pad_ms / 1000
100102
max_speech_samples = (
101-
sampling_rate * max_speech_duration_s
103+
self.sampling_rate * max_speech_duration_s
102104
- window_size_samples
103105
- 2 * speech_pad_samples
104106
)
105-
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
106-
min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
107+
min_silence_samples = self.sampling_rate * min_silence_duration_ms / 1000
108+
min_silence_samples_at_max_speech = self.sampling_rate * 98 / 1000
107109

108110
audio_length_samples = len(audio)
109111

110-
state, context = self.model.get_initial_states(batch_size=1)
111-
112-
speech_probs = []
113-
for current_start_sample in range(0, audio_length_samples, window_size_samples):
114-
progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...")
115-
116-
chunk = audio[current_start_sample: current_start_sample + window_size_samples]
117-
if len(chunk) < window_size_samples:
118-
chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
119-
speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
120-
speech_probs.append(speech_prob)
112+
padded_audio = np.pad(
113+
audio, (0, window_size_samples - audio.shape[0] % window_size_samples)
114+
)
115+
speech_probs = self.model(padded_audio.reshape(1, -1)).squeeze(0)
121116

122117
triggered = False
123118
speeches = []
124119
current_speech = {}
125-
neg_threshold = threshold - 0.15
120+
neg_threshold = vad_options.neg_threshold
126121

127122
# to save potential segment end (and tolerate some silence)
128123
temp_end = 0
@@ -258,8 +253,23 @@ def restore_speech_timestamps(
258253
ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
259254

260255
for segment in segments:
261-
segment.start = ts_map.get_original_time(segment.start)
262-
segment.end = ts_map.get_original_time(segment.end)
256+
if segment.words:
257+
words = []
258+
for word in segment.words:
259+
# Ensure the word start and end times are resolved to the same chunk.
260+
middle = (word.start + word.end) / 2
261+
chunk_index = ts_map.get_chunk_index(middle)
262+
word.start = ts_map.get_original_time(word.start, chunk_index)
263+
word.end = ts_map.get_original_time(word.end, chunk_index)
264+
words.append(word)
265+
266+
segment.start = words[0].start
267+
segment.end = words[-1].end
268+
segment.words = words
269+
270+
else:
271+
segment.start = ts_map.get_original_time(segment.start)
272+
segment.end = ts_map.get_original_time(segment.end)
263273

264274
return segments
265275

modules/whisper/data_classes.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,7 @@ class WhisperParams(BaseParams):
319319
)
320320
hotwords: Optional[str] = Field(default=None, description="Hotwords/hint phrases for the model")
321321
language_detection_threshold: Optional[float] = Field(
322-
default=None,
322+
default=0.5,
323323
description="Threshold for language detection probability"
324324
)
325325
language_detection_segments: int = Field(

notebook/whisper-webui.ipynb

+1-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
"!git clone https://github.com/jhj0517/Whisper-WebUI.git\n",
5454
"%cd Whisper-WebUI\n",
5555
"!pip install git+https://github.com/jhj0517/jhj0517-whisper.git\n",
56-
"!pip install faster-whisper==1.0.3\n",
56+
"!pip install git+https://github.com/SYSTRAN/faster-whisper.git\n",
5757
"!pip install ctranslate2==4.4.0\n",
5858
"!pip install gradio\n",
5959
"!pip install gradio-i18n\n",

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
torch
99
torchaudio
1010
git+https://github.com/jhj0517/jhj0517-whisper.git
11-
faster-whisper==1.0.3
11+
git+https://github.com/SYSTRAN/faster-whisper.git
1212
transformers
1313
gradio
1414
gradio-i18n

0 commit comments

Comments
 (0)