Merge pull request #428 from jhj0517/fix/faster-whisper

jhj0517 · web-flow · commit cd2c897c4ced · 2024-12-18T18:58:13.000+09:00
Install `faster-whisper` directly from repository
diff --git a/backend/requirements-backend.txt b/backend/requirements-backend.txt
@@ -1,17 +1,5 @@
 # Whisper-WebUI dependencies
---extra-index-url https://download.pytorch.org/whl/cu124
-torch
-torchaudio
-git+https://github.com/jhj0517/jhj0517-whisper.git
-faster-whisper==1.0.3
-transformers
-gradio
-gradio-i18n
-pytubefix
-ruamel.yaml==0.18.6
-pyannote.audio==3.3.2
-git+https://github.com/jhj0517/ultimatevocalremover_api.git
-git+https://github.com/jhj0517/pyrubberband.git
+-r ./../requirements.txt
 
 # Backend dependencies
 python-dotenv
diff --git a/configs/default_parameters.yaml b/configs/default_parameters.yaml
@@ -28,7 +28,7 @@ whisper:
   max_new_tokens: null
   hallucination_silence_threshold: null
   hotwords: null
-  language_detection_threshold: null
+  language_detection_threshold: 0.5
   language_detection_segments: 1
   add_timestamp: true
 
diff --git a/modules/vad/silero_vad.py b/modules/vad/silero_vad.py
@@ -4,11 +4,13 @@
 import numpy as np
 from typing import BinaryIO, Union, List, Optional, Tuple
 import warnings
+import bisect
 import faster_whisper
-from modules.whisper.data_classes import *
 from faster_whisper.transcribe import SpeechTimestampsMap
 import gradio as gr
 
+from modules.whisper.data_classes import *
+
 
 class SileroVAD:
     def __init__(self):
@@ -58,6 +60,7 @@ def run(self,
             vad_options=vad_parameters,
             progress=progress
         )
+
         audio = self.collect_chunks(audio, speech_chunks)
         duration_after_vad = audio.shape[0] / sampling_rate
 
@@ -94,35 +97,27 @@ def get_speech_timestamps(
         min_silence_duration_ms = vad_options.min_silence_duration_ms
         window_size_samples = self.window_size_samples
         speech_pad_ms = vad_options.speech_pad_ms
-        sampling_rate = 16000
-        min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
-        speech_pad_samples = sampling_rate * speech_pad_ms / 1000
+        min_speech_samples = self.sampling_rate * min_speech_duration_ms / 1000
+        speech_pad_samples = self.sampling_rate * speech_pad_ms / 1000
         max_speech_samples = (
-                sampling_rate * max_speech_duration_s
+                self.sampling_rate * max_speech_duration_s
                 - window_size_samples
                 - 2 * speech_pad_samples
         )
-        min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
-        min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
+        min_silence_samples = self.sampling_rate * min_silence_duration_ms / 1000
+        min_silence_samples_at_max_speech = self.sampling_rate * 98 / 1000
 
         audio_length_samples = len(audio)
 
-        state, context = self.model.get_initial_states(batch_size=1)
-
-        speech_probs = []
-        for current_start_sample in range(0, audio_length_samples, window_size_samples):
-            progress(current_start_sample/audio_length_samples, desc="Detecting speeches only using VAD...")
-
-            chunk = audio[current_start_sample: current_start_sample + window_size_samples]
-            if len(chunk) < window_size_samples:
-                chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
-            speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
-            speech_probs.append(speech_prob)
+        padded_audio = np.pad(
+            audio, (0, window_size_samples - audio.shape[0] % window_size_samples)
+        )
+        speech_probs = self.model(padded_audio.reshape(1, -1)).squeeze(0)
 
         triggered = False
         speeches = []
         current_speech = {}
-        neg_threshold = threshold - 0.15
+        neg_threshold = vad_options.neg_threshold
 
         # to save potential segment end (and tolerate some silence)
         temp_end = 0
@@ -258,8 +253,23 @@ def restore_speech_timestamps(
         ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
 
         for segment in segments:
-            segment.start = ts_map.get_original_time(segment.start)
-            segment.end = ts_map.get_original_time(segment.end)
+            if segment.words:
+                words = []
+                for word in segment.words:
+                    # Ensure the word start and end times are resolved to the same chunk.
+                    middle = (word.start + word.end) / 2
+                    chunk_index = ts_map.get_chunk_index(middle)
+                    word.start = ts_map.get_original_time(word.start, chunk_index)
+                    word.end = ts_map.get_original_time(word.end, chunk_index)
+                    words.append(word)
+
+                segment.start = words[0].start
+                segment.end = words[-1].end
+                segment.words = words
+
+            else:
+                segment.start = ts_map.get_original_time(segment.start)
+                segment.end = ts_map.get_original_time(segment.end)
 
         return segments
 
diff --git a/modules/whisper/data_classes.py b/modules/whisper/data_classes.py
@@ -319,7 +319,7 @@ class WhisperParams(BaseParams):
     )
     hotwords: Optional[str] = Field(default=None, description="Hotwords/hint phrases for the model")
     language_detection_threshold: Optional[float] = Field(
-        default=None,
+        default=0.5,
         description="Threshold for language detection probability"
     )
     language_detection_segments: int = Field(
diff --git a/notebook/whisper-webui.ipynb b/notebook/whisper-webui.ipynb
@@ -53,7 +53,7 @@
         "!git clone https://github.com/jhj0517/Whisper-WebUI.git\n",
         "%cd Whisper-WebUI\n",
         "!pip install git+https://github.com/jhj0517/jhj0517-whisper.git\n",
-        "!pip install faster-whisper==1.0.3\n",
+        "!pip install git+https://github.com/SYSTRAN/faster-whisper.git\n",
         "!pip install ctranslate2==4.4.0\n",
         "!pip install gradio\n",
         "!pip install gradio-i18n\n",
diff --git a/requirements.txt b/requirements.txt
@@ -8,7 +8,7 @@
 torch
 torchaudio
 git+https://github.com/jhj0517/jhj0517-whisper.git
-faster-whisper==1.0.3
+git+https://github.com/SYSTRAN/faster-whisper.git
 transformers
 gradio
 gradio-i18n

Original file line number	Diff line number	Diff line change
`@@ -319,7 +319,7 @@ class WhisperParams(BaseParams):`
`319`	`319`	`)`
`320`	`320`	`hotwords: Optional[str] = Field(default=None, description="Hotwords/hint phrases for the model")`
`321`	`321`	`language_detection_threshold: Optional[float] = Field(`
`322`		`- default=None,`
	`322`	`+ default=0.5,`
`323`	`323`	`description="Threshold for language detection probability"`
`324`	`324`	`)`
`325`	`325`	`language_detection_segments: int = Field(`