examples : update vad support in stream example [no ci]

danbev · danbev · commit b92470639b32 · 2025-05-16T15:28:44.000+02:00
wip
diff --git a/examples/stream/README.md b/examples/stream/README.md
@@ -1,51 +1,41 @@
-# whisper.cpp/examples/stream
-
-This is a naive example of performing real-time inference on audio from your microphone.
-The `whisper-stream` tool samples the audio every half a second and runs the transcription continously.
-More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
-
-```bash
-./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
-```
-
-https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
-
-## Sliding window mode with VAD
-
-Setting the `--step` argument to `0` enables the sliding window mode:
-
-```bash
- ./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 6 --step 0 --length 30000 -vth 0.6
-```
-
-In this mode, the tool will transcribe only after some speech activity is detected. A very
-basic VAD detector is used, but in theory a more sophisticated approach can be added. The
-`-vth` argument determines the VAD threshold - higher values will make it detect silence more often.
-It's best to tune it to the specific use case, but a value around `0.6` should be OK in general.
-When silence is detected, it will transcribe the last `--length` milliseconds of audio and output
-a transcription block that is suitable for parsing.
-
-## Building
-
-The `whisper-stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
-
-```bash
-# Install SDL2
-# On Debian based linux distributions:
-sudo apt-get install libsdl2-dev
-
-# On Fedora Linux:
-sudo dnf install SDL2 SDL2-devel
-
-# Install SDL2 on Mac OS
-brew install sdl2
-
-cmake -B build -DWHISPER_SDL2=ON
-cmake --build build --config Release
-
-./build/bin/whisper-stream
-```
-
-## Web version
-
-This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm)
+# whisper.cpp/examples/stream
+
+This is a naive example of performing real-time inference on audio from your microphone.
+The `whisper-stream` tool samples the audio every half a second and runs the transcription continously.
+More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
+
+```bash
+./build/bin/whisper-stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
+```
+
+https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
+
+## VAD support
+
+VAD support can be enabled by specifying the `--vad` and optionally a `--vad-model` (by default
+`models/for-tests-silero-v5.1.2-ggml.bin` will be used).
+
+## Building
+
+The `whisper-stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
+
+```bash
+# Install SDL2
+# On Debian based linux distributions:
+sudo apt-get install libsdl2-dev
+
+# On Fedora Linux:
+sudo dnf install SDL2 SDL2-devel
+
+# Install SDL2 on Mac OS
+brew install sdl2
+
+cmake -B build -DWHISPER_SDL2=ON
+cmake --build build --config Release
+
+./build/bin/whisper-stream
+```
+
+## Web version
+
+This tool can also run in the browser: [examples/stream.wasm](/examples/stream.wasm)
diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
@@ -25,9 +25,6 @@ struct whisper_params {
     int32_t audio_ctx  = 0;
     int32_t beam_size  = -1;
 
-    float vad_thold    = 0.6f;
-    float freq_thold   = 100.0f;
-
     bool translate     = false;
     bool no_fallback   = false;
     bool print_special = false;
@@ -37,10 +34,21 @@ struct whisper_params {
     bool save_audio    = false; // save audio to wav file
     bool use_gpu       = true;
     bool flash_attn    = false;
+    bool no_prints     = false;
 
     std::string language  = "en";
     std::string model     = "models/ggml-base.en.bin";
     std::string fname_out;
+
+    // Voice Activity Detection (VAD) parameters
+    bool        vad                         = false;
+    std::string vad_model                   = "models/for-tests-silero-v5.1.2-ggml.bin";
+    float       vad_threshold               = 0.5f;
+    int         vad_min_speech_duration_ms  = 250;
+    int         vad_min_silence_duration_ms = 100;
+    float       vad_max_speech_duration_s   = FLT_MAX;
+    int         vad_speech_pad_ms           = 30;
+    float       vad_samples_overlap         = 0.1f;
 };
 
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@@ -61,8 +69,6 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
         else if (arg == "-mt"   || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
         else if (arg == "-ac"   || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
         else if (arg == "-bs"   || arg == "--beam-size")     { params.beam_size     = std::stoi(argv[++i]); }
-        else if (arg == "-vth"  || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
-        else if (arg == "-fth"  || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
         else if (arg == "-tr"   || arg == "--translate")     { params.translate     = true; }
         else if (arg == "-nf"   || arg == "--no-fallback")   { params.no_fallback   = true; }
         else if (arg == "-ps"   || arg == "--print-special") { params.print_special = true; }
@@ -74,7 +80,16 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
         else if (arg == "-sa"   || arg == "--save-audio")    { params.save_audio    = true; }
         else if (arg == "-ng"   || arg == "--no-gpu")        { params.use_gpu       = false; }
         else if (arg == "-fa"   || arg == "--flash-attn")    { params.flash_attn    = true; }
-
+        else if (arg == "-np"   || arg == "--no-prints")     { params.no_prints     = true; }
+        // Voice Activity Detection (VAD)
+        else if (                  arg == "--vad")                         { params.vad                         = true; }
+        else if (arg == "-vm"   || arg == "--vad-model")                   { params.vad_model                   = argv[++i]; }
+        else if (arg == "-vt"   || arg == "--vad-threshold")               { params.vad_threshold               = std::stof(argv[++i]); }
+        else if (arg == "-vsd"  || arg == "--vad-min-speech-duration-ms")  { params.vad_min_speech_duration_ms  = std::stoi(argv[++i]); }
+        else if (arg == "-vsd"  || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms  = std::stoi(argv[++i]); }
+        else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s")   { params.vad_max_speech_duration_s   = std::stof(argv[++i]); }
+        else if (arg == "-vp"   || arg == "--vad-speech-pad-ms")           { params.vad_speech_pad_ms           = std::stoi(argv[++i]); }
+        else if (arg == "-vo"   || arg == "--vad-samples-overlap")         { params.vad_samples_overlap         = std::stof(argv[++i]); }
         else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             whisper_print_usage(argc, argv, params);
@@ -99,8 +114,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",       params.max_tokens);
     fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                   params.audio_ctx);
     fprintf(stderr, "  -bs N,    --beam-size N   [%-7d] beam size for beam search\n",                      params.beam_size);
-    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",           params.vad_thold);
-    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                   params.freq_thold);
     fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
     fprintf(stderr, "  -nf,      --no-fallback   [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
     fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
@@ -112,6 +125,19 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -sa,      --save-audio    [%-7s] save the recorded audio to a file\n",              params.save_audio ? "true" : "false");
     fprintf(stderr, "  -ng,      --no-gpu        [%-7s] disable GPU inference\n",                          params.use_gpu ? "false" : "true");
     fprintf(stderr, "  -fa,      --flash-attn    [%-7s] flash attention during inference\n",               params.flash_attn ? "true" : "false");
+    fprintf(stderr, "  -np,       --no-prints    [%-7s] do not print anything other than the results\n",   params.no_prints ? "true" : "false");
+    // Voice Activity Detection (VAD) parameters
+    fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
+    fprintf(stderr, "             --vad                           [%-7s] enable Voice Activity Detection (VAD)\n",            params.vad ? "true" : "false");
+    fprintf(stderr, "  -vm FNAME, --vad-model FNAME               [%-7s] VAD model path\n",                                   params.vad_model.c_str());
+    fprintf(stderr, "  -vt N,     --vad-threshold N               [%-7.2f] VAD threshold for speech recognition\n",           params.vad_threshold);
+    fprintf(stderr, "  -vspd N,   --vad-min-speech-duration-ms  N [%-7d] VAD min speech duration (0.0-1.0)\n",                params.vad_min_speech_duration_ms);
+    fprintf(stderr, "  -vsd N,    --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n",     params.vad_min_silence_duration_ms);
+    fprintf(stderr, "  -vmsd N,   --vad-max-speech-duration-s   N [%-7s] VAD max speech duration (auto-split longer)\n",      params.vad_max_speech_duration_s == FLT_MAX ?
+                                                                                                                                  std::string("FLT_MAX").c_str() :
+                                                                                                                                  std::to_string(params.vad_max_speech_duration_s).c_str());
+    fprintf(stderr, "  -vp N,     --vad-speech-pad-ms           N [%-7d] VAD speech padding (extend segments)\n",             params.vad_speech_pad_ms);
+    fprintf(stderr, "  -vo N,     --vad-samples-overlap         N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
     fprintf(stderr, "\n");
 }
 
@@ -122,20 +148,22 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    if (params.no_prints) {
+        whisper_log_set([](enum ggml_log_level, const char*, void*) { }, NULL);
+    }
+
     params.keep_ms   = std::min(params.keep_ms,   params.step_ms);
     params.length_ms = std::max(params.length_ms, params.step_ms);
 
+
     const int n_samples_step = (1e-3*params.step_ms  )*WHISPER_SAMPLE_RATE;
     const int n_samples_len  = (1e-3*params.length_ms)*WHISPER_SAMPLE_RATE;
     const int n_samples_keep = (1e-3*params.keep_ms  )*WHISPER_SAMPLE_RATE;
     const int n_samples_30s  = (1e-3*30000.0         )*WHISPER_SAMPLE_RATE;
 
-    const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
-
-    const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
-
-    params.no_timestamps  = !use_vad;
-    params.no_context    |= use_vad;
+    const int n_new_line = !params.vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
+    params.no_timestamps  = !params.vad;
+    params.no_context    |= params.vad;
     params.max_tokens     = 0;
 
     // init audio
@@ -189,7 +217,7 @@ int main(int argc, char ** argv) {
                 params.translate ? "translate" : "transcribe",
                 params.no_timestamps ? 0 : 1);
 
-        if (!use_vad) {
+        if (!params.vad) {
             fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
         } else {
             fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__);
@@ -242,7 +270,7 @@ int main(int argc, char ** argv) {
 
         // process new audio
 
-        if (!use_vad) {
+        if (!params.vad) {
             while (true) {
                 // handle Ctrl + C
                 is_running = sdl_poll_events();
@@ -270,7 +298,7 @@ int main(int argc, char ** argv) {
             // take up to params.length_ms audio from previous iteration
             const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
 
-            //printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
+            //fprintf(stdout, "processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
 
             pcmf32.resize(n_samples_new + n_samples_take);
 
@@ -285,22 +313,29 @@ int main(int argc, char ** argv) {
             const auto t_now  = std::chrono::high_resolution_clock::now();
             const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count();
 
-            if (t_diff < 2000) {
-                std::this_thread::sleep_for(std::chrono::milliseconds(100));
-
+            if (t_diff < params.step_ms) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(params.step_ms));
                 continue;
             }
 
-            audio.get(2000, pcmf32_new);
+            // Get new audio for this step
+            audio.get(params.step_ms, pcmf32_new);
 
-            if (::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
-                audio.get(params.length_ms, pcmf32);
-            } else {
-                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+            // Calculate how much old audio to keep
+            const int n_samples_new = pcmf32_new.size();
+            const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
 
-                continue;
+            // Combine old + new audio with overlap
+            pcmf32.resize(n_samples_new + n_samples_take);
+
+            // Copy kept portion from previous iteration
+            for (int i = 0; i < n_samples_take; i++) {
+                pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
             }
 
+            // Append new audio
+            memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new * sizeof(float));
+
             t_last = t_now;
         }
 
@@ -313,7 +348,6 @@ int main(int argc, char ** argv) {
             wparams.print_realtime   = false;
             wparams.print_timestamps = !params.no_timestamps;
             wparams.translate        = params.translate;
-            wparams.single_segment   = !use_vad;
             wparams.max_tokens       = params.max_tokens;
             wparams.language         = params.language.c_str();
             wparams.n_threads        = params.n_threads;
@@ -330,27 +364,30 @@ int main(int argc, char ** argv) {
             wparams.prompt_tokens    = params.no_context ? nullptr : prompt_tokens.data();
             wparams.prompt_n_tokens  = params.no_context ? 0       : prompt_tokens.size();
 
+            wparams.vad            = params.vad;
+            wparams.vad_model_path = params.vad_model.c_str();
+
+            wparams.vad_params.threshold               = params.vad_threshold;
+            wparams.vad_params.min_speech_duration_ms  = params.vad_min_speech_duration_ms;
+            wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
+            wparams.vad_params.max_speech_duration_s   = params.vad_max_speech_duration_s;
+            wparams.vad_params.speech_pad_ms           = params.vad_speech_pad_ms;
+            wparams.vad_params.samples_overlap         = params.vad_samples_overlap;
+
             if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
                 fprintf(stderr, "%s: failed to process audio\n", argv[0]);
                 return 6;
             }
 
             // print result;
             {
-                if (!use_vad) {
+                if (!params.vad) {
                     printf("\33[2K\r");
 
                     // print long empty line to clear the previous line
                     printf("%s", std::string(100, ' ').c_str());
 
                     printf("\33[2K\r");
-                } else {
-                    const int64_t t1 = (t_last - t_start).count()/1000000;
-                    const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE);
-
-                    printf("\n");
-                    printf("### Transcription %d START | t0 = %d ms | t1 = %d ms\n", n_iter, (int) t0, (int) t1);
-                    printf("\n");
                 }
 
                 const int n_segments = whisper_full_n_segments(ctx);
@@ -389,15 +426,11 @@ int main(int argc, char ** argv) {
                     fout << std::endl;
                 }
 
-                if (use_vad) {
-                    printf("\n");
-                    printf("### Transcription %d END\n", n_iter);
-                }
             }
 
             ++n_iter;
 
-            if (!use_vad && (n_iter % n_new_line) == 0) {
+            if (!params.vad && (n_iter % n_new_line) == 0) {
                 printf("\n");
 
                 // keep part of the audio for next iteration to try to mitigate word boundary issues